/*
 * Decompiled with CFR 0.152.
 */
package org.gbif.utils.file.tabular;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.gbif.common.shaded.com.google.common.base.Preconditions;
import org.gbif.common.shaded.com.google.common.collect.Sets;
import org.gbif.utils.file.CharsetDetection;
import org.gbif.utils.file.UnknownCharsetException;
import org.gbif.utils.file.tabular.TabularFileMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TabularFileMetadataExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class);
    private static final int MAX_SAMPLE_SIZE = 15;
    private static final int CHARSET_DETECTION_BUFFER_LENGTH = 0x100000;
    private static final Character[] POTENTIAL_DELIMITER_CHAR = new Character[]{Character.valueOf(','), Character.valueOf('\t'), Character.valueOf(';'), Character.valueOf('|')};
    private static final Character[] POTENTIAL_QUOTES_CHAR = new Character[]{Character.valueOf('\"'), Character.valueOf('\'')};
    private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY = lineStats -> lineStats.getFrequency() > 0;
    private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC = Comparator.comparing(Map.Entry::getValue, Collections.reverseOrder());
    private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT = (delimiter, quoteChar) -> Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]");

    private TabularFileMetadataExtractor() {
    }

    public static TabularFileMetadata extractTabularFileMetadata(Path filePath) throws IOException, UnknownCharsetException {
        Charset encoding;
        Objects.requireNonNull(filePath, "filePath shall be provided");
        Preconditions.checkArgument(!Files.isDirectory(filePath, new LinkOption[0]), "filePath should point to a file, not a directory");
        try {
            encoding = CharsetDetection.detectEncoding(filePath.toFile(), 0x100000);
            if (encoding == null) {
                throw new UnknownCharsetException("Unable to detect the file's character encoding");
            }
        }
        catch (IOException e) {
            throw new UnknownCharsetException(e);
        }
        ArrayList<String> lines = new ArrayList<String>();
        try (BufferedReader bf = Files.newBufferedReader(filePath, encoding);){
            String line;
            do {
                if ((line = bf.readLine()) == null) continue;
                lines.add(line);
            } while (line != null && lines.size() < 15);
        }
        TabularFileMetadata tabularFileMetadata = TabularFileMetadataExtractor.extractTabularMetadata(lines);
        tabularFileMetadata.setEncoding(encoding);
        return tabularFileMetadata;
    }

    static TabularFileMetadata extractTabularMetadata(List<String> sample) {
        Objects.requireNonNull(sample, "sample shall be provided");
        TabularFileMetadata tabularFileMetadata = new TabularFileMetadata();
        Optional<Character> delimiterFound = TabularFileMetadataExtractor.getDelimiterChar(sample);
        Character delimiter = delimiterFound.orElse(null);
        if (delimiter == null) {
            return tabularFileMetadata;
        }
        Optional<Character> quoteFound = TabularFileMetadataExtractor.getHighestCountOf(sample, line -> TabularFileMetadataExtractor.getQuoteCharWithHighestCount(line, delimiter));
        Character quote = quoteFound.orElse(null);
        tabularFileMetadata.setDelimiter(delimiter);
        tabularFileMetadata.setQuotedBy(quote);
        return tabularFileMetadata;
    }

    private static Optional<Character> getHighestCountOf(List<String> sample, Function<String, Optional<Character>> characterExtractor) {
        return sample.stream().map(characterExtractor).flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)).collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream().sorted(BY_VALUE_LONG_DESC).findFirst().map(Map.Entry::getKey);
    }

    public static Optional<Character> getDelimiterChar(List<String> sample) {
        Optional<Character> resultCharacter;
        List<LineDelimiterStats> linesStats = TabularFileMetadataExtractor.computeLineDelimiterStats(sample);
        Map delimiterDistinctFrequency = TabularFileMetadataExtractor.computeDelimiterDistinctFrequency(linesStats).entrySet().stream().filter(entry -> ((Set)entry.getValue()).size() > 1 || !((Set)entry.getValue()).contains(0)).sorted(Comparator.comparing(e -> ((Set)e.getValue()).size())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
        Set<Character> mostStableDelimiter = TabularFileMetadataExtractor.getAllEqualsToFirst(delimiterDistinctFrequency, (s1, s2) -> s1.size() == s2.size());
        Map delimiterFrequencySums = TabularFileMetadataExtractor.computeDelimiterFrequencySums(linesStats).entrySet().stream().sorted(Comparator.comparing(e -> (Integer)e.getValue()).reversed()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
        Set<Character> mostFrequentDelimiter = TabularFileMetadataExtractor.getAllEqualsToFirst(delimiterFrequencySums, Integer::equals);
        Map delimiterHighestFrequencyPerLine = TabularFileMetadataExtractor.computeDelimiterHighestFrequencyPerLine(sample).entrySet().stream().sorted(Comparator.comparing(e -> (Long)e.getValue()).reversed()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
        Set<Character> mostFrequentDelimiterPerLine = TabularFileMetadataExtractor.getAllEqualsToFirst(delimiterHighestFrequencyPerLine, Long::equals);
        if (LOG.isDebugEnabled()) {
            LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency);
            LOG.debug("mostStableDelimiter -> " + mostStableDelimiter);
            LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums);
            LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter);
            LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine);
            LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine);
        }
        if ((resultCharacter = TabularFileMetadataExtractor.intersectSingle(mostStableDelimiter, mostFrequentDelimiter)).isPresent()) {
            return resultCharacter;
        }
        resultCharacter = TabularFileMetadataExtractor.intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine);
        if (resultCharacter.isPresent()) {
            return resultCharacter;
        }
        resultCharacter = TabularFileMetadataExtractor.intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine);
        if (resultCharacter.isPresent()) {
            return resultCharacter;
        }
        return Optional.empty();
    }

    private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) {
        Sets.SetView<Character> intersection = Sets.intersection(set1, set2);
        return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty();
    }

    private static <T> Set<Character> getAllEqualsToFirst(Map<Character, T> map, BiFunction<T, T, Boolean> equalsPredicate) {
        Optional firstMapEntry = map.entrySet().stream().findFirst();
        if (!firstMapEntry.isPresent()) {
            return Collections.EMPTY_SET;
        }
        Object firstValue = ((Map.Entry)firstMapEntry.get()).getValue();
        return map.entrySet().stream().filter(e -> (Boolean)equalsPredicate.apply(firstValue, e.getValue())).map(Map.Entry::getKey).collect(Collectors.toSet());
    }

    static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample) {
        return sample.stream().map(TabularFileMetadataExtractor::lineToLineDelimiterStats).flatMap(Collection::stream).collect(Collectors.toList());
    }

    private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) {
        return Arrays.stream(POTENTIAL_DELIMITER_CHAR).map(delimiter -> new LineDelimiterStats((Character)delimiter, StringUtils.countMatches((CharSequence)line, (char)delimiter.charValue()))).collect(Collectors.toList());
    }

    static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency(List<LineDelimiterStats> linesStats) {
        return linesStats.stream().collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, Collectors.mapping(LineDelimiterStats::getFrequency, Collectors.toSet())));
    }

    static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) {
        return lines.stream().map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2).flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)).collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, Collectors.counting()));
    }

    static Map<Character, Integer> computeDelimiterFrequencySums(List<LineDelimiterStats> linesStats) {
        return linesStats.stream().filter(CONTAINS_FREQUENCY).collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, Collectors.summingInt(LineDelimiterStats::getFrequency)));
    }

    static Optional<Character> getDelimiterWithHighestCount(String line) {
        int highestCount = 0;
        Character highestCountDelimiter = null;
        for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
            int currentCount = StringUtils.countMatches((CharSequence)line, (char)delimiter.charValue());
            if (currentCount <= highestCount) continue;
            highestCount = currentCount;
            highestCountDelimiter = delimiter;
        }
        return Optional.ofNullable(highestCountDelimiter);
    }

    static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) {
        int highestCount = 0;
        LineDelimiterStats lineDelimiterStats = null;
        for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
            int currentCount = StringUtils.countMatches((CharSequence)line, (char)delimiter.charValue());
            if (currentCount <= highestCount) continue;
            highestCount = currentCount;
            lineDelimiterStats = new LineDelimiterStats(delimiter, highestCount);
        }
        return Optional.ofNullable(lineDelimiterStats);
    }

    static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) {
        int highestCount = 0;
        Character highestCountQuoteChar = null;
        for (Character quoteChar : POTENTIAL_QUOTES_CHAR) {
            int currentCount = 0;
            Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line);
            while (m.find()) {
                ++currentCount;
            }
            if (currentCount <= highestCount) continue;
            highestCount = currentCount;
            highestCountQuoteChar = quoteChar;
        }
        return Optional.ofNullable(highestCountQuoteChar);
    }

    static class LineDelimiterStats {
        private Character delimiter;
        private int frequency;

        LineDelimiterStats(Character delimiter, int frequency) {
            this.delimiter = delimiter;
            this.frequency = frequency;
        }

        Character getDelimiter() {
            return this.delimiter;
        }

        int getFrequency() {
            return this.frequency;
        }
    }
}

