/*
 * Decompiled with CFR 0.152.
 */
package org.gbif.nameparser;

import com.google.common.base.Strings;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.commons.lang3.StringEscapeUtils;
import org.gbif.api.model.checklistbank.ParsedName;
import org.gbif.api.vocabulary.NameType;
import org.gbif.api.vocabulary.Rank;
import org.gbif.nameparser.NormalisedNameParser;
import org.gbif.nameparser.UnparsableException;
import org.gbif.utils.text.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class NameParser {
    private static Logger LOG = LoggerFactory.getLogger(NameParser.class);
    private final NormalisedNameParser nnParser;
    private static char[] QUOTES = new char[4];
    public static final String HYBRID_MARKER = "\u00d7";
    public static final Pattern HYBRID_FORMULA_PATTERN;
    public static final String EXTINCT_MARKER = "\u2020";
    private static final Pattern EXTINCT_PATTERN;
    protected static final Pattern CULTIVAR;
    private static final Pattern STRAIN;
    public static final Pattern IS_VIRUS_PATTERN;
    public static final Pattern IS_VIRUS_PATTERN_CASE_SENSITIVE;
    private static final Pattern IS_VIRUS_PATTERN_POSTFAIL;
    public static final Pattern IS_GENE;
    private static final String CANDIDATUS = "(Candidatus\\s|Ca\\.)\\s*";
    private static final Pattern IS_CANDIDATUS_PATTERN;
    private static final Pattern IS_CANDIDATUS_QUOTE_PATTERN;
    private static final Pattern RANK_MARKER_AT_END;
    private static final String SENSU = "(s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)";
    private static final Pattern EXTRACT_SENSU;
    private static final String NOV_RANKS = "fam|gen|sp|ssp|var|forma";
    private static final Pattern NOV_RANK_MARKER;
    protected static final Pattern EXTRACT_NOMSTATUS;
    private static final Pattern EXTRACT_REMARKS;
    private static final Pattern EXTRACT_YEAR;
    private static final Pattern COMMA_BEFORE_YEAR;
    private static final Pattern REPLACE_QUOTES;
    private static final Pattern NORM_QUOTES;
    private static final Pattern NORM_UPPERCASE_WORDS;
    private static final Pattern NORM_WHITESPACE;
    private static final Pattern NORM_NO_SQUARE_BRACKETS;
    private static final Pattern NORM_BRACKETS_OPEN;
    private static final Pattern NORM_BRACKETS_CLOSE;
    private static final Pattern NORM_BRACKETS_OPEN_STRONG;
    private static final Pattern NORM_BRACKETS_CLOSE_STRONG;
    private static final Pattern NORM_AND;
    private static final Pattern NORM_ET_AL;
    private static final Pattern NORM_AMPERSAND_WS;
    private static final Pattern NORM_HYPHENS;
    private static final Pattern NORM_SUBGENUS;
    private static final Pattern NO_Q_MARKS;
    private static final Pattern NORM_COMMAS;
    private static final Pattern NORM_ORIG_AUTH;
    private static final Pattern NORM_ORIG_AUTH2;
    private static final Pattern NORM_IMPRINT_YEAR;
    private static final Pattern NORM_HYBRIDS_GENUS;
    private static final Pattern NORM_HYBRIDS_EPITH;
    private static final Pattern NORM_HYBRIDS_FORM;
    private static final Pattern NORM_INDET;
    private static final Pattern NORM_DOTS;
    private static final Pattern NORM_TF_GENUS;
    private static final Pattern NORM_IN_COMMA;
    private static final Pattern NORM_IN_BIB;
    private static final Pattern NORM_PREFIXES;
    private static final Pattern NORM_SUFFIXES;
    private static final Pattern NO_LETTERS;
    private static final Pattern PLACEHOLDER;
    private static final Pattern DOUBTFUL;
    private static final Pattern DOUBTFUL2;
    private static final Pattern BAD_NAME_SUFFICES;
    private static final Pattern XML_ENTITY_STRIP;
    private static final Pattern AMPERSAND_ENTITY;
    private static final Pattern XML_TAGS;
    private static final Pattern FIRST_WORD;
    private static final String WEIRD_CHARS = "[\u00a7$%/#+!;:_|\"=*]";
    private static final Pattern NORM_WEIRD_CHARS;
    private static final Pattern TYPE_TO_VAR;
    private static final Pattern COMB_BAS_AUTHOR_SWAP;

    public NameParser() {
        this.nnParser = new NormalisedNameParser(500L);
    }

    public NameParser(long timeout) {
        this.nnParser = new NormalisedNameParser(timeout / 2L);
    }

    protected static String cleanStrong(String name) {
        if (name != null) {
            Matcher m = BAD_NAME_SUFFICES.matcher(name);
            if (m.find()) {
                name = m.replaceAll("");
            }
            if ((m = FIRST_WORD.matcher(name = NORM_WEIRD_CHARS.matcher(name).replaceAll(" "))).find() && m.group(2) == null) {
                name = m.replaceFirst(org.apache.commons.lang3.StringUtils.defaultString((String)m.group(1)) + m.group(3).toUpperCase() + m.group(4).toLowerCase() + " ");
            }
            if ((m = NORM_HYBRIDS_GENUS.matcher(name)).find()) {
                name = m.replaceFirst("\u00d7$1");
            }
        }
        return name;
    }

    public static String normalize(String name) {
        if (name == null) {
            return null;
        }
        Matcher m = NORM_DOTS.matcher(name = StringUtils.unescapeUnicodeChars((String)name));
        if (m.find()) {
            name = m.replaceAll("$1. ");
        }
        if ((m = COMMA_BEFORE_YEAR.matcher(name)).find()) {
            name = m.replaceAll("$1, $2");
        }
        name = NORM_HYPHENS.matcher(name).replaceAll("-");
        m = NORM_BRACKETS_OPEN.matcher(name = NORM_AMPERSAND_WS.matcher(name).replaceAll(" & "));
        if (m.find()) {
            name = m.replaceAll(" $1");
        }
        if ((m = NORM_BRACKETS_CLOSE.matcher(name)).find()) {
            name = m.replaceAll("$1 ");
        }
        if ((m = NORM_COMMAS.matcher(name)).find()) {
            name = m.replaceAll(", ");
        }
        if ((m = NORM_HYBRIDS_GENUS.matcher(name)).find()) {
            name = m.replaceFirst("\u00d7$1");
        }
        if ((m = NORM_HYBRIDS_EPITH.matcher(name)).find()) {
            name = m.replaceFirst("$1 \u00d7$2");
        }
        if ((m = NORM_HYBRIDS_FORM.matcher(name)).find()) {
            name = m.replaceAll(" \u00d7 ");
        }
        m = NORM_UPPERCASE_WORDS.matcher(name);
        while (m.find()) {
            name = name.replaceFirst(m.group(0), m.group(1) + m.group(2).toLowerCase());
        }
        name = NORM_WHITESPACE.matcher(name).replaceAll(" ");
        return org.apache.commons.lang3.StringUtils.trimToEmpty((String)name);
    }

    protected static String normalizeStrong(String name) {
        if (name == null) {
            return null;
        }
        name = NORM_QUOTES.matcher(name).replaceAll("'");
        Matcher m = NO_Q_MARKS.matcher(name = REPLACE_QUOTES.matcher(name).replaceAll(""));
        if (m.find()) {
            name = m.replaceAll("$1");
        }
        if ((m = NORM_TF_GENUS.matcher(name = NORM_PREFIXES.matcher(name).replaceAll(""))).find()) {
            name = m.replaceAll("$1$2 ");
        }
        if ((m = NORM_IMPRINT_YEAR.matcher(name)).find()) {
            name = m.replaceAll("$1");
        }
        name = NORM_IN_COMMA.matcher(name).replaceFirst(" in ");
        m = NORM_UPPERCASE_WORDS.matcher(name);
        while (m.find()) {
            name = name.replaceFirst(m.group(0), m.group(1) + m.group(2).toLowerCase());
        }
        m = EXTRACT_YEAR.matcher(name);
        if (m.find() && name.length() < 80) {
            m = NORM_ORIG_AUTH.matcher(name);
            if (m.find()) {
                name = m.replaceAll("($1 $2)");
            }
            if ((m = NORM_ORIG_AUTH2.matcher(name)).find()) {
                name = m.replaceAll("($1 $2)");
            }
        }
        name = NORM_NO_SQUARE_BRACKETS.matcher(name).replaceAll(" $1 ");
        name = NORM_BRACKETS_OPEN_STRONG.matcher(name).replaceAll(" (");
        name = NORM_BRACKETS_CLOSE_STRONG.matcher(name).replaceAll(") ");
        name = NORM_AND.matcher(name).replaceAll(" & ");
        name = NORM_ET_AL.matcher(name).replaceAll(" et al.");
        m = NORM_SUBGENUS.matcher(name = NORM_SUFFIXES.matcher(name).replaceAll(""));
        if (m.find()) {
            name = m.replaceAll("$1 ($2) $3");
        }
        name = NameParser.normalize(name);
        return org.apache.commons.lang3.StringUtils.trimToEmpty((String)name);
    }

    protected static String preClean(String name) {
        Matcher m = XML_ENTITY_STRIP.matcher(name = StringUtils.unescapeUnicodeChars((String)name));
        if (m.find()) {
            name = m.replaceAll("&$1;");
        }
        name = StringEscapeUtils.unescapeHtml4((String)name);
        name = AMPERSAND_ENTITY.matcher(name).replaceAll("& ");
        name = XML_TAGS.matcher(name).replaceAll("");
        name = name.trim();
        for (char c : QUOTES) {
            int idx;
            for (idx = 0; idx < name.length() && (c == name.charAt(idx) || Character.isWhitespace(name.charAt(idx))); ++idx) {
            }
            if (idx <= 0) continue;
            int end = 0;
            while (c == name.charAt(name.length() - 1 - end) && name.length() - idx - end > 0) {
                ++end;
            }
            name = name.substring(idx, name.length() - end);
        }
        name = NORM_WHITESPACE.matcher(name).replaceAll(" ");
        return org.apache.commons.lang3.StringUtils.trimToEmpty((String)name);
    }

    public ParsedName parse(String scientificName, @Nullable Rank rank) throws UnparsableException {
        if (Strings.isNullOrEmpty((String)scientificName)) {
            throw new UnparsableException(NameType.NO_NAME, scientificName);
        }
        long start = 0L;
        if (LOG.isDebugEnabled()) {
            start = System.currentTimeMillis();
        }
        ParsedName pn = new ParsedName();
        pn.setScientificName(scientificName);
        String name = NameParser.preClean(scientificName);
        name = EXTINCT_PATTERN.matcher(name).replaceFirst("");
        Matcher m = IS_CANDIDATUS_QUOTE_PATTERN.matcher(scientificName);
        if (m.find()) {
            pn.setType(NameType.CANDIDATUS);
            name = m.replaceFirst(m.group(2));
        }
        if ((m = STRAIN.matcher(name = TYPE_TO_VAR.matcher(name).replaceAll("$1var"))).find()) {
            name = m.replaceFirst(m.group(1));
            pn.setType(NameType.INFORMAL);
            pn.setStrain(m.group(2));
            LOG.debug("Strain: {}", (Object)m.group(2));
        }
        if (PLACEHOLDER.matcher(name).find()) {
            throw new UnparsableException(NameType.PLACEHOLDER, scientificName);
        }
        if (IS_VIRUS_PATTERN.matcher(name).find() || IS_VIRUS_PATTERN_CASE_SENSITIVE.matcher(name).find()) {
            throw new UnparsableException(NameType.VIRUS, scientificName);
        }
        if (IS_GENE.matcher(name).find()) {
            pn.setType(NameType.INFORMAL);
        }
        if (Strings.isNullOrEmpty((String)(name = NameParser.normalize(name)))) {
            throw new UnparsableException(NameType.NO_NAME, scientificName);
        }
        m = CULTIVAR.matcher(name);
        if (m.find()) {
            pn.setCultivarEpithet(m.group(1));
            name = m.replaceFirst(" ");
            pn.setType(NameType.CULTIVAR);
            pn.setRank(Rank.CULTIVAR);
            LOG.debug("Cultivar: {}", (Object)pn.getCultivarEpithet());
        }
        if (NO_LETTERS.matcher(name).find()) {
            throw new UnparsableException(NameType.NO_NAME, scientificName);
        }
        if (HYBRID_FORMULA_PATTERN.matcher(name).find()) {
            throw new UnparsableException(NameType.HYBRID, scientificName);
        }
        m = IS_CANDIDATUS_PATTERN.matcher(name);
        if (m.find()) {
            pn.setType(NameType.CANDIDATUS);
            name = m.replaceFirst("");
        }
        if ((m = EXTRACT_NOMSTATUS.matcher(name)).find()) {
            Matcher rm;
            pn.setNomStatus(org.apache.commons.lang3.StringUtils.trimToNull((String)m.group(1)));
            name = m.replaceFirst("");
            if (pn.getNomStatus() != null && (rm = NOV_RANK_MARKER.matcher(pn.getNomStatus())).find()) {
                pn.setRankMarker(rm.group(1).trim());
            }
        }
        if ((m = EXTRACT_SENSU.matcher(name)).find()) {
            pn.setSensu(org.apache.commons.lang3.StringUtils.trimToNull((String)m.group(1)));
            name = m.replaceFirst("");
        }
        if ((m = EXTRACT_REMARKS.matcher(name)).find()) {
            pn.setRemarks(org.apache.commons.lang3.StringUtils.trimToNull((String)m.group(1)));
            name = m.replaceFirst("");
        }
        if ((m = RANK_MARKER_AT_END.matcher(name)).find() && !name.endsWith(" f.") && !name.endsWith(" f")) {
            pn.setType(NameType.INFORMAL);
            pn.setRankMarker(m.group(2));
            name = m.replaceAll(" ");
        }
        if ((m = NORM_INDET.matcher(name)).find()) {
            pn.setType(NameType.INFORMAL);
            name = m.replaceAll(" ");
        }
        if (Strings.isNullOrEmpty((String)(name = NameParser.normalizeStrong(name)))) {
            throw new UnparsableException(NameType.DOUBTFUL, scientificName);
        }
        Rank origRank = pn.getRank();
        boolean parsed = this.nnParser.parseNormalisedName(pn, name, rank);
        if (!parsed) {
            LOG.debug("Can't parse, use dirty normalizer");
            String deDirtedName = NameParser.cleanStrong(name);
            parsed = this.nnParser.parseNormalisedName(pn, deDirtedName, rank);
            if (!parsed) {
                LOG.debug("Still can't parse, try to ignore authors");
                parsed = this.nnParser.parseNormalisedNameIgnoreAuthors(pn, deDirtedName, rank);
                pn.setAuthorsParsed(false);
                if (!parsed) {
                    m = IS_VIRUS_PATTERN_POSTFAIL.matcher(name);
                    if (m.find()) {
                        throw new UnparsableException(NameType.VIRUS, scientificName);
                    }
                    throw new UnparsableException(NameType.DOUBTFUL, scientificName);
                }
            }
        }
        if (origRank != null) {
            pn.setRank(origRank);
        }
        this.postAssertParsing(pn, scientificName, name);
        if (pn.getType() == null) {
            m = DOUBTFUL.matcher(scientificName);
            if (!m.find()) {
                pn.setType(NameType.DOUBTFUL);
            } else {
                m = DOUBTFUL2.matcher(scientificName);
                if (m.find()) {
                    pn.setType(NameType.DOUBTFUL);
                } else {
                    pn.setType(NameType.SCIENTIFIC);
                }
            }
        }
        LOG.debug("Parsing time: {}", (Object)(System.currentTimeMillis() - start));
        return pn;
    }

    private void postAssertParsing(ParsedName pn, String rawName, String normedName) throws UnparsableException {
        if (pn.getGenusOrAbove() != null && !pn.isBinomial() && Character.isLowerCase(normedName.charAt(0))) {
            throw new UnparsableException(NameType.DOUBTFUL, rawName);
        }
        if (pn.getRank() != null) {
            if (pn.getRank().isSpeciesOrBelow() && !pn.isBinomial() && !pn.getRank().equals((Object)Rank.CULTIVAR)) {
                pn.setType(NameType.INFORMAL);
            } else if (pn.getRank().equals((Object)Rank.CULTIVAR) && pn.getCultivarEpithet() == null) {
                pn.setType(NameType.INFORMAL);
            } else if (pn.getRank().isInfraspecific() && pn.getInfraSpecificEpithet() == null) {
                pn.setType(NameType.INFORMAL);
            } else if (!pn.getRank().isSpeciesOrBelow() && pn.isBinomial()) {
                pn.setType(NameType.DOUBTFUL);
            }
        }
    }

    public String parseToCanonical(String scientificName, @Nullable Rank rank) {
        if (Strings.isNullOrEmpty((String)scientificName)) {
            return null;
        }
        try {
            ParsedName pn = this.parse(scientificName, rank);
            if (pn != null) {
                return pn.canonicalName();
            }
        }
        catch (UnparsableException e) {
            LOG.warn("Unparsable name " + scientificName + " >>> " + e.getMessage());
        }
        return null;
    }

    public String parseToCanonicalOrScientificName(String scientificName, @Nullable Rank rank) {
        if (Strings.isNullOrEmpty((String)scientificName)) {
            return null;
        }
        try {
            ParsedName pn = this.parse(scientificName, rank);
            if (pn != null) {
                return pn.canonicalName();
            }
        }
        catch (UnparsableException e) {
            LOG.warn("Unparsable name " + scientificName + " >>> " + e.getMessage());
        }
        return org.apache.commons.lang3.StringUtils.normalizeSpace((String)scientificName.trim());
    }

    public NormalisedNameParser getNormalisedNameParser() {
        return this.nnParser;
    }

    static {
        NameParser.QUOTES[0] = 34;
        NameParser.QUOTES[1] = 39;
        NameParser.QUOTES[2] = 34;
        NameParser.QUOTES[3] = 39;
        HYBRID_FORMULA_PATTERN = Pattern.compile(" \u00d7 ");
        EXTINCT_PATTERN = Pattern.compile("\u2020\\s*");
        CULTIVAR = Pattern.compile("(?: cv\\.? ?)?[\"'] ?((?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152]?[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+[- ]?){1,3}) ?[\"']");
        STRAIN = Pattern.compile("([a-z]\\.?) +([A-Z]+ *[0-9]+T?)$");
        IS_VIRUS_PATTERN = Pattern.compile("virus(es)?\\b|\\b(viroid|(bacterio|viro)?phage(in|s)?|(alpha|beta) ?satellites?|particles?|ictv$)\\b", 2);
        IS_VIRUS_PATTERN_CASE_SENSITIVE = Pattern.compile("\\b(:?[MS]?NP|G)V\\b");
        IS_VIRUS_PATTERN_POSTFAIL = Pattern.compile("(\\b(vector)\\b)", 2);
        IS_GENE = Pattern.compile("(RNA|DNA)[0-9]*(?:\\b|_)");
        IS_CANDIDATUS_PATTERN = Pattern.compile(CANDIDATUS, 2);
        IS_CANDIDATUS_QUOTE_PATTERN = Pattern.compile("\"(Candidatus\\s|Ca\\.)\\s*(.+)\"", 2);
        RANK_MARKER_AT_END = Pattern.compile(" " + NormalisedNameParser.RANK_MARKER_ALL.substring(0, NormalisedNameParser.RANK_MARKER_ALL.lastIndexOf(41)) + "|" + NormalisedNameParser.RANK_MARKER_MICROBIAL.substring(3) + "\\.?$");
        EXTRACT_SENSU = Pattern.compile(",?\\s+((s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)$|\\((s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)\\))");
        NOV_RANK_MARKER = Pattern.compile("(fam|gen|sp|ssp|var|forma)");
        EXTRACT_NOMSTATUS = Pattern.compile("(?:, ?| )\\(?((?:comb|fam|gen|sp|ssp|var|forma)?[\\. ] ?nov[\\. $](?: ?ined\\.?)?|ined\\.|nom(?:\\s+|\\.\\s*|en\\s+)(?:utiq(?:ue\\s+|\\.\\s*))?(?:ambig|alter|alt|correct|cons|dubium|dub|herb|illeg|invalid|inval|negatum|neg|novum|nov|nudum|nud|oblitum|obl|praeoccup|prov|prot|transf|superfl|super|rejic|rej)\\.?(?:\\s+(?:prop|proposed)\\.?)?)\\)?");
        EXTRACT_REMARKS = Pattern.compile("\\s+(anon\\.?)(\\s.+)?$");
        EXTRACT_YEAR = Pattern.compile("([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?\\s*\\)?)");
        COMMA_BEFORE_YEAR = Pattern.compile("(,+|[^0-9\\(\\[\"])\\s*(\\d{3})");
        REPLACE_QUOTES = Pattern.compile("(^\\s*[\"',]+)|([\"',]+\\s*$)");
        NORM_QUOTES = Pattern.compile("([\"'`\u00b4]+)");
        NORM_UPPERCASE_WORDS = Pattern.compile("\\b(\\p{Lu})(\\p{Lu}{2,})\\b");
        NORM_WHITESPACE = Pattern.compile("\\s+");
        NORM_NO_SQUARE_BRACKETS = Pattern.compile("\\[(.*?)\\]");
        NORM_BRACKETS_OPEN = Pattern.compile("([{(\\[])\\s*,?");
        NORM_BRACKETS_CLOSE = Pattern.compile(",?\\s*([})\\]])");
        NORM_BRACKETS_OPEN_STRONG = Pattern.compile("( ?[{(\\[] ?)+");
        NORM_BRACKETS_CLOSE_STRONG = Pattern.compile("( ?[})\\]] ?)+");
        NORM_AND = Pattern.compile(" (and|et|und) ");
        NORM_ET_AL = Pattern.compile("(?:& )+al\\.?");
        NORM_AMPERSAND_WS = Pattern.compile("&");
        NORM_HYPHENS = Pattern.compile("\\s*-\\s*");
        NORM_SUBGENUS = Pattern.compile("([A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152](?:\\.|[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)(?:-[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152]?[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)?) ([A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152](?:\\.|[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)(?:-[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152]?[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)?) ([a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153+-]{5,})");
        NO_Q_MARKS = Pattern.compile("([a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-])\\?+");
        NORM_COMMAS = Pattern.compile("\\s*,+");
        NORM_ORIG_AUTH = Pattern.compile("(?<=[ \\(])((?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*) ?\\( ?([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)\\)");
        NORM_ORIG_AUTH2 = Pattern.compile("\\(((?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)\\) ?,? ?([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)");
        NORM_IMPRINT_YEAR = Pattern.compile("([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)\\s*(?:\\(\"?[\\s0-9-_,?]+\"?\\)|\\[\"?[0-9 -,]+\"?\\]|\"[0-9 -,]+\")");
        NORM_HYBRIDS_GENUS = Pattern.compile("^\\s*[+\u00d7xX]\\s*([A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152])");
        NORM_HYBRIDS_EPITH = Pattern.compile("^\\s*(\u00d7?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152](?:\\.|[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)(?:-[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152]?[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153]+)?)\\s+(?:\u00d7|[xX]\\s)\\s*((?:[0-9]+-|[doml]')?(?:(?:van|novae) [a-z])?[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153+-]{1,}(?<! d)[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153](?<!(?:\\bex|bacilliform|coliform|coryneform|cytoform|chemoform|biovar|serovar|genomovar|agamovar|cultivar|genotype|serotype|subtype|ribotype|isolate))(?=\\b))");
        NORM_HYBRIDS_FORM = Pattern.compile(" [\u00d7xX] ");
        NORM_INDET = Pattern.compile("((^| )(undet|indet|aff|cf)[#!?\\.]?)+(?![a-z])");
        NORM_DOTS = Pattern.compile("(^\\s*[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152]|" + NormalisedNameParser.RANK_MARKER_ALL + ")\\.");
        NORM_TF_GENUS = Pattern.compile("^([A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152])\\(([a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153-]+)\\)\\.? ");
        NORM_IN_COMMA = Pattern.compile(", in ", 2);
        NORM_IN_BIB = Pattern.compile("( in .+$| ?: ?[0-9]+)", 2);
        NORM_PREFIXES = Pattern.compile("^(sub)?(fossil|" + org.apache.commons.lang3.StringUtils.join(Rank.RANK_MARKER_MAP_SUPRAGENERIC.keySet(), (String)"|") + ")\\.?\\s+", 2);
        NORM_SUFFIXES = Pattern.compile("[,;:]? (sp|anon|spp|hort|ms|&|[a-zA-Z][0-9])?\\.? *$", 2);
        NO_LETTERS = Pattern.compile("^[^a-zA-Z]+$");
        PLACEHOLDER = Pattern.compile("\\b(unnamed|mixed|unassigned|unallocated|unplaced|undetermined|unclassified|uncultured|unknown|unspecified|uncertain|incertae sedis|not assigned|awaiting allocation|temp|dummy)\\b", 2);
        DOUBTFUL = Pattern.compile("^[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-\u00d7&*+ ,.()/'`\u00b40-9-]+$");
        DOUBTFUL2 = Pattern.compile("\\bnull\\b");
        BAD_NAME_SUFFICES = Pattern.compile(" (author|unknown|unassigned|not_stated)$", 2);
        XML_ENTITY_STRIP = Pattern.compile("&\\s*([a-z]+)\\s*;");
        AMPERSAND_ENTITY = Pattern.compile("& *amp +");
        XML_TAGS = Pattern.compile("< */? *[a-zA-Z] *>");
        FIRST_WORD = Pattern.compile("^([\u00d7xX]\\s+)?([\u00d7x][A-Z])?([a-zA-Z])([a-zA-Z]+) ");
        NORM_WEIRD_CHARS = Pattern.compile(WEIRD_CHARS);
        StringBuilder sb = new StringBuilder();
        sb.append("\\b(");
        for (Rank r : Rank.INFRASUBSPECIFIC_MICROBIAL_RANKS) {
            if (!r.name().endsWith("VAR")) continue;
            if (sb.length() > 4) {
                sb.append("|");
            }
            sb.append(r.name().toLowerCase().substring(0, r.name().length() - 3));
        }
        sb.append(")type\\b");
        TYPE_TO_VAR = Pattern.compile(sb.toString());
        COMB_BAS_AUTHOR_SWAP = Pattern.compile("( (?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)(?:( ?,? ?[12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?))? ?\\(( ?(?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}][a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|Des|De|de|di|Di|da|N)[`' _]|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-Z\u00cf\u00cb\u00d6\u00dc\u00c4\u00c9\u00c8\u010c\u00c1\u00c0\u00c6\u0152\\p{Lu}]+[a-z\u00ef\u00eb\u00f6\u00fc\u00e4\u00e5\u00e9\u00e8\u010d\u00e1\u00e0\u00e6\u0153\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)( ?,? ?[12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)?\\)");
    }
}

