public List extract()

in Java/libraries/recognizers-text-number-with-unit/src/main/java/com/microsoft/recognizers/text/numberwithunit/extractors/NumberWithUnitExtractor.java [76:267]


    public List<ExtractResult> extract(String source) {
        List<ExtractResult> result = new ArrayList<>();

        if (!preCheckStr(source)) {
            return result;
        }

        Map<Integer, PrefixUnitResult> mappingPrefix = new HashMap<Integer, PrefixUnitResult>();
        boolean[] matched = new boolean[source.length()];
        Arrays.fill(matched, false);
        List<ExtractResult> numbers = this.config.getUnitNumExtractor().extract(source);
        int sourceLen = source.length();

        List<Matcher> prefixMatch = new ArrayList<Matcher>();
        List<Matcher> suffixMatch = new ArrayList<Matcher>();

        for (Pattern regex : prefixRegexes) {
            Matcher match = regex.matcher(source);
            if (match.find()) {
                prefixMatch.add(match);
            }
        }

        for (Pattern regex : suffixRegexes) {
            Matcher match = regex.matcher(source);
            if (match.find()) {
                suffixMatch.add(match);
            }
        }

        if (numbers.size() > 0  && this.config.getExtractType() == Constants.SYS_UNIT_CURRENCY && prefixMatch.size() > 0 && suffixMatch.size() > 0) {

            for (ExtractResult number : numbers) {
                int start = number.getStart();
                int length = number.getLength();
                Boolean numberPrefix = false;
                Boolean numberSuffix = false;

                for (Matcher match : prefixMatch) {
                    if (match.end() == start) {
                        numberPrefix = true;
                    }
                }

                for (Matcher match : suffixMatch) {
                    if (start + length == match.start()) {
                        numberSuffix = true;
                    }
                }

                if (numberPrefix && numberSuffix && number.getText().contains(",")) {
                    int commaIndex = start + number.getText().indexOf(",");
                    source = source.substring(0, commaIndex) + " " + source.substring(commaIndex + 1);
                }
            }
            numbers = this.config.getUnitNumExtractor().extract(source);
        }

        /* Special case for cases where number multipliers clash with unit */
        Pattern ambiguousMultiplierRegex = this.config.getAmbiguousUnitNumberMultiplierRegex();
        if (ambiguousMultiplierRegex != null) {
            for (int i = 0; i < numbers.size(); i++) {
                ExtractResult number = numbers.get(i);

                Match[] matches = RegExpUtility.getMatches(ambiguousMultiplierRegex, number.getText());
                if (matches.length == 1) {
                    int newLength = number.getLength() - matches[0].length;
                    numbers.set(i, new ExtractResult(number.getStart(), newLength, number.getText().substring(0, newLength),
                            number.getType(), number.getData()));
                }
            }
        }

        /* Mix prefix and numbers, make up a prefix-number combination */
        if (maxPrefixMatchLen != 0) {
            for (ExtractResult number : numbers) {
                if (number.getStart() == null || number.getLength() == null) {
                    continue;
                }

                int maxFindPref = Math.min(maxPrefixMatchLen, number.getStart());
                if (maxFindPref == 0) {
                    continue;
                }

                /* Scan from left to right , find the longest match */
                String leftStr = source.substring(number.getStart() - maxFindPref, number.getStart());
                int lastIndex = leftStr.length();

                MatchResult bestMatch = null;
                for (Pattern regex : prefixRegexes) {
                    Matcher match = regex.matcher(leftStr);
                    while (match.find()) {
                        if (leftStr.substring(match.start(), lastIndex).trim().equals(match.group())) {
                            if (bestMatch == null || bestMatch.start() >= match.start()) {
                                bestMatch = match.toMatchResult();
                            }
                        }
                    }
                }

                if (bestMatch != null) {
                    int offset = lastIndex - bestMatch.start();
                    String unitStr = leftStr.substring(bestMatch.start(), lastIndex);
                    mappingPrefix.put(number.getStart(), new PrefixUnitResult(offset, unitStr));
                }
            }
        }

        for (ExtractResult number : numbers) {
            if (number.getStart() == null || number.getLength() == null) {
                continue;
            }

            int start = number.getStart();
            int length = number.getLength();
            int maxFindLen = sourceLen - start - length;

            PrefixUnitResult prefixUnit = null;
            if (mappingPrefix.containsKey(start)) {
                prefixUnit = mappingPrefix.get(start);
            }

            if (maxFindLen > 0) {
                String rightSub = source.substring(start + length, start + length + maxFindLen);
                List<Matcher> unitMatch = suffixRegexes.stream().map(p -> p.matcher(rightSub)).collect(Collectors.toList());

                int maxlen = 0;
                for (int i = 0; i < unitMatch.size(); i++) {
                    Matcher m = unitMatch.get(i);
                    while (m.find()) {
                        int endpos = m.end();
                        if (m.start() >= 0) {
                            String midStr = rightSub.substring(0, Math.min(m.start(), rightSub.length()));
                            if (maxlen < endpos && (midStr.trim().isEmpty() || midStr.trim().equalsIgnoreCase(this.config.getConnectorToken()))) {
                                maxlen = endpos;
                            }
                        }
                    }
                }

                if (maxlen != 0) {
                    for (int i = 0; i < length + maxlen; i++) {
                        matched[i + start] = true;
                    }

                    String substr = source.substring(start, start + length + maxlen);
                    ExtractResult er = new ExtractResult(start, length + maxlen, substr, this.config.getExtractType(), null);

                    if (prefixUnit != null) {
                        er.setStart(er.getStart() - prefixUnit.offset);
                        er.setLength(er.getLength() + prefixUnit.offset);
                        er.setText(prefixUnit.unitStr + er.getText());
                    }

                    /* Relative position will be used in Parser */
                    number.setStart(start - er.getStart());
                    er.setData(number);
                    result.add(er);

                    continue;
                }
            }

            if (prefixUnit != null) {
                ExtractResult er = new ExtractResult(
                        number.getStart() - prefixUnit.offset,
                        number.getLength() + prefixUnit.offset,
                        prefixUnit.unitStr + number.getText(),
                        this.config.getExtractType(),
                        null);

                /* Relative position will be used in Parser */
                number.setStart(start - er.getStart());
                er.setData(number);
                result.add(er);
            }
        }

        // Extract Separate unit
        if (separateRegex != null) {
            extractSeparateUnits(source, result);
        }

        // Remove common ambiguous cases
        result = filterAmbiguity(result, source);

        // Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
        result = this.config.expandHalfSuffix(source, result, numbers);

        return result;
    }