in Java/libraries/recognizers-text-number-with-unit/src/main/java/com/microsoft/recognizers/text/numberwithunit/extractors/NumberWithUnitExtractor.java [76:267]
public List<ExtractResult> extract(String source) {
List<ExtractResult> result = new ArrayList<>();
if (!preCheckStr(source)) {
return result;
}
Map<Integer, PrefixUnitResult> mappingPrefix = new HashMap<Integer, PrefixUnitResult>();
boolean[] matched = new boolean[source.length()];
Arrays.fill(matched, false);
List<ExtractResult> numbers = this.config.getUnitNumExtractor().extract(source);
int sourceLen = source.length();
List<Matcher> prefixMatch = new ArrayList<Matcher>();
List<Matcher> suffixMatch = new ArrayList<Matcher>();
for (Pattern regex : prefixRegexes) {
Matcher match = regex.matcher(source);
if (match.find()) {
prefixMatch.add(match);
}
}
for (Pattern regex : suffixRegexes) {
Matcher match = regex.matcher(source);
if (match.find()) {
suffixMatch.add(match);
}
}
if (numbers.size() > 0 && this.config.getExtractType() == Constants.SYS_UNIT_CURRENCY && prefixMatch.size() > 0 && suffixMatch.size() > 0) {
for (ExtractResult number : numbers) {
int start = number.getStart();
int length = number.getLength();
Boolean numberPrefix = false;
Boolean numberSuffix = false;
for (Matcher match : prefixMatch) {
if (match.end() == start) {
numberPrefix = true;
}
}
for (Matcher match : suffixMatch) {
if (start + length == match.start()) {
numberSuffix = true;
}
}
if (numberPrefix && numberSuffix && number.getText().contains(",")) {
int commaIndex = start + number.getText().indexOf(",");
source = source.substring(0, commaIndex) + " " + source.substring(commaIndex + 1);
}
}
numbers = this.config.getUnitNumExtractor().extract(source);
}
/* Special case for cases where number multipliers clash with unit */
Pattern ambiguousMultiplierRegex = this.config.getAmbiguousUnitNumberMultiplierRegex();
if (ambiguousMultiplierRegex != null) {
for (int i = 0; i < numbers.size(); i++) {
ExtractResult number = numbers.get(i);
Match[] matches = RegExpUtility.getMatches(ambiguousMultiplierRegex, number.getText());
if (matches.length == 1) {
int newLength = number.getLength() - matches[0].length;
numbers.set(i, new ExtractResult(number.getStart(), newLength, number.getText().substring(0, newLength),
number.getType(), number.getData()));
}
}
}
/* Mix prefix and numbers, make up a prefix-number combination */
if (maxPrefixMatchLen != 0) {
for (ExtractResult number : numbers) {
if (number.getStart() == null || number.getLength() == null) {
continue;
}
int maxFindPref = Math.min(maxPrefixMatchLen, number.getStart());
if (maxFindPref == 0) {
continue;
}
/* Scan from left to right , find the longest match */
String leftStr = source.substring(number.getStart() - maxFindPref, number.getStart());
int lastIndex = leftStr.length();
MatchResult bestMatch = null;
for (Pattern regex : prefixRegexes) {
Matcher match = regex.matcher(leftStr);
while (match.find()) {
if (leftStr.substring(match.start(), lastIndex).trim().equals(match.group())) {
if (bestMatch == null || bestMatch.start() >= match.start()) {
bestMatch = match.toMatchResult();
}
}
}
}
if (bestMatch != null) {
int offset = lastIndex - bestMatch.start();
String unitStr = leftStr.substring(bestMatch.start(), lastIndex);
mappingPrefix.put(number.getStart(), new PrefixUnitResult(offset, unitStr));
}
}
}
for (ExtractResult number : numbers) {
if (number.getStart() == null || number.getLength() == null) {
continue;
}
int start = number.getStart();
int length = number.getLength();
int maxFindLen = sourceLen - start - length;
PrefixUnitResult prefixUnit = null;
if (mappingPrefix.containsKey(start)) {
prefixUnit = mappingPrefix.get(start);
}
if (maxFindLen > 0) {
String rightSub = source.substring(start + length, start + length + maxFindLen);
List<Matcher> unitMatch = suffixRegexes.stream().map(p -> p.matcher(rightSub)).collect(Collectors.toList());
int maxlen = 0;
for (int i = 0; i < unitMatch.size(); i++) {
Matcher m = unitMatch.get(i);
while (m.find()) {
int endpos = m.end();
if (m.start() >= 0) {
String midStr = rightSub.substring(0, Math.min(m.start(), rightSub.length()));
if (maxlen < endpos && (midStr.trim().isEmpty() || midStr.trim().equalsIgnoreCase(this.config.getConnectorToken()))) {
maxlen = endpos;
}
}
}
}
if (maxlen != 0) {
for (int i = 0; i < length + maxlen; i++) {
matched[i + start] = true;
}
String substr = source.substring(start, start + length + maxlen);
ExtractResult er = new ExtractResult(start, length + maxlen, substr, this.config.getExtractType(), null);
if (prefixUnit != null) {
er.setStart(er.getStart() - prefixUnit.offset);
er.setLength(er.getLength() + prefixUnit.offset);
er.setText(prefixUnit.unitStr + er.getText());
}
/* Relative position will be used in Parser */
number.setStart(start - er.getStart());
er.setData(number);
result.add(er);
continue;
}
}
if (prefixUnit != null) {
ExtractResult er = new ExtractResult(
number.getStart() - prefixUnit.offset,
number.getLength() + prefixUnit.offset,
prefixUnit.unitStr + number.getText(),
this.config.getExtractType(),
null);
/* Relative position will be used in Parser */
number.setStart(start - er.getStart());
er.setData(number);
result.add(er);
}
}
// Extract Separate unit
if (separateRegex != null) {
extractSeparateUnits(source, result);
}
// Remove common ambiguous cases
result = filterAmbiguity(result, source);
// Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
result = this.config.expandHalfSuffix(source, result, numbers);
return result;
}