in Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py [0:0]
def extract(self, source: str) -> List[ExtractResult]:
if not self._pre_check_str(source):
return []
non_unit_match = None
numbers = None
mapping_prefix: Dict[float, PrefixUnitResult] = dict()
matched = [False] * len(source)
result = []
prefix_matched = False
prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start)
suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start)
if len(prefix_match) > 0 or len(suffix_match) > 0:
numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)
if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0:
for number in numbers:
start = number.start
length = number.length
number_prefix = [(mr.start + mr.length) == start for mr in prefix_match]
number_suffix = [mr.start == (start + length) for mr in suffix_match]
if True in number_prefix and True in number_suffix and "," in number.text:
comma_index = number.start + number.text.index(",")
source = source[:comma_index] + " " + source[comma_index + 1:]
numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)
# Special case for cases where number multipliers clash with unit
ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex
if ambiguous_multiplier_regex is not None:
for num in numbers:
match = list(filter(lambda x: x.group(), regex.finditer(
ambiguous_multiplier_regex, num.text)))
if match and len(match) == 1:
new_length = num.length - \
(match[0].span()[1] - match[0].span()[0])
num.text = num.text[0:new_length]
num.length = new_length
for number in numbers:
if number.start is None or number.length is None:
continue
start = int(number.start)
length = int(number.length)
max_find_pref = min(self.max_prefix_match_len, number.start)
max_find_suff = len(source) - start - length
if max_find_pref != 0:
last_index = start
best_match = None
for m in prefix_match:
if m.length > 0 and m.end > start:
break
if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text:
best_match = m
break
if best_match is not None:
off_set = last_index - best_match.start
unit_str = source[best_match.start:best_match.start + off_set]
self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str)))
prefix_unit = mapping_prefix.get(start, None)
if max_find_suff > 0:
max_len = 0
first_index = start + length
for m in suffix_match:
if m.length > 0 and m.start >= first_index:
end_pos = m.start + m.length - first_index
if max_len < end_pos:
mid_str = source[first_index: first_index + (m.start - first_index)]
if mid_str is None or not mid_str or str.isspace(mid_str) \
or mid_str.strip() == self.config.connector_token:
max_len = end_pos
if m.end < len(source) and (
(mid_str.endswith('(') and source[m.end] == ')') or
(mid_str.endswith('[') and source[m.end] == ']') or
(mid_str.endswith('{') and source[m.end] == '}') or
(mid_str.endswith('<') and source[m.end] == '>')):
max_len = m.end - first_index + 1
if max_len != 0:
substr = source[start: start + length + max_len]
er = ExtractResult()
er.start = start
er.length = length + max_len
er.text = substr
er.type = self.config.extract_type
if prefix_unit is not None:
prefix_matched = True
er.start -= prefix_unit[0].offset
er.length += prefix_unit[0].offset
er.text = prefix_unit[0].unit + er.text
# Relative position will be used in Parser
number.start = start - er.start
er.data = number
# Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
is_not_unit = False
if er.type is Constants.SYS_UNIT_DIMENSION:
if non_unit_match is None:
non_unit_match = list(self.config.non_unit_regex.finditer(source))
for time in non_unit_match:
trimmed_source = source.lower()
index = trimmed_source.index(time.group())
if er.start >= time.start() and er.start + er.length <= \
time.start() + len(time.group()):
is_not_unit = True
break
if is_not_unit:
continue
result.append(er)
if prefix_unit and prefix_unit is not None and not prefix_matched:
er = ExtractResult()
er.start = number.start - prefix_unit[0].offset
er.length = number.length + prefix_unit[0].offset
er.text = prefix_unit[0].unit + number.text
er.type = self.config.extract_type
# Relative position will be used in Parser
number.start = start - er.start
er.data = number
result.append(er)
# Extract Separate unit
if self.separate_regex:
if non_unit_match is None:
try:
non_unit_match = list(self.config.non_unit_regex.match(source))
except:
non_unit_match = []
self._extract_separate_units(source, result, non_unit_match)
# Remove common ambiguous cases
result = self._filter_ambiguity(result, source)
# Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
self.config.expand_half_suffix(source, result, numbers)
return result