def extract()

in Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py [0:0]


    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return []

        non_unit_match = None
        numbers = None

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched = [False] * len(source)
        result = []
        prefix_matched = False
        prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start)
        suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start)

        if len(prefix_match) > 0 or len(suffix_match) > 0:

            numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0:

                for number in numbers:
                    start = number.start
                    length = number.length
                    number_prefix = [(mr.start + mr.length) == start for mr in prefix_match]
                    number_suffix = [mr.start == (start + length) for mr in suffix_match]
                    if True in number_prefix and True in number_suffix and "," in number.text:
                        comma_index = number.start + number.text.index(",")
                        source = source[:comma_index] + " " + source[comma_index + 1:]

                numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            # Special case for cases where number multipliers clash with unit
            ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex
            if ambiguous_multiplier_regex is not None:

                for num in numbers:
                    match = list(filter(lambda x: x.group(), regex.finditer(
                        ambiguous_multiplier_regex, num.text)))
                    if match and len(match) == 1:
                        new_length = num.length - \
                            (match[0].span()[1] - match[0].span()[0])
                        num.text = num.text[0:new_length]
                        num.length = new_length

            for number in numbers:
                if number.start is None or number.length is None:
                    continue
                start = int(number.start)
                length = int(number.length)
                max_find_pref = min(self.max_prefix_match_len, number.start)
                max_find_suff = len(source) - start - length

                if max_find_pref != 0:
                    last_index = start
                    best_match = None

                    for m in prefix_match:
                        if m.length > 0 and m.end > start:
                            break

                        if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text:
                            best_match = m
                            break

                    if best_match is not None:
                        off_set = last_index - best_match.start
                        unit_str = source[best_match.start:best_match.start + off_set]
                        self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str)))
                prefix_unit = mapping_prefix.get(start, None)
                if max_find_suff > 0:

                    max_len = 0
                    first_index = start + length

                    for m in suffix_match:

                        if m.length > 0 and m.start >= first_index:

                            end_pos = m.start + m.length - first_index
                            if max_len < end_pos:
                                mid_str = source[first_index: first_index + (m.start - first_index)]
                                if mid_str is None or not mid_str or str.isspace(mid_str) \
                                        or mid_str.strip() == self.config.connector_token:
                                    max_len = end_pos
                                if m.end < len(source) and (
                                    (mid_str.endswith('(') and source[m.end] == ')') or
                                    (mid_str.endswith('[') and source[m.end] == ']') or
                                    (mid_str.endswith('{') and source[m.end] == '}') or
                                        (mid_str.endswith('<') and source[m.end] == '>')):
                                    max_len = m.end - first_index + 1

                    if max_len != 0:
                        substr = source[start: start + length + max_len]
                        er = ExtractResult()

                        er.start = start
                        er.length = length + max_len
                        er.text = substr
                        er.type = self.config.extract_type

                        if prefix_unit is not None:
                            prefix_matched = True
                            er.start -= prefix_unit[0].offset
                            er.length += prefix_unit[0].offset
                            er.text = prefix_unit[0].unit + er.text

                        # Relative position will be used in Parser
                        number.start = start - er.start
                        er.data = number

                        # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                        is_not_unit = False

                        if er.type is Constants.SYS_UNIT_DIMENSION:
                            if non_unit_match is None:
                                non_unit_match = list(self.config.non_unit_regex.finditer(source))
                            for time in non_unit_match:
                                trimmed_source = source.lower()
                                index = trimmed_source.index(time.group())
                                if er.start >= time.start() and er.start + er.length <= \
                                        time.start() + len(time.group()):
                                    is_not_unit = True
                                    break

                        if is_not_unit:
                            continue

                        result.append(er)

                if prefix_unit and prefix_unit is not None and not prefix_matched:
                    er = ExtractResult()
                    er.start = number.start - prefix_unit[0].offset
                    er.length = number.length + prefix_unit[0].offset
                    er.text = prefix_unit[0].unit + number.text
                    er.type = self.config.extract_type

                    # Relative position will be used in Parser
                    number.start = start - er.start
                    er.data = number
                    result.append(er)

        # Extract Separate unit
        if self.separate_regex:
            if non_unit_match is None:
                try:
                    non_unit_match = list(self.config.non_unit_regex.match(source))
                except:
                    non_unit_match = []

            self._extract_separate_units(source, result, non_unit_match)

            # Remove common ambiguous cases
            result = self._filter_ambiguity(result, source)

        # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
        self.config.expand_half_suffix(source, result, numbers)

        return result