def nlp()

in hgext/pushlog/parsedatetime/__init__.py [0:0]
211 lines of code
52 McCabe index (conditional complexity)

    def nlp(self, inputString, sourceTime=None, version=None):
        """Utilizes parse() after making judgements about what datetime
        information belongs together.

        It makes logical groupings based on proximity and returns a parsed
        datetime for each matched grouping of datetime text, along with
        location info within the given inputString.

        @type  inputString: string
        @param inputString: natural language text to evaluate
        @type  sourceTime:  struct_time
        @param sourceTime:  C{struct_time} value to use as the base
        @type  version:     integer
        @param version:     style version, default will use L{Calendar}
                            parameter version value

        @rtype:  tuple or None
        @return: tuple of tuples in the format (parsed_datetime as
                 datetime.datetime, flags as int, start_pos as int,
                 end_pos as int, matched_text as string) or None if there
                 were no matches
        """

        orig_inputstring = inputString

        # replace periods at the end of sentences w/ spaces
        # opposed to removing them altogether in order to
        # retain relative positions (identified by alpha, period, space).
        # this is required for some of the regex patterns to match
        inputString = re.sub(r'(\w)(\.)(\s)', r'\1 \3', inputString).lower()
        inputString = re.sub(r'(\w)(\'|")(\s|$)', r'\1 \3', inputString)
        inputString = re.sub(r'(\s|^)(\'|")(\w)', r'\1 \3', inputString)

        startpos = 0  # the start position in the inputString during the loop

        # list of lists in format:
        # [startpos, endpos, matchedstring, flags, type]
        matches = []

        while startpos < len(inputString):

            # empty match
            leftmost_match = [0, 0, None, 0, None]

            # Modifier like next\prev..
            m = self.ptc.CRE_MODIFIER.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start() + startpos:
                    leftmost_match[0] = m.start() + startpos
                    leftmost_match[1] = m.end() + startpos
                    leftmost_match[2] = m.group()
                    leftmost_match[3] = 0
                    leftmost_match[4] = 'modifier'

            # Quantity + Units
            m = self.ptc.CRE_UNITS.search(inputString[startpos:])
            if m is not None:
                debug and log.debug('CRE_UNITS matched')
                if self._UnitsTrapped(inputString[startpos:], m, 'units'):
                    debug and log.debug('day suffix trapped by unit match')
                else:

                    if leftmost_match[1] == 0 or \
                            leftmost_match[0] > m.start('qty') + startpos:
                        leftmost_match[0] = m.start('qty') + startpos
                        leftmost_match[1] = m.end('qty') + startpos
                        leftmost_match[2] = m.group('qty')
                        leftmost_match[3] = 3
                        leftmost_match[4] = 'units'

                        if m.start('qty') > 0 and \
                                inputString[m.start('qty') - 1] == '-':
                            leftmost_match[0] = leftmost_match[0] - 1
                            leftmost_match[2] = '-' + leftmost_match[2]

            # Quantity + Units
            m = self.ptc.CRE_QUNITS.search(inputString[startpos:])
            if m is not None:
                debug and log.debug('CRE_QUNITS matched')
                if self._UnitsTrapped(inputString[startpos:], m, 'qunits'):
                    debug and log.debug('day suffix trapped by qunit match')
                else:
                    if leftmost_match[1] == 0 or \
                            leftmost_match[0] > m.start('qty') + startpos:
                        leftmost_match[0] = m.start('qty') + startpos
                        leftmost_match[1] = m.end('qty') + startpos
                        leftmost_match[2] = m.group('qty')
                        leftmost_match[3] = 3
                        leftmost_match[4] = 'qunits'

                        if m.start('qty') > 0 and \
                                inputString[m.start('qty') - 1] == '-':
                            leftmost_match[0] = leftmost_match[0] - 1
                            leftmost_match[2] = '-' + leftmost_match[2]

            m = self.ptc.CRE_DATE3.search(inputString[startpos:])
            # NO LONGER NEEDED, THE REGEXP HANDLED MTHNAME NOW
            # for match in self.ptc.CRE_DATE3.finditer(inputString[startpos:]):
            # to prevent "HH:MM(:SS) time strings" expressions from
            # triggering this regex, we checks if the month field exists
            # in the searched expression, if it doesn't exist, the date
            # field is not valid
            #     if match.group('mthname'):
            #         m = self.ptc.CRE_DATE3.search(inputString[startpos:],
            #                                       match.start())
            #         break

            # String date format
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start('date') + startpos:
                    leftmost_match[0] = m.start('date') + startpos
                    leftmost_match[1] = m.end('date') + startpos
                    leftmost_match[2] = m.group('date')
                    leftmost_match[3] = 1
                    leftmost_match[4] = 'dateStr'

            # Standard date format
            m = self.ptc.CRE_DATE.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start('date') + startpos:
                    leftmost_match[0] = m.start('date') + startpos
                    leftmost_match[1] = m.end('date') + startpos
                    leftmost_match[2] = m.group('date')
                    leftmost_match[3] = 1
                    leftmost_match[4] = 'dateStd'

            # Natural language day strings
            m = self.ptc.CRE_DAY.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start() + startpos:
                    leftmost_match[0] = m.start() + startpos
                    leftmost_match[1] = m.end() + startpos
                    leftmost_match[2] = m.group()
                    leftmost_match[3] = 1
                    leftmost_match[4] = 'dayStr'

            # Weekday
            m = self.ptc.CRE_WEEKDAY.search(inputString[startpos:])
            if m is not None:
                if inputString[startpos:] not in self.ptc.dayOffsets:
                    if leftmost_match[1] == 0 or \
                            leftmost_match[0] > m.start() + startpos:
                        leftmost_match[0] = m.start() + startpos
                        leftmost_match[1] = m.end() + startpos
                        leftmost_match[2] = m.group()
                        leftmost_match[3] = 1
                        leftmost_match[4] = 'weekdy'

            # Natural language time strings
            m = self.ptc.CRE_TIME.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start() + startpos:
                    leftmost_match[0] = m.start() + startpos
                    leftmost_match[1] = m.end() + startpos
                    leftmost_match[2] = m.group()
                    leftmost_match[3] = 2
                    leftmost_match[4] = 'timeStr'

            # HH:MM(:SS) am/pm time strings
            m = self.ptc.CRE_TIMEHMS2.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start('hours') + startpos:
                    leftmost_match[0] = m.start('hours') + startpos
                    leftmost_match[1] = m.end('meridian') + startpos
                    leftmost_match[2] = inputString[leftmost_match[0]:
                                                    leftmost_match[1]]
                    leftmost_match[3] = 2
                    leftmost_match[4] = 'meridian'

            # HH:MM(:SS) time strings
            m = self.ptc.CRE_TIMEHMS.search(inputString[startpos:])
            if m is not None:
                if leftmost_match[1] == 0 or \
                        leftmost_match[0] > m.start('hours') + startpos:
                    leftmost_match[0] = m.start('hours') + startpos
                    if m.group('seconds') is not None:
                        leftmost_match[1] = m.end('seconds') + startpos
                    else:
                        leftmost_match[1] = m.end('minutes') + startpos
                    leftmost_match[2] = inputString[leftmost_match[0]:
                                                    leftmost_match[1]]
                    leftmost_match[3] = 2
                    leftmost_match[4] = 'timeStd'

            # Units only; must be preceded by a modifier
            if len(matches) > 0 and matches[-1][3] == 0:
                m = self.ptc.CRE_UNITS_ONLY.search(inputString[startpos:])
                # Ensure that any match is immediately proceded by the
                # modifier. "Next is the word 'month'" should not parse as a
                # date while "next month" should
                if m is not None and \
                        inputString[startpos:startpos +
                                    m.start()].strip() == '':
                    debug and log.debug('CRE_UNITS_ONLY matched [%s]',
                                        m.group())
                    if leftmost_match[1] == 0 or \
                            leftmost_match[0] > m.start() + startpos:
                        leftmost_match[0] = m.start() + startpos
                        leftmost_match[1] = m.end() + startpos
                        leftmost_match[2] = m.group()
                        leftmost_match[3] = 3
                        leftmost_match[4] = 'unitsOnly'

            # set the start position to the end pos of the leftmost match
            startpos = leftmost_match[1]

            # nothing was detected
            # so break out of the loop
            if startpos == 0:
                startpos = len(inputString)
            else:
                if leftmost_match[3] > 0:
                    m = self.ptc.CRE_NLP_PREFIX.search(
                        inputString[:leftmost_match[0]] +
                        ' ' + str(leftmost_match[3]))
                    if m is not None:
                        leftmost_match[0] = m.start('nlp_prefix')
                        leftmost_match[2] = inputString[leftmost_match[0]:
                                                        leftmost_match[1]]
                matches.append(leftmost_match)

        # find matches in proximity with one another and
        # return all the parsed values
        proximity_matches = []
        if len(matches) > 1:
            combined = ''
            from_match_index = 0
            date = matches[0][3] == 1
            time = matches[0][3] == 2
            units = matches[0][3] == 3
            for i in range(1, len(matches)):

                # test proximity (are there characters between matches?)
                endofprevious = matches[i - 1][1]
                begofcurrent = matches[i][0]
                if orig_inputstring[endofprevious:
                                    begofcurrent].lower().strip() != '':
                    # this one isn't in proximity, but maybe
                    # we have enough to make a datetime
                    # TODO: make sure the combination of
                    # formats (modifier, dateStd, etc) makes logical sense
                    # before parsing together
                    if date or time or units:
                        combined = orig_inputstring[matches[from_match_index]
                                                    [0]:matches[i - 1][1]]
                        parsed_datetime, flags = self.parse(combined,
                                                            sourceTime,
                                                            version)
                        proximity_matches.append((
                            datetime.datetime(*parsed_datetime[:6]),
                            flags,
                            matches[from_match_index][0],
                            matches[i - 1][1],
                            combined))
                    # not in proximity, reset starting from current
                    from_match_index = i
                    date = matches[i][3] == 1
                    time = matches[i][3] == 2
                    units = matches[i][3] == 3
                    continue
                else:
                    if matches[i][3] == 1:
                        date = True
                    if matches[i][3] == 2:
                        time = True
                    if matches[i][3] == 3:
                        units = True

            # check last
            # we have enough to make a datetime
            if date or time or units:
                combined = orig_inputstring[matches[from_match_index][0]:
                                            matches[len(matches) - 1][1]]
                parsed_datetime, flags = self.parse(combined, sourceTime,
                                                    version)
                proximity_matches.append((
                    datetime.datetime(*parsed_datetime[:6]),
                    flags,
                    matches[from_match_index][0],
                    matches[len(matches) - 1][1],
                    combined))

        elif len(matches) == 0:
            return None
        else:
            if matches[0][3] == 0:  # not enough info to parse
                return None
            else:
                combined = orig_inputstring[matches[0][0]:matches[0][1]]
                parsed_datetime, flags = self.parse(matches[0][2], sourceTime,
                                                    version)
                proximity_matches.append((
                    datetime.datetime(*parsed_datetime[:6]),
                    flags,
                    matches[0][0],
                    matches[0][1],
                    combined))

        return tuple(proximity_matches)