def countChunks()

in metrics/conlleval.py [0:0]


def countChunks(fileIterator, delimiter=None, raw=False, oTag="O"):
    """
    Process input in given format and count chunks using the last two columns;
    return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter

    Args:
        fileIterator: either an input stream/stdin, or a list (Note that len(fileIterator[0]) >= 3.
            list example:
                [
                    ['', y_true, y_pred],
                    ...,
                    [SENT_BOUNDARY, 'O', 'O'],      # as sentence boundary
                    ['', y_true, y_pred],        # next sentence
                    ...
                ]
    """
    correctChunk = defaultdict(int)     # number of correctly identified chunks
    foundCorrect = defaultdict(int)     # number of chunks in corpus per type
    foundGuessed = defaultdict(int)     # number of identified chunks per type

    tokenCounter = 0     # token counter (ignores sentence breaks)
    correctTags = 0      # number of correct chunk tags

    lastType = None # temporary storage for detecting duplicates
    inCorrect = False # currently processed chunk is correct until now
    lastCorrect, lastCorrectType = "O", None    # previous chunk tag in corpus
    lastGuessed, lastGuessedType = "O", None  # previously identified chunk tag

    for line in fileIterator:
        # each non-empty line must contain >= 3 columns
        if isinstance(line, str):
            features = line.strip().split(delimiter)
        else:   # support online evaluation
            features = line
        if not features or features[0] == SENT_BOUNDARY:        # insert an sentence boundary
            features = [SENT_BOUNDARY, oTag, oTag]
        elif len(features) < 3:
             raise IOError("conlleval: unexpected number of features in line %s\n" % line)

        # extract tags from last 2 columns
        guessed, guessedType = splitTag(features[-1], oTag=oTag, raw=raw)
        correct, correctType = splitTag(features[-2], oTag=oTag, raw=raw)

        # 1999-06-26 sentence breaks should always be counted as out of chunk
        firstItem = features[0]
        if firstItem == SENT_BOUNDARY:
            guessed, guessedType = "O", None

        # decide whether current chunk is correct until now
        if inCorrect:
            endOfGuessed = endOfChunk(lastCorrect, correct, lastCorrectType, correctType)
            endOfCorrect = endOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
            if (endOfGuessed and endOfCorrect and lastGuessedType == lastCorrectType):
                inCorrect = False
                correctChunk[lastCorrectType] += 1
            elif ( endOfGuessed != endOfCorrect or guessedType != correctType):
                inCorrect = False

        startOfGuessed = startOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
        startOfCorrect = startOfChunk(lastCorrect, correct, lastCorrectType, correctType)
        if (startOfCorrect and startOfGuessed and guessedType == correctType):
            inCorrect = True
        if startOfCorrect:
            foundCorrect[correctType] += 1
        if startOfGuessed:
            foundGuessed[guessedType] += 1

        if firstItem != SENT_BOUNDARY:
            if correct == guessed and guessedType == correctType:
                correctTags += 1
            tokenCounter += 1

        lastGuessed, lastGuessedType = guessed, guessedType
        lastCorrect, lastCorrectType = correct, correctType

    if inCorrect:
        correctChunk[lastCorrectType] += 1

    return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter