in metrics/conlleval.py [0:0]
def countChunks(fileIterator, delimiter=None, raw=False, oTag="O"):
"""
Process input in given format and count chunks using the last two columns;
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter
Args:
fileIterator: either an input stream/stdin, or a list (Note that len(fileIterator[0]) >= 3.
list example:
[
['', y_true, y_pred],
...,
[SENT_BOUNDARY, 'O', 'O'], # as sentence boundary
['', y_true, y_pred], # next sentence
...
]
"""
correctChunk = defaultdict(int) # number of correctly identified chunks
foundCorrect = defaultdict(int) # number of chunks in corpus per type
foundGuessed = defaultdict(int) # number of identified chunks per type
tokenCounter = 0 # token counter (ignores sentence breaks)
correctTags = 0 # number of correct chunk tags
lastType = None # temporary storage for detecting duplicates
inCorrect = False # currently processed chunk is correct until now
lastCorrect, lastCorrectType = "O", None # previous chunk tag in corpus
lastGuessed, lastGuessedType = "O", None # previously identified chunk tag
for line in fileIterator:
# each non-empty line must contain >= 3 columns
if isinstance(line, str):
features = line.strip().split(delimiter)
else: # support online evaluation
features = line
if not features or features[0] == SENT_BOUNDARY: # insert an sentence boundary
features = [SENT_BOUNDARY, oTag, oTag]
elif len(features) < 3:
raise IOError("conlleval: unexpected number of features in line %s\n" % line)
# extract tags from last 2 columns
guessed, guessedType = splitTag(features[-1], oTag=oTag, raw=raw)
correct, correctType = splitTag(features[-2], oTag=oTag, raw=raw)
# 1999-06-26 sentence breaks should always be counted as out of chunk
firstItem = features[0]
if firstItem == SENT_BOUNDARY:
guessed, guessedType = "O", None
# decide whether current chunk is correct until now
if inCorrect:
endOfGuessed = endOfChunk(lastCorrect, correct, lastCorrectType, correctType)
endOfCorrect = endOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
if (endOfGuessed and endOfCorrect and lastGuessedType == lastCorrectType):
inCorrect = False
correctChunk[lastCorrectType] += 1
elif ( endOfGuessed != endOfCorrect or guessedType != correctType):
inCorrect = False
startOfGuessed = startOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
startOfCorrect = startOfChunk(lastCorrect, correct, lastCorrectType, correctType)
if (startOfCorrect and startOfGuessed and guessedType == correctType):
inCorrect = True
if startOfCorrect:
foundCorrect[correctType] += 1
if startOfGuessed:
foundGuessed[guessedType] += 1
if firstItem != SENT_BOUNDARY:
if correct == guessed and guessedType == correctType:
correctTags += 1
tokenCounter += 1
lastGuessed, lastGuessedType = guessed, guessedType
lastCorrect, lastCorrectType = correct, correctType
if inCorrect:
correctChunk[lastCorrectType] += 1
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter