in datasets/iamdb.py [0:0]
def load_metadata(data_path, wordsep, use_words=False):
forms = collections.defaultdict(list)
filename = "words.txt" if use_words else "lines.txt"
with open(os.path.join(data_path, filename), "r") as fid:
lines = (l.strip().split() for l in fid if l[0] != "#")
for line in lines:
# skip word segmentation errors
if use_words and line[1] == "err":
continue
text = " ".join(line[8:])
# remove garbage tokens:
text = text.replace("#", "")
# swap word sep from | to wordsep
text = re.sub(r"\|+|\s", wordsep, text).strip(wordsep)
form_key = "-".join(line[0].split("-")[:2])
line_key = "-".join(line[0].split("-")[:3])
box_idx = 4 - use_words
box = tuple(int(val) for val in line[box_idx : box_idx + 4])
forms[form_key].append(
{
"key": line_key,
"box": box,
"text": text,
}
)
return forms