in genre/utils.py [0:0]
def extract_pages(filename):
docs = {}
with open(filename) as f:
for line in f:
# CASE 1: beginning of the document
if line.startswith("<doc id="):
doc = ET.fromstring("{}{}".format(line, "</doc>")).attrib
doc["paragraphs"] = []
doc["anchors"] = []
# CASE 2: end of the document
elif line.startswith("</doc>"):
assert doc["id"] not in docs, "{} ({}) already in dict as {}".format(
doc["id"], doc["title"], docs[doc["id"]]["title"]
)
docs[doc["id"]] = doc
# CASE 3: in the document
else:
doc["paragraphs"].append("")
try:
line = BeautifulSoup(line, "html.parser")
except:
print("error line `{}`".format(line))
line = [line]
for span in line:
if isinstance(span, bs4.element.Tag):
if span.get("href", None):
doc["anchors"].append(
{
"text": span.get_text(),
"href": span["href"],
"paragraph_id": len(doc["paragraphs"]) - 1,
"start": len(doc["paragraphs"][-1]),
"end": len(doc["paragraphs"][-1])
+ len(span.get_text()),
}
)
doc["paragraphs"][-1] += span.get_text()
else:
doc["paragraphs"][-1] += str(span)
return docs