wikipedia/_tools/parse_documents.py (43 lines of code) (raw):
import bz2
import json
import sys
from xml.etree import cElementTree
PAGE_TAG = "page"
SITEINFO_TAG = "siteinfo"
XML_NAMESPACES = {"": "http://www.mediawiki.org/xml/export-0.11/"}
def doc_generator(f):
namespaces = dict()
for _, element in cElementTree.iterparse(f):
_, tag = element.tag.split("}")
if tag == PAGE_TAG:
yield parse_page(element, namespaces)
element.clear()
if tag == SITEINFO_TAG:
namespaces = parse_namespaces(element)
def get_doc_meta(doc_data, op_type="index"):
return {op_type: {"_index": "wikipedia", "_id": doc_data["title"]}}
def to_json(f):
with bz2.BZ2File(f, "r") as fp:
for doc_data in doc_generator(fp):
print(json.dumps(get_doc_meta(doc_data)))
print(json.dumps(doc_data))
def parse_namespaces(element) -> dict:
namespaces = dict()
for namespace_element in element.findall("namespaces/namespace", XML_NAMESPACES):
namespaces[namespace_element.get("key")] = namespace_element.text
return namespaces
def parse_page(element, namespaces):
page_data = {
"title": element.find("title", XML_NAMESPACES).text,
}
redirect = element.find("redirect", XML_NAMESPACES)
if redirect is not None:
page_data["redirect"] = redirect.get("title")
else:
page_data["content"] = element.find("revision/text", XML_NAMESPACES).text
namespace = namespaces[element.find("ns", XML_NAMESPACES).text]
if namespace is not None:
page_data["namespace"] = namespace
return page_data
for file_name in sys.argv[1:]:
to_json(file_name)