in minihack/wiki.py [0:0]
def process_json(wiki_json: List[dict], ignore_inpage_anchors) -> dict:
"""Process a list of json pages of the wiki into one dict of all pages."""
result: dict = {}
redirects = {}
result["_global_counts"] = defaultdict(int)
def href_normalise(x: str):
result = unquote(x.lower())
if ignore_inpage_anchors:
result = result.split("#")[0]
return result.replace("_", " ")
for page in wiki_json:
relevant_page_info = dict(
title=page["wikipedia_title"].lower(),
length=len("".join(page["text"])),
categories=page["categories"].split(","),
raw_text="".join(page["text"]),
text=clean_page_text(page["page_data"]),
)
# noqa: E731
relevant_page_info["anchors"] = [
dict(
text=anchor["text"].lower(),
page=href_normalise(anchor.get("title", anchor.get("href"))),
start=anchor["start"],
)
for anchor in page["anchors"]
]
redirect_anchors = [
anchor
for anchor in page["anchors"]
if anchor.get("title")
and href_normalise(anchor["href"])
!= href_normalise(anchor["title"])
]
redirects.update(
{
href_normalise(anchor["href"]): href_normalise(anchor["title"])
for anchor in redirect_anchors
}
)
unique_anchors: dict = defaultdict(int)
for anchor in relevant_page_info["anchors"]:
unique_anchors[anchor["page"]] += 1
result["_global_counts"][anchor["page"]] += 1
relevant_page_info["unique_anchors"] = dict(unique_anchors)
result[relevant_page_info["title"]] = relevant_page_info
for alias, page in redirects.items():
result[alias] = result[page]
return result