in src/extract_wikidata_info.py [0:0]
def extract_popularities(popularity_dump):
"""Iterate through the Wikipedia popularity dump without decompressing
it, storing each English Wikipedia page's number of page views.
Args:
popularity_dump: ``str`` A path to a .BZ2 file containing Wikipedia
page views for a day.
Returns:
wiki_popularity: ``dict`` Maps from a Wikipedia page to the daily
page view count.
"""
wiki_popularity = collections.defaultdict(int)
with bz2.open(popularity_dump, "rt") as bz_file:
# Each line corresponds to the number of page views for a Wikipedia page
for line in tqdm.tqdm(bz_file, desc="Loading Wikipedia popularity values"):
line = line.strip().split()
# Skip lines w/o right len or Wikipedia pages that aren't in English
if len(line) == 6 and line[0] == "en.wikipedia":
wiki_popularity[line[1]] += int(line[4])
print(f"Found {len(wiki_popularity)} English Wikipedia pages")
return wiki_popularity