in scripts_mgenre/preprocess_anchors.py [0:0]
def clean_anchor_lang(anchor, lang):
if re.match(r"^https%3A//(.*)\.wikipedia\.org/wiki/", anchor):
anchor = anchor[10:].split("/")
return clean_anchor_lang(anchor[2], anchor[0].split(".")[0])
elif anchor.startswith("%3A{}".format(lang)):
return clean_anchor_lang(anchor[len("%3A{}".format(lang)) :], lang)
elif anchor.startswith("%3A"):
return clean_anchor_lang(anchor[len("%3A") :], lang)
elif anchor.startswith("w%3A{}".format(lang)):
return clean_anchor_lang(anchor[len("w%3A{}".format(lang)) :], lang)
elif anchor.startswith("w%3A"):
return clean_anchor_lang(anchor[len("w%3A") :], lang)
else:
return anchor, lang