def clean_anchor_lang()

in scripts_mgenre/preprocess_anchors.py [0:0]


def clean_anchor_lang(anchor, lang):
    if re.match(r"^https%3A//(.*)\.wikipedia\.org/wiki/", anchor):
        anchor = anchor[10:].split("/")
        return clean_anchor_lang(anchor[2], anchor[0].split(".")[0])
    elif anchor.startswith("%3A{}".format(lang)):
        return clean_anchor_lang(anchor[len("%3A{}".format(lang)) :], lang)
    elif anchor.startswith("%3A"):
        return clean_anchor_lang(anchor[len("%3A") :], lang)
    elif anchor.startswith("w%3A{}".format(lang)):
        return clean_anchor_lang(anchor[len("w%3A{}".format(lang)) :], lang)
    elif anchor.startswith("w%3A"):
        return clean_anchor_lang(anchor[len("w%3A") :], lang)
    else:
        return anchor, lang