def load_hathitrust_book()

in data_preparation/text_retrieval/hathitrust.py [0:0]


def load_hathitrust_book(url):

    candidatesID = None
    if url.find("catalog.hathitrust.org") >= 0:
        catalogParser = CatalogParser()
        req = requests.get(url)
        catalogParser.feed(req._content.decode('utf-8'))

        if len(catalogParser.candidatesID) == 0:
            raise RuntimeError("Invalid url")

        candidatesID = catalogParser.candidatesID

    else:
        key = "cgi/ssd?"
        startOffset = url.find(key)

        if startOffset < 0:
            raise RuntimeError("Invalid url")

        startOffset += len(key)
        markers = url[startOffset:].split(';')

        for data in markers:
            name, value = data.split('=')
            if name == "id":
                candidatesID = [value]
                break

        if candidatesID is None:
            raise RuntimeError("Invalid url")

    text = None
    for id in candidatesID:

        try:
            text = load_whole_book(id)
        except RuntimeError:
            continue

    if text is None:
        raise RuntimeError("Couldn't find any transcription")

    return text