in data_preparation/text_retrieval/hathitrust.py [0:0]
def load_hathitrust_book(url):
candidatesID = None
if url.find("catalog.hathitrust.org") >= 0:
catalogParser = CatalogParser()
req = requests.get(url)
catalogParser.feed(req._content.decode('utf-8'))
if len(catalogParser.candidatesID) == 0:
raise RuntimeError("Invalid url")
candidatesID = catalogParser.candidatesID
else:
key = "cgi/ssd?"
startOffset = url.find(key)
if startOffset < 0:
raise RuntimeError("Invalid url")
startOffset += len(key)
markers = url[startOffset:].split(';')
for data in markers:
name, value = data.split('=')
if name == "id":
candidatesID = [value]
break
if candidatesID is None:
raise RuntimeError("Invalid url")
text = None
for id in candidatesID:
try:
text = load_whole_book(id)
except RuntimeError:
continue
if text is None:
raise RuntimeError("Couldn't find any transcription")
return text