in data_preparation/download_librivox.py [0:0]
def import_page(baseUrl, offset, limit, dirOut, language):
urlRequest = f'{baseUrl}/?offset={offset}&format=json&limit={limit}'
response = requests.get(urlRequest).json()
if "error" in response:
return -1
if "books" not in response:
raise RuntimeError(f"Invalid url {baseUrl}")
bookList = response['books']
fullSize = 0
if len(bookList) == 0:
return -1
for bookData in bookList:
if bookData['language'] != language:
continue
title = bookData['title']
print(f'Loading title {title}...')
chapterReaders, chaptersNames = None, None
try:
chaptersNames, chapterReaders = \
get_reader_data(bookData['url_librivox'])
except:
print(colored(f'Error when loading title {title} metadata', 'red'))
print(colored(sys.exc_info(), 'red'))
name = os.path.splitext(os.path.basename(bookData['url_zip_file']))[0]
outSpeaker = os.path.join(dirOut, f'{name}_speaker_data.json')
with open(outSpeaker, 'w') as file:
json.dump({"names": chaptersNames,
"readers": chapterReaders},
file, indent=2)
print(f'{title}\'s speaker data loaded')
outMetadata = os.path.join(dirOut, f'{name}_metadata.json')
with open(outMetadata, 'w') as file:
json.dump(bookData, file, indent=2)
print(f'{title}\'s metadata loaded')
try:
txtData = get_text_data(bookData['url_text_source'])
outTxt = os.path.join(dirOut, f'{name}_text.txt')
with open(outTxt, 'w') as file:
file.write(txtData)
print('... text data loaded')
fullSize += os.path.getsize(outTxt)
except:
print(colored(f'Error when loading {title}\'s text data', 'red'))
print(f'Loading audio data at {bookData["url_zip_file"]}')
outPath = os.path.join(dirOut, name + ".zip")
if not os.path.isfile(outPath):
try:
d = urllib.request.urlopen(bookData['url_zip_file'])
fullSize += int(d.info()['Content-Length'])
urllib.request.urlretrieve(bookData['url_zip_file'], outPath,
RequestPBar())
except KeyboardInterrupt:
if os.path.isfile(outPath):
os.remove(outPath)
sys.exit()
except:
if os.path.isfile(outPath):
os.remove(outPath)
print('')
return fullSize