in data_preparation/split_librilight/split.py [0:0]
def subselect(fnames, files2jsons, divisor=10):
overall_time = sum(
fnames2jsons[fname]['file_length_sec'] for fname in fnames)
print('Selecting from', overall_time / 60 / 60, 'hours')
genre2time = get_genre2time(fnames, fnames2jsons)
genre2budget = {}
for genre, time in genre2time.items():
genre2budget[genre] = time // divisor
time_selected = 0
selected_files = []
for fname in fnames:
if time_selected > overall_time // divisor:
break
data = fnames2jsons[fname]
if 'genre' not in data['book_meta'] or data['book_meta']['genre'] is None:
file_genres = ['<none>']
else:
file_genres = data['book_meta']['genre']
length = data['file_length_sec']
fits = True
for file_genre in file_genres:
fits = fits and (
file_genre not in genre2budget or genre2budget[file_genre] > length)
if fits:
time_selected += length
selected_files.append(fname)
for file_genre in file_genres:
if file_genre in genre2budget:
genre2budget[file_genre] -= length
overall_time = sum(
fnames2jsons[fname]['file_length_sec'] for fname in selected_files)
print('Selected', overall_time / 60 / 60, 'hours')
return selected_files