def subselect()

in data_preparation/split_librilight/split.py [0:0]


def subselect(fnames, files2jsons, divisor=10):
    overall_time = sum(
        fnames2jsons[fname]['file_length_sec'] for fname in fnames)
    print('Selecting from', overall_time / 60 / 60, 'hours')

    genre2time = get_genre2time(fnames, fnames2jsons)

    genre2budget = {}
    for genre, time in genre2time.items():
        genre2budget[genre] = time // divisor

    time_selected = 0
    selected_files = []

    for fname in fnames:
        if time_selected > overall_time // divisor:
            break

        data = fnames2jsons[fname]
        if 'genre' not in data['book_meta'] or data['book_meta']['genre'] is None:
            file_genres = ['<none>']
        else:
            file_genres = data['book_meta']['genre']
        length = data['file_length_sec']

        fits = True
        for file_genre in file_genres:
            fits = fits and (
                file_genre not in genre2budget or genre2budget[file_genre] > length)

        if fits:
            time_selected += length
            selected_files.append(fname)
            for file_genre in file_genres:
                if file_genre in genre2budget:
                    genre2budget[file_genre] -= length

    overall_time = sum(
        fnames2jsons[fname]['file_length_sec'] for fname in selected_files)
    print('Selected', overall_time / 60 / 60, 'hours')

    return selected_files