in data_preparation/complete_metadata.py [0:0]
def main(argv):
parser = parse_args()
args = parser.parse_args(argv)
if args.in_place:
path_out = Path(args.path_metadata)
elif args.out_dir is not None:
path_out = Path(args.out_dir)
Path.mkdir(path_out, exist_ok=True)
else:
print(f"You must input either an output directory or activate the "
"inplace flag")
parser.print_help()
sys.exit()
path_cache = path_out / ".cache"
Path.mkdir(path_cache, exist_ok=True)
path_global_data_dir = path_out / "global"
Path.mkdir(path_global_data_dir, exist_ok=True)
# Get the list of all metadata
print("Gathering the list of metadata")
path_cache_metadata = path_cache / "metadata.pkl"
list_metadata = ut.load_cache(path_cache_metadata,
ut.get_all_metadata,
args=(args.path_metadata,),
ignore_cache=args.ignore_cache)
if args.debug:
list_metadata = list_metadata[:10]
# Retrieve the genres
genre_list = gather_all_genres(args.path_metadata,
list_metadata)
ut.get_updated_metadata(genre_list, args.path_metadata, path_out, "genre")
# Fold the genres
reverse_folding_unique = ut.build_reverse_folding(UNIQUE_GENRE_FOLDING)
reverse_folding_super = ut.build_reverse_folding(SUPER_GENDER_FOLDING)
final_reverse_folding = ut.combine_reverse_foldings(reverse_folding_super,
reverse_folding_unique)
# Convert the "dramatic reading" option into a binary tag
has_dramatic_reading = [(name, 'Dramatic Readings' in vals)
for name, vals in genre_list]
ut.get_updated_metadata(has_dramatic_reading, path_out,
path_out, 'Dramatic Readings')
genre_list = [(name, ut.remove_tag(vals, 'Dramatic Readings', 'Undefined'))
for name, vals in genre_list]
#dramatric_reading = [(name, ut.has_tag(tag_str, tag))]
folded_genres = [(name, ut.remove_multiple_tags(ut.apply_folding('+'.join(vals),
final_reverse_folding),
SUPER_GENDER_ORDERING))
for name, vals in genre_list]
ut.get_updated_metadata(folded_genres, path_out,
path_out, "meta_genre")
# Retrieve the readers names
update_all_speaker_data(list_metadata, args.path_metadata, path_out)
# Look for duplicates
duplicate_list = get_books_duplicates(args.path_metadata, list_metadata)
path_out_duplicates = path_global_data_dir / "duplicates.json"
print(f"Saving the duplicates index at {path_out_duplicates}")
with open(path_out_duplicates, 'w') as file:
json.dump(duplicate_list, file, indent=2)
# Clean text data when possible
text_status = clean_all_text_data(list_metadata, args.path_metadata,
str(path_out))
ut.get_updated_metadata(text_status, path_out,
path_out, "trancription_status")