in datasets/imdb/pipelines/_images/run_csv_transform_kub/csv_transform.py [0:0]
def get_title_basics(source_file: dict) -> pd.DataFrame:
logging.info(f'Reading data from {source_file.get("url_data", "")}')
df = pd.read_csv(source_file.get("url_data", ""), sep="\t", compression="gzip")
df = df[df.isAdult.isin([0, "0", "0.0", 0.0, 1, "1", "1.0", 1.0, "\\N"])]
for col in df:
if col in ("isAdult"):
coldata_replace(df, col, {**REPLACE_BINARY_DICT, **REPLACE_DICT})
convert_int(df, col, "Int64")
continue
elif col in ("startYear", "endYear", "runtimeMinutes"):
convert_digit(df, col)
convert_int(df, col, "Int64")
continue
elif col in ("genres"):
coldata_replace(df, col, REPLACE_DICT)
return df