in datasets/imdb/pipelines/_images/run_csv_transform_kub/csv_transform.py [0:0]
def create_dataframe(extract_here: pathlib.Path):
logging.info("Started creating Dataframe for reviews(data).")
df_reviews = pd.DataFrame(columns=REVIEW_COLS)
for parent in TEST_TRAIN:
for child in NEG_POS_UNSUP:
if parent == "test" and child == "unsup":
break
path = f"{extract_here}/aclImdb/{parent}/{child}/"
csv_files = list(glob.glob(path + "*.txt"))
csv_files.sort()
logging.info(
f"\tCreating Dataframe by reading files from {parent}-->{child}."
)
df_child = pd.DataFrame(
[allign_data(file) for file in csv_files], columns=REVIEW_COLS
)
logging.info("\tAdding movie_url column")
df_child["movie_url"] = add_movie_url(parent, child, df_child, "id_tag")
logging.info("\tAdding movie_id column")
df_child["movie_id"] = add_movie_id(df_child, "movie_url", -2)
logging.info(
f"\tTrying to concatenating main dataframe & child dataframe for {parent}-->{child} (folder)."
)
df_reviews = pd.concat([df_reviews, df_child], ignore_index=True)
logging.info("\tChild Dataframe concatenated with main Dataframe df")
logging.info("Successfully Created Dataframe and assigned to variable df_reviews.")
return df_reviews