in datasets/hacker_news/pipelines/_images/run_csv_transform_kub/csv_transform.py [0:0]
def convert_json_file_to_csv(source_json_file: str) -> str:
target_file_batch_csv = source_json_file.replace(".json", ".csv")
if os.path.isfile(target_file_batch_csv):
# Remove the [local] batch csv file if it exists
os.remove(target_file_batch_csv)
with open(source_json_file, "r") as source_json:
data = json.load(source_json)
df = pd.json_normalize(data["data"], max_level=0)
df["time"] = df["time"].astype("Int64")
df["time"] = df["time"].astype("str")
df["time"] = df["time"].apply(lambda x: "" if x == "<NA>" else x)
df["timestamp"] = df["time"].apply(
lambda x: (
""
if x == ""
else f"{datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')} UTC"
)
)
df["descendants"] = df["descendants"].astype("Int64")
df["descendants"] = df["descendants"].astype("str")
df["descendants"] = df["descendants"].apply(lambda x: "" if x == "<NA>" else x)
df["score"] = df["score"].astype("Int64")
df["score"] = df["score"].astype("str")
df["score"] = df["score"].apply(lambda x: "" if x == "<NA>" else x)
df["parent"] = df["parent"].astype("Int64")
df["parent"] = df["parent"].astype("str")
df["parent"] = df["parent"].apply(lambda x: "" if x == "<NA>" else x)
df["ranking"] = ""
df["text"] = df["text"].replace(r"\n", " ", regex=True)
df["text"] = df["text"].replace(r"\r", " ", regex=True)
df["text"] = df["text"].replace(r"\x00", "", regex=True)
df["title"] = df["title"].replace(r"\n", " ", regex=True)
df["title"] = df["title"].replace(r"\r", " ", regex=True)
df["title"] = df["title"].replace(r"\x00", "", regex=True)
df = df[
[
"title",
"url",
"text",
"dead",
"by",
"score",
"time",
"timestamp",
"type",
"id",
"parent",
"descendants",
"ranking",
"deleted",
]
]
df["deleted"] = df["deleted"].apply(
lambda x: "" if not (x == "True" or x == "False") else x
)
save_to_new_file(df, file_path=str(target_file_batch_csv), sep="|")
# Release the dataframe memory
del df
return str(target_file_batch_csv)