in dataset_prep.py [0:0]
def prepare_data(df_raw):
print('******************* Prepare Data **********************')
product = df_raw[(df_raw['item'] == "mens_clothing")]
print(product.head())
product["region_state"] = product.apply(lambda x: f"{x['region']}_{x['state']}", axis=1)
region_states = product["region_state"].unique()
grouped_sections = product.groupby(["region", "region_state"])
edges_hierarchy = list(grouped_sections.groups.keys())
# Now, we must not forget that total is our root node.
second_level_nodes = product.region.unique()
root_node = "total"
root_edges = [(root_node, second_level_node) for second_level_node in second_level_nodes]
root_edges += edges_hierarchy
product_bottom_level = product.pivot(index="date", columns="region_state", values="quantity")
regions = product["region"].unique().tolist()
for region in regions:
region_cols = get_region_columns(product_bottom_level, region)
product_bottom_level[region] = product_bottom_level[region_cols].sum(axis=1)
product_bottom_level["total"] = product_bottom_level[regions].sum(axis=1)
# create hierarchy
# Now that we have our dataset ready, let's define our hierarchy tree.
# We need a dictionary, where each key is a column (node) in our hierarchy and a list of its children.
hierarchy = dict()
for edge in root_edges:
parent, children = edge[0], edge[1]
hierarchy.get(parent)
if not hierarchy.get(parent):
hierarchy[parent] = [children]
else:
hierarchy[parent] += [children]
product_bottom_level.index = pd.to_datetime(product_bottom_level.index)
product_bottom_level = product_bottom_level.resample("D").sum()
print('******************* End Prepare Data **********************')
return hierarchy, product_bottom_level, region_states