in tools/visualize.py [0:0]
def analyse_categorical_data(X_cat, n_days=10, output_dir=""):
# analyse categorical variables
n_vec = len(X_cat)
n_cat = len(X_cat[0])
n_days = n_days
print("n_vec", n_vec, "n_cat", n_cat)
# for c in train_data.X_cat:
# print(n_cat, c)
all_cat = np.array(X_cat)
print("all_cat.shape", all_cat.shape)
day_size = all_cat.shape[0]/n_days
for i in range(0,n_cat):
l_d = []
l_s1 = []
l_s2 = []
l_int = []
l_rem = []
cat = all_cat[:,i]
print("cat", i, cat.shape)
for d in range(1,n_days):
offset = int(d*day_size)
#print(offset)
cat1 = cat[:offset]
cat2 = cat[offset:]
s1 = set(cat1)
s2 = set(cat2)
intersect = list(s1 & s2)
#print(intersect)
l_d.append(d)
l_s1.append(len(s1))
l_s2.append(len(s2))
l_int.append(len(intersect))
l_rem.append((len(s1)-len(intersect)))
print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect)))
print("spit", l_d)
print("before", l_s1)
print("after", l_s2)
print("inters.", l_int)
print("removed", l_rem)
plt.figure(figsize=(8,8))
plt.plot(l_d, l_s1, "g", label="before")
plt.plot(l_d, l_s2, "r", label="after")
plt.plot(l_d, l_int, "b", label="intersect")
plt.plot(l_d, l_rem, "y", label="removed")
plt.title("categorical var. "+str(i))
plt.legend()
plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
plt.close()