def analyse_categorical_data()

in tools/visualize.py [0:0]


def analyse_categorical_data(X_cat, n_days=10, output_dir=""):

    # analyse categorical variables
    n_vec = len(X_cat)
    n_cat = len(X_cat[0])
    n_days = n_days
    
    print("n_vec", n_vec, "n_cat", n_cat)
#    for c in train_data.X_cat:
#        print(n_cat, c)

    all_cat = np.array(X_cat)
    print("all_cat.shape", all_cat.shape)
    day_size = all_cat.shape[0]/n_days

    for i in range(0,n_cat):
        l_d   = []
        l_s1  = []
        l_s2  = []
        l_int = []
        l_rem = []

        cat = all_cat[:,i]
        print("cat", i, cat.shape)
        for d in range(1,n_days):
            offset = int(d*day_size)
            #print(offset)
            cat1 = cat[:offset]
            cat2 = cat[offset:]

            s1 = set(cat1)
            s2 = set(cat2)

            intersect = list(s1 & s2) 
            #print(intersect)
            l_d.append(d)
            l_s1.append(len(s1))
            l_s2.append(len(s2))
            l_int.append(len(intersect))
            l_rem.append((len(s1)-len(intersect)))

            print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect)))

        print("spit",    l_d)
        print("before",  l_s1)
        print("after",   l_s2)
        print("inters.", l_int)
        print("removed", l_rem)

        plt.figure(figsize=(8,8))
        plt.plot(l_d, l_s1,  "g", label="before")
        plt.plot(l_d, l_s2,  "r", label="after")
        plt.plot(l_d, l_int, "b", label="intersect")
        plt.plot(l_d, l_rem, "y", label="removed")
        plt.title("categorical var. "+str(i))
        plt.legend()
        plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
        plt.close()