def preprocess_criteo()

in scripts/preprocess.py [0:0]


def preprocess_criteo(datafile):

    train_path="train.txt"
    # train_path="train.txt"
    train_path = os.path.join(datafile, train_path)
    f1 = open(train_path,'r')
    dic= {}
    # generate three fold.
    # train_x: value
    # train_i: index
    # train_y: label
    f_train_value = open(os.path.join(datafile, 'train_x.txt'),'w')
    f_train_index = open(os.path.join(datafile, 'train_i.txt'),'w')
    f_train_label = open(os.path.join(datafile, 'train_y.txt'),'w')

    num_dense, num_sparse = 13, 26
    num_feature = num_dense + num_sparse
    for i in range(num_feature):
        dic[i] = {}

    cnt_train = 0

    #for debug
    #limits = 10000
    index = [1] * num_sparse
    for line in f1:
        cnt_train +=1
        if cnt_train % 100000 ==0:
            print('now train cnt : %d\n' % cnt_train)
        #if cnt_train > limits:
        #	break
        split = line.strip('\n').split('\t')
        # 0-label, 1-13 numerical, 14-39 category
        for i in range(num_dense, num_feature):
            #dic_len = len(dic[i])
            if split[i+1] not in dic[i]:
            # [1, 0] 1 is the index for those whose appear times <= 10   0 indicates the appear times
                dic[i][split[i+1]] = [1,0]
            dic[i][split[i+1]][1] += 1
            if dic[i][split[i+1]][0] == 1 and dic[i][split[i+1]][1] > 10:
                index[i-num_dense] += 1
                dic[i][split[i+1]][0] = index[i-num_dense]
    f1.close()
    print('total entries :%d\n' % (cnt_train - 1))

    # calculate number of category features of every dimension
    kinds = [num_dense]
    for i in range(num_dense, num_feature):
        kinds.append(index[i-num_dense])
    print('number of dimensions : %d' % (len(kinds)-1))
    print(kinds)

    for i in range(1,len(kinds)):
        kinds[i] += kinds[i-1]
    print(kinds)

    # make new data

    f1 = open(train_path,'r')
    cnt_train = 0
    print('remake training data...\n')
    for line in f1:
        cnt_train +=1
        if cnt_train % 100000 ==0:
            print('now train cnt : %d\n' % cnt_train)
        #if cnt_train > limits:
        #	break
        entry = ['0'] * num_feature
        index = [None] * num_feature
        split = line.strip('\n').split('\t')
        label = str(split[0])
        for i in range(num_dense):
            if split[i+1] != '':
                entry[i] = (split[i+1])
            index[i] = (i+1)
        for i in range(num_dense, num_feature):
            if split[i+1] != '':
                entry[i] = '1'
            index[i] = (dic[i][split[i+1]][0])
        for j in range(num_sparse):
            index[num_dense+j] += kinds[j]
        index = [str(item) for item in index]
        f_train_value.write(' '.join(entry)+'\n')
        f_train_index.write(' '.join(index)+'\n')
        f_train_label.write(label+'\n')
    f1.close()


    f_train_value.close()
    f_train_index.close()
    f_train_label.close()