in scripts/preprocess.py [0:0]
def preprocess_criteo(datafile):
train_path="train.txt"
# train_path="train.txt"
train_path = os.path.join(datafile, train_path)
f1 = open(train_path,'r')
dic= {}
# generate three fold.
# train_x: value
# train_i: index
# train_y: label
f_train_value = open(os.path.join(datafile, 'train_x.txt'),'w')
f_train_index = open(os.path.join(datafile, 'train_i.txt'),'w')
f_train_label = open(os.path.join(datafile, 'train_y.txt'),'w')
num_dense, num_sparse = 13, 26
num_feature = num_dense + num_sparse
for i in range(num_feature):
dic[i] = {}
cnt_train = 0
#for debug
#limits = 10000
index = [1] * num_sparse
for line in f1:
cnt_train +=1
if cnt_train % 100000 ==0:
print('now train cnt : %d\n' % cnt_train)
#if cnt_train > limits:
# break
split = line.strip('\n').split('\t')
# 0-label, 1-13 numerical, 14-39 category
for i in range(num_dense, num_feature):
#dic_len = len(dic[i])
if split[i+1] not in dic[i]:
# [1, 0] 1 is the index for those whose appear times <= 10 0 indicates the appear times
dic[i][split[i+1]] = [1,0]
dic[i][split[i+1]][1] += 1
if dic[i][split[i+1]][0] == 1 and dic[i][split[i+1]][1] > 10:
index[i-num_dense] += 1
dic[i][split[i+1]][0] = index[i-num_dense]
f1.close()
print('total entries :%d\n' % (cnt_train - 1))
# calculate number of category features of every dimension
kinds = [num_dense]
for i in range(num_dense, num_feature):
kinds.append(index[i-num_dense])
print('number of dimensions : %d' % (len(kinds)-1))
print(kinds)
for i in range(1,len(kinds)):
kinds[i] += kinds[i-1]
print(kinds)
# make new data
f1 = open(train_path,'r')
cnt_train = 0
print('remake training data...\n')
for line in f1:
cnt_train +=1
if cnt_train % 100000 ==0:
print('now train cnt : %d\n' % cnt_train)
#if cnt_train > limits:
# break
entry = ['0'] * num_feature
index = [None] * num_feature
split = line.strip('\n').split('\t')
label = str(split[0])
for i in range(num_dense):
if split[i+1] != '':
entry[i] = (split[i+1])
index[i] = (i+1)
for i in range(num_dense, num_feature):
if split[i+1] != '':
entry[i] = '1'
index[i] = (dic[i][split[i+1]][0])
for j in range(num_sparse):
index[num_dense+j] += kinds[j]
index = [str(item) for item in index]
f_train_value.write(' '.join(entry)+'\n')
f_train_index.write(' '.join(index)+'\n')
f_train_label.write(label+'\n')
f1.close()
f_train_value.close()
f_train_index.close()
f_train_label.close()