in scripts/preprocess.py [0:0]
def preprocess_kdd(datafile):
#coding=utf-8
#Email of the author: zjduan@pku.edu.cn
'''
0. Click:
1. Impression(numerical)
2. DisplayURL: (categorical)
3. AdID:(categorical)
4. AdvertiserID:(categorical)
5. Depth:(numerical)
6. Position:(numerical)
7. QueryID: (categorical) the key of the data file 'queryid_tokensid.txt'.
8. KeywordID: (categorical)the key of 'purchasedkeyword_tokensid.txt'.
9. TitleID: (categorical)the key of 'titleid_tokensid.txt'.
10. DescriptionID: (categorical)the key of 'descriptionid_tokensid.txt'.
11. UserID: (categorical)the key of 'userid_profile.txt'
12. User's Gender: (categorical)
13. User's Age: (categorical)
'''
train_path = './training.txt'
f1 = open(train_path, 'r')
f2 = open('./userid_profile.txt', 'r')
dic = {}
f_train_value = open('./train_x.txt', 'w')
f_train_index = open('./train_i.txt', 'w')
f_train_label = open('./train_y.txt', 'w')
debug = False
tune = False
Column = 12
Field = 13
numr_feat = [1,5,6]
numerical = [0] * Column
cate_feat = [2,3,4,7,8,9,10,11]
index_cnt = 0
index_others = [0] * (Field + 1)
Max = [0] * 12
numerical[0] = -1
for i in numr_feat:
index_others[i] = index_cnt
index_cnt += 1
numerical[i] = 1
for i in cate_feat:
index_others[i] = index_cnt
index_cnt += 1
for i in range(Field + 1):
dic[i] = dict()
###init user_dic
user_dic = dict()
cnt_line = 0
for line in f2:
cnt_line += 1
if (cnt_line % 1000000 == 0):
print ("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt))
# if (debug == True):
# if (cnt_line >= 10000):
# break
split = line.strip('\n').split('\t')
user_dic[split[0]] = [split[1], split[2]]
if (split[1] not in dic[12]):
dic[12][split[1]] = [index_cnt, 0]
index_cnt += 1
if (split[2] not in dic[13]):
dic[13][split[2]] = [index_cnt, 0]
index_cnt += 1
cnt_line = 0
for line in f1:
cnt_line += 1
if (cnt_line % 1000000 == 0):
print ("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt))
if (debug == True):
if (cnt_line >= 10000):
break
split = line.strip('\n').split('\t')
for i in cate_feat:
if (split[i] != ''):
if split[i] not in dic[i]:
dic[i][split[i]] = [index_others[i], 0]
dic[i][split[i]][1] += 1
if (dic[i][split[i]][0] == index_others[i] and dic[i][split[i]][1] == 10):
dic[i][split[i]][0] = index_cnt
index_cnt += 1
if (tune == False):
label = split[0]
if (label != '0'): label = '1'
index = [0] * Field
value = ['0'] * Field
for i in range(1, 12):
if (numerical[i] == 1):
index[i - 1] = index_others[i]
if (split[i] != ''):
value[i - 1] = split[i]
Max[i] = max(int(split[i]), Max[i])
else:
if (split[i] != ''):
index[i - 1] = dic[i][split[i]][0]
value[i - 1] = '1'
if (split[i] == ''):
value[i - 1] = '0'
if (i == 11 and split[i] == '0'):
value[i - 1] = '0'
### gender and age
if (split[11] == '' or (split[11] not in user_dic)):
index[12 - 1] = index_others[12]
value[12 - 1] = '0'
index[13 - 1] = index_others[13]
value[13 - 1] = '0'
else:
index[12 - 1] = dic[12][user_dic[split[11]][0]][0]
value[12 - 1] = '1'
index[13 - 1] = dic[13][user_dic[split[11]][1]][0]
value[13 - 1] = '1'
f_train_index.write(' '.join(str(i) for i in index) + '\n')
f_train_value.write(' '.join(value) + '\n')
f_train_label.write(label + '\n')
f1.close()
f_train_index.close()
f_train_value.close()
f_train_label.close()
print ("Finished!")
print ("index_cnt = %d" % index_cnt)
print ("max number for numerical features:")
for i in numr_feat:
print ("no.:%d max: %d" % (i, Max[i]))