def produce_neg_item_hist_with_cate()

in tools/taobao_prepare.py [0:0]


def produce_neg_item_hist_with_cate(train_file, test_file):
    item_dict = {}
    sample_count = 0
    hist_seq = 0
    for line in train_file:
        units = line.strip().split("\t")
        item_hist_list = units[4].split(",")
        cate_hist_list = units[5].split(",")
        hist_list = zip(item_hist_list, cate_hist_list)
        hist_seq = len(hist_list)
        sample_count += 1
        for item in hist_list:
            item_dict.setdefault(str(item),0)

    for line in test_file:
        units = line.strip().split("\t")
        item_hist_list = units[4].split(",")
        cate_hist_list = units[5].split(",")
        hist_list = zip(item_hist_list, cate_hist_list)
        hist_seq = len(hist_list)
        sample_count += 1
        for item in hist_list:
            item_dict.setdefault(str(item),0)


    del(item_dict["('0', '0')"])
    neg_array = np.random.choice(np.array(item_dict.keys()), (sample_count, hist_seq+20))
    neg_list = neg_array.tolist()
    sample_count = 0

    for line in train_file:
        units = line.strip().split("\t")
        item_hist_list = units[4].split(",")
        cate_hist_list = units[5].split(",")
        hist_list = zip(item_hist_list, cate_hist_list)
        hist_seq = len(hist_list)
        neg_hist_list = []
        for item in neg_list[sample_count]:
            item = eval(item)
            if item not in hist_list:
                neg_hist_list.append(item)
            if len(neg_hist_list) == hist_seq:
                break
        sample_count += 1
        neg_item_list, neg_cate_list = zip(*neg_hist_list)
        Train_handle.write(line.strip() + "\t" + ",".join(neg_item_list) + "\t" + ",".join(neg_cate_list) + "\n" )

    for line in test_file:
        units = line.strip().split("\t")
        item_hist_list = units[4].split(",")
        cate_hist_list = units[5].split(",")
        hist_list = zip(item_hist_list, cate_hist_list)
        hist_seq = len(hist_list)
        neg_hist_list = []
        for item in neg_list[sample_count]:
            item = eval(item)
            if item not in hist_list:
                neg_hist_list.append(item)
            if len(neg_hist_list) == hist_seq:
                break
        sample_count += 1
        neg_item_list, neg_cate_list = zip(*neg_hist_list)
        Test_handle.write(line.strip() + "\t" + ",".join(neg_item_list) + "\t" + ",".join(neg_cate_list) + "\n" )