in torchbenchmark/models/nvidia_deeprecommender/data_utils/netflix_data_convert.py [0:0]
def create_NETFLIX_data_timesplit(all_data,
train_min,
train_max,
test_min,
test_max):
"""
Creates time-based split of NETFLIX data into train, and (validation, test)
:param all_data:
:param train_min:
:param train_max:
:param test_min:
:param test_max:
:return:
"""
train_min_ts = time.mktime(datetime.datetime.strptime(train_min,"%Y-%m-%d").timetuple())
train_max_ts = time.mktime(datetime.datetime.strptime(train_max, "%Y-%m-%d").timetuple())
test_min_ts = time.mktime(datetime.datetime.strptime(test_min, "%Y-%m-%d").timetuple())
test_max_ts = time.mktime(datetime.datetime.strptime(test_max, "%Y-%m-%d").timetuple())
training_data = dict()
validation_data = dict()
test_data = dict()
train_set_items = set()
for userId, userRatings in all_data.items():
time_sorted_ratings = sorted(userRatings, key=lambda x: x[2]) # sort by timestamp
for rating_item in time_sorted_ratings:
if rating_item[2] >= train_min_ts and rating_item[2] <= train_max_ts:
if not userId in training_data:
training_data[userId] = []
training_data[userId].append(rating_item)
train_set_items.add(rating_item[0]) # keep track of items from training set
elif rating_item[2] >= test_min_ts and rating_item[2] <= test_max_ts:
if not userId in training_data: # only include users seen in the training set
continue
p = random.random()
if p <=0.5:
if not userId in validation_data:
validation_data[userId] = []
validation_data[userId].append(rating_item)
else:
if not userId in test_data:
test_data[userId] = []
test_data[userId].append(rating_item)
# remove items not not seen in training set
for userId, userRatings in test_data.items():
test_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
for userId, userRatings in validation_data.items():
validation_data[userId] = [rating for rating in userRatings if rating[0] in train_set_items]
return training_data, validation_data, test_data