in tensorflow_datasets/structured/movielens_parsing.py [0:0]
def parse_100k_ratings_data(
dir_path: str) -> Iterator[Tuple[int, Dict[str, Any]]]:
"""Parses the 100k ratings data."""
movie_info_map = {}
for _, movie_example in parse_100k_movies_data(dir_path):
movie_info_map[movie_example['movie_id']] = movie_example
users_file_path = os.path.join(dir_path, 'u.user')
# A list of thresholds for bucketizing user age.
age_thresholds_list = [18, 25, 35, 45, 50, 56]
# A dictionary for converting 100K occupation labels to standardized labels.
# These labels are consistent across the 100k and 1m datasets. Some labels are
# merged according to the labels in the 1m dataset.
occupation_label_conversion_map = {
'administrator': 'clerical/admin',
'artist': 'artist',
'doctor': 'doctor/health care',
'educator': 'academic/educator',
'engineer': 'technician/engineer',
'entertainment': 'entertainment',
'executive': 'executive/managerial',
'healthcare': 'doctor/health care',
'homemaker': 'homemaker',
'lawyer': 'lawyer',
'librarian': 'librarian',
'marketing': 'sales/marketing',
'none': 'other/not specified',
'other': 'other/not specified',
'programmer': 'programmer',
'retired': 'retired',
'salesman': 'sales/marketing',
'scientist': 'scientist',
'student': 'student',
'technician': 'technician/engineer',
'writer': 'writer',
}
user_info_map = {}
with tf.io.gfile.GFile(users_file_path, mode='rb') as users_file:
for line in users_file:
line = codecs.decode(line, encoding='ISO-8859-1').strip()
user_id, age_str, gender_str, occupation_str, zip_code = line.split('|')
raw_age = int(age_str)
bucketized_age = 1
for age_threshold in age_thresholds_list:
if raw_age >= age_threshold:
bucketized_age = age_threshold
else:
break
user_info_map[user_id] = {
'user_gender':
gender_str == 'M',
'bucketized_user_age':
bucketized_age,
'raw_user_age':
raw_age,
'user_occupation_label':
occupation_label_conversion_map[occupation_str],
'user_occupation_text':
occupation_str,
'user_zip_code':
zip_code,
}
ratings_file_path = os.path.join(dir_path, 'u.data')
with tf.io.gfile.GFile(ratings_file_path, mode='rb') as ratings_file:
for row_num, line in enumerate(ratings_file):
line = codecs.decode(line, encoding='ISO-8859-1').strip()
user_id, movie_id, rating, timestamp = line.split('\t')
movie_info = movie_info_map[movie_id]
user_info = user_info_map[user_id]
ex = {
'user_id': user_id,
'user_rating': rating,
'timestamp': timestamp,
}
ex.update(movie_info)
ex.update(user_info)
yield row_num, ex