def parse_100k_ratings_data()

in tensorflow_datasets/structured/movielens_parsing.py [0:0]


def parse_100k_ratings_data(
    dir_path: str) -> Iterator[Tuple[int, Dict[str, Any]]]:
  """Parses the 100k ratings data."""
  movie_info_map = {}
  for _, movie_example in parse_100k_movies_data(dir_path):
    movie_info_map[movie_example['movie_id']] = movie_example

  users_file_path = os.path.join(dir_path, 'u.user')
  # A list of thresholds for bucketizing user age.
  age_thresholds_list = [18, 25, 35, 45, 50, 56]
  # A dictionary for converting 100K occupation labels to standardized labels.
  # These labels are consistent across the 100k and 1m datasets. Some labels are
  # merged according to the labels in the 1m dataset.
  occupation_label_conversion_map = {
      'administrator': 'clerical/admin',
      'artist': 'artist',
      'doctor': 'doctor/health care',
      'educator': 'academic/educator',
      'engineer': 'technician/engineer',
      'entertainment': 'entertainment',
      'executive': 'executive/managerial',
      'healthcare': 'doctor/health care',
      'homemaker': 'homemaker',
      'lawyer': 'lawyer',
      'librarian': 'librarian',
      'marketing': 'sales/marketing',
      'none': 'other/not specified',
      'other': 'other/not specified',
      'programmer': 'programmer',
      'retired': 'retired',
      'salesman': 'sales/marketing',
      'scientist': 'scientist',
      'student': 'student',
      'technician': 'technician/engineer',
      'writer': 'writer',
  }
  user_info_map = {}
  with tf.io.gfile.GFile(users_file_path, mode='rb') as users_file:
    for line in users_file:
      line = codecs.decode(line, encoding='ISO-8859-1').strip()
      user_id, age_str, gender_str, occupation_str, zip_code = line.split('|')
      raw_age = int(age_str)
      bucketized_age = 1
      for age_threshold in age_thresholds_list:
        if raw_age >= age_threshold:
          bucketized_age = age_threshold
        else:
          break
      user_info_map[user_id] = {
          'user_gender':
              gender_str == 'M',
          'bucketized_user_age':
              bucketized_age,
          'raw_user_age':
              raw_age,
          'user_occupation_label':
              occupation_label_conversion_map[occupation_str],
          'user_occupation_text':
              occupation_str,
          'user_zip_code':
              zip_code,
      }

  ratings_file_path = os.path.join(dir_path, 'u.data')
  with tf.io.gfile.GFile(ratings_file_path, mode='rb') as ratings_file:
    for row_num, line in enumerate(ratings_file):
      line = codecs.decode(line, encoding='ISO-8859-1').strip()
      user_id, movie_id, rating, timestamp = line.split('\t')
      movie_info = movie_info_map[movie_id]
      user_info = user_info_map[user_id]
      ex = {
          'user_id': user_id,
          'user_rating': rating,
          'timestamp': timestamp,
      }
      ex.update(movie_info)
      ex.update(user_info)
      yield row_num, ex