in tensorflow_examples/lite/model_maker/core/data_util/recommendation_dataloader.py [0:0]
def from_movielens(cls,
data_dir,
data_tag,
input_spec: recommendation_config.InputSpec,
generated_examples_dir=None,
min_timeline_length=3,
max_context_length=10,
max_context_movie_genre_length=10,
min_rating=None,
train_data_fraction=0.9,
build_vocabs=True,
train_filename='train_movielens_1m.tfrecord',
test_filename='test_movielens_1m.tfrecord',
vocab_filename='movie_vocab.json',
meta_filename='meta.json'):
"""Generates data loader from movielens dataset.
The method downloads and prepares dataset, then generates for train/eval.
For `movielens` data format, see:
- function `_generate_fake_data` in `recommendation_testutil.py`
- Or, zip file: http://files.grouplens.org/datasets/movielens/ml-1m.zip
Args:
data_dir: str, path to dataset containing (unzipped) text data.
data_tag: str, specify dataset in {'train', 'test'}.
input_spec: InputSpec, specify data format for input and embedding.
generated_examples_dir: str, path to generate preprocessed examples.
(default: same as data_dir)
min_timeline_length: int, min timeline length to split train/eval set.
max_context_length: int, max context length as one input.
max_context_movie_genre_length: int, max context length of movie genre as
one input.
min_rating: int or None, include examples with min rating.
train_data_fraction: float, percentage of training data [0.0, 1.0].
build_vocabs: boolean, whether to build vocabs.
train_filename: str, generated file name for training data.
test_filename: str, generated file name for test data.
vocab_filename: str, generated file name for vocab data.
meta_filename: str, generated file name for meta data.
Returns:
Data Loader.
"""
if data_tag not in ('train', 'test'):
raise ValueError(
'Expected data_tag is train or test, but got {}'.format(data_tag))
if not generated_examples_dir:
# By default, set generated examples dir to data_dir
generated_examples_dir = data_dir
meta = cls.generate_movielens_dataset(
data_dir,
generated_examples_dir,
train_filename=train_filename,
test_filename=test_filename,
vocab_filename=vocab_filename,
meta_filename=meta_filename,
min_timeline_length=min_timeline_length,
max_context_length=max_context_length,
max_context_movie_genre_length=max_context_movie_genre_length,
min_rating=min_rating,
train_data_fraction=train_data_fraction,
build_vocabs=build_vocabs,
)
vocab = cls.load_vocab(meta['vocab_file'])
if data_tag == 'train':
ds = cls._read_dataset(meta['train_file'], input_spec,
generated_examples_dir)
return cls(ds, meta['train_size'], vocab)
elif data_tag == 'test':
ds = cls._read_dataset(meta['test_file'], input_spec,
generated_examples_dir)
return cls(ds, meta['test_size'], vocab)