in tensorflow_examples/lite/model_maker/core/data_util/text_dataloader.py [0:0]
def from_folder(cls,
filename,
model_spec='average_word_vec',
is_training=True,
class_labels=None,
shuffle=True,
cache_dir=None):
"""Loads text with labels and preproecess text according to `model_spec`.
Assume the text data of the same label are in the same subdirectory. each
file is one text.
Args:
filename: Name of the file.
model_spec: Specification for the model.
is_training: Whether the loaded data is for training or not.
class_labels: Class labels that should be considered. Name of the
subdirectory not in `class_labels` will be ignored. If None, all the
subdirectories will be considered.
shuffle: boolean, if shuffle, random shuffle data.
cache_dir: The cache directory to save preprocessed data. If None,
generates a temporary directory to cache preprocessed data.
Returns:
TextDataset containing text, labels and other related info.
"""
model_spec = ms.get(model_spec)
data_root = os.path.abspath(filename)
folder_name = os.path.basename(data_root)
is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
cache_dir, folder_name, model_spec, is_training)
# If cached, directly loads data from cache directory.
if is_cached:
return cls._load_data(tfrecord_file, meta_data_file, model_spec)
# Gets paths of all text.
if class_labels:
all_text_paths = []
for class_label in class_labels:
all_text_paths.extend(
list(
tf.io.gfile.glob(os.path.join(data_root, class_label) + r'/*')))
else:
all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*'))
all_text_size = len(all_text_paths)
if all_text_size == 0:
raise ValueError('Text size is zero')
if shuffle:
random.shuffle(all_text_paths)
# Gets label and its index.
if class_labels:
label_names = sorted(class_labels)
else:
label_names = sorted(
name for name in os.listdir(data_root)
if os.path.isdir(os.path.join(data_root, name)))
# Generates text examples from folder.
examples = []
for i, path in enumerate(all_text_paths):
with tf.io.gfile.GFile(path, 'r') as f:
text = f.read()
guid = '%s-%d' % (folder_name, i)
label = os.path.basename(os.path.dirname(path))
examples.append(classifier_data_lib.InputExample(guid, text, None, label))
# Saves preprocessed data and other assets into files.
cls._save_data(examples, model_spec, label_names, tfrecord_file,
meta_data_file, vocab_file, is_training)
# Loads data from cache directory.
return cls._load_data(tfrecord_file, meta_data_file, model_spec)