def from_folder()

in tensorflow_examples/lite/model_maker/core/data_util/text_dataloader.py [0:0]
43 lines of code
12 McCabe index (conditional complexity)

  def from_folder(cls,
                  filename,
                  model_spec='average_word_vec',
                  is_training=True,
                  class_labels=None,
                  shuffle=True,
                  cache_dir=None):
    """Loads text with labels and preproecess text according to `model_spec`.

    Assume the text data of the same label are in the same subdirectory. each
    file is one text.

    Args:
      filename: Name of the file.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      class_labels: Class labels that should be considered. Name of the
        subdirectory not in `class_labels` will be ignored. If None, all the
        subdirectories will be considered.
      shuffle: boolean, if shuffle, random shuffle data.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      TextDataset containing text, labels and other related info.
    """
    model_spec = ms.get(model_spec)
    data_root = os.path.abspath(filename)
    folder_name = os.path.basename(data_root)

    is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
        cache_dir, folder_name, model_spec, is_training)
    # If cached, directly loads data from cache directory.
    if is_cached:
      return cls._load_data(tfrecord_file, meta_data_file, model_spec)

    # Gets paths of all text.
    if class_labels:
      all_text_paths = []
      for class_label in class_labels:
        all_text_paths.extend(
            list(
                tf.io.gfile.glob(os.path.join(data_root, class_label) + r'/*')))
    else:
      all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*'))

    all_text_size = len(all_text_paths)
    if all_text_size == 0:
      raise ValueError('Text size is zero')

    if shuffle:
      random.shuffle(all_text_paths)

    # Gets label and its index.
    if class_labels:
      label_names = sorted(class_labels)
    else:
      label_names = sorted(
          name for name in os.listdir(data_root)
          if os.path.isdir(os.path.join(data_root, name)))

    # Generates text examples from folder.
    examples = []
    for i, path in enumerate(all_text_paths):
      with tf.io.gfile.GFile(path, 'r') as f:
        text = f.read()
      guid = '%s-%d' % (folder_name, i)
      label = os.path.basename(os.path.dirname(path))
      examples.append(classifier_data_lib.InputExample(guid, text, None, label))

    # Saves preprocessed data and other assets into files.
    cls._save_data(examples, model_spec, label_names, tfrecord_file,
                   meta_data_file, vocab_file, is_training)

    # Loads data from cache directory.
    return cls._load_data(tfrecord_file, meta_data_file, model_spec)