tensorflow_text/tools/wordpiece_vocab/generate_vocab.py [130:155]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      (transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
              utils.count_preprocessing_fn(FLAGS.text_key,
                                           FLAGS.language_code_key)))

      # Filter by languages.
      tokens = (
          transformed_data
          | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

      # Calculate smoothing coefficients.
      coeffs = (
          tokens
          | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
              utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

      # Apply smoothing, aggregate counts, and sort words by count.
      _ = (
          tokens
          | 'ApplyExponentialSmoothing' >> beam.ParDo(
              utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
          | 'SumCounts' >> beam.CombinePerKey(sum)
          | 'FilterLowCounts' >> beam.ParDo(utils.FilterByCount(
              FLAGS.max_word_length, min_token_frequency))
          | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


tensorflow_text/tools/wordpiece_vocab/generate_word_counts.py [77:102]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    (transformed_data, _), _ = (
        (raw_data, raw_metadata)
        | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
            utils.count_preprocessing_fn(FLAGS.text_key,
                                         FLAGS.language_code_key)))

    # Filter by languages.
    tokens = (
        transformed_data
        | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

    # Calculate smoothing coefficients.
    coeffs = (
        tokens
        | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
            utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

    # Apply smoothing, aggregate counts, and sort words by count.
    _ = (
        tokens
        | 'ApplyExponentialSmoothing' >> beam.ParDo(
            utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
        | 'SumCounts' >> beam.CombinePerKey(sum)
        | 'FilterLowCounts' >> beam.ParDo(utils.FilterByCount(
            FLAGS.max_word_length, min_token_frequency))
        | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -