def _run_stat()

in spark_scripts/stat_for_ner_category_to_wh_words.py [0:0]


def _run_stat(question_rdd, metric_fptr, metric_per_category_fptr, toml_fptr, output_dir):
    count_per_category = _compute_count_per_category(question_rdd)

    unit_count_prdd = question_rdd.flatMap(_extract_leading_ngrams)

    count_prdd = unit_count_prdd.reduceByKey(add).filter(lambda x: x[1] >= 10)
    count_prdd.cache()
    count_prdd.saveAsTextFile(os.path.join(output_dir, 'count_prdd'))

    count_dct = count_prdd.collectAsMap()

    toml_dct = {}

    category2list = defaultdict(list)
    for k, count in count_dct.items():
        ner_category = k[0]
        whxx_ngram = WhxxNgram(k[1], k[2], count)

        category2list[ner_category].append(whxx_ngram)

    for category, whxx_ngram_lst in sorted(category2list.items()):
        size2list = defaultdict(list)  # ngram-size to list-of-whxx_ngram

        for whxx_ngram in whxx_ngram_lst:
            size2list[whxx_ngram.ngram_size].append(whxx_ngram)

        for size in size2list.keys():
            size2list[size] = sorted(size2list[size], key=lambda x: -x.count)

        for size, lst in sorted(size2list.items()):
            print('{} {}-gram'.format(category, size), file=metric_fptr)
            for whxx_ngram in lst[:TOPK]:
                print('count= {:4d} / {:4d} ({:5.2f}%) "{}"'.format(
                    whxx_ngram.count,
                    count_per_category[category],
                    100.0 * whxx_ngram.count / count_per_category[category],
                    whxx_ngram.ngram), file=metric_fptr)
            print(file=metric_fptr)

        _add_to_toml(toml_dct, category, size2list)

    for category, whxx_ngram_lst in sorted(category2list.items()):
        sorted_lst = sorted(whxx_ngram_lst, key=lambda x: -x.count)[:10]

        print('{}'.format(category), file=metric_per_category_fptr)
        for whxx_ngram in sorted_lst:
            print('count= {:4d} / {:4d} ({:5.2f}%) "{}"'.format(
                whxx_ngram.count,
                count_per_category[category],
                100.0 * whxx_ngram.count / count_per_category[category],
                whxx_ngram.ngram), file=metric_per_category_fptr)
        print(file=metric_per_category_fptr)

    print(toml.dumps(toml_dct), file=toml_fptr)