in spark_scripts/stat_for_ner_category_to_wh_words.py [0:0]
def _run_stat(question_rdd, metric_fptr, metric_per_category_fptr, toml_fptr, output_dir):
count_per_category = _compute_count_per_category(question_rdd)
unit_count_prdd = question_rdd.flatMap(_extract_leading_ngrams)
count_prdd = unit_count_prdd.reduceByKey(add).filter(lambda x: x[1] >= 10)
count_prdd.cache()
count_prdd.saveAsTextFile(os.path.join(output_dir, 'count_prdd'))
count_dct = count_prdd.collectAsMap()
toml_dct = {}
category2list = defaultdict(list)
for k, count in count_dct.items():
ner_category = k[0]
whxx_ngram = WhxxNgram(k[1], k[2], count)
category2list[ner_category].append(whxx_ngram)
for category, whxx_ngram_lst in sorted(category2list.items()):
size2list = defaultdict(list) # ngram-size to list-of-whxx_ngram
for whxx_ngram in whxx_ngram_lst:
size2list[whxx_ngram.ngram_size].append(whxx_ngram)
for size in size2list.keys():
size2list[size] = sorted(size2list[size], key=lambda x: -x.count)
for size, lst in sorted(size2list.items()):
print('{} {}-gram'.format(category, size), file=metric_fptr)
for whxx_ngram in lst[:TOPK]:
print('count= {:4d} / {:4d} ({:5.2f}%) "{}"'.format(
whxx_ngram.count,
count_per_category[category],
100.0 * whxx_ngram.count / count_per_category[category],
whxx_ngram.ngram), file=metric_fptr)
print(file=metric_fptr)
_add_to_toml(toml_dct, category, size2list)
for category, whxx_ngram_lst in sorted(category2list.items()):
sorted_lst = sorted(whxx_ngram_lst, key=lambda x: -x.count)[:10]
print('{}'.format(category), file=metric_per_category_fptr)
for whxx_ngram in sorted_lst:
print('count= {:4d} / {:4d} ({:5.2f}%) "{}"'.format(
whxx_ngram.count,
count_per_category[category],
100.0 * whxx_ngram.count / count_per_category[category],
whxx_ngram.ngram), file=metric_per_category_fptr)
print(file=metric_per_category_fptr)
print(toml.dumps(toml_dct), file=toml_fptr)