in custom/metrics.py [0:0]
def ngram_metrics(token_list, pad=1):
if pad in token_list:
token_list = token_list[:token_list.index(pad)] # remove possible padding
stats = defaultdict(float)
for n in range(1, 5):
ngs = [ng for ng in ngrams(token_list, n)]
counter = Counter([ng for ng in ngrams(token_list, n)])
stats['pct_repeat_%dgrams' % n] = 1.0 - len(counter)/len(ngs)
return stats