in utils_nlp/models/glove/src/vocab_count.c [167:224]
int get_counts() {
long long i = 0, j = 0, vocab_size = 12500;
// char format[20];
char str[MAX_STRING_LENGTH + 1];
HASHREC **vocab_hash = inithashtable();
HASHREC *htmp;
VOCAB *vocab;
FILE *fid = stdin;
fprintf(stderr, "BUILDING VOCABULARY\n");
if (verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
// sprintf(format,"%%%ds",MAX_STRING_LENGTH);
while ( ! feof(fid)) {
// Insert all tokens into hashtable
int nl = get_word(str, fid);
if (nl) continue; // just a newline marker or feof
if (strcmp(str, "<unk>") == 0) {
fprintf(stderr, "\nError, <unk> vector found in corpus.\nPlease remove <unk>s from your corpus (e.g. cat text8 | sed -e 's/<unk>/<raw_unk>/g' > text8.new)");
return 1;
}
hashinsert(vocab_hash, str);
if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
}
if (verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
vocab = malloc(sizeof(VOCAB) * vocab_size);
for (i = 0; i < TSIZE; i++) { // Migrate vocab to array
htmp = vocab_hash[i];
while (htmp != NULL) {
vocab[j].word = htmp->word;
vocab[j].count = htmp->count;
j++;
if (j>=vocab_size) {
vocab_size += 2500;
vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);
}
htmp = htmp->next;
}
}
if (verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
if (max_vocab > 0 && max_vocab < j)
// If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks.
// This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet
qsort(vocab, j, sizeof(VOCAB), CompareVocab);
else max_vocab = j;
qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
for (i = 0; i < max_vocab; i++) {
if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
if (verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
break;
}
printf("%s %lld\n",vocab[i].word,vocab[i].count);
}
if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
return 0;
}