in utils_nlp/models/glove/src/cooccur.c [308:449]
int get_cooccurrence() {
int flag, x, y, fidcounter = 1;
long long a, j = 0, k, id, counter = 0, ind = 0, vocab_size, w1, w2, *lookup, *history;
char format[20], filename[200], str[MAX_STRING_LENGTH + 1];
FILE *fid, *foverflow;
real *bigram_table, r;
HASHREC *htmp, **vocab_hash = inithashtable();
CREC *cr = malloc(sizeof(CREC) * (overflow_length + 1));
history = malloc(sizeof(long long) * window_size);
fprintf(stderr, "COUNTING COOCCURRENCES\n");
if (verbose > 0) {
fprintf(stderr, "window size: %d\n", window_size);
if (symmetric == 0) fprintf(stderr, "context: asymmetric\n");
else fprintf(stderr, "context: symmetric\n");
}
if (verbose > 1) fprintf(stderr, "max product: %lld\n", max_product);
if (verbose > 1) fprintf(stderr, "overflow length: %lld\n", overflow_length);
sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has (irrelevant) frequency data
if (verbose > 1) fprintf(stderr, "Reading vocab from file \"%s\"...", vocab_file);
fid = fopen(vocab_file,"r");
if (fid == NULL) {fprintf(stderr,"Unable to open vocab file %s.\n",vocab_file); return 1;}
while (fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
fclose(fid);
vocab_size = j;
j = 0;
if (verbose > 1) fprintf(stderr, "loaded %lld words.\nBuilding lookup table...", vocab_size);
/* Build auxiliary lookup table used to index into bigram_table */
lookup = (long long *)calloc( vocab_size + 1, sizeof(long long) );
if (lookup == NULL) {
fprintf(stderr, "Couldn't allocate memory!");
return 1;
}
lookup[0] = 1;
for (a = 1; a <= vocab_size; a++) {
if ((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1];
else lookup[a] = lookup[a-1] + vocab_size;
}
if (verbose > 1) fprintf(stderr, "table contains %lld elements.\n",lookup[a-1]);
/* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */
bigram_table = (real *)calloc( lookup[a-1] , sizeof(real) );
if (bigram_table == NULL) {
fprintf(stderr, "Couldn't allocate memory!");
return 1;
}
fid = stdin;
// sprintf(format,"%%%ds",MAX_STRING_LENGTH);
sprintf(filename,"%s_%04d.bin", file_head, fidcounter);
foverflow = fopen(filename,"wb");
if (verbose > 1) fprintf(stderr,"Processing token: 0");
/* For each token in input stream, calculate a weighted cooccurrence sum within window_size */
while (1) {
if (ind >= overflow_length - window_size) { // If overflow buffer is (almost) full, sort it and write it to temporary file
qsort(cr, ind, sizeof(CREC), compare_crec);
write_chunk(cr,ind,foverflow);
fclose(foverflow);
fidcounter++;
sprintf(filename,"%s_%04d.bin",file_head,fidcounter);
foverflow = fopen(filename,"wb");
ind = 0;
}
flag = get_word(str, fid);
if (verbose > 2) fprintf(stderr, "Maybe processing token: %s\n", str);
if (flag == 1) {
// Newline, reset line index (j); maybe eof.
if (feof(fid)) {
if (verbose > 2) fprintf(stderr, "Not getting coocurs as at eof\n");
break;
}
j = 0;
if (verbose > 2) fprintf(stderr, "Not getting coocurs as at newline\n");
continue;
}
counter++;
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[19G%lld",counter);
htmp = hashsearch(vocab_hash, str);
if (htmp == NULL) {
if (verbose > 2) fprintf(stderr, "Not getting coocurs as word not in vocab\n");
continue; // Skip out-of-vocabulary words
}
w2 = htmp->id; // Target word (frequency rank)
for (k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line
w1 = history[k % window_size]; // Context word (frequency rank)
if (verbose > 2) fprintf(stderr, "Adding cooccur between words %lld and %lld.\n", w1, w2);
if ( w1 < max_product/w2 ) { // Product is small enough to store in a full array
bigram_table[lookup[w1-1] + w2 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // Weight by inverse of distance between words if needed
if (symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
}
else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full.
cr[ind].word1 = w1;
cr[ind].word2 = w2;
cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;
ind++; // Keep track of how full temporary buffer is
if (symmetric > 0) { // Symmetric context
cr[ind].word1 = w2;
cr[ind].word2 = w1;
cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;
ind++;
}
}
}
history[j % window_size] = w2; // Target word is stored in circular buffer to become context word in the future
j++;
}
/* Write out temp buffer for the final time (it may not be full) */
if (verbose > 1) fprintf(stderr,"\033[0GProcessed %lld tokens.\n",counter);
qsort(cr, ind, sizeof(CREC), compare_crec);
write_chunk(cr,ind,foverflow);
sprintf(filename,"%s_0000.bin",file_head);
/* Write out full bigram_table, skipping zeros */
if (verbose > 1) fprintf(stderr, "Writing cooccurrences to disk");
fid = fopen(filename,"wb");
j = 1e6;
for (x = 1; x <= vocab_size; x++) {
if ( (long long) (0.75*log(vocab_size / x)) < j) {
j = (long long) (0.75*log(vocab_size / x));
if (verbose > 1) fprintf(stderr,".");
} // log's to make it look (sort of) pretty
for (y = 1; y <= (lookup[x] - lookup[x-1]); y++) {
if ((r = bigram_table[lookup[x-1] - 2 + y]) != 0) {
fwrite(&x, sizeof(int), 1, fid);
fwrite(&y, sizeof(int), 1, fid);
fwrite(&r, sizeof(real), 1, fid);
}
}
}
if (verbose > 1) fprintf(stderr,"%d files in total.\n",fidcounter + 1);
fclose(fid);
fclose(foverflow);
free(cr);
free(lookup);
free(bigram_table);
free(vocab_hash);
return merge_files(fidcounter + 1); // Merge the sorted temporary files
}