in utils_nlp/models/glove/src/cooccur.c [251:305]
int merge_files(int num) {
int i, size;
long long counter = 0;
CRECID *pq, new, old;
char filename[200];
FILE **fid, *fout;
fid = malloc(sizeof(FILE) * num);
pq = malloc(sizeof(CRECID) * num);
fout = stdout;
if (verbose > 1) fprintf(stderr, "Merging cooccurrence files: processed 0 lines.");
/* Open all files and add first entry of each to priority queue */
for (i = 0; i < num; i++) {
sprintf(filename,"%s_%04d.bin",file_head,i);
fid[i] = fopen(filename,"rb");
if (fid[i] == NULL) {fprintf(stderr, "Unable to open file %s.\n",filename); return 1;}
fread(&new, sizeof(CREC), 1, fid[i]);
new.id = i;
insert(pq,new,i+1);
}
/* Pop top node, save it in old to see if the next entry is a duplicate */
size = num;
old = pq[0];
i = pq[0].id;
delete(pq, size);
fread(&new, sizeof(CREC), 1, fid[i]);
if (feof(fid[i])) size--;
else {
new.id = i;
insert(pq, new, size);
}
/* Repeatedly pop top node and fill priority queue until files have reached EOF */
while (size > 0) {
counter += merge_write(pq[0], &old, fout); // Only count the lines written to file, not duplicates
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[39G%lld lines.",counter);
i = pq[0].id;
delete(pq, size);
fread(&new, sizeof(CREC), 1, fid[i]);
if (feof(fid[i])) size--;
else {
new.id = i;
insert(pq, new, size);
}
}
fwrite(&old, sizeof(CREC), 1, fout);
fprintf(stderr,"\033[0GMerging cooccurrence files: processed %lld lines.\n",++counter);
for (i=0;i<num;i++) {
sprintf(filename,"%s_%04d.bin",file_head,i);
remove(filename);
}
fprintf(stderr,"\n");
return 0;
}