in Source/Readers/Kaldi2Reader/msra_mgram.h [3196:3453]
void read (const std::wstring & pathname, SYMMAP & userSymMap, bool filterVocabulary, int maxM)
{
int lineNo = 0;
msra::basetypes::auto_file_ptr f = fopenOrDie (pathname, L"rbS");
fprintf (stderr, "read: reading %S", pathname.c_str());
filename = pathname; // (keep this info for debugging)
// --- read header information
// search for header line
char buf[1024];
lineNo++, fgetline (f, buf);
while (strcmp (buf, "\\data\\") != 0 && !feof (f))
lineNo++, fgetline (f, buf);
lineNo++, fgetline (f, buf);
// get the dimensions
std::vector<int> dims; dims.reserve (4);
while (buf[0] == 0 && !feof (f))
lineNo++, fgetline (f, buf);
int n, dim;
dims.push_back (1); // dummy zerogram entry
while (sscanf (buf, "ngram %d=%d", &n, &dim) == 2 && n == (int) dims.size())
{
dims.push_back (dim);
lineNo++, fgetline (f, buf);
}
M = (int) dims.size() -1;
if (M == 0)
RuntimeError ("read: mal-formed LM file, no dimension information (%d): %S", lineNo, pathname.c_str());
int fileM = M;
if (M > maxM)
M = maxM;
// allocate main storage
refs.resize (M);
for (int m = 0; m < M; m++)
refs[m].reserve (dims[m] +1);
entries.resize (M +1);
for (int m = 0; m <= M; m++)
entries[m].reserve (dims[m]);
lmSymbols.reserve (dims[0]);
refs[0].push_back (LMHIST (0, 0.0));
refs[0].push_back (LMHIST (0, -99.0)); // this one gets updated
entries[0].push_back (LMSCORE (-1, -99.0)); // zerogram score -- gets updated later
std::vector<bool> skipWord; // true: skip entry containing this word
skipWord.reserve (lmSymbols.capacity());
// --- read main sections
const double ln10xLMF = log (10.0); // ARPA scores are strangely scaled
for (int m = 1; m <= M; m++)
{
while (buf[0] == 0 && !feof (f))
lineNo++, fgetline (f, buf);
if (sscanf (buf, "\\%d-grams:", &n) != 1 || n != m)
RuntimeError ("read: mal-formed LM file, bad section header (%d): %S", lineNo, pathname.c_str());
lineNo++, fgetline (f, buf);
std::vector<int> mgram (m +1); // current mgram being read
std::vector<int> prevmgram (m +1, -1); // previous mgram read
std::vector<int> histEntry (m); // sub-array ranges
histEntry[0] = 0;
// read all the m-grams
while (buf[0] != '\\')
{
if (buf[0] == 0)
{
lineNo++, fgetline (f, buf);
continue;
}
// -- parse the line
const char * delim = " \t\n\r";
const char * score = strtok (&buf[0], delim);
if (score == NULL || score[0] == 0) // not checking whether it is numeric
RuntimeError ("read: mal-formed LM file, no score (%d): %S", lineNo, pathname.c_str());
double scoreVal = atof (score);
double logP = scoreVal * ln10xLMF; // convert to natural log
bool skipEntry = false;
for (int n = 1; n <= m; n++)
{
/*const*/ char * tok = strtok (NULL, delim);
if (tok == NULL)
RuntimeError ("read: mal-formed LM file, not enough words in mgram (%d): %S", lineNo, pathname.c_str());
// map to id
int id;
if (m == 1) // unigram: build vocab table
{
id = (int) lmSymbols.size(); // unique id for this symbol
lmSymbols.push_back (SYMBOL (id, tok));
bool toSkip = false;
if (userSymMap.sym2existingId (lmSymbols.back().symbol) == -1)
{
if (filterVocabulary)
toSkip = true; // unknown word
else
userSymMap.sym2id (lmSymbols.back().symbol); // create it in user's space
}
skipWord.push_back (toSkip);
}
else // mgram: look up word in vocabulary
{
if (prevmgram[n] >= 0 && strcmp (idToSymbol (prevmgram[n]), tok) == 0)
id = prevmgram[n];
else
{
id = symbolToId (tok);
if (id == -1)
RuntimeError ("read: mal-formed LM file, m-gram contains unknown word (%d): %S", lineNo, pathname.c_str());
}
}
mgram[n] = id; // that's our id
skipEntry |= skipWord[id]; // skip entry if any token is unknown
}
double logB = 0.0;
if (m < M)
{
const char * bo = strtok (NULL, delim);
if (score == NULL || score[0] == 0) // not checking whether it is numeric
RuntimeError ("read: mal-formed LM file, no score (%d): %S", lineNo, pathname.c_str());
double boVal = atof (bo);
logB = boVal * ln10xLMF; // convert to natural log
}
lineNo++, fgetline (f, buf);
if (skipEntry) // word contained unknown vocabulary: skip entire entry
goto skipMGram;
// -- enter the information into our data structure
// locate the corresponding entries
// histEntry[n] are valid iff mgram[n'] == prevmgram[n'] for all n' <= '
bool prevValid = true;
for (int n = 1; n < m; n++)
{
if (prevValid && mgram[n] == prevmgram[n])
continue;
if (prevValid && mgram[n] < prevmgram[n])
RuntimeError ("read: mal-formed LM file, m-gram out of order (%d): %S", lineNo, pathname.c_str());
// a history token differs from previous mgram. That history must exist.
const std::vector<LMSCORE> & entries_n = entries[n];
const std::vector<LMHIST> & refs_h = refs[n -1]; // history
int beg = refs_h[histEntry[n -1]].firstEntry; // sub-array range for next level
int end = refs_h[histEntry[n -1] +1].firstEntry;
int i = findEntry (entries_n, beg, end, mgram[n]);
if (i == -1) // unknown history: fall back
RuntimeError ("read: mal-formed LM file, m-gram history not defined (%d): %S", lineNo, pathname.c_str());
// found it: narrow down search range
histEntry[n] = i;
prevValid = false;
}
if (prevValid && mgram[m] <= prevmgram[m])
RuntimeError ("read: mal-formed LM file, m-gram out of order (%d): %S", lineNo, pathname.c_str());
if (m < M) // create history entry
refs[m].push_back (LMHIST (0, logB));
entries[m].push_back (LMSCORE (mgram[m], logP)); // score entry
refs[m-1][histEntry[m-1]].firstEntry++; // for now count how many histories we got
skipMGram:
// remember current mgram for next iteration
::swap (mgram, prevmgram);
}
// Update previous level history from #entries to firstEntry.
// We do this afterwards because some histories may not be used and
// therefore not occur in higher-order m-grams, such that we cannot
// rely on touching them in the loop above. Counting entries instead
// leaves those at 0, which is correct.
std::vector<LMHIST> & refs_h = refs[m -1]; // history
int n0 = 0;
for (int i = 0; i < (int) refs_h.size(); i++)
{
int num = refs_h[i].firstEntry;
refs_h[i].firstEntry = n0;
n0 += num;
}
assert (refs_h.back().firstEntry == (int) entries[m].size());
// create closing history entry
if (m < M)
refs[m].push_back (LMHIST (0, -99.0));
// fix the symbol set -- now we can binary-search in them with symbolToId()
if (m == 1)
{
std::sort (lmSymbols.begin(), lmSymbols.end());
idToSymIndex.resize (lmSymbols.size(), -1);
for (int i = 0; i < (int) lmSymbols.size(); i++)
{
idToSymIndex[lmSymbols[i].id] = i;
}
}
fprintf (stderr, ", %d %d-grams", entries[m].size(), m);
}
fprintf (stderr, "\n");
// check end tag
if (M == fileM)
{ // only if caller did not restrict us to a lower order
while (buf[0] == 0 && !feof (f))
lineNo++, fgetline (f, buf);
if (strcmp (buf, "\\end\\") != 0)
RuntimeError ("read: mal-formed LM file, no \\end\\ tag (%d): %S", lineNo, pathname.c_str());
}
// update zerogram score
// We use the minimum of all unigram scores.
const std::vector<LMSCORE> & entries_1 = entries[1];
float unknownLogP = 0.0f;
for (int i = 0; i < (int) entries_1.size(); i++)
{
if (entries_1[i].logP < -98.9f) continue; // disabled token does not count
if (entries_1[i].logP < unknownLogP)
unknownLogP = entries_1[i].logP;
}
entries[0][0].logP = unknownLogP;;
// = (float) -log ((double) lmSymbols.size()); // zerogram score
// establish mapping of word ids from user to LM space
userToLMSymMap.resize (userSymMap.size());
for (int i = 0; i < userSymMap.size(); i++)
{
const char * sym = userSymMap.id2sym (i);
int id = symbolToId (sym); // may be -1 if not found
userToLMSymMap[i] = id;
}
// check whether first-level unigrams need mapping
// We don't unless user provided a dictionary to filter.
entries1Unmapped = true; // assume findEntry (id) == id
for (int i = 0; i < (int) entries_1.size(); i++)
{
if (entries_1[i].id != i)
{
entries1Unmapped = false;
break;
}
}
}