void read()

in Source/Readers/Kaldi2Reader/msra_mgram.h [3196:3453]


    void read (const std::wstring & pathname, SYMMAP & userSymMap, bool filterVocabulary, int maxM)
    {
        int lineNo = 0;
        msra::basetypes::auto_file_ptr f = fopenOrDie (pathname, L"rbS");
        fprintf (stderr, "read: reading %S", pathname.c_str());
        filename = pathname;            // (keep this info for debugging)

        // --- read header information

        // search for header line
        char buf[1024];
        lineNo++, fgetline (f, buf);
        while (strcmp (buf, "\\data\\") != 0 && !feof (f))
            lineNo++, fgetline (f, buf);
        lineNo++, fgetline (f, buf);

        // get the dimensions
        std::vector<int> dims; dims.reserve (4);

        while (buf[0] == 0 && !feof (f))
            lineNo++, fgetline (f, buf);

        int n, dim;
        dims.push_back (1); // dummy zerogram entry
        while (sscanf (buf, "ngram %d=%d", &n, &dim) == 2 && n == (int) dims.size())
        {
            dims.push_back (dim);
            lineNo++, fgetline (f, buf);
        }

        M = (int) dims.size() -1;
        if (M == 0)
            RuntimeError ("read: mal-formed LM file, no dimension information (%d): %S", lineNo, pathname.c_str());
        int fileM = M;
        if (M > maxM)
            M = maxM;

        // allocate main storage
        refs.resize (M);
        for (int m = 0; m < M; m++)
            refs[m].reserve (dims[m] +1);
        entries.resize (M +1);
        for (int m = 0; m <= M; m++)
            entries[m].reserve (dims[m]);
        lmSymbols.reserve (dims[0]);

        refs[0].push_back (LMHIST (0, 0.0));
        refs[0].push_back (LMHIST (0, -99.0));  // this one gets updated
        entries[0].push_back (LMSCORE (-1, -99.0));    // zerogram score -- gets updated later

        std::vector<bool> skipWord; // true: skip entry containing this word
        skipWord.reserve (lmSymbols.capacity());

        // --- read main sections

        const double ln10xLMF = log (10.0);     // ARPA scores are strangely scaled
        for (int m = 1; m <= M; m++)
        {
            while (buf[0] == 0 && !feof (f))
                lineNo++, fgetline (f, buf);

            if (sscanf (buf, "\\%d-grams:", &n) != 1 || n != m)
                RuntimeError ("read: mal-formed LM file, bad section header (%d): %S", lineNo, pathname.c_str());
            lineNo++, fgetline (f, buf);

            std::vector<int> mgram (m +1);          // current mgram being read
            std::vector<int> prevmgram (m +1, -1);  // previous mgram read
            std::vector<int> histEntry (m);         // sub-array ranges

            histEntry[0] = 0;

            // read all the m-grams
            while (buf[0] != '\\')
            {
                if (buf[0] == 0)
                {
                    lineNo++, fgetline (f, buf);
                    continue;
                }

                // -- parse the line
                const char * delim = " \t\n\r";
                const char * score = strtok (&buf[0], delim);
                if (score == NULL || score[0] == 0) // not checking whether it is numeric
                    RuntimeError ("read: mal-formed LM file, no score (%d): %S", lineNo, pathname.c_str());
                double scoreVal = atof (score);
                double logP = scoreVal * ln10xLMF;  // convert to natural log

                bool skipEntry = false;
                for (int n = 1; n <= m; n++)
                {
                    /*const*/ char * tok = strtok (NULL, delim);
                    if (tok == NULL)
                        RuntimeError ("read: mal-formed LM file, not enough words in mgram (%d): %S", lineNo, pathname.c_str());
                    // map to id
                    int id;
                    if (m == 1)     // unigram: build vocab table
                    {
                        id = (int) lmSymbols.size();        // unique id for this symbol
                        lmSymbols.push_back (SYMBOL (id, tok));
                        bool toSkip = false;
                        if (userSymMap.sym2existingId (lmSymbols.back().symbol) == -1)
                        {
                            if (filterVocabulary)
                                toSkip = true;              // unknown word
                            else
                                userSymMap.sym2id (lmSymbols.back().symbol);    // create it in user's space
                        }
                        skipWord.push_back (toSkip);
                    }
                    else            // mgram: look up word in vocabulary
                    {
                        if (prevmgram[n] >= 0 && strcmp (idToSymbol (prevmgram[n]), tok) == 0)
                            id = prevmgram[n];
                        else
                        {
                            id = symbolToId (tok);
                            if (id == -1)
                                RuntimeError ("read: mal-formed LM file, m-gram contains unknown word (%d): %S", lineNo, pathname.c_str());
                        }
                    }
                    mgram[n] = id;          // that's our id
                    skipEntry |= skipWord[id];   // skip entry if any token is unknown
                }

                double logB = 0.0;
                if (m < M)
                {
                    const char * bo = strtok (NULL, delim);
                    if (score == NULL || score[0] == 0) // not checking whether it is numeric
                        RuntimeError ("read: mal-formed LM file, no score (%d): %S", lineNo, pathname.c_str());
                    double boVal = atof (bo);
                    logB = boVal * ln10xLMF;        // convert to natural log
                }

                lineNo++, fgetline (f, buf);

                if (skipEntry)                      // word contained unknown vocabulary: skip entire entry
                    goto skipMGram;

                // -- enter the information into our data structure

                // locate the corresponding entries
                // histEntry[n] are valid iff mgram[n'] == prevmgram[n'] for all n' <= '

                bool prevValid = true;
                for (int n = 1; n < m; n++)
                {
                    if (prevValid && mgram[n] == prevmgram[n])
                        continue;

                    if (prevValid && mgram[n] < prevmgram[n])
                        RuntimeError ("read: mal-formed LM file, m-gram out of order (%d): %S", lineNo, pathname.c_str());

                    // a history token differs from previous mgram. That history must exist.
                    const std::vector<LMSCORE> & entries_n = entries[n];
                    const std::vector<LMHIST> & refs_h = refs[n -1];    // history
                    int beg = refs_h[histEntry[n -1]].firstEntry;       // sub-array range for next level
                    int end = refs_h[histEntry[n -1] +1].firstEntry;
                    int i = findEntry (entries_n, beg, end, mgram[n]);
                    if (i == -1)    // unknown history: fall back
                        RuntimeError ("read: mal-formed LM file, m-gram history not defined (%d): %S", lineNo, pathname.c_str());
                    // found it: narrow down search range
                    histEntry[n] = i;
                    prevValid = false;
                }

                if (prevValid && mgram[m] <= prevmgram[m])
                    RuntimeError ("read: mal-formed LM file, m-gram out of order (%d): %S", lineNo, pathname.c_str());

                if (m < M)              // create history entry
                    refs[m].push_back (LMHIST (0, logB));
                entries[m].push_back (LMSCORE (mgram[m], logP));   // score entry

                refs[m-1][histEntry[m-1]].firstEntry++;     // for now count how many histories we got

skipMGram:
                // remember current mgram for next iteration
                ::swap (mgram, prevmgram);
            }

            // Update previous level history from #entries to firstEntry.
            // We do this afterwards because some histories may not be used and
            // therefore not occur in higher-order m-grams, such that we cannot
            // rely on touching them in the loop above. Counting entries instead
            // leaves those at 0, which is correct.
            std::vector<LMHIST> & refs_h = refs[m -1];    // history
            int n0 = 0;
            for (int i = 0; i < (int) refs_h.size(); i++)
            {
                int num = refs_h[i].firstEntry;
                refs_h[i].firstEntry =  n0;
                n0 += num;
            }
            assert (refs_h.back().firstEntry == (int) entries[m].size());

            // create closing history entry
            if (m < M)
                refs[m].push_back (LMHIST (0, -99.0));

            // fix the symbol set -- now we can binary-search in them with symbolToId()
            if (m == 1)
            {
                std::sort (lmSymbols.begin(), lmSymbols.end());
                idToSymIndex.resize (lmSymbols.size(), -1);
                for (int i = 0; i < (int) lmSymbols.size(); i++)
                {
                    idToSymIndex[lmSymbols[i].id] = i;
                }
            }

            fprintf (stderr, ", %d %d-grams", entries[m].size(), m);
        }
        fprintf (stderr, "\n");

        // check end tag
        if (M == fileM)
        {   // only if caller did not restrict us to a lower order
            while (buf[0] == 0 && !feof (f))
                lineNo++, fgetline (f, buf);
            if (strcmp (buf, "\\end\\") != 0)
                RuntimeError ("read: mal-formed LM file, no \\end\\ tag (%d): %S", lineNo, pathname.c_str());
        }

        // update zerogram score
        // We use the minimum of all unigram scores.
        const std::vector<LMSCORE> & entries_1 = entries[1];
        float unknownLogP = 0.0f;
        for (int i = 0; i < (int) entries_1.size(); i++)
        {
            if (entries_1[i].logP < -98.9f) continue;   // disabled token does not count
            if (entries_1[i].logP < unknownLogP)
                unknownLogP = entries_1[i].logP;
        }
        entries[0][0].logP = unknownLogP;;
        // = (float) -log ((double) lmSymbols.size());         // zerogram score

        // establish mapping of word ids from user to LM space
        userToLMSymMap.resize (userSymMap.size());
        for (int i = 0; i < userSymMap.size(); i++)
        {
            const char * sym = userSymMap.id2sym (i);
            int id = symbolToId (sym);    // may be -1 if not found
            userToLMSymMap[i] = id;
        }

        // check whether first-level unigrams need mapping
        // We don't unless user provided a dictionary to filter.
        entries1Unmapped = true;    // assume findEntry (id) == id
        for (int i = 0; i < (int) entries_1.size(); i++)
        {
            if (entries_1[i].id != i)
            {
                entries1Unmapped = false;
                break;
            }
        }
    }