int __cdecl main()

in blingfiretools/fa_lex/fa_lex.cpp [214:460]


int __cdecl main (int argc, char ** argv)
{
    __PROG__ = argv [0];

    --argc, ++argv;

    ::FAIOSetup ();

    process_args (argc, argv);

    try {

        FATagSet tagset (&g_alloc);
        FACorpusIOTools_utf8 text_io (&g_alloc);
        FAMapIOTools map_io (&g_alloc);
        FATaggedText text (&g_alloc);

        text_io.SetTagSet (&tagset);
        text_io.SetNoPosTags (g_no_postags);

        FAImageDump StageImg;

        FARSDfa_pack_triv Dfa;
        FAState2Ow_pack_triv State2Ow;
        FAMultiMap_pack Acts;
        FAWbdConfKeeper Conf;
        FALDB Ldb;

        FALexTools_t < int > lex;

        // adjust IO pointers
        if (g_pInFile) {
            g_ifs.open (g_pInFile, std::ios::in);
            FAAssertStream (&g_ifs, g_pInFile);
            g_pIs = &g_ifs;
        }
        if (g_pOutFile) {
            g_ofs.open (g_pOutFile, std::ios::out);
            g_pOs = &g_ofs;
        }
        if (g_pTagSetFile) {
            std::ifstream tagset_ifs (g_pTagSetFile, std::ios::in);
            FAAssertStream (&tagset_ifs, g_pTagSetFile);
            map_io.Read (tagset_ifs, &tagset);
        }

        // get EOS tag ID from input args
        if (g_pEosTagName) {
            const int EosStrLen = (int) strlen (g_pEosTagName);
            g_EosTag = tagset.Str2Tag (g_pEosTagName, EosStrLen);
            if (-1 == g_EosTag) {
                std::cerr << "ERROR: Unknown EOS tag " << g_pEosTagName << '\n';
                return 1;
            }
        }

        // load and set up the compiled rules
        if (!g_pLdbFile) {

            FAAssert (g_pStageFile, FAMsg::InvalidParameters);

            StageImg.Load (g_pStageFile);
            const unsigned char * pImg = StageImg.GetImageDump ();
            FAAssert (pImg, FAMsg::IOError);

            const int * pA = (const int *) pImg ;
            const int Count = *pA;
            FAAssert (2 == Count, FAMsg::IOError);

            Dfa.SetImage (pImg + *++pA);
            State2Ow.SetImage (pImg + *pA);
            Acts.SetImage (pImg + *++pA);

            Conf.SetRsDfa (&Dfa);
            Conf.SetState2Ow (&State2Ow);
            Conf.SetMMap (&Acts);
            Conf.SetIgnoreCase (g_ignore_case);
            Conf.SetMaxDepth (g_max_depth);

        } else {

            StageImg.Load (g_pLdbFile);
            const unsigned char * pImg = StageImg.GetImageDump ();
            FAAssert (pImg, FAMsg::IOError);

            Ldb.SetImage (pImg);

            const int * pValues = NULL;
            const int iSize = Ldb.GetHeader ()->Get (FAFsmConst::FUNC_WBD, &pValues);
            Conf.Initialize (&Ldb, pValues, iSize);
        }

        // setup parameters and data
        lex.SetConf (&Conf);

        while (!(g_pIs->eof ())) {

            if (!std::getline (*g_pIs, line))
                break;

            LineNum++;

            const char * pLine = line.c_str ();
            int LineLen = (const int) line.length ();

            // echo the input, if needed
            if (g_print_input && false == g_no_output) {
                (*g_pOs) << line << '\n';
            }

            if (0 < LineLen) {
                DebugLogAssert (pLine);
                if (0x0D == (unsigned char) pLine [LineLen - 1])
                    LineLen--;
            }
            if (0 < LineLen) {

                // UTF-8 --> UTF-32
                int BuffSize = ::FAStrUtf8ToArray (pLine, LineLen, g_RawBuff, g_Offsets, MaxBuffSize);
                FAAssert (0 < BuffSize && MaxBuffSize >= BuffSize, FAMsg::IOError);
                int * g_Buff = g_RawBuff;

                if (false == g_no_process) {

                    // see if we want to normalize the buffer first
                    if (g_normalize_input && 0 < BuffSize) {
                        BuffSize = ::FANormalize(g_Buff, BuffSize, g_NormBuff, MaxBuffSize, Conf.GetCharMap ());
                        FAAssert (0 < BuffSize && MaxBuffSize >= BuffSize, FAMsg::IOError);
                        g_Buff = g_NormBuff;
                    }

                    const int OutSize = \
                        lex.Process (g_Buff, BuffSize, g_Out, MaxOutputSize);
                    FAAssert (OutSize <= MaxOutputSize && 0 == OutSize % 3, \
                        FAMsg::IOError);

                    if (false == g_no_output) {

                        if(g_p2s_mode) {

                            // set previous sentence end to -1
                            int PrevEnd = -1;

                            for (int i = 0; i < OutSize; i += 3) {

                                // we don't care about Tag or From for p2s task
                                const int From = PrevEnd + 1;
                                const int To = g_Out [i + 2];
                                const int Len = To - From + 1;
                                PrevEnd = To;

                                // adjust sentence start if needed
                                const int Delta = FAGetFirstNonWhiteSpace(g_Buff + From, Len);

                                if(Delta < Len) {
                                    // convert buffer to a UTF-8 string
                                    const int StrOutSize = ::FAArrayToStrUtf8 (g_Buff + From + Delta, Len - Delta, g_OutUtf8, sizeof(g_OutUtf8)-1);
                                    FAAssert (0 < StrOutSize && StrOutSize < (int) sizeof(g_OutUtf8)-1, FAMsg::IOError);

                                    // print the sentence
                                    g_OutUtf8 [StrOutSize] = 0;
                                    (*g_pOs) << g_OutUtf8 << '\n';
                                }
                            }

                            // always use the end of paragraph as the end of sentence
                            if(PrevEnd + 1 < BuffSize)
                            {
                                const int From = PrevEnd + 1;
                                const int To = BuffSize - 1;
                                const int Len = To - From + 1;

                                // adjust sentence start if needed
                                const int Delta = FAGetFirstNonWhiteSpace(g_Buff + From, Len);

                                if(Delta < Len) {
                                    // convert buffer to a UTF-8 string
                                    const int StrOutSize = ::FAArrayToStrUtf8 (g_Buff + From + Delta, Len - Delta, g_OutUtf8, sizeof(g_OutUtf8)-1);
                                    FAAssert (0 < StrOutSize && StrOutSize < (int) sizeof(g_OutUtf8)-1, FAMsg::IOError);

                                    // print the sentence
                                    g_OutUtf8 [StrOutSize] = 0;
                                    (*g_pOs) << g_OutUtf8 << '\n';
                                }
                            }

                            (*g_pOs) << '\n';

                        } else {

                            bool print_after_loop = true;

                            text.Clear ();

                            for (int i = 0; i < OutSize; i += 3) {

                                const int Tag = g_Out [i];
                                const int From = g_Out [i + 1];
                                const int Len = g_Out [i + 2] - From + 1;

                                text.AddWord (g_Buff + From, Len, Tag, From);
                                print_after_loop = true;

                                if (NULL != g_pEosTagName && Tag == g_EosTag) {
                                    text_io.Print (*g_pOs, &text);
                                    text.Clear ();
                                    print_after_loop = false;
                                }
                            }

                            if (print_after_loop)
                                text_io.Print (*g_pOs, &text);

                        } // of if(g_p2s_mode) ...
                    }

                } // of if (false == g_no_process) ...

            } // of if (0 < LineLen) ...

        } // of while (!(g_pIs->eof ())) ...

    } catch (const FAException & e) {

        const char * const pErrMsg = e.GetErrMsg ();
        const char * const pFile = e.GetSourceName ();
        const int Line = e.GetSourceLine ();

        std::cerr << "ERROR: " << pErrMsg << " in " << pFile \
            << " at line " << Line << " in program " << __PROG__ << '\n';

        std::cerr << "ERROR: in data at line: " << LineNum << " in \"" \
            << line << "\"\n";

        return 2;

    } catch (...) {

        std::cerr << "ERROR: Unknown error in program " << __PROG__ << '\n';
        return 1;
    }

    // print out memory leaks, if any
    FAPrintLeaks(&g_alloc, std::cerr);

    return 0;
}