in blingfiretools/fa_lex/fa_lex.cpp [214:460]
int __cdecl main (int argc, char ** argv)
{
__PROG__ = argv [0];
--argc, ++argv;
::FAIOSetup ();
process_args (argc, argv);
try {
FATagSet tagset (&g_alloc);
FACorpusIOTools_utf8 text_io (&g_alloc);
FAMapIOTools map_io (&g_alloc);
FATaggedText text (&g_alloc);
text_io.SetTagSet (&tagset);
text_io.SetNoPosTags (g_no_postags);
FAImageDump StageImg;
FARSDfa_pack_triv Dfa;
FAState2Ow_pack_triv State2Ow;
FAMultiMap_pack Acts;
FAWbdConfKeeper Conf;
FALDB Ldb;
FALexTools_t < int > lex;
// adjust IO pointers
if (g_pInFile) {
g_ifs.open (g_pInFile, std::ios::in);
FAAssertStream (&g_ifs, g_pInFile);
g_pIs = &g_ifs;
}
if (g_pOutFile) {
g_ofs.open (g_pOutFile, std::ios::out);
g_pOs = &g_ofs;
}
if (g_pTagSetFile) {
std::ifstream tagset_ifs (g_pTagSetFile, std::ios::in);
FAAssertStream (&tagset_ifs, g_pTagSetFile);
map_io.Read (tagset_ifs, &tagset);
}
// get EOS tag ID from input args
if (g_pEosTagName) {
const int EosStrLen = (int) strlen (g_pEosTagName);
g_EosTag = tagset.Str2Tag (g_pEosTagName, EosStrLen);
if (-1 == g_EosTag) {
std::cerr << "ERROR: Unknown EOS tag " << g_pEosTagName << '\n';
return 1;
}
}
// load and set up the compiled rules
if (!g_pLdbFile) {
FAAssert (g_pStageFile, FAMsg::InvalidParameters);
StageImg.Load (g_pStageFile);
const unsigned char * pImg = StageImg.GetImageDump ();
FAAssert (pImg, FAMsg::IOError);
const int * pA = (const int *) pImg ;
const int Count = *pA;
FAAssert (2 == Count, FAMsg::IOError);
Dfa.SetImage (pImg + *++pA);
State2Ow.SetImage (pImg + *pA);
Acts.SetImage (pImg + *++pA);
Conf.SetRsDfa (&Dfa);
Conf.SetState2Ow (&State2Ow);
Conf.SetMMap (&Acts);
Conf.SetIgnoreCase (g_ignore_case);
Conf.SetMaxDepth (g_max_depth);
} else {
StageImg.Load (g_pLdbFile);
const unsigned char * pImg = StageImg.GetImageDump ();
FAAssert (pImg, FAMsg::IOError);
Ldb.SetImage (pImg);
const int * pValues = NULL;
const int iSize = Ldb.GetHeader ()->Get (FAFsmConst::FUNC_WBD, &pValues);
Conf.Initialize (&Ldb, pValues, iSize);
}
// setup parameters and data
lex.SetConf (&Conf);
while (!(g_pIs->eof ())) {
if (!std::getline (*g_pIs, line))
break;
LineNum++;
const char * pLine = line.c_str ();
int LineLen = (const int) line.length ();
// echo the input, if needed
if (g_print_input && false == g_no_output) {
(*g_pOs) << line << '\n';
}
if (0 < LineLen) {
DebugLogAssert (pLine);
if (0x0D == (unsigned char) pLine [LineLen - 1])
LineLen--;
}
if (0 < LineLen) {
// UTF-8 --> UTF-32
int BuffSize = ::FAStrUtf8ToArray (pLine, LineLen, g_RawBuff, g_Offsets, MaxBuffSize);
FAAssert (0 < BuffSize && MaxBuffSize >= BuffSize, FAMsg::IOError);
int * g_Buff = g_RawBuff;
if (false == g_no_process) {
// see if we want to normalize the buffer first
if (g_normalize_input && 0 < BuffSize) {
BuffSize = ::FANormalize(g_Buff, BuffSize, g_NormBuff, MaxBuffSize, Conf.GetCharMap ());
FAAssert (0 < BuffSize && MaxBuffSize >= BuffSize, FAMsg::IOError);
g_Buff = g_NormBuff;
}
const int OutSize = \
lex.Process (g_Buff, BuffSize, g_Out, MaxOutputSize);
FAAssert (OutSize <= MaxOutputSize && 0 == OutSize % 3, \
FAMsg::IOError);
if (false == g_no_output) {
if(g_p2s_mode) {
// set previous sentence end to -1
int PrevEnd = -1;
for (int i = 0; i < OutSize; i += 3) {
// we don't care about Tag or From for p2s task
const int From = PrevEnd + 1;
const int To = g_Out [i + 2];
const int Len = To - From + 1;
PrevEnd = To;
// adjust sentence start if needed
const int Delta = FAGetFirstNonWhiteSpace(g_Buff + From, Len);
if(Delta < Len) {
// convert buffer to a UTF-8 string
const int StrOutSize = ::FAArrayToStrUtf8 (g_Buff + From + Delta, Len - Delta, g_OutUtf8, sizeof(g_OutUtf8)-1);
FAAssert (0 < StrOutSize && StrOutSize < (int) sizeof(g_OutUtf8)-1, FAMsg::IOError);
// print the sentence
g_OutUtf8 [StrOutSize] = 0;
(*g_pOs) << g_OutUtf8 << '\n';
}
}
// always use the end of paragraph as the end of sentence
if(PrevEnd + 1 < BuffSize)
{
const int From = PrevEnd + 1;
const int To = BuffSize - 1;
const int Len = To - From + 1;
// adjust sentence start if needed
const int Delta = FAGetFirstNonWhiteSpace(g_Buff + From, Len);
if(Delta < Len) {
// convert buffer to a UTF-8 string
const int StrOutSize = ::FAArrayToStrUtf8 (g_Buff + From + Delta, Len - Delta, g_OutUtf8, sizeof(g_OutUtf8)-1);
FAAssert (0 < StrOutSize && StrOutSize < (int) sizeof(g_OutUtf8)-1, FAMsg::IOError);
// print the sentence
g_OutUtf8 [StrOutSize] = 0;
(*g_pOs) << g_OutUtf8 << '\n';
}
}
(*g_pOs) << '\n';
} else {
bool print_after_loop = true;
text.Clear ();
for (int i = 0; i < OutSize; i += 3) {
const int Tag = g_Out [i];
const int From = g_Out [i + 1];
const int Len = g_Out [i + 2] - From + 1;
text.AddWord (g_Buff + From, Len, Tag, From);
print_after_loop = true;
if (NULL != g_pEosTagName && Tag == g_EosTag) {
text_io.Print (*g_pOs, &text);
text.Clear ();
print_after_loop = false;
}
}
if (print_after_loop)
text_io.Print (*g_pOs, &text);
} // of if(g_p2s_mode) ...
}
} // of if (false == g_no_process) ...
} // of if (0 < LineLen) ...
} // of while (!(g_pIs->eof ())) ...
} catch (const FAException & e) {
const char * const pErrMsg = e.GetErrMsg ();
const char * const pFile = e.GetSourceName ();
const int Line = e.GetSourceLine ();
std::cerr << "ERROR: " << pErrMsg << " in " << pFile \
<< " at line " << Line << " in program " << __PROG__ << '\n';
std::cerr << "ERROR: in data at line: " << LineNum << " in \"" \
<< line << "\"\n";
return 2;
} catch (...) {
std::cerr << "ERROR: Unknown error in program " << __PROG__ << '\n';
return 1;
}
// print out memory leaks, if any
FAPrintLeaks(&g_alloc, std::cerr);
return 0;
}