in src/doc_parser.cpp [25:69]
bool LayerDataParser::parse(
string& s,
vector<Base>& feats,
const string& sep) {
// split each part into tokens
vector<string> tokens;
boost::split(tokens, s, boost::is_any_of(string(sep)));
int start_idx = 0;
float ex_weight = 1.0;
if (tokens[0].find("__weight__") != std::string::npos) {
std::size_t pos = tokens[0].find(args_->weightSep);
if (pos != std::string::npos) {
ex_weight = atof(tokens[0].substr(pos + 1).c_str());
}
start_idx = 1;
}
for (unsigned int i = start_idx; i < tokens.size(); i++) {
string t = tokens[i];
float weight = 1.0;
if (args_->useWeight) {
std::size_t pos = tokens[i].find(args_->weightSep);
if (pos != std::string::npos) {
t = tokens[i].substr(0, pos);
weight = atof(tokens[i].substr(pos + 1).c_str());
}
}
if (args_->normalizeText) {
normalize_text(t);
}
int32_t wid = dict_->getId(t);
if (wid != -1) {
feats.push_back(make_pair(wid, weight * ex_weight));
}
}
if (args_->ngrams > 1) {
addNgrams(tokens, feats, args_->ngrams);
}
return feats.size() > 0;
}