in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala [76:106]
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
val txts = toks.map(_.getText).toArray
this.synchronized {
val poses = if tagger != null then tagger.tag(txts) else txts.map(_ => "")
var lemmas = if lemmatizer != null then lemmatizer.lemmatize(txts, poses) else txts
require(toks.sizeIs == poses.length && toks.sizeIs == lemmas.length)
// For some reasons lemmatizer (en-lemmatizer.dict) marks some words with non-existent POS 'NNN'
// Valid POS list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
val suspIdxs = lemmas.zip(poses).zipWithIndex.flatMap {
// "0" is flag that lemma cannot be obtained for some reasons.
case ((lemma, pos), i) => Option.when(lemma == "O" && pos == "NN")(i)
}
if suspIdxs.nonEmpty && lemmatizer != null then
val fixes = lemmatizer.
lemmatize(suspIdxs.map(i => txts(i)), suspIdxs.map(_ => "NNN")).
zipWithIndex.
flatMap { (lemma, i) => Option.when(lemma != "0")(suspIdxs(i) -> lemma) }.toMap
lemmas = lemmas.zipWithIndex.map {
(lemma, idx) => fixes.getOrElse(idx, lemma)
}
toks.zip(poses).zip(lemmas).foreach { case ((t, pos), lemma) =>
t.put("pos", pos)
t.put("lemma", lemma)
() // Otherwise - NPE.
}
}