in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala [49:105]
private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword").getOrElse(false)
/**
*
* 1. Prepares combination of tokens (sliding).
* Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
* One sentence converted to 4 pieces.
*
* 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
* Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
* {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
*
* 3. All variants collected, duplicated sets deleted, etc.
*
* @param toks
*/
private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
(for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p).map(combo => {
val stops = combo.filter(s => isStopWord(s) && s != combo.head && s != combo.last)
val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
for (stop <- stops)
if slides.nonEmpty && slides.last.last.getIndex + 1 == stop.getIndex then
slides.last += stop
else
slides += mutable.ArrayBuffer.empty :+ stop
// Too many stopwords inside skipped.
val bigSlides = slides.filter(_.size > 2)
var stops4Delete =
if bigSlides.nonEmpty then
val allBig = bigSlides.flatten
val stops4AllCombs = stops.filter(p => !allBig.contains(p))
if stops4AllCombs.nonEmpty then
for (
seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
)
yield seq1 ++ seq2.flatten
else
for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
yield seq.toSeq.flatten
else
Range.inclusive(1, stops.size).flatMap(stops.combinations)
stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
Piece(
combo.toList,
stops4Delete.
map(_.toSet).
map(del => combo.filter(t => !del.contains(t)).toList).filter(_.nonEmpty).sortBy(-_.size).
toList
)
})