in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala [116:257]
@tailrec private def combine(data1: Seq[String], data2: Seq[String], i: Int = 0, tmp: Set[List[String]] = Set(List.empty)): Set[List[String]] =
require(data1.sizeIs == data2.size)
if data1.isEmpty then Set.empty
else if i >= data1.size then tmp
else combine(data1, data2, i + 1, tmp.map(_ :+ data1(i)) ++ tmp.map(_ :+ data2(i)))
import NCSemanticEntityParser.*
/**
* Semantic entity parser implementation.
*
* This synonyms based parser provides simple yet powerful way to find domain specific data in the input text.
* It is configured via [[NCSemanticElement]] list which represents all possible [[NCEntity named entities]] that
* this parser can detect.
*
* [[NCSemanticElement Semantic elements]] can be configured via YAML or JSON files in special format or
* passed in this parser as programmatically prepared list. [[NCSemanticElement Semantic elements]] contain set of
* synonyms which can use special [[https://nlpcraft.apache.org/built-in-entity-parser.html#macros macros]].
* These macros also can be provided via YAML and JSON files or passed directly in case of programmatically prepared
* [[NCSemanticElement]] list.
*
* Example of YAML elements definition.
* <pre>
* macros:
* "<OF>": "{of|for|per}"
* "<CUR>": "{current|present|now|local}"
* "<TIME>": "{time <OF> day|day time|date|time|moment|datetime|hour|o'clock|clock|date time}"
* elements:
* - id: "x:time"
* description: "Date and/or time token indicator."
* synonyms:
* - "{<CUR>|_} <TIME>"
* - "what <TIME> {is it now|now|is it|_}"
* </pre>
* Given this simple definition the **x:time** element can be detected by a large number of synonyms like *day time*,
* *local day time*, *time of day*, *local time of day*, *what hour is it*, etc.
*
* @param stemmer [[NCStemmer]] implementation which used to match tokens and given [[NCSemanticElement]] synonyms.
* @param parser [[NCTokenParser]] implementation which will be used for [[NCSemanticElement]] synonyms tokenization.
* It should be same implementation as used in [[NCPipeline.getTokenParser]].
* @param macros Macros map which are used for extracting [[NCSemanticElement]] synonyms defined via **macros**.
* More information at [[https://nlpcraft.apache.org/built-in-entity-parser.html#macros]].
* @param elements Programmatically prepared [[NCSemanticElement]] instances. Note that either the model or elements
* must be supplied at least.
* @param mdlResOpt Optional relative path, absolute path, classpath resource or URL to YAML or JSON semantic model
* which contains [[NCSemanticElement]] definitions. Note that either the model or elements must be supplied at least.
*
* @see [[NCSemanticElement]]
*/
class NCSemanticEntityParser private (
stemmer: NCStemmer,
parser: NCTokenParser,
macros: Map[String, String],
elements: List[NCSemanticElement],
mdlResOpt: Option[String]
) extends NCEntityParser with LazyLogging:
require(stemmer != null, "Stemmer cannot be null.")
require(parser != null, "Token parser cannot be null.")
require(macros != null, "Macros cannot be null.")
require(elements != null && elements.nonEmpty || mdlResOpt.isDefined, "Either elements or external YAML/JSON model must be supplied.")
/**
* Creates [[NCSemanticEntityParser]] instance with given parameters.
*
* @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
* @param macros Macros map. Empty by default.
* @param elements [[NCSemanticElement]] list.
*/
def this(stemmer: NCStemmer, parser: NCTokenParser, macros: Map[String, String], elements: List[NCSemanticElement]) =
this(stemmer, parser, macros, elements, None)
/**
*
* Creates [[NCSemanticEntityParser]] instance with given parameters.
*
* @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
* @param elements [[NCSemanticElement]] list.
*/
def this(stemmer: NCStemmer, parser: NCTokenParser, elements: List[NCSemanticElement]) =
this(stemmer, parser, Map.empty, elements, None)
/**
*
* Creates [[NCSemanticEntityParser]] instance with given parameters.
*
* @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
* @param mdlRes Relative path, absolute path, classpath resource or URL to YAML or JSON semantic model definition.
*/
def this(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String) =
this(stemmer, parser, Map.empty, List.empty, mdlRes.?)
private lazy val scrType =
require(mdlResOpt.isDefined)
NCSemanticSourceType.detect(mdlResOpt.get)
private var synsHolder: NCSemanticSynonymsHolder = _
private var elemsMap: Map[String, NCSemanticElement] = _
init()
/**
*
*/
private def init(): Unit =
val (macros, elements, elemsMap) =
def toMap(elems: Seq[NCSemanticElement]): Map[String, NCSemanticElement] = elems.map(p => p.getType -> p).toMap
mdlResOpt match
case Some(mdlSrc) =>
val src = NCSemanticSourceReader.read(new BufferedInputStream(NCUtils.getStream(mdlSrc)), scrType)
logger.trace(s"Loaded resource: $mdlResOpt")
(src.macros, src.elements, toMap(src.elements))
case None => (this.macros, this.elements, toMap(this.elements))
this.synsHolder = NCSemanticSynonymsProcessor.prepare(stemmer, parser, macros, elements)
this.elemsMap = elemsMap
/**
*
* @param name
*/
private def warnMissedProperty(name: String): Unit = logger.warn(s"'$name' property not found. Is proper token enricher configured?")
/** @inheritdoc */
override def parse(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): List[NCEntity] =
if toks.exists(_.get[String]("stopword").isEmpty) then warnMissedProperty("stopword")
val stems = toks.map(p => p -> stemmer.stem(p.getText.toLowerCase)).toMap
val stems4Lemms =
var ok = true
val seq =
for (t <- toks; lemmaOpt = t.get[String]("lemma") if ok)
yield
ok = lemmaOpt.isDefined
t -> lemmaOpt.orNull
if ok then
seq.toMap.map { (tok, lemma) => tok -> stemmer.stem(lemma.toLowerCase) }