in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala [66:199]
private def hasNullOrEmpty(iter: Iterable[String]): Boolean = iter.exists(p => p == null || p.strip.isEmpty)
/**
*
* @param macros
* @param elements
*/
private def checkMacros(macros: Map[String, String], elements: Seq[NCSemanticElement]): Unit =
require(elements != null)
if macros != null then
if hasNullOrEmpty(macros.keySet) then E("Some macro names are null or empty.")
if hasNullOrEmpty(macros.values) then E("Some macro bodies are null or empty.")
val set = elements.filter(_.getSynonyms != null).flatMap(_.getSynonyms) ++ macros.values
for (makro <- macros.keys if !set.exists(_.contains(makro)))
logger.warn(s"Unused macro detected [macro=$makro]")
def isSuspicious(s: String): Boolean = SUSP_SYNS_CHARS.exists(s.contains)
// Ignore suspicious chars if regex is used in macro...
for ((name, value) <- macros if isSuspicious(name) || (isSuspicious(value) && !value.contains("//")))
logger.warn(s"Suspicious macro definition (use of ${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [macro=$name]")
/**
*
* @param syns
* @param elemType
* @param valueName
*/
private def checkSynonyms(syns: Set[String], elemType: String, valueName: Option[String] = None): Unit =
def mkDesc: String =
val valuePart = if valueName.isDefined then s", value=${valueName.get}" else ""
s"[type=$elemType$valuePart]"
if syns != null then
if hasNullOrEmpty(syns) then E(s"Some synonyms are null or empty $mkDesc")
val susp = syns.filter(syn => !syn.contains("//") && SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
if susp.nonEmpty then
logger.warn(
s"Suspicious synonyms detected (use of ${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) $mkDesc"
)
/**
*
* @param elems
*/
private def checkElements(elems: Seq[NCSemanticElement]): Unit =
if elems == null || elems.isEmpty then E("Elements cannot be null or empty.")
if elems.contains(null) then E("Some elements are null.")
// Duplicates.
val types = mutable.HashSet.empty[String]
for (typ <- elems.map(_.getType))
if types.contains(typ) then E(s"Duplicate element type [type=$typ]")
else types += typ
for (e <- elems)
val typ = e.getType
if typ == null || typ.isEmpty then E(s"Some element types are not provided or empty.")
else if !typ.matches(TYPE_REGEX) then E(s"Element type does not match regex [type=$typ, regex=$TYPE_REGEX]")
else if typ.exists(_.isWhitespace) then E(s"Element type cannot have whitespaces [type=$typ]")
checkSynonyms(e.getSynonyms, typ)
val vals = e.getValues
if vals != null then
if hasNullOrEmpty(vals.keySet) then E(s"Some values names are null or empty [element=$typ]")
for ((name, syns) <- vals)
checkSynonyms(syns, typ, name.?)
/**
*
* @param stemmer
* @param tokParser
* @param macroParser
* @param elemType
* @param syns
*/
private def convertSynonyms(
stemmer: NCStemmer,
tokParser: NCTokenParser,
macroParser: NCMacroParser,
elemType: String,
syns: Set[String]
): List[List[NCSemanticSynonymChunk]] =
case class RegexHolder(text: String, var used: Boolean = false):
private def stripSuffix(fix: String, s: String): String = s.slice(fix.length, s.length - fix.length)
def mkChunk(): NCSemanticSynonymChunk =
val ptrn = stripSuffix(REGEX_FIX, text)
if ptrn.nonEmpty then
try NCSemanticSynonymChunk(REGEX, text, regex = Pattern.compile(ptrn))
catch case e: PatternSyntaxException => E(s"Invalid regex synonym syntax detected [element=$elemType, chunk=$text]", e)
else E(s"Empty regex synonym detected [element=$elemType]")
val regexes = mutable.HashMap.empty[Int, RegexHolder]
def findRegex(t: NCToken): Option[RegexHolder] =
if regexes.nonEmpty then (t.getStartCharIndex to t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption
else None
syns.flatMap(macroParser.expand).
map(syn => {
// Drops redundant spaces without any warnings.
val normSyn = syn.split(" ").map(_.strip).filter(_.nonEmpty)
var start = 0
var end = -1
regexes.clear()
// Saves regex chunks positions. Regex chunks can be found without tokenizer, just split by spaces.
for (ch <- normSyn)
start = end + 1
end = start + ch.length
if ch.startsWith(REGEX_FIX) && ch.endsWith(REGEX_FIX) then
val r = RegexHolder(ch)
(start to end).foreach(regexes += _ -> r)
// Tokenizes synonym without regex chunks. Regex chunks are used as is, without tokenization.
tokParser.tokenize(normSyn.mkString(" ")).flatMap(tok =>
findRegex(tok) match
case Some(regex) =>
if regex.used then None
else
regex.used = true
regex.mkChunk().?
case None => NCSemanticSynonymChunk(TEXT, tok.getText, stemmer.stem(tok.getText.toLowerCase)).?
).toList
}).toList.filter(_.nonEmpty)