in backend/app/services/index/ElasticsearchResources.scala [114:220]
override def ingestDocument(uri: Uri, fileSize: Long, ingestionData: IngestionData, languages: List[Language]): Attempt[Unit] = {
val fileUris = ingestionData.uris.map(_.value)
val mimeTypes = ingestionData.mimeTypes.map(_.mimeType)
val parentBlobs = ingestionData.parentBlobs.map(_.value)
logger.info(s"Indexing ${uri.value} with types: ${mimeTypes.mkString(", ")} and file URIs: '${fileUris.mkString(", ")}")
val collection = ingestionData.ingestion.split("/").headOption.getOrElse("unknown")
// We're playing a weird game with several functions which take 'Object'.
// From the scala Set, via the elasticsearch transport client and finally to a elasticsearch "painless" collection
val javaFileUris = fileUris.toList.asJava
val javaMimeTypes = mimeTypes.toList.asJava
val javaParentBlobs = parentBlobs.toList.asJava
val defaultFields = Map(
IndexFields.flags -> Flags.unseen,
IndexFields.`type` -> "blob",
IndexFields.extracted -> false,
IndexFields.ocrExtracted -> false,
IndexFields.transcriptExtracted -> false,
IndexFields.collection -> Set(collection),
IndexFields.ingestion -> Set(ingestionData.ingestion),
IndexFields.parentBlobs -> parentBlobs,
IndexFields.metadataField -> Map(
IndexFields.metadata.mimeTypes -> mimeTypes,
IndexFields.metadata.fileUris -> fileUris.map(multiLanguageValue(languages, _)),
IndexFields.metadata.fileSize -> Long.box(fileSize)
)
) ++ getWorkspaceFields(ingestionData.workspace)
val createdAtField = ingestionData.createdAt.map(IndexFields.createdAt -> _)
val lastModifiedAtField = ingestionData.lastModifiedAt.map(IndexFields.lastModifiedAt -> _)
val upsertFields = defaultFields ++ createdAtField ++ lastModifiedAtField
executeUpdate {
updateById(indexName, uri.value)
.script {
Script(
s"""
|params.mimeTypes.removeIf(mime -> ctx._source.metadata.${IndexFields.metadata.mimeTypes}.contains(mime));
|ctx._source.metadata.${IndexFields.metadata.mimeTypes}.addAll(params.mimeTypes);
|
|params.fileUris
| .removeIf(fileUri ->
| ctx._source.metadata.${IndexFields.metadata.fileUris}
| .stream()
| .anyMatch(v -> v.values().contains(fileUri))
| );
|
|for(uri in params.fileUris) {
| def fileUri = [:];
|
| for(language in params.languages) {
| fileUri[language] = uri;
| }
|
| ctx._source.metadata.${IndexFields.metadata.fileUris}.add(fileUri);
|}
|
|if(!ctx._source.${IndexFields.collection}.contains(params.collection)) {
| ctx._source.${IndexFields.collection}.add(params.collection);
|}
|
|if(!ctx._source.${IndexFields.ingestion}.contains(params.ingestion)) {
| ctx._source.${IndexFields.ingestion}.add(params.ingestion);
|}
|
|if(ctx._source.${IndexFields.parentBlobs} == null) {
| ctx._source.${IndexFields.parentBlobs} = [];
|}
|
|params.parentBlobs.removeIf(uri -> ctx._source.${IndexFields.parentBlobs}.contains(uri));
|ctx._source.${IndexFields.parentBlobs}.addAll(params.parentBlobs);
|
|if(params.workspaceBlobUri != null && params.workspaceId != null && params.workspaceNodeId != null) {
| if(ctx._source.${IndexFields.workspacesField} == null) {
| ctx._source.${IndexFields.workspacesField} = [[
| "${IndexFields.workspaces.uri}": params.workspaceBlobUri,
| "${IndexFields.workspaces.workspaceId}": params.workspaceId,
| "${IndexFields.workspaces.workspaceNodeId}": params.workspaceNodeId
| ]];
| } else {
| ctx._source.${IndexFields.workspacesField}.add([
| "${IndexFields.workspaces.uri}": params.workspaceBlobUri,
| "${IndexFields.workspaces.workspaceId}": params.workspaceId,
| "${IndexFields.workspaces.workspaceNodeId}": params.workspaceNodeId
| ]);
| }
|}
""".
stripMargin.replaceAll("\\\r?\\\n", "").trim()).params(Map(
"mimeTypes" -> javaMimeTypes,
"fileUris" -> javaFileUris,
"collection" -> collection,
"ingestion" -> ingestionData.ingestion,
"parentBlobs" -> javaParentBlobs,
"workspaceBlobUri" -> ingestionData.workspace.map(_.blobAddedToWorkspace).orNull,
"workspaceId" -> ingestionData.workspace.map(_.workspaceId).orNull,
"workspaceNodeId" -> ingestionData.workspace.map(_.workspaceNodeId).orNull,
"languages" -> languages.map(_.key)
)
).lang("painless")
}.upsert(upsertFields)
}
}