override def ingestDocument()

in backend/app/services/index/ElasticsearchResources.scala [114:220]


  override def ingestDocument(uri: Uri, fileSize: Long, ingestionData: IngestionData, languages: List[Language]): Attempt[Unit] = {
    val fileUris = ingestionData.uris.map(_.value)
    val mimeTypes = ingestionData.mimeTypes.map(_.mimeType)
    val parentBlobs = ingestionData.parentBlobs.map(_.value)

    logger.info(s"Indexing ${uri.value} with types: ${mimeTypes.mkString(", ")} and file URIs: '${fileUris.mkString(", ")}")

    val collection = ingestionData.ingestion.split("/").headOption.getOrElse("unknown")

    // We're playing a weird game with several functions which take 'Object'.
    // From the scala Set, via the elasticsearch transport client and finally to a elasticsearch "painless" collection
    val javaFileUris = fileUris.toList.asJava
    val javaMimeTypes = mimeTypes.toList.asJava
    val javaParentBlobs = parentBlobs.toList.asJava

    val defaultFields = Map(
      IndexFields.flags -> Flags.unseen,
      IndexFields.`type` -> "blob",
      IndexFields.extracted -> false,
      IndexFields.ocrExtracted -> false,
      IndexFields.transcriptExtracted -> false,
      IndexFields.collection -> Set(collection),
      IndexFields.ingestion -> Set(ingestionData.ingestion),
      IndexFields.parentBlobs -> parentBlobs,
      IndexFields.metadataField -> Map(
        IndexFields.metadata.mimeTypes -> mimeTypes,
        IndexFields.metadata.fileUris -> fileUris.map(multiLanguageValue(languages, _)),
        IndexFields.metadata.fileSize -> Long.box(fileSize)
      )
    ) ++ getWorkspaceFields(ingestionData.workspace)

    val createdAtField = ingestionData.createdAt.map(IndexFields.createdAt -> _)
    val lastModifiedAtField = ingestionData.lastModifiedAt.map(IndexFields.lastModifiedAt -> _)

    val upsertFields = defaultFields ++ createdAtField ++ lastModifiedAtField

    executeUpdate {
      updateById(indexName, uri.value)
        .script {
          Script(
            s"""
               |params.mimeTypes.removeIf(mime -> ctx._source.metadata.${IndexFields.metadata.mimeTypes}.contains(mime));
               |ctx._source.metadata.${IndexFields.metadata.mimeTypes}.addAll(params.mimeTypes);
               |
               |params.fileUris
               |  .removeIf(fileUri ->
               |    ctx._source.metadata.${IndexFields.metadata.fileUris}
               |      .stream()
               |      .anyMatch(v -> v.values().contains(fileUri))
               |    );
               |
               |for(uri in params.fileUris) {
               |  def fileUri = [:];
               |
               |  for(language in params.languages) {
               |    fileUri[language] = uri;
               |  }
               |
               |  ctx._source.metadata.${IndexFields.metadata.fileUris}.add(fileUri);
               |}
               |
               |if(!ctx._source.${IndexFields.collection}.contains(params.collection)) {
               |  ctx._source.${IndexFields.collection}.add(params.collection);
               |}
               |
               |if(!ctx._source.${IndexFields.ingestion}.contains(params.ingestion)) {
               |  ctx._source.${IndexFields.ingestion}.add(params.ingestion);
               |}
               |
               |if(ctx._source.${IndexFields.parentBlobs} == null) {
               |  ctx._source.${IndexFields.parentBlobs} = [];
               |}
               |
               |params.parentBlobs.removeIf(uri -> ctx._source.${IndexFields.parentBlobs}.contains(uri));
               |ctx._source.${IndexFields.parentBlobs}.addAll(params.parentBlobs);
               |
               |if(params.workspaceBlobUri != null && params.workspaceId != null && params.workspaceNodeId != null) {
               |  if(ctx._source.${IndexFields.workspacesField} == null) {
               |    ctx._source.${IndexFields.workspacesField} = [[
               |      "${IndexFields.workspaces.uri}": params.workspaceBlobUri,
               |      "${IndexFields.workspaces.workspaceId}": params.workspaceId,
               |      "${IndexFields.workspaces.workspaceNodeId}": params.workspaceNodeId
               |    ]];
               |  } else {
               |    ctx._source.${IndexFields.workspacesField}.add([
               |      "${IndexFields.workspaces.uri}": params.workspaceBlobUri,
               |      "${IndexFields.workspaces.workspaceId}": params.workspaceId,
               |      "${IndexFields.workspaces.workspaceNodeId}": params.workspaceNodeId
               |    ]);
               |  }
               |}
          """.
              stripMargin.replaceAll("\\\r?\\\n", "").trim()).params(Map(
            "mimeTypes" -> javaMimeTypes,
            "fileUris" -> javaFileUris,
            "collection" -> collection,
            "ingestion" -> ingestionData.ingestion,
            "parentBlobs" -> javaParentBlobs,
            "workspaceBlobUri" -> ingestionData.workspace.map(_.blobAddedToWorkspace).orNull,
            "workspaceId" -> ingestionData.workspace.map(_.workspaceId).orNull,
            "workspaceNodeId" -> ingestionData.workspace.map(_.workspaceNodeId).orNull,
            "languages" -> languages.map(_.key)
          )
          ).lang("painless")
        }.upsert(upsertFields)
    }
  }