nginx/lua/tika-response-body.lua (55 lines of code) (raw):
--[[
Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
or more contributor license agreements. Licensed under the Elastic License 2.0;
you may not use this file except in compliance with the Elastic License 2.0.
]]
local chunk, eof = ngx.arg[1], ngx.arg[2]
local buffered = ngx.ctx.buffered
if not buffered then
buffered = {}
ngx.ctx.buffered = buffered
end
if chunk ~= "" then
buffered[#buffered + 1] = chunk
ngx.arg[1] = nil
end
if eof then
local whole = table.concat(buffered)
ngx.ctx.buffered = nil
local cjson = require "cjson"
local response = {
_meta = {
["X-ELASTIC:service"] = "tika"
}
}
if ngx.status == 200 then
local body = cjson.decode(whole)
if not body["X-TIKA:content"] then
for k, v in pairs(body) do
if string.find(k, "X-TIKA:EXCEPTION") then
local i = string.find(v, "\n")
local message = ""
if not i then
message = v
else
-- Tika errors are often massive Java stack traces.
-- We can see these in full in the tikaserver.log so only send first line back.
message = string.sub(v, 1, i - 1)
end
response["error"] = "Content Extraction Error"
response["message"] = k .. " - " .. message
ngx.log(ngx.STDERR, response["error"])
break
end
end
if not response["error"] then
-- if no exceptions are returned, content was extracted but it was likely a blank document
response["extracted_text"] = ""
end
else
response["extracted_text"] = body["X-TIKA:content"]
response["_meta"]["X-ELASTIC:TIKA:parsed_by"] = body["X-TIKA:Parsed-By"]
end
elseif ngx.status == 422 then
response["error"] = "Unprocessable Entity"
response["message"] = "Tikaserver could not process file. File may be corrupt or encrypted."
ngx.log(ngx.STDERR, response["error"])
else
response["error"] = "Unexpected Extraction Failure"
response["message"] = "Tikaserver could not extract the file content."
ngx.log(ngx.STDERR, response["error"])
end
ngx.arg[1] = cjson.encode(response)
end