sample-configs/customocr/tika-config-rendered.xml (13 lines of code) (raw):

<?xml version="1.0" encoding="UTF-8" standalone="no"?> <!-- ~ Licensed to the Apache Software Foundation (ASF) under one or more ~ contributor license agreements. See the NOTICE file distributed with ~ this work for additional information regarding copyright ownership. ~ The ASF licenses this file to You under the Apache License, Version 2.0 ~ (the "License"); you may not use this file except in compliance with ~ the License. You may obtain a copy of the License at ~ ~ http://www.apache.org/licenses/LICENSE-2.0 ~ ~ Unless required by applicable law or agreed to in writing, software ~ distributed under the License is distributed on an "AS IS" BASIS, ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ~ See the License for the specific language governing permissions and ~ limitations under the License. --> <properties> <parsers> <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) --> <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/> <!-- OCR on Rendered Pages --> <parser class="org.apache.tika.parser.pdf.PDFParser"> <params> <!-- no_ocr - extract text only ocr_only - don't extract text and just attempt OCR ocr_and_text - extract text and attempt OCR (from Tika 1.24) auto - extract text but if < 10 characters try OCR --> <param name="ocrStrategy" type="string">ocr_only</param> <param name="ocrImageType" type="string">rgb</param> <param name="ocrDPI" type="int">100</param> </params> </parser> </parsers> </properties>