tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java (14 lines): - line 59: * TODO: we should cutover to a "real" tokenizer (eg JFlex); - line 236: // TODO: in theory these other charsets are simple - line 326: //TODO: - line 539: // TODO: log a warning here, somehow? - line 852: // TODO: afN? (associated font number) - line 854: // TODO: do these alter text output...? - line 873: // TODO: inefficient that we check equals N times; - line 984: // TODO: log a warning? Throw an exc? - line 1123: // TODO: we can also parse \creatim, \revtim, - line 1255: // TODO: we should produce a table output here? - line 1275: // TODO: create img tag? but can that support - line 1488: // TODO: log a warning? - line 1527: // TODO: what other instructions can be in a - line 1543: // TODO: we could process the other known field tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java (13 lines): - line 77: // TODO: consider ConcurrentHashMap ? - line 78: // TODO: this could really be a weak map somewhere else on the coreCacheKey, - line 145: // TODO: this is trappy as the expectation is that core keys live for a long - line 175: return MultiDocValues.getNumericValues(in, field); // TODO cache? - line 181: return MultiDocValues.getBinaryValues(in, field); // TODO cache? - line 187: return MultiDocValues.getSortedNumericValues(in, field); // TODO cache? - line 291: return MultiDocValues.getNormValues(in, field); // TODO cache? - line 296: //TODO figure out how to implement this... if needed - line 302: //TODO figure out how to implement this... if needed - line 308: //TODO figure out how to implement this... if needed - line 313: //TODO figure out how to implement this... if needed - line 357: return MultiBits.getLiveDocs(in); // TODO cache? - line 373: // TODO: as this is a wrapper, should we really close the delegate? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java (13 lines): - line 77: COMMON_TAG_MAP.put("td", new HtmlTag("td"));//TODO -- convert to th if in thead? - line 199: //TODO: is there a better way of getting these/doing the mapping? - line 239: //TODO: LOG! piece of text that wasn't referenced in the marked content - line 254: //TODO: figure out when we're crossing page boundaries during the recursion - line 279: //TODO should be merged with COSDictionary segment below? - line 316: //TODO: currently suppressing span and lbody... - line 355: //TODO: log can't find mcid - line 358: //TODO: check for other types of dictionary? - line 375: //TODO: handle a different object? - line 426: //TODO: at some point also handle - line 434: //TODO: sort text positions? Figure out when to add/remove a newline and/or space? - line 443: TODO: do we want to do anything with these? - line 444: TODO: Are there other types of objects we need to handle here? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java (10 lines): - line 102: //TODO: parameterize this ? - line 105: //TODO: this is an embarrassment of an initializer...fix - line 174: //TODO: log or otherwise report - line 233: //TODO: is there a better way to do this rather than reprocessing the page - line 273: //TODO: handle image metadata: xobject.getMetadata() - line 286: //TODO: should we use the hash of the PDImage to check for seen - line 405: //TODO -- should we look for image rotation metadata in the PDImage or elsewhere? - line 449: //TODO: what else can we extract from the PDImage without rendering? - line 482: //TODO: determine if we need to add more image types - line 496: //TODO -- is there a cleaner way of checking for: tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java (9 lines): - line 111: // TODO Service Loader section - line 128: // TODO Implement the reverse of ExecutorServiceXmlLoader - line 129: // TODO Make it possible to detect if we have the default executor - line 130: // TODO Make it possible to get the current values from ConfigurableThreadPoolExecutor - line 298: // TODO Parser Exclusions - line 346: //TODO -- check code base for setters with lowercase initial letters?! - line 366: //TODO -- check code base for setX() zero parameters that set boolean to true - line 391: //TODO -- remove nonprimitive setters/getters that have a string equivalent - line 566: //TODO -- if both string and integer, which one do we pick? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java (9 lines): - line 165: * TODO Make this thread-safe - line 455: // TODO: other metadata? - line 579: //TODO: we need to prevent this if only a portion of the page or portions - line 581: //TODO: we should also figure out how to not reuse the rendering if - line 673: //TODO -- get suffix based on OcrImageType - line 676: //TODO: get output format from TesseractConfig - line 744: // TODO: remove once PDFBOX-1143 is fixed: - line 763: // TODO: maybe also annotationMarkup.getRichContents()? - line 1077: //TODO figure out better way of managing this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java (8 lines): - line 47: * TODO: move this into POI? - line 119: //in run or in field. TODO: convert this to an integer because you can have a run within a run - line 184: //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd - line 221: } else if (B.equals(localName)) { //TODO: add bCs - line 230: } else if (I.equals(localName)) { //TODO: add iCs - line 293: } //TODO: add sdt, sdtPr, sdtContent goes here statistically - line 313: //TODO: clean this up and ...want to get ProgID? - line 406: //a run...TODO: should we swallow whitespace that doesn't occur in a run? tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java (8 lines): - line 27: * TODO change to LanguageTag, and use these vs. strings everywhere in the - line 46: // TODO make it so. - line 52: // TODO make it so - line 65: // TODO make it so - line 73: // TODO Fill in script if missing and something we could derive from lang+region - line 76: // TODO Treat missing script == present script, if present script is default - line 79: // TODO probably OK to ignore extensions - line 81: // TODO Do we want/need a fuzzy match for region (and script) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (7 lines): - line 299: //TODO -- can we use the PDFBox parser's RandomAccessRead - line 312: //TODO figure out better way of managing this - line 361: //TODO -- consider parsing the metadata - line 408: //TODO: test that this is not AUTO with no OCR parser installed - line 516: //TODO: are there other checks we need to perform? - line 649: //TODO: Let's try to move this into PDFBox. - line 1123: //TODO -- figure out how to deserialize this in TikaConfigSerializer tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java (6 lines): - line 49: @SuppressWarnings("unused") int singleByteCharCount = 0; //TODO Do we really need this? - line 185: // TODO: This set of data comes from the character frequency- - line 243: // TODO: This set of data comes from the character frequency- - line 376: // TODO: This set of data comes from the character frequency- - line 411: // TODO: This set of data comes from the character frequency- - line 447: // TODO: This set of data comes from the character frequency- tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (5 lines): - line 3210: - line 3236: - line 4871: TODO: add detection for wb2 and wb1 --> - line 5659: - line 7418: tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (5 lines): - line 219: // TODO Fetch the array values and output - line 221: // TODO Fetch the vector values and output - line 223: // TODO Decode, if possible - line 226: // TODO Decode, if possible - line 228: // TODO Decode, if possible tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java (5 lines): - line 343: // TODO check that next is OK - line 544: // FIXME: deal with ObjectDataEncryptionKey - line 648: // TODO - call postprocessObjectDeclarationContents on this object? - line 766: // TODO - the expected header is different per version of one note. - line 788: // TODO - the expected footer is per version of one note. tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java (5 lines): - line 49: * TODO: integrate table markup - line 365: // TODO: Implement splitting for RTL too - line 479: // TODO: This could be optimised using a binary search since we know rows is sorted - line 500: //TODO: use name in conjunction with Frag information - line 573: // TODO: Parse other elements of GlyphIndex tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java (5 lines): - line 86: //TODO -- this assumes files have been deleted first - line 94: //TODO -- run mutool pages to get page sizes - line 102: //TODO: parameterize timeout - line 107: //TODO -- fix this - line 126: //TODO parameterize all the things; mutool path, colorspace and size and format and... tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java (5 lines): - line 86: //TODO: take into consideration the filename. Perhaps require - line 168: //TODO: test to make sure cell doesn't start with escaped - line 345: //TODO -- do some analysis to make sure you don't have - line 355: //TODO -- add tests for long tokens containing - line 397: //TODO: convert this to continuous vs vague heuristic step function tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java (5 lines): - line 145: // TODO: 0x20 might not be a space in all character sets... - line 167: // TODO: Is this OK? The buffer could have ended in the middle of a word... - line 176: // TODO - This is a bit of a hack to take care of a case - line 290: // TODO: 0x20 might not be a space in all character sets... - line 301: // TODO: 0x20 might not be a space in all character sets... tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java (4 lines): - line 71: //TODO: what else - line 231: //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*? - line 232: //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*? - line 387: //TODO: require that we're in ? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java (4 lines): - line 176: //TODO -- if it was detected as a non-csv subtype of text - line 216: //TODO -- figure out how to improve this - line 222: //TODO -- consider dumping what's left in the reader as text - line 332: //TODO: log bad/unrecognized delimiter string tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java (4 lines): - line 87: //TODO: record warnings in metadata: warcreader.onWarning(); - line 115: //TODO - other warc record types - line 130: //TODO handle missing payload? Report or ignore? - line 147: //TODO check Content-Encoding on the warcResponse.http.headers and wrap the stream. tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (4 lines): - line 170: // TODO: enable this, but some parsers currently - line 198: // TODO: enable this, but some parsers currently - line 210: // TODO: we could strengthen this to do full - line 242: // TODO: enable this, but some parsers currently tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/unsigned/UInteger.java (4 lines): - line 137: // FIXME: should we log this somewhere? - line 145: // FIXME: should we log this somewhere? - line 154: // FIXME: should we log this somewhere? - line 163: // FIXME: should we log this somewhere? tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java (3 lines): - line 86: TODO: integrate these settings: - line 109: //TODO: make parameterizable for debugging - line 580: //TODO fix this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (3 lines): - line 166: //TODO: figure out if we can send in the PDPage in the TikaInputStream - line 178: //TODO: add markup here? - line 197: //TODO: modernize to ImageStratey != rawImages tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java (3 lines): - line 41: // TODO - parameterize this - line 43: // TODO - parameterize this - line 45: // TODO - parameterize this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java (3 lines): - line 83: // TODO Can we find more general properties for these / move - line 105: // TODO Find keys to record the format and the type - line 135: //TODO: initialize this on the first row and then apply tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFCell.java (3 lines): - line 63: //TODO: find examples of other cell types for testing - line 127: //TODO: add heuristic for deciding; - line 128: //TODO: find example of file with time != 0 tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java (3 lines): - line 326: //TODO: implement this - line 582: //TODO: especially clean this up. - line 763: //TODO allowed named configurations in tika config tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java (3 lines): - line 80: //TODO -- grab errors and warnings - line 232: //TODO -- should we get values in "dl" instead or in addition? - line 257: //TODO -- should we get values in "dl" instead or in addition? tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonDeserializer.java (3 lines): - line 214: //TODO this should try to match the map setters with the data types - line 243: //TODO -- maybe check for string? - line 394: //TODO -- check for over/underflow tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java (3 lines): - line 110: // TODO warning - line 114: // TODO warning - line 118: String investigation = investigationList[0]; // TODO add to metadata? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java (3 lines): - line 100: // TODO Identify a suitable metadata key for this - line 141: // TODO Parse the EXIF info via ImageMetadataExtractor - line 143: // TODO Parse the EXIF info via ImageMetadataExtractor tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java (3 lines): - line 47: //TODO: change signature to require parsecontext from parse - line 93: // TODO should we set SUBJECT too? - line 153: //TODO: not yet supported by XMPBox...extract OriginalDocumentID tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (3 lines): - line 981: //TODO -- figure out if this is actual BEST or whatever - line 1000: //TODO: add AUTO? - line 1052: //TODO: add LOGICAL_IMAGES tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (3 lines): - line 77: //TODO -- specific handling for other multipart subtypes? mixed, parallel, digest - line 454: // TODO Auto-generated method stub - line 569: //TODO -- is this the right definition in rfc822 for rich text?! tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java (3 lines): - line 205: //TODO -- clean this up -- only load as necessary - line 214: //TODO -- figure out how to turn this back on - line 309: //TODO figure out which ones to turn off tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java (3 lines): - line 103: // TODO Only offer those in common to several/all parser - line 104: // TODO Some sort of specialisation / subtype support - line 250: * TODO Do we need to return all the ContentHandler instances we created? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (2 lines): - line 309: //TODO check for targetMode=INTERNAL? - line 487: //TODO -- should we record the literal name of the embedded file? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (2 lines): - line 439: // TODO: identify bullet/list type - line 576: //TODO -- inject progId into the metadata of the embedded file tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-news-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java (2 lines): - line 105: // TODO: put body content here - line 294: // TODO: this only pulls back 8K of data on a read, regardless of buffer size tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (2 lines): - line 316: // TODO: replace w/ XPath/XQuery: - line 343: //TODO: get ProgID, while we're here? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (2 lines): - line 136: //TODO: add in character after number - line 164: //not yet handled by NumberFormatter...TODO: add to NumberFormatter? tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/captioning/tf/TensorflowRESTCaptioner.java (2 lines): - line 128: //TODO -- what do we want to check? - line 141: //TODO: convert this to stream, this might cause OOM issue tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java (2 lines): - line 55: //TODO: move this into POI? - line 165: //TODO: XWPFStyles styles = loadStyles(documentPart); tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java (2 lines): - line 124: //TODO: parameterize extracting other blob metadata, eg. md5, crc, etc. - line 135: //TODO -- add other params to the builder as needed tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP6Charsets.java (2 lines): - line 313: { //TODO implement Current Font Symbols - line 387: //TODO map multi-characters tika-pipes/tika-pipes-iterators/tika-pipes-iterator-az-blob/src/main/java/org/apache/tika/pipes/pipesiterator/azblob/AZBlobPipesIterator.java (2 lines): - line 124: //TODO -- extract metadata about content length etc from properties - line 137: //TODO -- allow authentication via other methods tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java (2 lines): - line 86: // TODO Identify a suitable metadata key for this - line 90: // TODO Identify a suitable metadata key for this+hasAlphaPlane2 tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java (2 lines): - line 129: //TODO -- what do we want to check? - line 142: //TODO: convert this to stream, this might cause OOM issue tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/updates/StartXRefScanner.java (2 lines): - line 81: //TODO: if we're opening a new file for the source - line 120: //TODO -- make this more robust tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java (2 lines): - line 77: // TODO: Use XMPDM.TRACKS? - line 103: // FIXME: What's the encoding? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (2 lines): - line 112: //TODO -- figure out how to get IOExceptions out of boxhandler. Mp4Reader - line 234: //TODO Replace this with a 2dp Duration Property Converter tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (2 lines): - line 480: //TODO -- would be good to find an example triggering file and - line 734: // TODO: do we need to replicate this in Tika? If we wind up tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java (2 lines): - line 54: * TODO Implement proper "Junk" detection - line 106: // TODO Do this in a more generic, less english-only way! tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (2 lines): - line 137: // TODO Add support for the remaining CompressorInputStream formats: - line 154: // TODO Add unit tests for this format tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java (2 lines): - line 223: //TODO: figure out safe way to shutdown resource crawler - line 521: //TODO: get rid of this? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (2 lines): - line 274: //TODO: figure out if the equivalent of OLE 1.0's - line 330: //TODO: modify getEntry to case insensitive when available in POI tika-pipes/tika-emitters/tika-emitter-az-blob/src/main/java/org/apache/tika/pipes/emitter/azblob/AZBlobEmitter.java (2 lines): - line 89: //TODO: estimate size of metadata list. Above a certain size, - line 250: //TODO -- allow authentication via other methods tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java (2 lines): - line 34: //TODO add something about protecting first x bytes? - line 45: //TODO -- don't load the full thing into memory tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (2 lines): - line 194: //TODO: add check for targetmode=external into POI - line 268: //TODO: Need to wait for fix in POI to test for hyperlink first tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DeprecatedZipContainerDetector.java (2 lines): - line 19: //see TODO then delete this class - line 31: TODO: move this to the apple module tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java (2 lines): - line 93: //TODO PDFBOX30 replace COSWriterXRefEntry with XReferenceEntry (and much more) - line 636: //TODO: parameterize wonkifying length and filters tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java (2 lines): - line 101: //TODO Inspect "begin" attribute! - line 105: //TODO Do with TeeInputStream when Commons IO 1.4 is available tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java (2 lines): - line 237: //TODO: shouldn't be fileA!!!! - line 396: //TODO: could make this more robust tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java (2 lines): - line 271: //TODO: consider getting parsing "Format" field from - line 323: //TODO: find test file that has this kind of attachment tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java (2 lines): - line 91: // TODO: Support for matching processing instructions - line 95: // TODO: Can skipped entities refer to more than text? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java (2 lines): - line 73: //TODO: make x/y info public in POI so that we can use it here - line 77: //TODO: do what Graphics does by maintaining the stack, etc.! tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java (2 lines): - line 113: /* TODO: The following is not correct, the cell should be repeated not spanned! - line 517: //TODO: figure out whether we're in an inline image or a regular tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (2 lines): - line 738: //TODO : allow duplicate instances with different configurations - line 797: // TODO Support arguments, needed for Translators etc tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java (2 lines): - line 78: //TODO -- add other params to the builder as needed - line 116: //TODO -- allow user specified metadata as the "id"? tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java (2 lines): - line 271: //TODO -- add other httpclient configurations?? - line 313: //TODO -- there's more that we need to pass through, including ssl etc. tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java (2 lines): - line 50: // TODO Auto-generated method stub - line 57: // TODO Auto-generated method stub tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java (2 lines): - line 114: //TODO -- improve the efficiency of this so that we're not - line 166: //TODO fix this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (2 lines): - line 79: //TODO: add this to the signatures from the actual parse - line 552: //TODO: should we try to process ExifSubIFDDirectory.TAG_TIME_ZONE_OFFSET tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java (2 lines): - line 149: //TODO -- add other httpclient configurations?? - line 238: //TODO -- there's more that we need to pass through, including ssl etc. tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java (2 lines): - line 97: // TODO warning - line 102: // TODO warning tika-batch/src/main/java/org/apache/tika/batch/builders/BatchProcessBuilder.java (2 lines): - line 131: * TODO: This is a bit smelly. NumConsumers needs to be used by the crawler - line 227: //TODO: should we have a max range check? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DeprecatedStreamingZipContainerDetector.java (2 lines): - line 82: //odt -- TODO -- check that the results are valid - line 94: //TODO: do something if the full stream hasn't been read? tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java (2 lines): - line 138: // TODO what happens if you request a language that has no profile? - line 205: // TODO figure out right level for confidence brackets. tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java (1 line): - line 183: //TODO -- allow authentication via other methods tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailUtil.java (1 line): - line 90: //TODO: warn if more than one email is found? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java (1 line): - line 72: // TODO add int_max_value checking tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java (1 line): - line 93: // TODO Metadata.DATE is used as modified, should it be here? tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java (1 line): - line 169: // TODO -- once OPENNLP-1261 is fixed, tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java (1 line): - line 75: //TODO: see if MimeType now works for these tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP5DocumentAreaExtractor.java (1 line): - line 108: //TODO Are there functions containing data? Like footnotes? tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java (1 line): - line 164: //TODO -- add other httpclient configurations?? tika-example/src/main/java/org/apache/tika/example/Language.java (1 line): - line 40: // TODO support version of LanguageWriter that doesn't need a detector. tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java (1 line): - line 62: * TODO This is actually a language alternative property tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmItspHeader.java (1 line): - line 41: // TODO: refactor all unmarshals tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (1 line): - line 183: // TODO Log the short read tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java (1 line): - line 314: //TODO: tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java (1 line): - line 92: // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat) tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java (1 line): - line 163: //TODO: log tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java (1 line): - line 165: //TODO: parameterize whether or not to un-rotate page? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (1 line): - line 380: // TODO Can there be custom ones? tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java (1 line): - line 224: // TODO: investigate a way to do both using ExternalParser tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java (1 line): - line 103: //TODO: process stdout to extract status for each file: tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (1 line): - line 374: //TODO: optimize this to not copy if all tika-pipes/tika-emitters/tika-emitter-gcs/src/main/java/org/apache/tika/pipes/emitter/gcs/GCSEmitter.java (1 line): - line 181: //TODO -- add other params to the builder as needed tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java (1 line): - line 140: //TODO Are there functions containing data? Like footnotes? tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java (1 line): - line 45: //TODO -- don't load the full thing into memory tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java (1 line): - line 48: final String NULL_VALUE = "";//TODO: make this configurable!!! tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java (1 line): - line 57: //TODO: we should allow users to select digest type/encoding and file detector(s). tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java (1 line): - line 31: * TODO Link to documentation on configuring these probabilities tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniser.java (1 line): - line 81: //TODO: what do we want to check here? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java (1 line): - line 79: //TODO - replace this with multivalued map? This isn't tika-translate/src/main/java/org/apache/tika/language/translate/impl/YandexTranslator.java (1 line): - line 101: //TODO Add support for text over 10k characters tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java (1 line): - line 157: //TODO -- what do we want to check? tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java (1 line): - line 412: THUMBNAIL, //TODO: set this in parsers that handle thumbnails tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java (1 line): - line 87: //TODO: something better than this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java (1 line): - line 523: //TODO -- these seem to be somewhat broken font files and other tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java (1 line): - line 75: //TODO -- grab errors and warnings tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java (1 line): - line 41: // TODO Replace this with proper indexed choices support tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java (1 line): - line 283: //TODO: what do we want to check here? tika-batch/src/main/java/org/apache/tika/batch/fs/FSOutputStreamFactory.java (1 line): - line 73: //TODO: shouldn't need this any more in java 7, right? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (1 line): - line 96: //TODO: should we do a singleton for dwfx+xps? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java (1 line): - line 68: //TODO -- fill out rest tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (1 line): - line 112: // TODO - fill out this set. Include core, i18n, etc sets where appropriate. tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java (1 line): - line 35: // TODO decide how deep to go into supporting extended language tags, see tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java (1 line): - line 239: // TODO if there are multiple metadata values with same key (in tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JVGG16Net.java (1 line): - line 93: //TODO: what do we want to check here? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java (1 line): - line 39: * TODO: refactor this copy/paste from POIFSContainerDetector tika-batch/src/main/java/org/apache/tika/batch/fs/builders/FSCrawlerBuilder.java (1 line): - line 75: //TODO: change to logger warn or throw RuntimeException? tika-core/src/main/java/org/apache/tika/metadata/PagedText.java (1 line): - line 36: //TODO MaxPageSize, Fonts, Colorants, PlateNames tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (1 line): - line 213: // TODO - key off content-type in headers to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (1 line): - line 89: // TODO: we should make this dynamic depending on the size of the image tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP5Charsets.java (1 line): - line 168: //TODO map multi-characters tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java (1 line): - line 127: // TODO: Compute the initial capacity using minlen and maxlen. tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java (1 line): - line 158: //TODO -- figure out how to get the version of sqlite3 that last modified this file and tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (1 line): - line 83: // TODO Get the version and type, to set as the Content Type tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (1 line): - line 339: //TODO add metadata about iframe content? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java (1 line): - line 89: //TODO PDFBOX30 this segment no longer needed with 3.0 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (1 line): - line 81: //TODO find what happened to SUPPORTED_TYPES tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java (1 line): - line 123: //if there's no html, back off to straight text -- TODO maybe add RTF parsing? tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java (1 line): - line 29: * TODO: Figure out how to simplify this and allow for emitting of the source document. tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java (1 line): - line 159: // TODO Resolve it so we don't need this try/catch block tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java (1 line): - line 100: //TODO -- process pages (jsonl); process indexes?! tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java (1 line): - line 109: //TODO extract owner or group? tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TlsConfig.java (1 line): - line 29: //TODO make this configurable tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java (1 line): - line 78: // TODO: would be nice to somehow log that we tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java (1 line): - line 344: //TODO -- create a mime for active x tika-pipes/tika-pipes-iterators/tika-pipes-iterator-jdbc/src/main/java/org/apache/tika/pipes/pipesiterator/jdbc/JDBCPipesIterator.java (1 line): - line 282: //TODO: improve this later with special handling for numerals/dates/timestamps, etc tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java (1 line): - line 395: //TODO alert user if they've gotten 1 or 2 out of three? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (1 line): - line 657: * TODO When POI 3.18 is out, replace this with PictureRunMapper, tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java (1 line): - line 438: //TODO: sha-256? tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java (1 line): - line 76: // TODO: Add all supported video types tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java (1 line): - line 43: //TODO -- redo streaming tika-core/src/main/java/org/apache/tika/fork/ForkParser.java (1 line): - line 326: //TODO: make this more useful tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java (1 line): - line 114: //TODO -- what do we want to check? tika-pipes/tika-pipes-reporters/tika-pipes-reporter-opensearch/src/main/java/org/apache/tika/pipes/reporters/opensearch/OpenSearchPipesReporter.java (1 line): - line 89: //TODO -- we're not currently doing anything with the message tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java (1 line): - line 200: //TODO -- allow user specified metadata as the "id"? tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/HTMLHelper.java (1 line): - line 30: * TODO Decide if this would be better done as a MessageBodyWriter tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (1 line): - line 82: //TODO: read class name from context or config tika-translate/src/main/java/org/apache/tika/language/translate/impl/CachedTranslator.java (1 line): - line 161: // TODO what to do if we get an error? tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java (1 line): - line 242: //TODO: possibly open up the different handle-existings in the future tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java (1 line): - line 117: //TODO: figure out if the rw lock is any better than a simple lock tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/MSOneStorePackage.java (1 line): - line 248: //TODO -- these seem to be somewhat broken font files and other tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java (1 line): - line 123: //TODO: potentially use codepage info in the header tika-xmp/src/main/java/org/apache/tika/xmp/convert/GenericConverter.java (1 line): - line 86: // TODO Add support for structs and lang-alts, but those types are tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java (1 line): - line 63: //TODO: figure out how to get this info tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java (1 line): - line 315: //TODO: find an example where basic.getThumbNail is not null tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java (1 line): - line 323: // TODO Decode the snappy stream, and check for the Message Type tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java (1 line): - line 32: //TODO -- add this later for xhtml? tika-xmp/src/main/java/org/apache/tika/xmp/XMPMetadata.java (1 line): - line 320: // TODO Timezone is currently lost tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java (1 line): - line 81: //TODO -- make this actually streaming tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (1 line): - line 566: //TODO: consider allowing multiple text pieces tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java (1 line): - line 160: //Also, TODO -- this should favor getters and setters with Strings over those tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java (1 line): - line 224: //TODO: should we limit the number of field values? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java (1 line): - line 66: // TODO: log a warning? tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenStatistics.java (1 line): - line 112: //TODO: consider adding others... tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java (1 line): - line 91: // TODO: any reason to avoid closing of input & inputChannel? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java (1 line): - line 100: // TODO: we should probably add a readlimiting wrapper around this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java (1 line): - line 279: // TODO: Improve the caching tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (1 line): - line 268: //SEE TIKA-2703 TODO: add unit test tika-core/src/main/java/org/apache/tika/config/ConfigBase.java (1 line): - line 301: //TODO -- we could do more with info from the node -- is it complex, does it have tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java (1 line): - line 59: //TODO: file has some diff mimes names for some very common mimes tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java (1 line): - line 176: //TODO: make throwOnWriteLimitReached configurable tika-server/tika-server-core/src/main/resources/tikaserver-template.html (1 line): - line 24: tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java (1 line): - line 65: //TODO: handle encodings tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java (1 line): - line 121: //TODO: extract the version with the next two bytes if they were read tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java (1 line): - line 71: IOUtils.skipFully(is, 20);//TODO: can get useful info out of here tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmLzxState.java (1 line): - line 68: //TODO: position_slots is not used ? tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java (1 line): - line 38: * For the time being, assume XHTML (TODO: DTBook) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (1 line): - line 277: //TODO -- we've probably already detected the stream by here. We should tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java (1 line): - line 62: // TODO Auto-generated method stub tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java (1 line): - line 68: // TODO - add onetoc and other onenote mime types tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java (1 line): - line 50: //TODO: should we throw a RuntimeException? tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java (1 line): - line 95: //TODO -- what do we want to check? tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java (1 line): - line 35: * TODO This property is of type RessourceRef which is a struct tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java (1 line): - line 55: //TODO: fix this tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java (1 line): - line 88: // TODO: add plain ASCII as an explicitly detected type. tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/OpenDocumentDetector.java (1 line): - line 65: //odt -- TODO -- check that the results are valid tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (1 line): - line 275: // TODO: A better implementation would be to copy the detect loop from tika-core/src/main/java/org/apache/tika/metadata/WARC.java (1 line): - line 30: //TODO: lots tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java (1 line): - line 136: //TODO - parameterize this tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java (1 line): - line 41: //TODO: we're relying on metadata to bring in a bunch of info. tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/QPWTextExtractor.java (1 line): - line 99: //TODO shall we validate and throw warning/error if the file does not