public void parse()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java [98:233]


    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {

        configure(context);
        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // create unique files so we avoid overwriting out files if multithreaded
        UUID uuid = UUID.randomUUID();
        File tmpFileOut = Files.createTempFile(uuid + "dwgreadout", ".json").toFile();
        File tmpFileOutCleaned = Files.createTempFile(uuid + "dwgreadoutclean", ".json").toFile();
        File tmpFileIn = Files.createTempFile(uuid + "dwgreadin", ".dwg").toFile();
        try {
            

            FileUtils.copyInputStreamToFile(stream, tmpFileIn);

            List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
                    tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
            ProcessBuilder pb = new ProcessBuilder().command(command);
            LOG.info("About to call DWGRead: " + command.toString());
            FileProcessResult fpr = ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000);
            LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
            if (fpr.getExitValue() == 0) {
                if (dwgc.isCleanDwgReadOutput()) {
                    // dwgread sometimes creates strings with invalid utf-8 sequences or invalid
                    // json (nan instead of NaN). replace them
                    // with empty string.
                    LOG.debug("Cleaning Json Output - Replace: " + dwgc.getCleanDwgReadRegexToReplace() 
                              + " with: " + dwgc.getCleanDwgReadReplaceWith());
                    try ( BufferedReader br = new BufferedReader(
                              new InputStreamReader(
                                      Files.newInputStream(tmpFileOut.toPath()),
                              StandardCharsets.UTF_8));
                            
                            BufferedWriter out = new BufferedWriter(
                                    new OutputStreamWriter(
                                            new FileOutputStream(tmpFileOutCleaned, true), 
                                            StandardCharsets.UTF_8),32768))
                    {

                        String sCurrentLine;
                        while ((sCurrentLine = br.readLine()) != null) 
                        {
                            sCurrentLine = sCurrentLine
                                            .replaceAll( dwgc.getCleanDwgReadRegexToReplace(), 
                                                    dwgc.getCleanDwgReadReplaceWith())
                                            .replaceAll("\\bnan\\b", " 0,")
                                            .replaceAll("\\.,", " \\. ,") + "\n";
                            out.write(sCurrentLine);
                        }                            
                                 
                    } finally {
                        FileUtils.deleteQuietly(tmpFileIn);
                        FileUtils.deleteQuietly(tmpFileOut);
                        tmpFileOut = tmpFileOutCleaned;
                    }

                } else {
                    LOG.debug(
                            "Json wasn't cleaned, "
                            + "if json parsing fails consider reviewing dwgread json output to check it's valid");
                }
            } else if (fpr.isTimeout()) {
                throw new TikaException(
                        "DWGRead Failed - Timeout setting exceeded current setting of " + dwgc.getDwgReadTimeout() );
            }
            else {
                throw new TikaException(
                        "DWGRead Failed - Exit Code is:" + fpr.getExitValue() + " Exe error is: " + fpr.getStderr() );
            }

            // we can't guarantee the json output is correct so we try to ignore as many
            // errors as we can
            JsonFactory jfactory = JsonFactory.builder()
                    .enable(JsonReadFeature.ALLOW_MISSING_VALUES, 
                            JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
                            JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, 
                            JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES, 
                            JsonReadFeature.ALLOW_TRAILING_COMMA,
                            JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS, 
                            JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
                    .build();
            JsonParser jParser;
            try {
                jParser = jfactory.createParser(tmpFileOut);
            } catch (JsonParseException e1) {
                throw new TikaException("Failed to parse Json: " + ExceptionUtils.getStackTrace(e1));
            } catch (IOException e1) {
                throw new TikaException("Failed to read json file: " + ExceptionUtils.getStackTrace(e1));
            }
            // read json token in a stream using jackson, iterate over each token. We only
            // support OBJECTS, FILEHEADER and SummaryInfo
            // these are the only ones we have in either sample files or have been tested
            // with
            DWGReadFormatRemover dwgReadFormatRemover = new DWGReadFormatRemover();
            JsonToken nextToken = jParser.nextToken();
            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
                if (nextToken == JsonToken.FIELD_NAME) {
                    String nextFieldName = jParser.currentName();
                    nextToken = jParser.nextToken();
                    if (nextToken.isStructStart()) {

                        if ("OBJECTS".equals(nextFieldName)) {
                            // Start array
                            while (jParser.nextToken() != JsonToken.END_ARRAY) {
                                parseDwgObject(jParser, (nextTextValue) -> {

                                    try {
                                        xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
                                        xhtml.newline();
                                    } catch (SAXException e) {
                                        LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
                                    }
                                });
                            }
                        } else if ("FILEHEADER".equals(nextFieldName)) {
                            parseHeader(jParser, metadata);
                        } else if ("SummaryInfo".equals(nextFieldName)) {
                            parseSummaryInfo(jParser, metadata);
                        } else {
                            jParser.skipChildren();
                        }
                    }
                }
            }
            jParser.close();
        } finally {
            // make sure we delete all temp files
            FileUtils.deleteQuietly(tmpFileOut);
            FileUtils.deleteQuietly(tmpFileIn);
            FileUtils.deleteQuietly(tmpFileOutCleaned);
        }

        xhtml.endDocument();
    }