in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java [98:233]
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
configure(context);
DWGParserConfig dwgc = context.get(DWGParserConfig.class);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// create unique files so we avoid overwriting out files if multithreaded
UUID uuid = UUID.randomUUID();
File tmpFileOut = Files.createTempFile(uuid + "dwgreadout", ".json").toFile();
File tmpFileOutCleaned = Files.createTempFile(uuid + "dwgreadoutclean", ".json").toFile();
File tmpFileIn = Files.createTempFile(uuid + "dwgreadin", ".dwg").toFile();
try {
FileUtils.copyInputStreamToFile(stream, tmpFileIn);
List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
ProcessBuilder pb = new ProcessBuilder().command(command);
LOG.info("About to call DWGRead: " + command.toString());
FileProcessResult fpr = ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000);
LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
if (fpr.getExitValue() == 0) {
if (dwgc.isCleanDwgReadOutput()) {
// dwgread sometimes creates strings with invalid utf-8 sequences or invalid
// json (nan instead of NaN). replace them
// with empty string.
LOG.debug("Cleaning Json Output - Replace: " + dwgc.getCleanDwgReadRegexToReplace()
+ " with: " + dwgc.getCleanDwgReadReplaceWith());
try ( BufferedReader br = new BufferedReader(
new InputStreamReader(
Files.newInputStream(tmpFileOut.toPath()),
StandardCharsets.UTF_8));
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(tmpFileOutCleaned, true),
StandardCharsets.UTF_8),32768))
{
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null)
{
sCurrentLine = sCurrentLine
.replaceAll( dwgc.getCleanDwgReadRegexToReplace(),
dwgc.getCleanDwgReadReplaceWith())
.replaceAll("\\bnan\\b", " 0,")
.replaceAll("\\.,", " \\. ,") + "\n";
out.write(sCurrentLine);
}
} finally {
FileUtils.deleteQuietly(tmpFileIn);
FileUtils.deleteQuietly(tmpFileOut);
tmpFileOut = tmpFileOutCleaned;
}
} else {
LOG.debug(
"Json wasn't cleaned, "
+ "if json parsing fails consider reviewing dwgread json output to check it's valid");
}
} else if (fpr.isTimeout()) {
throw new TikaException(
"DWGRead Failed - Timeout setting exceeded current setting of " + dwgc.getDwgReadTimeout() );
}
else {
throw new TikaException(
"DWGRead Failed - Exit Code is:" + fpr.getExitValue() + " Exe error is: " + fpr.getStderr() );
}
// we can't guarantee the json output is correct so we try to ignore as many
// errors as we can
JsonFactory jfactory = JsonFactory.builder()
.enable(JsonReadFeature.ALLOW_MISSING_VALUES,
JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES,
JsonReadFeature.ALLOW_TRAILING_COMMA,
JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS,
JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
.build();
JsonParser jParser;
try {
jParser = jfactory.createParser(tmpFileOut);
} catch (JsonParseException e1) {
throw new TikaException("Failed to parse Json: " + ExceptionUtils.getStackTrace(e1));
} catch (IOException e1) {
throw new TikaException("Failed to read json file: " + ExceptionUtils.getStackTrace(e1));
}
// read json token in a stream using jackson, iterate over each token. We only
// support OBJECTS, FILEHEADER and SummaryInfo
// these are the only ones we have in either sample files or have been tested
// with
DWGReadFormatRemover dwgReadFormatRemover = new DWGReadFormatRemover();
JsonToken nextToken = jParser.nextToken();
while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
if (nextToken == JsonToken.FIELD_NAME) {
String nextFieldName = jParser.currentName();
nextToken = jParser.nextToken();
if (nextToken.isStructStart()) {
if ("OBJECTS".equals(nextFieldName)) {
// Start array
while (jParser.nextToken() != JsonToken.END_ARRAY) {
parseDwgObject(jParser, (nextTextValue) -> {
try {
xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
xhtml.newline();
} catch (SAXException e) {
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
}
});
}
} else if ("FILEHEADER".equals(nextFieldName)) {
parseHeader(jParser, metadata);
} else if ("SummaryInfo".equals(nextFieldName)) {
parseSummaryInfo(jParser, metadata);
} else {
jParser.skipChildren();
}
}
}
}
jParser.close();
} finally {
// make sure we delete all temp files
FileUtils.deleteQuietly(tmpFileOut);
FileUtils.deleteQuietly(tmpFileIn);
FileUtils.deleteQuietly(tmpFileOutCleaned);
}
xhtml.endDocument();
}