in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-news-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java [458:750]
private boolean parseBody(byte[] value, HashMap<String, String> properties) {
boolean added = false;
StringBuilder bdy_heading = new StringBuilder();
StringBuilder bdy_title = new StringBuilder();
StringBuilder bdy_source = new StringBuilder();
StringBuilder bdy_author = new StringBuilder();
StringBuilder bdy_body = new StringBuilder();
int read = 0;
boolean done = false;
while (!done && (read < value.length)) {
// pull apart the body, getting the heading (^....\x0d\x0a)
while (read < value.length) {
byte val_next = value[read++];
if (val_next == CT) { // start of a new section , first is the heading
val_next = (read < value.length) ? value[read++] : 0x00;
// AP, NYT, and Bloomberg end with < , Reuters with EOL
while ((val_next != LT) && (val_next != CR) &&
(val_next != LF)) { // less than delimiter (\x3c) and not EOL
bdy_heading.append((char) (val_next & 0xff)); // convert the byte to an unsigned int
val_next = (read < value.length) ? value[read++] : 0x00;
if (read >= value.length) {
break;
} // shouldn't ever hit this, but save a NPE
}
if (val_next == LT) {
// hit the delimiter, carry on
val_next = (read < value.length) ? value[read++] : 0x00;
}
while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
val_next =
(read < value.length) ? value[read++] : 0x00; // skip the new lines
if ((val_next != CR) && (val_next != LF)) {
--read;
}
}
} else {
// this will only be hit on poorly-formed files
// for reuters, the heading does not start with the ^, so we push one back
// into the stream
if (FORMAT == this.FMT_IPTC_RTR) {
if (val_next != CT) {
// for any non-whitespace, we need to go back an additional step to
// non destroy the data
if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
(val_next != LF)) {
// if the very first byte is data, we have to shift the whole
// array, and stuff in a carat
if (read == 1) {
byte[] resize = new byte[value.length + 1];
System.arraycopy(value, 0, resize, 1, value.length);
value = resize;
}
}
value[--read] = CT;
continue;
}
}
}
break;
}
// pull apart the body, getting the title (^....\x0d\x0a)
while (read < value.length) {
byte val_next = value[read++];
if (val_next == CT) { // start of a new section , first is the heading
val_next = (read < value.length) ? value[read++] : 0x00;
// AP, NYT, and Bloomberg end with < , Reuters with EOL
while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next !=
LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL
bdy_title.append((char) (val_next & 0xff)); // convert the byte to an unsigned int
val_next = (read < value.length) ? value[read++] : 0x00;
if (read >= value.length) {
break;
} // shouldn't ever hit this, but save a NPE
}
if (val_next ==
CT) { // start of a new section , when first didn't finish cleanly
--read;
}
if (val_next == LT) {
// hit the delimiter, carry on
val_next = (read < value.length) ? value[read++] : 0x00;
}
while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
val_next =
(read < value.length) ? value[read++] : 0x00; // skip the new lines
if ((val_next != CR) && (val_next != LF)) {
--read;
}
}
} else {
// this will only be hit on poorly-formed files
// for bloomberg, the title does not start with the ^, so we push one back
// into the stream
if (FORMAT == this.FMT_IPTC_BLM) {
if (val_next == TB) {
value[--read] = CT;
continue;
}
}
// for reuters, the title does not start with the ^, so we push one back into
// the stream
if (FORMAT == this.FMT_IPTC_RTR) {
if (val_next != CT) {
// for any non-whitespace, we need to go back an additional step to
// non destroy the data
if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
(val_next != LF)) {
--read;
}
value[--read] = CT;
continue;
}
}
}
break;
}
// at this point, we have a variable number of metadata lines, with various orders
// we scan the start of each line for the special character, and run to the end
// character
// pull apart the body, getting the title (^....\x0d\x0a)
boolean metastarted = false;
String longline = "";
String longkey;
while (read < value.length) {
byte val_next = value[read++];
// eat up whitespace before committing to the next section
if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
continue;
}
if (val_next ==
CT) { // start of a new section , could be authors, sources, etc
val_next = (read < value.length) ? value[read++] : 0x00;
StringBuilder tmp_line = new StringBuilder();
while ((val_next != LT) && (val_next != CT) && (val_next != CR) &&
(val_next != LF) && (val_next != 0)) {
// less than delimiter (\x3c), maybe also badly formed with just new line
tmp_line.append((char) (val_next & 0xff)); // convert the byte to an unsigned int
val_next = (read < value.length) ? value[read++] : 0x00;
if (read >= value.length) {
break;
} // shouldn't ever hit this, but save a NPE
}
if (val_next ==
CT) { // start of a new section , when first didn't finish cleanly
--read;
}
if (val_next == LT) {
// hit the delimiter, carry on
val_next = (read < value.length) ? value[read++] : 0x00;
}
while ((val_next == CR) || (val_next == LF)) {
val_next =
(read < value.length) ? value[read++] : 0x00; // skip the new lines
if ((val_next != CR) && (val_next != LF)) {
--read;
}
}
if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("by") ||
longline.equals("bdy_author")) {
longkey = "bdy_author";
// prepend a space to subsequent line, so it gets parsed consistent with
// the lead line
tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));
// we have an author candidate
int term = tmp_line.length();
term = Math.min(term,
(tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
term = Math.min(term,
(tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
term = Math.min(term,
(tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
term = (term > 0) ? term : tmp_line.length();
bdy_author.append(tmp_line.substring(tmp_line.indexOf(" "), term));
metastarted = true;
longline =
((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ? longkey :
"");
} else if (FORMAT == this.FMT_IPTC_BLM) {
String byline = " by ";
if (tmp_line.toString().toLowerCase(Locale.ROOT).contains(byline)) {
longkey = "bdy_author";
int term = tmp_line.length();
term = Math.min(term,
(tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
term = Math.min(term,
(tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
term = Math.min(term,
(tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
term = (term > 0) ? term : tmp_line.length();
// for bloomberg, the author line sits below their copyright statement
bdy_author.append(tmp_line.substring(
tmp_line.toString().toLowerCase(Locale.ROOT).indexOf(byline) +
byline.length(), term)).append(" ");
metastarted = true;
longline = ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ?
longkey : "");
} else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("c.")) {
// the author line for bloomberg is a multiline starting with c.2011
// Bloomberg News
// then containing the author info on the next line
if (val_next == TB) {
value[--read] = CT;
continue;
}
} else if (tmp_line.toString().toLowerCase(Locale.ROOT).trim().startsWith("(") &&
tmp_line.toString().toLowerCase(Locale.ROOT).trim().endsWith(")")) {
// the author line may have one or more comment lines between the
// copyright
// statement, and the By AUTHORNAME line
if (val_next == TB) {
value[--read] = CT;
continue;
}
}
} else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("eds") ||
longline.equals("bdy_source")) {
longkey = "bdy_source";
// prepend a space to subsequent line, so it gets parsed consistent with
// the lead line
tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));
// we have a source candidate
int term = tmp_line.length();
term = Math.min(term,
(tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
term = Math.min(term,
(tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
// term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") :
// term));
term = (term > 0) ? term : tmp_line.length();
bdy_source.append(tmp_line.substring(tmp_line.indexOf(" ") + 1, term)).append(" ");
metastarted = true;
longline = (!longline.equals(longkey) ? longkey : "");
} else {
// this has fallen all the way through. trap it as part of the subject,
// rather than just losing it
if (!metastarted) {
bdy_title.append(" , ").append(tmp_line); // not sure where else to put this but in the
// title
} else {
// what to do with stuff that is metadata, which falls after metadata
// lines started?
bdy_body.append(" ")
.append(tmp_line)
.append(" , "); // not sure where else to put this but in the title
}
}
} else { // we're on to the main body
while ((read < value.length) && (val_next != 0)) {
// read until the train runs out of tracks
bdy_body.append((char) (val_next & 0xff)); // convert the byte to an unsigned int
val_next = (read < value.length) ? value[read++] : 0x00;
if (read >= value.length) {
break;
} // shouldn't ever hit this, but save a NPE
}
}
// we would normally break here, but just let this read out to the end
}
done = true; // don't let this run back through and start thrashing metadata
}
properties.put("body", bdy_body.toString());
properties.put("title", bdy_title.toString());
properties.put("subject", bdy_heading.toString());
properties.put("author", bdy_author.toString());
properties.put("source", bdy_source.toString());
added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() +
bdy_author.length() + bdy_source.length()) > 0;
return added;
}