private boolean parseBody()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-news-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java [458:750]


    private boolean parseBody(byte[] value, HashMap<String, String> properties) {
        boolean added = false;

        StringBuilder bdy_heading = new StringBuilder();
        StringBuilder bdy_title = new StringBuilder();
        StringBuilder bdy_source = new StringBuilder();
        StringBuilder bdy_author = new StringBuilder();
        StringBuilder bdy_body = new StringBuilder();

        int read = 0;
        boolean done = false;

        while (!done && (read < value.length)) {

            // pull apart the body, getting the heading (^....\x0d\x0a)
            while (read < value.length) {
                byte val_next = value[read++];
                if (val_next == CT) {      //  start of a new section , first is the heading
                    val_next = (read < value.length) ? value[read++] : 0x00;
                    // AP, NYT, and Bloomberg end with < , Reuters with EOL
                    while ((val_next != LT) && (val_next != CR) &&
                            (val_next != LF)) {   // less than delimiter (\x3c) and not EOL
                        bdy_heading.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                        val_next = (read < value.length) ? value[read++] : 0x00;
                        if (read >= value.length) {
                            break;
                        }  // shouldn't ever hit this, but save a NPE
                    }
                    if (val_next == LT) {
                        // hit the delimiter, carry on
                        val_next = (read < value.length) ? value[read++] : 0x00;
                    }
                    while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                        val_next =
                                (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                        if ((val_next != CR) && (val_next != LF)) {
                            --read;
                        }
                    }
                } else {
                    // this will only be hit on poorly-formed files

                    // for reuters, the heading does not start with the ^, so we push one back
                    // into the stream
                    if (FORMAT == this.FMT_IPTC_RTR) {
                        if (val_next != CT) {
                            // for any non-whitespace, we need to go back an additional step to
                            // non destroy the data
                            if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
                                    (val_next != LF)) {
                                // if the very first byte is data, we have to shift the whole
                                // array, and stuff in a carat
                                if (read == 1) {
                                    byte[] resize = new byte[value.length + 1];
                                    System.arraycopy(value, 0, resize, 1, value.length);
                                    value = resize;
                                }
                            }
                            value[--read] = CT;
                            continue;
                        }
                    }
                }
                break;
            }

            // pull apart the body, getting the title (^....\x0d\x0a)
            while (read < value.length) {
                byte val_next = value[read++];
                if (val_next == CT) {      //  start of a new section , first is the heading
                    val_next = (read < value.length) ? value[read++] : 0x00;
                    // AP, NYT, and Bloomberg end with < , Reuters with EOL
                    while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next !=
                            LF)) {   // less than delimiter (\x3c), or carat (\x5e) and not EOL
                        bdy_title.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                        val_next = (read < value.length) ? value[read++] : 0x00;
                        if (read >= value.length) {
                            break;
                        }  // shouldn't ever hit this, but save a NPE
                    }

                    if (val_next ==
                            CT) {      //  start of a new section , when first didn't finish cleanly
                        --read;
                    }

                    if (val_next == LT) {
                        // hit the delimiter, carry on
                        val_next = (read < value.length) ? value[read++] : 0x00;
                    }

                    while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                        val_next =
                                (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                        if ((val_next != CR) && (val_next != LF)) {
                            --read;
                        }
                    }
                } else {
                    // this will only be hit on poorly-formed files

                    // for bloomberg, the title does not start with the ^, so we push one back
                    // into the stream
                    if (FORMAT == this.FMT_IPTC_BLM) {
                        if (val_next == TB) {
                            value[--read] = CT;
                            continue;
                        }
                    }

                    // for reuters, the title does not start with the ^, so we push one back into
                    // the stream
                    if (FORMAT == this.FMT_IPTC_RTR) {
                        if (val_next != CT) {
                            // for any non-whitespace, we need to go back an additional step to
                            // non destroy the data
                            if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
                                    (val_next != LF)) {
                                --read;
                            }
                            value[--read] = CT;
                            continue;
                        }
                    }
                }
                break;
            }


            // at this point, we have a variable number of metadata lines, with various orders
            // we scan the start of each line for the special character, and run to the end
            // character
            // pull apart the body, getting the title (^....\x0d\x0a)
            boolean metastarted = false;
            String longline = "";
            String longkey;
            while (read < value.length) {
                byte val_next = value[read++];

                // eat up whitespace before committing to the next section
                if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
                    continue;
                }

                if (val_next ==
                        CT) {      //  start of a new section , could be authors, sources, etc
                    val_next = (read < value.length) ? value[read++] : 0x00;
                    StringBuilder tmp_line = new StringBuilder();
                    while ((val_next != LT) && (val_next != CT) && (val_next != CR) &&
                            (val_next != LF) && (val_next != 0)) {
                        // less than delimiter (\x3c), maybe also badly formed with just new line
                        tmp_line.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                        val_next = (read < value.length) ? value[read++] : 0x00;
                        if (read >= value.length) {
                            break;
                        }  // shouldn't ever hit this, but save a NPE
                    }

                    if (val_next ==
                            CT) {      //  start of a new section , when first didn't finish cleanly
                        --read;
                    }

                    if (val_next == LT) {
                        // hit the delimiter, carry on
                        val_next = (read < value.length) ? value[read++] : 0x00;
                    }

                    while ((val_next == CR) || (val_next == LF)) {
                        val_next =
                                (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                        if ((val_next != CR) && (val_next != LF)) {
                            --read;
                        }
                    }
                    if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("by") ||
                            longline.equals("bdy_author")) {
                        longkey = "bdy_author";

                        // prepend a space to subsequent line, so it gets parsed consistent with
                        // the lead line
                        tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));

                        // we have an author candidate
                        int term = tmp_line.length();
                        term = Math.min(term,
                                (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                        term = Math.min(term,
                                (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
                        term = Math.min(term,
                                (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
                        term = (term > 0) ? term : tmp_line.length();
                        bdy_author.append(tmp_line.substring(tmp_line.indexOf(" "), term));
                        metastarted = true;
                        longline =
                                ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ? longkey :
                                        "");
                    } else if (FORMAT == this.FMT_IPTC_BLM) {
                        String byline = "   by ";
                        if (tmp_line.toString().toLowerCase(Locale.ROOT).contains(byline)) {
                            longkey = "bdy_author";

                            int term = tmp_line.length();
                            term = Math.min(term,
                                    (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                            term = Math.min(term,
                                    (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
                            term = Math.min(term,
                                    (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
                            term = (term > 0) ? term : tmp_line.length();
                            // for bloomberg, the author line sits below their copyright statement
                            bdy_author.append(tmp_line.substring(
                                    tmp_line.toString().toLowerCase(Locale.ROOT).indexOf(byline) +
                                    byline.length(), term)).append(" ");
                            metastarted = true;
                            longline = ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ?
                                    longkey : "");
                        } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("c.")) {
                            // the author line for bloomberg is a multiline starting with c.2011
                            // Bloomberg News
                            // then containing the author info on the next line
                            if (val_next == TB) {
                                value[--read] = CT;
                                continue;
                            }
                        } else if (tmp_line.toString().toLowerCase(Locale.ROOT).trim().startsWith("(") &&
                                   tmp_line.toString().toLowerCase(Locale.ROOT).trim().endsWith(")")) {
                            // the author line may have one or more comment lines between the
                            // copyright
                            // statement, and the By AUTHORNAME line
                            if (val_next == TB) {
                                value[--read] = CT;
                                continue;
                            }
                        }
                    } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("eds") ||
                               longline.equals("bdy_source")) {
                        longkey = "bdy_source";
                        // prepend a space to subsequent line, so it gets parsed consistent with
                        // the lead line
                        tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));

                        // we have a source candidate
                        int term = tmp_line.length();
                        term = Math.min(term,
                                (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                        term = Math.min(term,
                                (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
//                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") :
//                  term));
                        term = (term > 0) ? term : tmp_line.length();
                        bdy_source.append(tmp_line.substring(tmp_line.indexOf(" ") + 1, term)).append(" ");
                        metastarted = true;
                        longline = (!longline.equals(longkey) ? longkey : "");
                    } else {
                        // this has fallen all the way through.  trap it as part of the subject,
                        // rather than just losing it
                        if (!metastarted) {
                            bdy_title.append(" , ").append(tmp_line);     //  not sure where else to put this but in the
                            // title
                        } else {
                            // what to do with stuff that is metadata, which falls after metadata
                            // lines started?
                            bdy_body.append(" ")
                                    .append(tmp_line)
                                    .append(" , ");     //  not sure where else to put this but in the title
                        }
                    }
                } else {  // we're on to the main body
                    while ((read < value.length) && (val_next != 0)) {
                        // read until the train runs out of tracks
                        bdy_body.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                        val_next = (read < value.length) ? value[read++] : 0x00;
                        if (read >= value.length) {
                            break;
                        }  // shouldn't ever hit this, but save a NPE
                    }

                }
                // we would normally break here, but just let this read out to the end
            }
            done = true; // don't let this run back through and start thrashing metadata
        }
        properties.put("body", bdy_body.toString());
        properties.put("title", bdy_title.toString());
        properties.put("subject", bdy_heading.toString());
        properties.put("author", bdy_author.toString());
        properties.put("source", bdy_source.toString());

        added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() +
                bdy_author.length() + bdy_source.length()) > 0;
        return added;
    }