public static Page buildPage()

in webindex/modules/data/src/main/java/webindex/data/util/ArchiveUtil.java [38:117]


  public static Page buildPage(ArchiveRecord archiveRecord) throws IOException, ParseException {
    if (archiveRecord.getHeader().getMimetype().equalsIgnoreCase("application/json")) {
      byte[] rawData = IOUtils.toByteArray(archiveRecord, archiveRecord.available());
      if (rawData.length == 0) {
        return Page.EMPTY;
      }
      String jsonString = new String(rawData);
      if (jsonString.isEmpty()) {
        return Page.EMPTY;
      }
      JSONObject json;
      try {
        json = new JSONObject(new String(rawData));
      } catch (JSONException e) {
        throw new ParseException(e.getMessage(), 0);
      }
      String rawPageUrl = archiveRecord.getHeader().getUrl();
      URL pageUrl;
      try {
        pageUrl = URL.from(rawPageUrl);
      } catch (IllegalArgumentException e) {
        return Page.EMPTY;
      } catch (Exception e) {
        log.error("Unexpected exception while parsing raw page URL: " + rawPageUrl, e);
        return Page.EMPTY;
      }
      Page page = new Page(pageUrl.toUri());
      page.setCrawlDate(archiveRecord.getHeader().getDate());
      try {
        JSONObject responseMeta = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata")
            .getJSONObject("HTTP-Response-Metadata");

        if (archiveRecord.getHeader().getMimetype().equals("application/json")) {
          try {
            JSONArray links = responseMeta.getJSONObject("HTML-Metadata").getJSONArray("Links");
            for (int i = 0; i < links.length(); i++) {
              JSONObject link = links.getJSONObject(i);
              if (link.has("path") && link.get("path").equals("A@/href") && link.has("url")) {
                String anchorText = "";
                if (link.has("text")) {
                  anchorText = link.getString("text");
                } else if (link.has("title")) {
                  anchorText = link.getString("title");
                }
                String rawLinkUrl = link.getString("url");
                URL linkUrl;
                try {
                  linkUrl = URL.from(rawLinkUrl);
                  if (!page.getDomain().equals(linkUrl.getDomain())) {
                    page.addOutbound(Link.of(linkUrl, anchorText));
                  }
                } catch (IllegalArgumentException e) {
                  log.debug("Failed to parse link: " + rawLinkUrl);
                } catch (Exception e) {
                  log.error("Unexpected exception while parsing link URL: " + rawLinkUrl, e);
                }
              }
            }
          } catch (JSONException e) {
            log.debug("Exception trying retrieve links", e);
          }
        }
        try {
          page.setTitle(
              responseMeta.getJSONObject("HTML-Metadata").getJSONObject("Head").getString("Title"));
        } catch (JSONException e) {
          log.debug("Failed to retrieve title", e);
        }
        try {
          page.setServer(responseMeta.getJSONObject("Headers").getString("Server"));
        } catch (JSONException e) {
          log.debug("Failed to retrieve server", e);
        }
      } catch (JSONException e) {
        log.debug("Exception trying retrieve responseMeta", e);
      }
      return page;
    }
    return Page.EMPTY;
  }