public void unwindLink()

in streams-contrib/streams-processor-urls/src/main/java/org/apache/streams/urls/LinkResolver.java [159:341]


    public void unwindLink(String url) {
        Objects.requireNonNull(linkDetails);
        Objects.requireNonNull(url);

        // Check url validity
        UrlValidator urlValidator = new UrlValidator();
        if (!urlValidator.isValid(url)) {
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
            return;
        }

        // Check to see if they wound up in a redirect loop,
        // IE: 'A' redirects to 'B', then 'B' redirects to 'A'
        if ((linkDetails.getRedirectCount() != null && linkDetails.getRedirectCount() > 0 &&
                (linkDetails.getOriginalURL().equals(url) || linkDetails.getRedirects().contains(url)))
                || (linkDetails.getRedirectCount() != null && linkDetails.getRedirectCount() > MAX_ALLOWED_REDIRECTS)) {
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.LOOP);
            return;
        }

        if (!linkDetails.getOriginalURL().equals(url))
            linkDetails.getRedirects().add(url);

        HttpURLConnection connection = null;

        // Store where the redirected link will go (if there is one)
        String reDirectedLink = null;

        try {
            // Turn the string into a URL
            URL thisURL = new URL(url);

            // Be sensitive to overloading domains STREAMS-77
            try {
                String host = thisURL.getHost().toLowerCase();
                if(!domainsSensitiveTo.contains(host)) {
                    domainsSensitiveTo.add(host);
                    long domainWait = LinkResolverHelperFunctions.waitTimeForDomain(thisURL.getHost());
                    if (domainWait > 0) {
                        LOGGER.debug("Waiting for domain: {}", domainWait);
                        Thread.sleep(domainWait);
                    }
                }
            } catch(Exception e) {
                // noOp
            }

            connection = (HttpURLConnection) new URL(url).openConnection();

            // now we are going to pretend that we are a browser...
            // This is the way my mac works.
            if (!BOTS_ARE_OK.contains(thisURL.getHost())) {
                connection.addRequestProperty("Host", thisURL.getHost());

                // Bots are not 'ok', so we need to spoof the headers
                for (String k : SPOOF_HTTP_HEADERS.keySet())
                    connection.addRequestProperty(k, SPOOF_HTTP_HEADERS.get(k));

                // the test to seattlemamadoc.com prompted this change.
                // they auto detect bots by checking the referrer chain and the 'user-agent'
                // this broke the t.co test. t.co URLs are EXPLICITLY ok with bots
                // there is a list for URLS that behave this way at the top in BOTS_ARE_OK
                // smashew 2013-13-2013
                if (linkDetails.getRedirectCount() > 0 && BOTS_ARE_OK.contains(thisURL.getHost()))
                    connection.addRequestProperty("Referrer", linkDetails.getOriginalURL());
            }

            connection.setReadTimeout(DEFAULT_HTTP_TIMEOUT);
            connection.setConnectTimeout(DEFAULT_HTTP_TIMEOUT);

            // we want to follow this behavior on our own to ensure that we are getting to the
            // proper place. This is especially true with links that are wounded by special
            // link winders,
            // IE:
            connection.setInstanceFollowRedirects(false);

            if (linkDetails.getCookies() != null)
                for (String cookie : linkDetails.getCookies())
                    connection.addRequestProperty("Cookie", cookie.split(";", 1)[0]);

            connection.connect();

            linkDetails.setFinalResponseCode((long) connection.getResponseCode());

            Map<String, List<String>> headers = createCaseInsensitiveMap(connection.getHeaderFields());
            /*
             * If they want us to set cookies, well, then we will set cookies
             * Example URL:
             * http://nyti.ms/1bCpesx
             *****************************************************************/
            if (headers.containsKey(SET_COOKIE_IDENTIFIER))
                linkDetails.getCookies().add(headers.get(SET_COOKIE_IDENTIFIER).get(0));

            switch (linkDetails.getFinalResponseCode().intValue()) {
                /*
                 * W3C HTTP Response Codes:
                 * http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
                 */
                case 200: // HTTP OK
                    linkDetails.setFinalURL(connection.getURL().toString());
                    linkDetails.setDomain(new URL(linkDetails.getFinalURL()).getHost());
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.SUCCESS);
                    break;
                case 300: // Multiple choices
                case 301: // URI has been moved permanently
                case 302: // Found
                case 303: // Primarily for a HTTP Post
                case 304: // Not Modified
                case 306: // This status code is unused but in the redirect block.
                case 307: // Temporary re-direct
                    /*
                     * Author:
                     * Smashew
                     *
                     * Date: 2013-11-15
                     *
                     * Note:
                     * It is possible that we have already found our final URL. In
                     * the event that we have found our final URL, we are going to
                     * save this URL as long as it isn't the original URL.
                     * We are still going to ask the browser to re-direct, but in the
                     * case of yet another redirect, seen with the redbull test
                     * this can be followed by a 304, a browser, by W3C standards would
                     * still render the page with it's content, but for us to assert
                     * a success, we are really hoping for a 304 message.
                     *******************************************************************/
                    if (!linkDetails.getOriginalURL().toLowerCase().equals(connection.getURL().toString().toLowerCase()))
                        linkDetails.setFinalURL(connection.getURL().toString());
                    if (!headers.containsKey(LOCATION_IDENTIFIER)) {
                        LOGGER.info("Headers: {}", headers);
                        linkDetails.setLinkStatus(LinkDetails.LinkStatus.REDIRECT_ERROR);
                    } else {
                        linkDetails.setRedirected(Boolean.TRUE);
                        linkDetails.setRedirectCount(linkDetails.getRedirectCount() + 1);
                        reDirectedLink = connection.getHeaderField(LOCATION_IDENTIFIER);
                    }
                    break;
                case 305: // User must use the specified proxy (deprecated by W3C)
                    break;
                case 401: // Unauthorized (nothing we can do here)
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.UNAUTHORIZED);
                    break;
                case 403: // HTTP Forbidden (Nothing we can do here)
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.FORBIDDEN);
                    break;
                case 404: // Not Found (Page is not found, nothing we can do with a 404)
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
                    break;
                case 500: // Internal Server Error
                case 501: // Not Implemented
                case 502: // Bad Gateway
                case 503: // Service Unavailable
                case 504: // Gateway Timeout
                case 505: // Version not supported
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.HTTP_ERROR_STATUS);
                    break;
                default:
                    LOGGER.info("Unrecognized HTTP Response Code: {}", linkDetails.getFinalResponseCode());
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
                    break;
            }
        } catch (MalformedURLException e) {
            // the URL is trash, so, it can't load it.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
        } catch (IOException ex) {
            // there was an issue we are going to set to error.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.ERROR);
        } catch (Exception ex) {
            // there was an unknown issue we are going to set to exception.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.EXCEPTION);
        } finally {
            // if the connection is not null, then we need to disconnect to close any underlying resources
            if (connection != null)
                connection.disconnect();
        }

        // If there was a redirection, then we have to keep going
        // Placing this code here should help to satisfy ensuring that the connection object
        // is closed successfully.
        if (reDirectedLink != null)
            unwindLink(reDirectedLink);

    }