private String processQueryElements()

in core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java [219:291]


    private String processQueryElements(String urlToFilter) {
        try {
            // Handle illegal characters by making a url first
            // this will clean illegal characters like |
            final URL url = new URL(urlToFilter);

            String query = url.getQuery();
            String path = url.getPath();

            // check if the last element of the path contains parameters
            // if so convert them to query elements
            if (path.contains(";")) {
                String[] pathElements = path.split("/");
                String last = pathElements[pathElements.length - 1];
                // replace last value by part without params
                int semicolon = last.indexOf(";");
                if (semicolon != -1) {
                    pathElements[pathElements.length - 1] = last.substring(0, semicolon);
                    String params = last.substring(semicolon + 1).replaceAll(";", "&");
                    if (query == null) {
                        query = params;
                    } else {
                        query += "&" + params;
                    }
                    // rebuild the path
                    StringBuilder newPath = new StringBuilder();
                    for (String p : pathElements) {
                        if (StringUtils.isNotBlank(p)) {
                            newPath.append("/").append(p);
                        }
                    }
                    path = newPath.toString();
                }
            }

            if (StringUtils.isEmpty(query)) {
                return urlToFilter;
            }

            List<NameValuePair> pairs = URLEncodedUtils.parse(query, StandardCharsets.UTF_8);
            Iterator<NameValuePair> pairsIterator = pairs.iterator();
            while (pairsIterator.hasNext()) {
                NameValuePair param = pairsIterator.next();
                if (queryElementsToRemove.contains(param.getName())) {
                    pairsIterator.remove();
                } else if (removeHashes && param.getValue() != null) {
                    Matcher m = thirtytwobithash.matcher(param.getValue());
                    if (m.matches()) {
                        pairsIterator.remove();
                    }
                }
            }

            String newQueryString = null;
            if (!pairs.isEmpty()) {
                pairs.sort(comp);
                newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8);
            }

            // copied from URL.toExternalForm()
            String s;
            return url.getProtocol()
                    + ':'
                    + ((s = url.getAuthority()) != null && !s.isEmpty() ? "//" + s : "")
                    + ((s = path) != null ? s : "")
                    + ((s = newQueryString) != null ? '?' + s : "")
                    + ((s = url.getRef()) != null ? '#' + s : "");

        } catch (MalformedURLException e) {
            LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e);
            return null;
        }
    }