in src/java/org/apache/ivy/util/url/ApacheURLLister.java [105:213]
public List<URL> retrieveListing(URL url, boolean includeFiles, boolean includeDirectories)
throws IOException {
List<URL> urlList = new ArrayList<>();
// add trailing slash for relative urls
if (!url.getPath().endsWith("/") && !url.getPath().endsWith(".html")) {
url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getPath() + "/");
}
URLHandler urlHandler = URLHandlerRegistry.getDefault();
URLInfo urlInfo = urlHandler.getURLInfo(url);
if (urlInfo == URLHandler.UNAVAILABLE) {
return urlList; // not found => return empty list
}
// here, urlInfo is valid
String charset = urlInfo.getBodyCharset();
InputStream contentStream = urlHandler.openStream(url);
BufferedReader r = null;
if (charset == null) {
r = new BufferedReader(new InputStreamReader(contentStream));
} else {
r = new BufferedReader(new InputStreamReader(contentStream, charset));
}
String htmlText = FileUtil.readEntirely(r);
Matcher matcher = PATTERN.matcher(htmlText);
while (matcher.find()) {
// get the href text and the displayed text
String href = matcher.group(1);
String text = matcher.group(2);
if (href == null || text == null) {
// the groups were not found (shouldn't happen, really)
continue;
}
text = text.trim();
try {
// URI methods decode the URL
URI uri = new URI(href);
href = uri.getPath();
// handle complete URL listings
if (uri.getScheme() != null) {
if (!href.startsWith(url.getPath())) {
// ignore URLs which aren't children of the base URL
continue;
}
href = href.substring(url.getPath().length());
}
} catch (URISyntaxException e) {
// incorrect URL, ignore
continue;
}
if (href.startsWith("../")) {
// we are only interested in sub-URLs, not parent URLs, so skip this one
continue;
}
// absolute href: convert to relative one
if (href.startsWith("/")) {
int slashIndex = href.substring(0, href.length() - 1).lastIndexOf('/');
href = href.substring(slashIndex + 1);
}
// relative to current href: convert to simple relative one
if (href.startsWith("./")) {
href = href.substring("./".length());
}
// exclude those where they do not match
// href will never be truncated, text may be truncated by apache
if (text.endsWith("..>")) {
// text is probably truncated, we can only check if the href starts with text
if (!href.startsWith(text.substring(0, text.length() - 3))) {
continue;
}
} else if (text.endsWith("..>")) {
// text is probably truncated, we can only check if the href starts with text
if (!href.startsWith(text.substring(0, text.length() - 6))) {
continue;
}
} else {
// text is not truncated, so it must match the url after stripping optional
// trailing slashes
String strippedHref = href.endsWith("/") ? href.substring(0, href.length() - 1)
: href;
String strippedText = text.endsWith("/") ? text.substring(0, text.length() - 1)
: text;
if (!strippedHref.equalsIgnoreCase(strippedText)) {
continue;
}
}
boolean directory = href.endsWith("/");
if ((directory && includeDirectories) || (!directory && includeFiles)) {
URL child = new URL(url, href);
urlList.add(child);
Message.debug("ApacheURLLister found URL=[" + child + "].");
}
}
return urlList;
}