in src/main/java/org/apache/creadur/tentacles/NexusClient.java [127:175]
public Set<URI> crawl(final URI index) throws IOException {
log.info("Crawl {}", index);
final Set<URI> resources = new LinkedHashSet<>();
final CloseableHttpResponse response = get(index);
final InputStream content = response.getEntity().getContent();
final StreamLexer lexer = new StreamLexer(content);
final Set<URI> crawl = new LinkedHashSet<>();
// <a
// href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
while (lexer.readAndMark("<a ", "/a>")) {
try {
final String link = lexer.peek("href=\"", "\"");
final String name = lexer.peek(">", "<");
final URI uri = index.resolve(link);
if (name.equals(ONE_UP)) {
continue;
}
if (link.equals(ONE_UP)) {
continue;
}
if (name.endsWith(SLASH)) {
crawl.add(uri);
continue;
}
resources.add(uri);
} finally {
lexer.unmark();
}
}
content.close();
response.close();
for (final URI uri : crawl) {
resources.addAll(crawl(uri));
}
return resources;
}