in webindex/modules/integration/src/main/java/webindex/integration/SampleData.java [42:67]
public static void generate(Path path, int numPages) throws Exception {
Gson gson = new Gson();
long count = 0;
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
log.debug("Skipping {}", p.getUrl());
continue;
}
log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
String json = gson.toJson(p);
writer.write(json);
writer.newLine();
count++;
if (count == numPages) {
break;
} else if ((count % 1000) == 0) {
log.info("Wrote {} of {} pages to {}", count, numPages, path);
}
}
}
log.info("Wrote {} pages to {}", numPages, path);
}