in core/src/main/java/org/apache/stormcrawler/protocol/Protocol.java [59:157]
public static void main(Protocol protocol, String[] args) throws Exception {
Config conf = new Config();
// loads the default configuration file
Map<String, Object> defaultSCConfig =
Utils.findAndReadConfigFile("crawler-default.yaml", false);
conf.putAll(ConfUtils.extractConfigElement(defaultSCConfig));
Options options = new Options();
options.addOption("f", true, "configuration file");
options.addOption("b", false, "dump binary content to temp file");
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(options, args);
String confFile = cmd.getOptionValue("f");
if (confFile != null) {
ConfUtils.loadConf(confFile, conf);
}
boolean binary = cmd.hasOption("b");
protocol.configure(conf);
Set<Runnable> threads = new HashSet<>();
class Fetchable implements Runnable {
final String url;
final Metadata md;
Fetchable(String line) {
StringTabScheme scheme = new StringTabScheme();
List<Object> tuple =
scheme.deserialize(ByteBuffer.wrap(line.getBytes(StandardCharsets.UTF_8)));
this.url = (String) tuple.get(0);
this.md = (Metadata) tuple.get(1);
}
public void run() {
final StringBuilder stringB = new StringBuilder();
stringB.append(url).append("\n");
final boolean skipRobots =
ConfUtils.getBoolean(conf, "http.robots.file.skip", false);
if (!skipRobots) {
BaseRobotRules rules = protocol.getRobotRules(url);
stringB.append("robots allowed: ").append(rules.isAllowed(url)).append("\n");
if (rules instanceof RobotRules) {
stringB.append("robots requests: ")
.append(((RobotRules) rules).getContentLengthFetched().length)
.append("\n");
}
stringB.append("sitemaps identified: ")
.append(rules.getSitemaps().size())
.append("\n");
}
long start = System.currentTimeMillis();
ProtocolResponse response;
try {
response = protocol.getProtocolOutput(url, md);
stringB.append(response.getMetadata()).append("\n");
stringB.append("status code: ").append(response.getStatusCode()).append("\n");
stringB.append("content length: ")
.append(response.getContent().length)
.append("\n");
long timeFetching = System.currentTimeMillis() - start;
stringB.append("fetched in : ").append(timeFetching).append(" msec\n");
if (binary) {
Path p = Files.createTempFile("sc-protocol-", ".dump");
FileUtils.writeByteArrayToFile(p.toFile(), response.getContent());
stringB.append("dumped content to : ").append(p);
}
System.out.println(stringB);
} catch (Exception e) {
e.printStackTrace();
} finally {
threads.remove(this);
}
}
}
for (String arg : cmd.getArgs()) {
Fetchable p = new Fetchable(arg);
threads.add(p);
new Thread(p).start();
}
while (threads.size() > 0) {
Thread.sleep(1000);
}
protocol.cleanup();
System.exit(0);
}