in core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java [142:204]
public static void main(String[] args) throws ParseException {
Config conf = new Config();
// loads the default configuration file
Map<String, Object> defaultSCConfig =
Utils.findAndReadConfigFile("crawler-default.yaml", false);
conf.putAll(ConfUtils.extractConfigElement(defaultSCConfig));
String configFile = "urlfilters.json";
Options options =
new Options()
.addOption("f", true, "Filters configuration file. Default " + configFile);
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(options, args);
if (cmd.hasOption("f")) {
configFile = cmd.getOptionValue("f");
}
if (cmd.getArgList().isEmpty()) {
System.err.println("Missing argument for input URL");
System.exit(-1);
}
// read URL to check
String inputURL = cmd.getArgList().get(0);
// if a URL has been specified in 2nd position
String sourceURL = inputURL;
if (cmd.getArgList().size() > 1) {
sourceURL = cmd.getArgList().get(1);
}
try {
URLFilters filters = new URLFilters(conf, configFile);
String normalizedURL = inputURL;
try {
for (URLFilter filter : filters.filters) {
long start = System.currentTimeMillis();
normalizedURL =
filter.filter(new URL(sourceURL), new Metadata(), normalizedURL);
long end = System.currentTimeMillis();
System.out.println(
"\t["
+ filter.getClass().getName()
+ "] "
+ (end - start)
+ "msec => "
+ normalizedURL);
if (normalizedURL == null) break;
}
} catch (Exception e) {
LOG.error("URL filtering threw exception", e);
}
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
System.exit(0);
}