in src/java/org/apache/nutch/protocol/RobotRulesParser.java [289:429]
public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url,
List<Content> robotsTxtContent);
@Override
public int run(String[] args) {
if (args.length < 2) {
String[] help = {
"Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> <url-file> [<agent-names>]",
"",
"<robots-file-or-url>\tlocal file or URL parsed as robots.txt file",
"\tIf <robots-file-or-url> starts with a protocol specification",
"\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path",
"\tand query are removed and the path \"/robots.txt\" is appended.",
"\tThe resulting URL (the canonical robots.txt location) is then",
"\tfetched using the specified protocol.",
"\tIf the URL does not include a protocol, a local file is assumed.",
"",
"<url-file>\tlocal file with URLs (one per line), for every URL",
"\tthe path part (including the query) is checked whether",
"\tit is allowed by the robots.txt rules. Other parts of the URLs",
"\t(mainly the host) are ignored.",
"",
"<agent-names>\tuser-agent name (aka. \"product token\")",
"\tused to select rules from the robots.txt file.",
"\tMultiple agent names can be passed as comma-separated string.",
"\tIf no agent name is given the properties http.agent.name",
"\tand http.robots.agents are used.",
"\tIf also http.agent.name and http.robots.agents are empty,",
"\trobots.txt is checked for rules assigned to the user",
"\tagent `*' (meaning any other).",
"",
"Important properties:",
" -D fetcher.store.robotstxt=true",
"\toutput content and HTTP meta data of fetched robots.txt (if not a local file)",
" -D http.agent.name=...\t(primary) agent name",
" -D http.robots.agents=...\tadditional agent names",
" -D http.robot.rules.allowlist=..."};
for (String s : help) {
System.err.println(s);
}
return -1;
}
if (args.length > 2) {
// set agent name from command-line in configuration
// Note: when fetching via protocol this must be done
// before the protocol is configured
String agents = args[2];
conf.set("http.robots.agents", agents);
conf.set("http.agent.name", agents.split(",")[0]);
setConf(conf);
}
Protocol protocol = null;
URL robotsTxtUrl = null;
if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
try {
robotsTxtUrl = new URL(args[0]);
} catch (MalformedURLException e) {
LOG.warn("Not a valid URL, assuming local file: {}", args[0]);
}
ProtocolFactory factory = new ProtocolFactory(conf);
try {
protocol = factory.getProtocol(robotsTxtUrl);
LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass());
} catch (ProtocolNotFound e) {
LOG.error("No protocol found for {}: {}", args[0],
StringUtils.stringifyException(e));
return -1;
}
}
if (robotsTxtUrl == null) {
// try as local file
File robotsFile = new File(args[0]);
if (!robotsFile.exists()) {
LOG.error("File does not exist: {}", args[0]);
return -1;
} else {
try {
robotsTxtUrl = robotsFile.toURI().toURL();
} catch (MalformedURLException e) {
}
}
}
File urlFile = new File(args[1]);
List<Content> robotsTxtContent = null;
if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
robotsTxtContent = new LinkedList<>();
}
try {
BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent);
LOG.debug("Robots.txt rules:\n{}", rules);
if (robotsTxtContent != null) {
for (Content robotsTxt : robotsTxtContent) {
LOG.info("fetched robots.txt {}:",
robotsTxt.getUrl());
LOG.info(robotsTxt.toString());
}
}
System.out.println("Testing robots.txt for agent names: "
+ (agentNames.isEmpty() ? "* (any other agent)" : agentNames));
LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
String testPath;
testPath = testsIn.readLine();
while (testPath != null) {
testPath = testPath.trim();
try {
// testPath can be just a path or a complete URL
URL url = new URL(testPath);
String status;
if (isAllowListed(url)) {
status = "allowlisted";
} else if (rules.isAllowed(testPath)) {
status = "allowed";
} else {
status = "not allowed";
}
System.out.println(status + ":\t" + testPath);
} catch (MalformedURLException e) {
LOG.warn("Not a valid URL: {}", testPath);
}
testPath = testsIn.readLine();
}
testsIn.close();
} catch (IOException e) {
LOG.error("Failed to run: " + StringUtils.stringifyException(e));
return -1;
}
return 0;
}