in core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java [100:166]
public void setConf(Config conf) {
// Grab the agent names we advertise to robots files.
String agentName = ConfUtils.getString(conf, "http.agent.name");
if (null == agentName) {
throw new RuntimeException("Agent name not configured!");
}
agentName = agentName.toLowerCase(Locale.ROOT);
checkAgentValue(agentName);
ArrayList<String> agents = new ArrayList<>();
List<String> configuredAgentNames = ConfUtils.loadListFromConf("http.robots.agents", conf);
// backward compatibility
// if it has a single entry - parse it
if (configuredAgentNames.size() == 1) {
StringTokenizer tok = new StringTokenizer(configuredAgentNames.get(0), ",");
while (tok.hasMoreTokens()) {
String agent = tok.nextToken().trim().toLowerCase(Locale.ROOT);
checkAgentValue(agent);
agents.add(agent);
}
} else {
for (String ag : configuredAgentNames) {
String agent = ag.trim().toLowerCase(Locale.ROOT);
checkAgentValue(agent);
agents.add(agent);
}
}
/*
* If there are no agents for robots-parsing, use the default agent-string. If
* both are present, our agent-string should be the first one we advertise to
* robots-parsing.
*/
if (agents.isEmpty()) {
LOG.info(
"No agents listed in 'http.robots.agents' property! Using http.agent.name [{}]",
agentName);
this.agentNames.add(agentName.toLowerCase(Locale.ROOT));
} else {
int index = 0;
if ((agents.get(0)).equalsIgnoreCase(agentName)) {
index++;
} else {
LOG.info(
"Agent we advertise ({}) not listed first in 'http.robots.agents' property!",
agentName);
}
// append all the agents from the http.robots.agents property
for (; index < agents.size(); index++) {
agentNames.add(agents.get(index));
}
}
String spec =
ConfUtils.getString(
conf, cacheConfigParamName, "maximumSize=10000,expireAfterWrite=6h");
CACHE = Caffeine.from(spec).build();
spec =
ConfUtils.getString(
conf, errorcacheConfigParamName, "maximumSize=10000,expireAfterWrite=1h");
ERRORCACHE = Caffeine.from(spec).build();
}