in core/src/main/java/org/apache/stormcrawler/util/URLPartitioner.java [44:93]
public static String getPartition(
final String url, final Metadata metadata, final String partitionMode) {
String partitionKey = null;
String host = "";
// IP in metadata?
if (partitionMode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
String ip_provided = metadata.getFirstValue("ip");
if (StringUtils.isNotBlank(ip_provided)) {
partitionKey = ip_provided;
}
}
if (partitionKey == null) {
URL u;
try {
u = new URL(url);
host = u.getHost();
} catch (MalformedURLException e1) {
LOG.warn("Invalid URL: {}", url);
return null;
}
}
// partition by hostname
if (partitionMode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST)) partitionKey = host;
// partition by domain : needs fixing
else if (partitionMode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
partitionKey = PaidLevelDomain.getPLD(host);
}
// partition by IP
if (partitionMode.equalsIgnoreCase(Constants.PARTITION_MODE_IP) && partitionKey == null) {
try {
long start = System.currentTimeMillis();
final InetAddress addr = InetAddress.getByName(host);
partitionKey = addr.getHostAddress();
long end = System.currentTimeMillis();
LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey, end - start, url);
} catch (final Exception e) {
LOG.warn("Unable to resolve IP for: {}", host);
return null;
}
}
LOG.debug("Partition Key for: {} > {}", url, partitionKey);
return partitionKey;
}