in connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java [5614:5849]
public Filter(Specification spec, boolean warnOnBadSeed)
throws ManifoldCFException
{
String excludes = "";
// To save allocation, preallocate the seeds map assuming that it will require 1.5x the number of nodes in the spec
int initialSize = spec.getChildCount();
if (initialSize == 0)
initialSize = 1;
seeds = new HashSet<String>((initialSize * 3) >> 1);
int i = 0;
// First pass. Find all of the rules (which are necessary to canonicalize the seeds, etc.)
while (i < spec.getChildCount())
{
SpecificationNode n = spec.getChild(i++);
if (n.getType().equals(RSSConfig.NODE_MAP))
{
String match = n.getAttributeValue(RSSConfig.ATTR_MATCH);
String map = n.getAttributeValue(RSSConfig.ATTR_MAP);
if (match != null && match.length() > 0)
{
Pattern p;
try
{
p = Pattern.compile(match);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Regular expression '"+match+"' is illegal: "+e.getMessage(),e);
}
if (map == null)
map = "";
mappings.add(new MappingRule(p,map));
}
}
else if (n.getType().equals(RSSConfig.NODE_EXCLUDES))
{
excludes = n.getValue();
if (excludes == null)
excludes = "";
}
else if (n.getType().equals(RSSConfig.NODE_URLSPEC))
{
String urlRegexp = n.getAttributeValue(RSSConfig.ATTR_REGEXP);
if (urlRegexp == null)
urlRegexp = "";
String reorder = n.getAttributeValue(RSSConfig.ATTR_REORDER);
boolean reorderValue;
if (reorder == null)
reorderValue = false;
else
{
if (reorder.equals(RSSConfig.VALUE_YES))
reorderValue = true;
else
reorderValue = false;
}
String javaSession = n.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL);
boolean javaSessionValue;
if (javaSession == null)
javaSessionValue = false;
else
{
if (javaSession.equals(RSSConfig.VALUE_YES))
javaSessionValue = true;
else
javaSessionValue = false;
}
String aspSession = n.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL);
boolean aspSessionValue;
if (aspSession == null)
aspSessionValue = false;
else
{
if (aspSession.equals(RSSConfig.VALUE_YES))
aspSessionValue = true;
else
aspSessionValue = false;
}
String phpSession = n.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL);
boolean phpSessionValue;
if (phpSession == null)
phpSessionValue = false;
else
{
if (phpSession.equals(RSSConfig.VALUE_YES))
phpSessionValue = true;
else
phpSessionValue = false;
}
String bvSession = n.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL);
boolean bvSessionValue;
if (bvSession == null)
bvSessionValue = false;
else
{
if (bvSession.equals(RSSConfig.VALUE_YES))
bvSessionValue = true;
else
bvSessionValue = false;
}
try
{
canonicalizationPolicies.addRule(new CanonicalizationPolicy(Pattern.compile(urlRegexp),reorderValue,javaSessionValue,aspSessionValue,
phpSessionValue, bvSessionValue));
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"' is illegal: "+e.getMessage(),e);
}
}
}
compileList(excludePatterns,stringToArray(excludes));
// Second pass. Do the rest of the work,
i = 0;
while (i < spec.getChildCount())
{
SpecificationNode n = spec.getChild(i++);
if (n.getType().equals(RSSConfig.NODE_FEED))
{
String rssURL = n.getAttributeValue(RSSConfig.ATTR_URL);
if (rssURL != null && rssURL.length() > 0)
{
String canonicalURL = makeDocumentIdentifier(canonicalizationPolicies,null,rssURL);
if (canonicalURL != null)
{
seeds.add(canonicalURL);
}
else
{
if (warnOnBadSeed)
Logging.connectors.warn("RSS: Illegal seed feed '"+rssURL+"'");
}
}
}
else if (n.getType().equals(RSSConfig.NODE_ACCESS))
{
String token = n.getAttributeValue(RSSConfig.ATTR_TOKEN);
acls.add(token);
}
else if (n.getType().equals(RSSConfig.NODE_FEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
defaultRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_MINFEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
minimumRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_BADFEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
badFeedRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_FEEDTIMEOUT))
{
String value = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (value != null && value.length() > 0)
{
try
{
feedTimeoutValue= Integer.parseInt(value) * 1000;
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_DECHROMEDMODE))
{
String mode = n.getAttributeValue(RSSConfig.ATTR_MODE);
if (mode != null && mode.length() > 0)
{
if (mode.equals(RSSConfig.VALUE_NONE))
dechromedContentMode = DECHROMED_NONE;
else if (mode.equals(RSSConfig.VALUE_DESCRIPTION))
dechromedContentMode = DECHROMED_DESCRIPTION;
else if (mode.equals(RSSConfig.VALUE_CONTENT))
dechromedContentMode = DECHROMED_CONTENT;
}
}
else if (n.getType().equals(RSSConfig.NODE_CHROMEDMODE))
{
String mode = n.getAttributeValue(RSSConfig.ATTR_MODE);
if (mode != null && mode.length() > 0)
{
if (mode.equals(RSSConfig.VALUE_USE))
chromedContentMode = CHROMED_USE;
else if (mode.equals(RSSConfig.VALUE_SKIP))
chromedContentMode = CHROMED_SKIP;
else if (mode.equals(RSSConfig.VALUE_METADATA))
chromedContentMode = CHROMED_METADATA_ONLY;
}
}
}
}