in opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java [645:728]
public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String name1, String name2, Float thresh, boolean bStem)
{
if (thresh == null || thresh == 0f)
{
thresh = 0.8f;
}
// normalize gender
name1 = normalizeGenderAndOtherAttributes(name1);
name2 = normalizeGenderAndOtherAttributes(name2);
StringBuilder reason = new StringBuilder();
boolean bSportsOrOrchestra = !succeededMenWomenSportsRule(name1, name2);
if (bSportsOrOrchestra)
return new DedupResult("Sports rule: different teams or teams of different venues", 0, false);
bSportsOrOrchestra = !succeededSpecialGroupsSymphoniesRule(name1, name2);
if (bSportsOrOrchestra)
return new DedupResult("SpecialGroupsSymphoniesRule: different circus/band", 0, false);
LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'");
List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), true);
List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), true);
boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens)
&& verifyEventAttributesPre(name1Tokens, name2Tokens);
if (!bSameAttrib)
{
LOG.info("similar events but different attributes");
return new DedupResult("similar events but different attributes", 0, false);
}
List<HitBase> searchResult1 = webSearch.runSearch(name1);
List<HitBase> searchResult2 = webSearch.runSearch(name2);
int score = 0;
if (searchResult1 != null && searchResult2 != null)
{
for (HitBase item1 : searchResult1)
{
if (item1.getUrl().contains("myspace") || item1.getUrl().contains("wiki"))
continue;
for (HitBase item2 : searchResult2)
{
String lookup1 = item1.getTitle().replace("Facebook", "").replace("LinkedIn", "")
.replace("MySpace", "");
String lookup2 = item2.getTitle().replace("Facebook", "").replace("LinkedIn", "")
.replace("MySpace", "");
double d;
if (bStem)
d = stringDistanceMeasurer.measureStringDistance(lookup1, lookup2);
else
d = stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2);
if (d > thresh) // 0.8)
{
reason.append("Found common search result title for group names '").append(lookup1).append(" < > ").append(lookup2).append(" sim = ").append(d).append("\n");
LOG.info(("Found common search result title for group names '" + lookup1 + " < > " + lookup2
+ " sim = " + d));
score++;
}
}
}
}
boolean bothSidesSuccess = applyBothSidesRule(name1, name2);
if (!bothSidesSuccess)
{
score = 1;
reason.append("Failed common words test for sports");
}
if (score > 0)
{
boolean bDifferentGroup = bDifferentGroupOneSubnameOfAnother(name1, name2);
if (bDifferentGroup)
{
score = 1;
reason.append("Failed common words test for sports");
}
}
return new DedupResult(reason.toString(), score, score > 1);
}