in opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java [491:643]
public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String name1, String name2, String venue)
{
// normalize gender
name1 = normalizeGenderAndOtherAttributes(name1);
name2 = normalizeGenderAndOtherAttributes(name2);
Boolean bShortTitlesSimilarInWebSpace = attemptShortTitlesSimilarityInWebSpace(name1, name2);
if (bShortTitlesSimilarInWebSpace)
return new DedupResult("Accepted as short title by web mining", 2, true);
StringBuilder reason = new StringBuilder();
List<String> venueToks = removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false));
LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'");
// convert titles into token lists
List<String> name1Tokens = removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true));
List<String> name2Tokens = removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true));
// applySubPhraseExtractionRule()
boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens)
&& verifyEventAttributesPre(name1Tokens, name2Tokens);
if (!bSameAttrib)
{
LOG.info("similar events but different attributes");
return new DedupResult("similar events but different attributes", 0, false);
}
boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, name2);
if (!bothSodesSuccess)
{
return new DedupResult("Failed common words test for sports", 0, false);
}
float dist = (float) LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10);
if (dist < 5.1)
{
LOG.info("Found low LevensteinDistance for name1 and name2");
return new DedupResult("Found low LevensteinDistance", 2, true);
}
int nameMergeScore = getAttemptedNameMerge(name1, name2);
if (nameMergeScore > 0)
{
LOG.info("Found low NameMerge Distance for name1 and name2");
return new DedupResult("Found low NameMerge Distance", 2, true);
}
// todo take into account order
// form common sub-list of tokens
name1Tokens.retainAll(name2Tokens);
name1Tokens.removeAll(venueToks);
name1Tokens.removeAll(COMMON_WORDS_IN_EVENT_TITLES);
name1Tokens.removeAll(Arrays.asList(ENGLISH_PREPOSITIONS));
name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens);
// todo : to use full string measure
// boundary case: too many words => just do counts
float commonPortion = (float) name1Tokens.size() / (float) name2Tokens.size();
if (commonPortion > 0.8 || name1Tokens.size() >= 4)
{ // after typical
// title words
// are revomed 4
// looks OK
LOG.info("Accepted since substantial common part");
return new DedupResult("Accepted since substantial common part", Math.max((int) (commonPortion * 5.0), 2),
true);
}
// boundary case: no overlap
if (name1Tokens.size() < 1)
{
LOG.info("Rejected since nothing in common");
return new DedupResult("Rejected since nothing in common", 0, false);
}
// get from list of tokens back to words to get search expression
String entityExpression = name1Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ')
.replace(" ", " ").trim();
/*
* // now try name merge reduced strings String entityExpression1 = name1TokensC.toString().replace('[',
* ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); String entityExpression2 =
* name2Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim();
*
* nameMergeScore = getAttemptedNameMerge(entityExpression1, entityExpression2); if (nameMergeScore>0){
* LOG.info("Found low NameMerge Distance for REDUCED name1 and name2"); return new
* DedupResult("Found low NameMerge Distance REDUCED", 2, true);
*
* }
*/
// Before doing web mining, make sure overlap between titles is NOT a
// set of common english words (use the vocabulary)
// if all words are common, then NOT an entity
if (name1Tokens.size() < 2)
{
boolean bCommonEnglishWord = false;
for (String word : name1Tokens)
{
// if (stopList.isCommonWord(word) /*&& mostFrequent1000Words.isMostFrequent1000Word(word)*/)
// bCommonEnglishWord = true;
}
if (bCommonEnglishWord)
{
LOG.info("Rejected common entity: common word = " + entityExpression);
return new DedupResult("Rejected since common entity is common English word = " + entityExpression, 0,
false);
}
}
// accept common expression
LOG.info("Formed common entity = " + entityExpression);
reason.append("Formed common entity = ").append(entityExpression).append("\n");
// now go to the web / bing api with this common expression
List<HitBase> searchResult = webSearch.runSearch(entityExpression);
float entityScore = 0f;
if (searchResult != null)
{
int count = 0;
for (HitBase item : searchResult)
{
String lookup = item.getTitle();
LOG.info("Bing hit title = '" + lookup + "'");
reason.append("Bing hit title = '").append(lookup).append("'\n");
if (count > 4)
break;
count++;
// if occurrence is not capitalized then rejected, do not take
// into account in score
if (!isCapitalized(lookup))
{
LOG.info("Rejected hit title since not capitalized");
reason.append("Rejected hit title since not capitalized\n");
continue;
}
/*
* if (lookup.indexOf('-')>0 ){ lookup = lookup.split("-")[0]; }
*/
// now compute overlap between what found on the web for hit's
// title and the common expression between events
List<String> lookupTokens = tokenizeAndStem(lookup);
lookupTokens.retainAll(stemList(name1Tokens));
if (lookupTokens.size() >= name1Tokens.size())
// increment score if found hit title is acceptable
entityScore += 1.0;
else
{
LOG.info("Found hit title " + lookupTokens + " does not cover comonality expr = " + name1Tokens);
entityScore += 0.25;
}
}
}
return new DedupResult(reason.toString(), (int) entityScore, entityScore > 1.0);
}