public DedupResult areNamesSemanticallyCloseInWebSearchSpace()

in opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java [645:728]


	public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String name1, String name2, Float thresh, boolean bStem)
	{

		if (thresh == null || thresh == 0f)
		{
			thresh = 0.8f;
		}

		// normalize gender
		name1 = normalizeGenderAndOtherAttributes(name1);
		name2 = normalizeGenderAndOtherAttributes(name2);

		StringBuilder reason = new StringBuilder();

		boolean bSportsOrOrchestra = !succeededMenWomenSportsRule(name1, name2);
		if (bSportsOrOrchestra)
			return new DedupResult("Sports rule: different teams or teams of different venues", 0, false);

		bSportsOrOrchestra = !succeededSpecialGroupsSymphoniesRule(name1, name2);
		if (bSportsOrOrchestra)
			return new DedupResult("SpecialGroupsSymphoniesRule: different circus/band", 0, false);

		LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'");

		List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), true);
		List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), true);
		boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens)
			&& verifyEventAttributesPre(name1Tokens, name2Tokens);
		if (!bSameAttrib)
		{
			LOG.info("similar events but different attributes");
			return new DedupResult("similar events but different attributes", 0, false);
		}

		List<HitBase> searchResult1 = webSearch.runSearch(name1);
		List<HitBase> searchResult2 = webSearch.runSearch(name2);
		int score = 0;
		if (searchResult1 != null && searchResult2 != null)
		{
			for (HitBase item1 : searchResult1)
			{
				if (item1.getUrl().contains("myspace") || item1.getUrl().contains("wiki"))
					continue;
				for (HitBase item2 : searchResult2)
				{
					String lookup1 = item1.getTitle().replace("Facebook", "").replace("LinkedIn", "")
						.replace("MySpace", "");
					String lookup2 = item2.getTitle().replace("Facebook", "").replace("LinkedIn", "")
						.replace("MySpace", "");
					double d;
					if (bStem)
						d = stringDistanceMeasurer.measureStringDistance(lookup1, lookup2);
					else
						d = stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2);
					if (d > thresh) // 0.8)
					{

						reason.append("Found common search result title for group names '").append(lookup1).append(" < > ").append(lookup2).append(" sim = ").append(d).append("\n");
						LOG.info(("Found common search result title for group names '" + lookup1 + " < > " + lookup2
							+ " sim = " + d));
						score++;
					}

				}
			}
		}

		boolean bothSidesSuccess = applyBothSidesRule(name1, name2);
		if (!bothSidesSuccess)
		{
			score = 1;
			reason.append("Failed common words test for sports");
		}
		if (score > 0)
		{
			boolean bDifferentGroup = bDifferentGroupOneSubnameOfAnother(name1, name2);
			if (bDifferentGroup)
			{
				score = 1;
				reason.append("Failed common words test for sports");
			}
		}
		return new DedupResult(reason.toString(), score, score > 1);
	}