public DedupResult areNamesSemanticallyCloseWebMineCommonPart()

in opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java [491:643]


	public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String name1, String name2, String venue)
	{
		// normalize gender
		name1 = normalizeGenderAndOtherAttributes(name1);
		name2 = normalizeGenderAndOtherAttributes(name2);

		Boolean bShortTitlesSimilarInWebSpace = attemptShortTitlesSimilarityInWebSpace(name1, name2);
		if (bShortTitlesSimilarInWebSpace)
			return new DedupResult("Accepted as short title by web mining", 2, true);

		StringBuilder reason = new StringBuilder();
		List<String> venueToks = removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false));

		LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'");
		// convert titles into token lists
		List<String> name1Tokens = removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true));
		List<String> name2Tokens = removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true));
		// applySubPhraseExtractionRule()
		boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens)
			&& verifyEventAttributesPre(name1Tokens, name2Tokens);
		if (!bSameAttrib)
		{
			LOG.info("similar events but different attributes");
			return new DedupResult("similar events but different attributes", 0, false);
		}

		boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, name2);
		if (!bothSodesSuccess)
		{
			return new DedupResult("Failed common words test for sports", 0, false);
		}

		float dist = (float) LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10);
		if (dist < 5.1)
		{
			LOG.info("Found low LevensteinDistance for name1 and name2");
			return new DedupResult("Found low LevensteinDistance", 2, true);
		}

		int nameMergeScore = getAttemptedNameMerge(name1, name2);
		if (nameMergeScore > 0)
		{
			LOG.info("Found low NameMerge Distance for name1 and name2");
			return new DedupResult("Found low  NameMerge Distance", 2, true);
		}

		// todo take into account order
		// form common sub-list of tokens
		name1Tokens.retainAll(name2Tokens);
		name1Tokens.removeAll(venueToks);

		name1Tokens.removeAll(COMMON_WORDS_IN_EVENT_TITLES);
		name1Tokens.removeAll(Arrays.asList(ENGLISH_PREPOSITIONS));
		name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens);
		// todo : to use full string measure
		// boundary case: too many words => just do counts
		float commonPortion = (float) name1Tokens.size() / (float) name2Tokens.size();
		if (commonPortion > 0.8 || name1Tokens.size() >= 4)
		{ // after typical
			// title words
			// are revomed 4
			// looks OK
			LOG.info("Accepted since substantial common part");
			return new DedupResult("Accepted since substantial common part", Math.max((int) (commonPortion * 5.0), 2),
				true);
		}
		// boundary case: no overlap
		if (name1Tokens.size() < 1)
		{
			LOG.info("Rejected since nothing in common");
			return new DedupResult("Rejected since nothing in common", 0, false);
		}
		// get from list of tokens back to words to get search expression
		String entityExpression = name1Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ')
			.replace("  ", " ").trim();
		/*
		 * // now try name merge reduced strings String entityExpression1 = name1TokensC.toString().replace('[',
		 * ' ').replace(']', ' ').replace(',', ' ') .replace("  ", " ").trim(); String entityExpression2 =
		 * name2Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace("  ", " ").trim();
		 * 
		 * nameMergeScore = getAttemptedNameMerge(entityExpression1, entityExpression2); if (nameMergeScore>0){
		 * LOG.info("Found low NameMerge Distance for REDUCED name1 and name2"); return new
		 * DedupResult("Found low  NameMerge Distance REDUCED", 2, true);
		 * 
		 * }
		 */

		// Before doing web mining, make sure overlap between titles is NOT a
		// set of common english words (use the vocabulary)
		// if all words are common, then NOT an entity
		if (name1Tokens.size() < 2)
		{
			boolean bCommonEnglishWord = false;
			for (String word : name1Tokens)
			{
	//			if (stopList.isCommonWord(word) /*&& mostFrequent1000Words.isMostFrequent1000Word(word)*/)
	//				bCommonEnglishWord = true;
			}

			if (bCommonEnglishWord)
			{
				LOG.info("Rejected common entity: common word = " + entityExpression);
				return new DedupResult("Rejected since common entity is common English word = " + entityExpression, 0,
					false);
			}
		}
		// accept common expression
		LOG.info("Formed common entity = " + entityExpression);
		reason.append("Formed common entity = ").append(entityExpression).append("\n");
		// now go to the web / bing api with this common expression
		List<HitBase> searchResult = webSearch.runSearch(entityExpression);
		float entityScore = 0f;
		if (searchResult != null)
		{
			int count = 0;
			for (HitBase item : searchResult)
			{
				String lookup = item.getTitle();
				LOG.info("Bing hit title = '" + lookup + "'");
				reason.append("Bing hit title = '").append(lookup).append("'\n");
				if (count > 4)
					break;
				count++;
				// if occurrence is not capitalized then rejected, do not take
				// into account in score
				if (!isCapitalized(lookup))
				{
					LOG.info("Rejected hit title since not capitalized");
					reason.append("Rejected hit title since not capitalized\n");
					continue;
				}

				/*
				 * if (lookup.indexOf('-')>0 ){ lookup = lookup.split("-")[0]; }
				 */
				// now compute overlap between what found on the web for hit's
				// title and the common expression between events
				List<String> lookupTokens = tokenizeAndStem(lookup);
				lookupTokens.retainAll(stemList(name1Tokens));
				if (lookupTokens.size() >= name1Tokens.size())
					// increment score if found hit title is acceptable
					entityScore += 1.0;
				else
				{
					LOG.info("Found hit title " + lookupTokens + " does not cover comonality expr = " + name1Tokens);
					entityScore += 0.25;

				}

			}
		}
		return new DedupResult(reason.toString(), (int) entityScore, entityScore > 1.0);
	}