opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java [104:164]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int l1 = str1List.size(), l2 = str2List.size(); if (l1 < 2) l1 = str1Words.length; if (l2 < 2) l2 = str2Words.length; int lOverlap = 0; List strListOverlap = new ArrayList<>(str1List); strListOverlap.retainAll(str2List); for (String w : strListOverlap) { if (w.toLowerCase().equals(w)) // no special interest word lOverlap++; else lOverlap += 2; // if capitalized, or specific word => important so // double score } result = Math.pow((double) (lOverlap * lOverlap) / (double) l1 / (double) l2, 0.4); // now we try to find similar words which are long or Upper case int countSimilar = 0; str1List.removeAll(strListOverlap); str2List.removeAll(strListOverlap); for (String w1 : str1List) { for (String w2 : str2List) { if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD || !w1.toLowerCase().equals(w1)) if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD || !w2.toLowerCase().equals(w2)) if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1, 10) <= ACCEPTABLE_DEVIATION_IN_CHAR) countSimilar++; } } lOverlap += countSimilar; result = Math.pow((double) (lOverlap * lOverlap) / (double) l1 / (double) l2, 0.4); if (result > 1) result = 1.0; // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1, // 10, 1, 10); // System.out.println(ld); } catch (Exception e) { e.printStackTrace(); return -1.0; } Double linguisticScore = -1.0; // to be developed - employs linguistic processor /* * if (result>MIN_SCORE_FOR_LING) { List> matchResult = * pos.matchOrigSentencesCache(str1, str2); linguisticScore = * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult); * System.out.println(matchResult); * * // magic formula for 0.7 string match and 0.3 linguistic match result = * result*0.7 + linguisticScore/6.0* 0.3; } */ return result; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java [180:240]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int l1 = str1List.size(), l2 = str2List.size(); if (l1 < 2) l1 = str1Words.length; if (l2 < 2) l2 = str2Words.length; int lOverlap = 0; List strListOverlap = new ArrayList<>(str1List); strListOverlap.retainAll(str2List); for (String w : strListOverlap) { if (w.toLowerCase().equals(w)) // no special interest word lOverlap++; else lOverlap += 2; // if capitalized, or specific word => important so // double score } result = Math.pow((double) (lOverlap * lOverlap) / (double) l1 / (double) l2, 0.4); // now we try to find similar words which are long or Upper case int countSimilar = 0; str1List.removeAll(strListOverlap); str2List.removeAll(strListOverlap); for (String w1 : str1List) { for (String w2 : str2List) { if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD || !w1.toLowerCase().equals(w1)) if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD || !w2.toLowerCase().equals(w2)) if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1, 10) <= ACCEPTABLE_DEVIATION_IN_CHAR) countSimilar++; } } lOverlap += countSimilar; result = Math.pow((double) (lOverlap * lOverlap) / (double) l1 / (double) l2, 0.4); if (result > 1) result = 1.0; // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1, // 10, 1, 10); // System.out.println(ld); } catch (Exception e) { e.printStackTrace(); return -1.0; } Double linguisticScore = -1.0; // to be developed - employs linguistic processor /* * if (result>MIN_SCORE_FOR_LING) { List> matchResult = * pos.matchOrigSentencesCache(str1, str2); linguisticScore = * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult); * System.out.println(matchResult); * * // magic formula for 0.7 string match and 0.3 linguistic match result = * result*0.7 + linguisticScore/6.0* 0.3; } */ return result; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -