in languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java [244:432]
private String adjustTextForTokenizing(String text, HashMap<String, String> urls) {
text = cleanup(text);
if( "\u2014\u2013-".indexOf(text.charAt(0)) >=0 ) {
Matcher matcher = LEADING_DASH_PATTERN.matcher(text);
if( matcher.find() ) {
text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
}
else {
matcher = LEADING_DASH_PATTERN_2.matcher(text);
if( matcher.find() ) {
text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
}
}
}
if( text.contains(",") ) {
text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
}
// check for urls
if( text.contains("http") || text.contains("www") || text.contains("@") || text.contains("ftp") ) { // https?|ftp
Matcher matcher = URL_PATTERN.matcher(text);
int urlReplaceChar = URL_START_REPLACE_CHAR;
while( matcher.find() ) {
String urlGroup = matcher.group();
String replaceChar = String.valueOf((char)urlReplaceChar);
urls.put(replaceChar, urlGroup);
text = matcher.replaceFirst(replaceChar);
urlReplaceChar++;
matcher = URL_PATTERN.matcher(text);
}
}
if( text.indexOf('\u2014') != -1 ) {
text = text.replaceAll("\u2014([\\h\\v])", BREAKING_PLACEHOLDER + "\u2014$1");
}
boolean nDashPresent = text.indexOf('\u2013') != -1;
if( text.indexOf('-') != -1 || nDashPresent ) {
text = DASH_NUMBERS_PATTERN.matcher(text).replaceAll(DASH_NUMBERS_REPL);
if( nDashPresent ) {
text = N_DASH_SPACE_PATTERN.matcher(text).replaceAll(N_DASH_SPACE_REPL);
text = N_DASH_SPACE_PATTERN2.matcher(text).replaceAll(N_DASH_SPACE_REPL);
}
}
if( text.indexOf("с/г") != -1 ) {
text = text.replace("с/г", "с" +NON_BREAKING_SLASH_SUBST + "г");
}
if( text.indexOf("Л/ДНР") != -1 ) {
text = text.replace("Л/ДНР", "Л" +NON_BREAKING_SLASH_SUBST + "ДНР");
}
if( text.indexOf("р.") != -1 ) {
Matcher matcher = YEAR_WITH_R.matcher(text);
if( matcher.find() ) {
text = matcher.replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
}
}
// leave only potential hashtags together
text = text.replace("#", BREAKING_PLACEHOLDER + "#");
// leave numbers with following % together
if( text.indexOf('%') >= 0 ) {
text = text.replaceAll("%([^-])", "%" + BREAKING_PLACEHOLDER + "$1");
}
text = COMPOUND_WITH_QUOTES1.matcher(text).replaceAll("$1$2\uE120$3\uE120$4\uE120");
text = COMPOUND_WITH_QUOTES2.matcher(text).replaceAll("$1\uE120$2\uE120$3\uE120$4");
if( text.indexOf('[') != -1 ) {
text = WORDS_WITH_BRACKETS_PATTERN.matcher(text).replaceAll("$1\\[\uE120$2\\]\uE120");
}
// if period is not the last character in the sentence
int dotIndex = text.indexOf('.');
String textRtrimmed = text.replaceFirst("[\\h\\v]*$", "");
boolean dotInsideSentence = dotIndex >= 0 && dotIndex < textRtrimmed.length()-1;
if( dotInsideSentence
|| (dotIndex == textRtrimmed.length()-1
&& ABBR_AT_THE_END.matcher(text).find()) ) { // ugly - special case for тис. та ініціалів
text = DOTTED_NUMBERS_PATTERN3.matcher(text).replaceAll("$1.\uE120$2.\uE120$3");
text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1.\uE120$2");
text = ABBR_DOT_NAR_PATTERN_1.matcher(text).replaceAll("$1.\uE120\uE110");
text = ABBR_DOT_NAR_PATTERN_2.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_2_SMALL_LETTERS_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110"); //.replaceFirst("(([смкд]|мк)?м\\.[\\h\\v]*)\uE120\uE110$", "$1");
text = ABBR_DOT_VO_PATTERN1.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_VO_PATTERN2.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_VO_PATTERN3.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_ART_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_MAN_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_TYS_PATTERN1.matcher(text).replaceAll("$1$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_TYS_PATTERN2.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_LAT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_PROF_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_GUB_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_DASH_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2");
text = INITIALS_DOT_PATTERN_SP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_2);
text = INITIALS_DOT_PATTERN_SP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_1);
text = INITIALS_DOT_PATTERN_RSP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_2);
text = INITIALS_DOT_PATTERN_RSP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_1);
// text = ABBR_DOT_INVALID_DOT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_KUB_SM_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_S_G_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_CHL_KOR_PATTERN.matcher(text).replaceAll("$1.\uE120$2.\uE120\uE110");
text = ABBR_DOT_PN_ZAH_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110");
text = ABBR_DOT_I_T_P_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_I_T_CH_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_T_ZV_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_RED_AVT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_NON_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");
text = ABBR_DOT_NON_ENDING_PATTERN_2.matcher(text).replaceAll("$1\uE120\uE110$2");
text = INVALID_MLN_DOT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
}
if( dotInsideSentence ) {
text = WEB_ENTITIES.matcher(text).replaceAll("$1.\uE120$2");
text = WEB_ENTITIES2.matcher(text).replaceAll(".\uE120$1.\uE120$2");
}
text = ABBR_DOT_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");
// 2 000 000
Matcher spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text);
if( spacedDecimalMatcher.find() ) {
StringBuffer sb = new StringBuffer();
do {
String splitNumber = spacedDecimalMatcher.group(0);
String splitNumberAdjusted = splitNumber.replace(' ', NON_BREAKING_SPACE_SUBST);
splitNumberAdjusted = splitNumberAdjusted.replace('\u00A0', NON_BREAKING_SPACE_SUBST);
splitNumberAdjusted = splitNumberAdjusted.replace('\u202F', NON_BREAKING_SPACE_SUBST);
spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
} while( spacedDecimalMatcher.find() );
spacedDecimalMatcher.appendTail(sb);
text = sb.toString();
}
// 12:25
if( text.contains(":") ) {
text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
}
// ВКПБ(о)
if( text.contains("(") ) {
text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1" + LEFT_BRACE_SUBST + "$2" + RIGHT_BRACE_SUBST);
}
if( text.contains("<") ) {
text = XML_TAG_PATTERN.matcher(text).replaceAll(BREAKING_PLACEHOLDER + LEFT_ANGLE_SUBST + "$1" + RIGHT_ANGLE_SUBST + BREAKING_PLACEHOLDER);
text = text.replace(LEFT_ANGLE_SUBST+"/", "" + LEFT_ANGLE_SUBST + SLASH_SUBST);
text = text.replace("/" + RIGHT_ANGLE_SUBST, "" + SLASH_SUBST + RIGHT_ANGLE_SUBST);
}
if( text.contains("-") ) {
text = text.replaceAll("([а-яіїєґА-ЯІЇЄҐ])([»\"-]+-)", "$1" + BREAKING_PLACEHOLDER + "$2");
text = text.replaceAll("([»\"-]+-)([а-яіїєґА-ЯІЇЄҐ])", "$1" + BREAKING_PLACEHOLDER + "$2");
}
if( text.contains(SOFT_HYPHEN_WRAP) ) {
text = text.replaceAll("(?<!\\s)"+SOFT_HYPHEN_WRAP, SOFT_HYPHEN_WRAP_SUBST);
}
if( text.indexOf('\'') >= 0 ) {
text = APOSTROPHE_BEGIN_PATTERN.matcher(text).replaceAll("$1'" + BREAKING_PLACEHOLDER + "$2");
text = APOSTROPHE_END_PATTER.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "'$2");
}
if( text.contains("+") ) {
text = text.replaceAll("\\+(?=[а-яіїєґА-ЯІЇЄҐ0-9])", BREAKING_PLACEHOLDER + "+" + BREAKING_PLACEHOLDER);
}
// -20C
if( text.length() > 1 && (text.contains("-") || text.contains("\u2013")) ) {
text = text.replaceAll("(?<=(^|[\\h\\v]))([-\u2013])(?=[0-9])", "$2" + BREAKING_PLACEHOLDER);
}
text = NUMBER_MISSING_SPACE.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
return text;
}