private String adjustTextForTokenizing()

in languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java [244:432]


  private String adjustTextForTokenizing(String text, HashMap<String, String> urls) {
    text = cleanup(text);

    if( "\u2014\u2013-".indexOf(text.charAt(0)) >=0 ) {
      Matcher matcher = LEADING_DASH_PATTERN.matcher(text);
      if( matcher.find() ) {
        text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
      }
      else {
        matcher = LEADING_DASH_PATTERN_2.matcher(text);
        if( matcher.find() ) {
          text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
        }
      }
    }
    
    if( text.contains(",") ) {
      text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
    }

    // check for urls
    if( text.contains("http") || text.contains("www") || text.contains("@") || text.contains("ftp") ) { // https?|ftp
      Matcher matcher = URL_PATTERN.matcher(text);
      int urlReplaceChar = URL_START_REPLACE_CHAR;
      
      while( matcher.find() ) {
        String urlGroup = matcher.group();
        String replaceChar = String.valueOf((char)urlReplaceChar);
        urls.put(replaceChar, urlGroup);
        text = matcher.replaceFirst(replaceChar);
        urlReplaceChar++;
        matcher = URL_PATTERN.matcher(text);
      }
    }

    if( text.indexOf('\u2014') != -1 ) {
      text = text.replaceAll("\u2014([\\h\\v])", BREAKING_PLACEHOLDER + "\u2014$1");
    }

    boolean nDashPresent = text.indexOf('\u2013') != -1;
    if( text.indexOf('-') != -1 || nDashPresent ) {
      text = DASH_NUMBERS_PATTERN.matcher(text).replaceAll(DASH_NUMBERS_REPL);
      if( nDashPresent ) {
        text = N_DASH_SPACE_PATTERN.matcher(text).replaceAll(N_DASH_SPACE_REPL);
        text = N_DASH_SPACE_PATTERN2.matcher(text).replaceAll(N_DASH_SPACE_REPL);
      }
    }

    if( text.indexOf("с/г") != -1 ) {
      text = text.replace("с/г", "с" +NON_BREAKING_SLASH_SUBST + "г");
    }

    if( text.indexOf("Л/ДНР") != -1 ) {
      text = text.replace("Л/ДНР", "Л" +NON_BREAKING_SLASH_SUBST + "ДНР");
    }

    if( text.indexOf("р.") != -1 ) {
      Matcher matcher = YEAR_WITH_R.matcher(text);
      if( matcher.find() ) {
        text = matcher.replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
      }
    }

    // leave only potential hashtags together
    text = text.replace("#", BREAKING_PLACEHOLDER + "#");
    // leave numbers with following % together
    if( text.indexOf('%') >= 0 ) {
      text = text.replaceAll("%([^-])", "%" + BREAKING_PLACEHOLDER + "$1");
    }

    
    text = COMPOUND_WITH_QUOTES1.matcher(text).replaceAll("$1$2\uE120$3\uE120$4\uE120");
    text = COMPOUND_WITH_QUOTES2.matcher(text).replaceAll("$1\uE120$2\uE120$3\uE120$4");
    if( text.indexOf('[') != -1 ) {
      text = WORDS_WITH_BRACKETS_PATTERN.matcher(text).replaceAll("$1\\[\uE120$2\\]\uE120");
    }
    
    // if period is not the last character in the sentence
    int dotIndex = text.indexOf('.');
    String textRtrimmed = text.replaceFirst("[\\h\\v]*$", "");
    boolean dotInsideSentence = dotIndex >= 0 && dotIndex < textRtrimmed.length()-1;

    if( dotInsideSentence 
        || (dotIndex == textRtrimmed.length()-1
            && ABBR_AT_THE_END.matcher(text).find()) ) {  // ugly - special case for тис. та ініціалів

      text = DOTTED_NUMBERS_PATTERN3.matcher(text).replaceAll("$1.\uE120$2.\uE120$3");
      text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1.\uE120$2");

      text = ABBR_DOT_NAR_PATTERN_1.matcher(text).replaceAll("$1.\uE120\uE110");
      text = ABBR_DOT_NAR_PATTERN_2.matcher(text).replaceAll("$1.\uE120\uE110$2");

      text = ABBR_DOT_2_SMALL_LETTERS_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110"); //.replaceFirst("(([смкд]|мк)?м\\.[\\h\\v]*)\uE120\uE110$", "$1");
      text = ABBR_DOT_VO_PATTERN1.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
      text = ABBR_DOT_VO_PATTERN2.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
      text = ABBR_DOT_VO_PATTERN3.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
      text = ABBR_DOT_ART_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_MAN_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_TYS_PATTERN1.matcher(text).replaceAll("$1$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
      text = ABBR_DOT_TYS_PATTERN2.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_LAT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_PROF_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_GUB_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
      text = ABBR_DOT_DASH_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2");

      text = INITIALS_DOT_PATTERN_SP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_2);
      text = INITIALS_DOT_PATTERN_SP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_1);
      text = INITIALS_DOT_PATTERN_RSP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_2);
      text = INITIALS_DOT_PATTERN_RSP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_1);

//      text = ABBR_DOT_INVALID_DOT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
      text = ABBR_DOT_KUB_SM_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
      text = ABBR_DOT_S_G_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
      text = ABBR_DOT_CHL_KOR_PATTERN.matcher(text).replaceAll("$1.\uE120$2.\uE120\uE110");
      text = ABBR_DOT_PN_ZAH_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110");
      text = ABBR_DOT_I_T_P_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
      text = ABBR_DOT_I_T_CH_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
      text = ABBR_DOT_T_ZV_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
      text = ABBR_DOT_RED_AVT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
      text = ABBR_DOT_NON_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");
      text = ABBR_DOT_NON_ENDING_PATTERN_2.matcher(text).replaceAll("$1\uE120\uE110$2");
      text = INVALID_MLN_DOT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
    }

    if( dotInsideSentence ) {
      text = WEB_ENTITIES.matcher(text).replaceAll("$1.\uE120$2");
      text = WEB_ENTITIES2.matcher(text).replaceAll(".\uE120$1.\uE120$2");
    }

    text = ABBR_DOT_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");

    // 2 000 000
    Matcher spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text);
    if( spacedDecimalMatcher.find() ) {
    	StringBuffer sb = new StringBuffer();
    	do {
    		String splitNumber = spacedDecimalMatcher.group(0);
    		String splitNumberAdjusted = splitNumber.replace(' ', NON_BREAKING_SPACE_SUBST);
    		splitNumberAdjusted = splitNumberAdjusted.replace('\u00A0', NON_BREAKING_SPACE_SUBST);
    		splitNumberAdjusted = splitNumberAdjusted.replace('\u202F', NON_BREAKING_SPACE_SUBST);
    		spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
    	} while( spacedDecimalMatcher.find() );

    	spacedDecimalMatcher.appendTail(sb);
    	text = sb.toString();
    }

    // 12:25
    if( text.contains(":") ) {
    	text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
    }

    // ВКПБ(о)
    if( text.contains("(") ) {
      text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1" + LEFT_BRACE_SUBST + "$2" + RIGHT_BRACE_SUBST);
    }

    if( text.contains("<") ) {
      text = XML_TAG_PATTERN.matcher(text).replaceAll(BREAKING_PLACEHOLDER + LEFT_ANGLE_SUBST + "$1" + RIGHT_ANGLE_SUBST + BREAKING_PLACEHOLDER);
      text = text.replace(LEFT_ANGLE_SUBST+"/", "" + LEFT_ANGLE_SUBST + SLASH_SUBST);
      text = text.replace("/" + RIGHT_ANGLE_SUBST, "" + SLASH_SUBST + RIGHT_ANGLE_SUBST);
    }

    if( text.contains("-") ) {
      text = text.replaceAll("([а-яіїєґА-ЯІЇЄҐ])([»\"-]+-)", "$1" + BREAKING_PLACEHOLDER + "$2");
      text = text.replaceAll("([»\"-]+-)([а-яіїєґА-ЯІЇЄҐ])", "$1" + BREAKING_PLACEHOLDER + "$2");
    }

    if( text.contains(SOFT_HYPHEN_WRAP) ) {
      text = text.replaceAll("(?<!\\s)"+SOFT_HYPHEN_WRAP, SOFT_HYPHEN_WRAP_SUBST);
    }

    if( text.indexOf('\'') >= 0 ) {
      text = APOSTROPHE_BEGIN_PATTERN.matcher(text).replaceAll("$1'" + BREAKING_PLACEHOLDER + "$2");
      text = APOSTROPHE_END_PATTER.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "'$2");
    }

    if( text.contains("+") ) {
      text = text.replaceAll("\\+(?=[а-яіїєґА-ЯІЇЄҐ0-9])", BREAKING_PLACEHOLDER + "+" + BREAKING_PLACEHOLDER);
    }
    
    // -20C
    if( text.length() > 1 && (text.contains("-") || text.contains("\u2013")) ) {
      text = text.replaceAll("(?<=(^|[\\h\\v]))([-\u2013])(?=[0-9])", "$2" + BREAKING_PLACEHOLDER);
    }
    
    text = NUMBER_MISSING_SPACE.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
    return text;
  }