public static String acceptableMinedSentence()

in opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MinedSentenceProcessor.java [26:150]


  public static String acceptableMinedSentence(String sent) {
    // if too many commas => seo text

    String[] commas = StringUtils.split(sent, ',');
    String[] spaces = StringUtils.split(sent, ' ');
    if ((float) commas.length / (float) spaces.length > 0.7) {
      System.out.println("Rejection: too many commas");
      return null;
    }
    
    String[] otherDelimiters = StringUtils.split(sent, '/');
    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
        System.out.println("Rejection: too many delimiters");
        return null;
    }
    
    otherDelimiters = StringUtils.split(sent, '.');
    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
        System.out.println("Rejection: too many delimiters");
        return null;
    }
    otherDelimiters = StringUtils.split(sent, '!');
    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
        System.out.println("Rejection: too many delimiters");
        return null;
    }
    otherDelimiters = StringUtils.split(sent, '=');
    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
        System.out.println("Rejection: too many delimiters");
        return null;
    }
    
    if (StringUtils.split(sent, '|').length > 2
        || StringUtils.split(sent, '>').length > 2) {
      System.out.println("Rejection: too many |s or >s ");
      return null;
    }
    String sentTry = sent.toLowerCase();
    // if too many long spaces
    String sentSpaces = sentTry.replace("   ", "");
    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
      // suspicious
      return null;

    if (sentTry.contains("click here") || sentTry.contains(" wikip")
        || sentTry.contains("copyright")
        || sentTry.contains("operating hours")
        || sentTry.contains("days per week")
        || sentTry.contains("click for") || sentTry.contains("photos")
        || sentTry.contains("find the latest")
        || sentTry.startsWith("subscribe")
        || sentTry.contains("Terms of Service")
        || sentTry.contains("clicking here")
        || sentTry.contains("skip to") || sentTry.contains("sidebar")
        || sentTry.contains("Tags:") || sentTry.startsWith("Posted by")
        || sentTry.contains("available online")
        || sentTry.contains("get online")
        || sentTry.contains("buy online")
        || sentTry.contains("not valid") || sentTry.contains("discount")
        || sentTry.contains("official site")
        || sentTry.contains("this video")
        || sentTry.contains("this book")
        || sentTry.contains("this product")
        || sentTry.contains("paperback") || sentTry.contains("hardcover")
        || sentTry.contains("audio cd")
        || sentTry.contains("related searches")
        || sentTry.contains("permission is granted")
        || sentTry.contains("[edit")
        || sentTry.contains("edit categories")
        || sentTry.contains("free license")
        || sentTry.contains("under the terms")
        || sentTry.contains("rights reserved")
        || sentTry.contains("wikipedia") || sentTry.endsWith("the")
        || sentTry.endsWith("the.") || sentTry.startsWith("below") 
        || sentTry.contains("recipient of") || sentTry.contains("this message")
        || sentTry.contains("mailing list") || sentTry.contains("purchase order")
        || sentTry.contains("mon-fri") || sentTry.contains("email us") || sentTry.contains("privacy pol") || sentTry.contains("back to top")
        || sentTry.contains("for details") || sentTry.contains("assistance?") || sentTry.contains("chat live")
        || sentTry.contains("free shipping") || sentTry.contains("company info") || sentTry.contains("satisfaction g") || sentTry.contains("contact us")
        ||sentTry.startsWith("write") || sentTry.startsWith( "email")|| sentTry.contains("conditions")
        ||sentTry.startsWith("we ") || sentTry.contains("the recipient") || sentTry.contains("day return") || sentTry.contains("days return")
        
        ||sentTry.startsWith("fax") || sentTry.contains("refund it") || sentTry.contains("your money")
        ||sentTry.startsWith("free") || sentTry.contains("purchase orders")
        ||sentTry.startsWith("exchange it ") || sentTry.contains("return it") || sentTry.contains("credit card")
        
        || sentTry.contains("storeshop") || sentTry.startsWith( "find") || sentTry.startsWith( "shop") || sentTry.startsWith( "unlimited")
        || sentTry.contains("for a limited time") || sentTry.contains("prime members") || sentTry.contains("amazon members") || sentTry.contains("unlimited free")
        || sentTry.contains("shipping") || sentTry.startsWith( "amazon")
        // not a script text
        || sentTry.contains("document.body") || sentTry.contains(" var ") || sentTry.contains("search suggestions") ||sentTry.startsWith( "Search")
        
    		)
      return null;
    
    //Millions of Amazon Prime members enjoy instant videos, free Kindle books and unlimited free two-day shipping.

    // count symbols indicating wrong parts of page to mine for text
    // if short and contains too many symbols indicating wrong area: reject
    String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")
        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
        .replace("-", "&&&").replace("%", "&&&");
    if ((sentWrongSym.length() - sentTry.length()) >= 4
        && sentTry.length() < 200) // twice ot more
      return null;

    sent = sent.replace('[', ' ').replace(']', ' ')
        .replace("_should_find_orig_", "").replace(".   .", ". ")
        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")
        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")
        .replace("2008", "2011").replace("2006", "2011")
        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")
        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")
        .replace("(more.)", "").replace("more.", "").replace("<more>", "")
        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")
        .replace("p&gt;", "").replace("product description", "");

    // TODO .replace("a.", ".");

    int endIndex = sent.indexOf(" posted");
    if (endIndex > 0)
      sent = sent.substring(0, endIndex);

    return sent;
  }