public static void generateModel()

in modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java [67:120]


  public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
          File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
    SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
    BaseModelBuilderParams params = new BaseModelBuilderParams();
    params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
    params.setSentenceFile(sentences);
    params.setEntityType(namedEntityType);
    params.setKnownEntitiesFile(knownEntities);
    params.setModelFile(modelOutFile);
    params.setKnownEntityBlacklist(knownEntitiesBlacklist);

    /*
     * sentence providers feed this process with user data derived sentences
     * this impl just reads line by line through a file
     */
    SentenceProvider sentenceProvider = new FileSentenceProvider();
    sentenceProvider.setParameters(params);

    /*
     * KnownEntityProviders provide a seed list of known entities... such as
     * Barack Obama for person, or Germany for location obviously these would
     * want to be prolific, non-ambiguous names
     */
    KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
    knownEntityProvider.setParameters(params);

    /*
     * ModelGenerationValidators try to weed out bad hits by the iterations of
     * the name finder. Since this is a recursive process, with each iteration
     * the namefinder will get more and more greedy if bad entities are allowed
     * in this provides a mechanism for throwing out obviously bad hits. A good
     * impl may be to make sure a location is actually within a noun phrase
     * etc...users can make this as specific as they need for their dat and
     * their use case
     */
    ModelGenerationValidator validator = new FileModelValidatorImpl();
    validator.setParameters(params);

    /*
     * Modelable's write and read the annotated sentences, as well as create and
     * write the NER models
     */
    Modelable modelable = new GenericModelableImpl();
    modelable.setParameters(params);

    /*
     * the modelGenerator actually runs the process with a set number of
     * iterations... could be better by actually calculating the diff between
     * runs and stopping based on a thresh, but for extrememly large sentence
     * sets this may be too much.
     */
    modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);

  }