in modelbuilder-addon/src/main/java/opennlp/addons/modelbuilder/DefaultModelBuilderUtil.java [67:120]
public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations) {
SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
BaseModelBuilderParams params = new BaseModelBuilderParams();
params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
params.setSentenceFile(sentences);
params.setEntityType(namedEntityType);
params.setKnownEntitiesFile(knownEntities);
params.setModelFile(modelOutFile);
params.setKnownEntityBlacklist(knownEntitiesBlacklist);
/*
* sentence providers feed this process with user data derived sentences
* this impl just reads line by line through a file
*/
SentenceProvider sentenceProvider = new FileSentenceProvider();
sentenceProvider.setParameters(params);
/*
* KnownEntityProviders provide a seed list of known entities... such as
* Barack Obama for person, or Germany for location obviously these would
* want to be prolific, non-ambiguous names
*/
KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
knownEntityProvider.setParameters(params);
/*
* ModelGenerationValidators try to weed out bad hits by the iterations of
* the name finder. Since this is a recursive process, with each iteration
* the namefinder will get more and more greedy if bad entities are allowed
* in this provides a mechanism for throwing out obviously bad hits. A good
* impl may be to make sure a location is actually within a noun phrase
* etc...users can make this as specific as they need for their dat and
* their use case
*/
ModelGenerationValidator validator = new FileModelValidatorImpl();
validator.setParameters(params);
/*
* Modelable's write and read the annotated sentences, as well as create and
* write the NER models
*/
Modelable modelable = new GenericModelableImpl();
modelable.setParameters(params);
/*
* the modelGenerator actually runs the process with a set number of
* iterations... could be better by actually calculating the diff between
* runs and stopping based on a thresh, but for extrememly large sentence
* sets this may be too much.
*/
modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
}