in opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java [104:169]
public ObjectStream<NameSample> create(String[] args) {
if (args == null) {
throw new IllegalArgumentException("Passed args must not be null!");
}
Parameters params = ArgumentParser.parse(args, Parameters.class);
if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) {
throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!");
}
AnnotationConfiguration annConfig;
try {
annConfig = AnnotationConfiguration.parse(params.getAnnotationConfig());
} catch (IOException e) {
throw new TerminateToolException(1, "Failed to parse annotation.conf file!");
}
// TODO: Add an optional parameter to search recursive
// TODO: How to handle the error here ? terminate the tool? not nice if used by API!
ObjectStream<BratDocument> samples;
try {
samples = new BratDocumentStream(annConfig,
params.getBratDataDir(), params.getRecursive(), null);
} catch (IOException e) {
throw new TerminateToolException(-1, e.getMessage());
}
SentenceDetector sentDetector;
if (params.getSentenceDetectorModel() != null) {
try {
sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel()));
} catch (IOException e) {
throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
}
} else {
sentDetector = new NewlineSentenceDetector();
}
Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
if (params.getTokenizerModel() != null) {
try {
tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
} catch (IOException e) {
throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
}
} else if (params.getRuleBasedTokenizer() != null) {
String tokenizerName = params.getRuleBasedTokenizer();
if ("simple".equals(tokenizerName)) {
tokenizer = SimpleTokenizer.INSTANCE;
} else if ("whitespace".equals(tokenizerName)) {
tokenizer = WhitespaceTokenizer.INSTANCE;
} else {
throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
}
}
Set<String> nameTypes = null;
if (params.getNameTypes() != null) {
String[] nameTypesArr = params.getNameTypes().split(",");
if (nameTypesArr.length > 0) {
nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
}
}
return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
}