in ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java [284:400]
private List<File> getFilesFor( List<Integer> patientSets ) throws FileNotFoundException {
List<File> files = new ArrayList<>();
if ( this.xmlFormat == XMLFormat.Anafora ) {
Set<String> ids = new HashSet<>();
for ( Integer set : patientSets ) {
if ( this.subcorpus == Subcorpus.Colon ) {
ids.add( String.format( "ID%03d", set ) );
} else if ( this.subcorpus == Subcorpus.DeepPhe ) {
ids.add( String.format( "patient%02d", set ) );
} else {
ids.add( String.format( "doc%04d", set ) );
}
}
int filePrefixLen = 5; // Colon: "ID\d{3}"
if ( this.subcorpus == Subcorpus.Brain ) {
filePrefixLen = 7; // Brain: "doc\d{4}"
} else if ( this.subcorpus == Subcorpus.DeepPhe ) {
filePrefixLen = 9; // deepPhe: "patient\d{2}"
}
if ( this.subcorpus == Subcorpus.DeepPhe ) {
for ( File dir : this.xmlDirectory.listFiles() ) {
if ( dir.isDirectory() ) {
if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) {
File file = new File( dir, dir.getName() );
if ( file.exists() ) {
files.add( file );
} else {
LOGGER.warn( "Missing note: " + file );
}
}
}
}
} else {
for ( String section : THYMEData.SECTIONS ) {
File xmlSubdir = new File( this.xmlDirectory, section );
for ( File dir : xmlSubdir.listFiles() ) {
if ( dir.isDirectory() ) {
if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) {
File file = new File( dir, dir.getName() );
if ( file.exists() ) {
files.add( file );
} else {
LOGGER.warn( "Missing note: " + file );
}
}
}
}
}
}
} else if ( this.xmlFormat == XMLFormat.AnaforaCoref) {
Set<String> ids = new HashSet<>();
for (Integer set : patientSets) {
if (this.subcorpus == Subcorpus.Colon) {
ids.add(String.format("ID%03d", set));
} else {
LOGGER.warn("No coreference annotations exist for this corpus!");
}
}
for (File dir : this.xmlDirectory.listFiles()) {
// this gets us into train/dev/test subdirectory
for (File ptDir : dir.listFiles()) {
if (ids.contains(ptDir.getName())) {
for (File subDir : ptDir.listFiles()) {
if (subDir.isDirectory()) {
// for document 001 for patient 001, directory is ID001/ID001_clinic_001
// and text file within is ID001_clinic_001
files.add(new File(subDir, subDir.getName()));
}
}
}
}
}
} else if ( this.xmlFormat == XMLFormat.I2B2 ) {
File trainDir = new File( this.xmlDirectory, "training" );
File testDir = new File( this.xmlDirectory, "test" );
for ( Integer pt : patientSets ) {
File xmlTrain = new File( trainDir, pt + ".xml" );
File train = new File( trainDir, pt + ".xml.txt" );
if ( train.exists() ) {
if ( xmlTrain.exists() ) {
files.add( train );
} else {
System.err.println( "Text file in training has no corresponding xml -- skipping: " + train );
}
}
File xmlTest = new File( testDir, pt + ".xml" );
File test = new File( testDir, pt + ".xml.txt" );
if ( xmlTest.exists() ) {
if ( test.exists() ) {
files.add( test );
} else {
throw new FileNotFoundException( "Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set." );
}
}
assert !(train.exists() && test.exists());
}
} else if ( xmlFormat == XMLFormat.Knowtator ) {
LOGGER.warn( "This is an old annotation format -- please upgrade to using anafora files." );
for ( Integer set : patientSets ) {
final int setNum = set;
for ( File file : rawTextDirectory.listFiles( new FilenameFilter() {
@Override
public boolean accept( File dir, String name ) {
return name.contains( String.format( "ID%03d", setNum ) );
}
} ) ) {
// skip hidden files like .svn
if ( !file.isHidden() ) {
files.add( file );
}
}
}
} else {
LOGGER.error( "Unknown data format -- please specify Anafora, i2b2, or Knowtator format." );
}
return files;
}