in util/th_check.pl [32:93]
sub processFile($) {
my ($input) = @_;
if (!open(INPUT, $input)) {
print "FAIL: $input (no input found)\n";
return 1;
}
# top line of thesaurus provides encoding (we ignore it)
$_=<INPUT>;
my $line = 1;
my $expectedEntries;
my $actualEntries = 0;
my $word;
my %words = ();
my @errors = ();
while (<INPUT>){
$line++;
s/\n$//;
s/\r$//;
s/\s+$//;
if (m/^([^\|]+)\|(\d+)$/) {
my $tword = $1;
my $texpectedEntries = $2;
#print $tword, $texpectedEntries, "\n";
if (defined $expectedEntries) {
# Check if the last word's actual entries matched the expected
if ($actualEntries != $expectedEntries) {
push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n";
}
}
$word = $tword;
$expectedEntries = $texpectedEntries;
if (defined $words{$word}) {
push @errors, "$line: $word previously defined on $words{$word}\n";
} else {
$words{$word} = $line;
}
$actualEntries = 0;
} elsif (m/^[\(\-\|]/) {
$actualEntries++;
} else {
push @errors, "$line: Unrecognised line format: $_\n";
if (m/^(interj|prep|conj)\|/) {
$actualEntries++;
}
}
}
close(INPUT);
if (scalar(@errors)) {
print $input, ':', join($input.':', @errors);
return 1;
}
else {
return 0;
}
}