sub processFile()

in util/th_check.pl [32:93]


sub processFile($) {
    my ($input) = @_;

    if (!open(INPUT, $input)) {
        print "FAIL: $input (no input found)\n";
        return 1;
    }
    # top line of thesaurus provides encoding (we ignore it)
    $_=<INPUT>;
    my $line = 1;


    my $expectedEntries;
    my $actualEntries = 0;
    my $word;
    my %words = ();
    my @errors = ();
    while (<INPUT>){
        $line++;
        s/\n$//;
        s/\r$//;
        s/\s+$//;
        if (m/^([^\|]+)\|(\d+)$/) {

            my $tword = $1;
            my $texpectedEntries = $2;
            #print $tword, $texpectedEntries, "\n";
            if (defined $expectedEntries) {
                # Check if the last word's actual entries matched the expected
                if ($actualEntries != $expectedEntries) {
                    push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n";
                }
            }
            $word = $tword;
            $expectedEntries = $texpectedEntries;
            if (defined $words{$word}) {
                push @errors, "$line: $word previously defined on $words{$word}\n";
            } else {
                $words{$word} = $line;
            }
            $actualEntries = 0;
        } elsif (m/^[\(\-\|]/) {
            $actualEntries++;
        } else {
            push @errors, "$line: Unrecognised line format: $_\n";
            if (m/^(interj|prep|conj)\|/) {
                $actualEntries++;
            }
        }

    }
    close(INPUT);


    if (scalar(@errors)) {
        print $input, ':', join($input.':', @errors);
        return 1;
    }
    else {
        return 0;
    }
}