masses/perceptron.c (322 lines of code) (raw):

/* This program uses stochastic gradient descent to learn a scoreset for * SpamAssassin. You'll need to run logs-to-c from spamassassin/masses to * generate the stuff in tmp. * * <@LICENSE> * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to you under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * </@LICENSE> */ #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <sys/time.h> #include <math.h> #include <unistd.h> #include "tmp/scores.h" #include "tmp/tests.h" /* Ensure that multiple error functions have not been chosen. */ #ifdef ENTROPIC_ERROR #ifdef LEAST_SQUARES_ERROR #warn Both entropic error and least squares error chosen. Using least squares. #endif #endif /* Choose the least squares error function by default. */ #ifndef ENTROPIC_ERROR #ifndef LEAST_SQUARES_ERROR #define LEAST_SQUARES_ERROR #endif #endif #define OUTPUT_FILE "perceptron.scores" /* #define IGNORE_SCORE_RANGES 1 */ void init_wheel (); void destroy_wheel (); int get_random_test (); void init_weights(); void destroy_weights (); void write_weights (FILE * fp); void scale_scores (double old_threshold, double new_threshold); double evaluate_test (int test); double evaluate_test_nogain (int test); void train (int num_epochs, double learning_rate); void usage (); /* Converts a weight to a SpamAssassin score. */ #define weight_to_score(x) (-threshold*(x)/bias) #define score_to_weight(x) (-(x)*bias/threshold) int wheel_size; /* The number of entries in the roulette wheel (NOT THE SIZE OF THE ROULETTE_WHEEL ARRAY!). */ int * roulette_wheel; /* Used for roulette wheel selection. */ double ham_preference = 2.0; #define DEFAULT_THRESHOLD 5.0 double threshold = DEFAULT_THRESHOLD; double * weights; /* The weights of the single-layer perceptron. */ double bias; /* The network bias for the single-layer perceptron. */ int num_epochs = 15; double learning_rate = 2.0; double weight_decay = 1.0; /* Initialize the roulette wheel and populate it, replicating harder-to-classify hams. */ void init_wheel () { int i; int spam = 0, ham = 0; /* cache locations on the roulette wheel for fast lookups */ roulette_wheel = (int*)calloc(num_nondup, sizeof(int)); wheel_size = 0; roulette_wheel[0] = 0; for (i = 0; i < num_nondup - 1; i++) { int slot_size = 1; /* Hams with more tests are rare and harder to classify but are the * most important to classify correctly. They are thus replicated in the * training set proportionally to their difficulty. */ if ( ! is_spam[i] ) { slot_size += (int)(num_tests_hit[i] * ham_preference * tests_count[i]); } else { slot_size = tests_count[i]; } /* The database is compressed with all instances mapped in the same place. */ wheel_size += slot_size; if ( ! is_spam[i] ) { ham += slot_size; } else { spam++; } roulette_wheel[i+1] = roulette_wheel[i] + slot_size; } printf ("Modified training set statistics: %d spam, %d ham.\n", spam, ham); } /* Free the resources for the roulette wheel selector. */ void destroy_wheel () { if ( roulette_wheel ) { free (roulette_wheel); roulette_wheel = 0; wheel_size = 0; } } /* Get a random test using roulette wheel selection. This is not used anymore. */ int get_random_test () { int r; int bottom, middle, top; r = lrand48() % wheel_size; bottom = 0; top = num_nondup - 1; middle = top / 2; while (1) { if ( r < roulette_wheel[bottom+1] ) { return bottom; } else if ( r >= roulette_wheel[top] ) { return top; } else if ( r >= roulette_wheel[middle] && r < roulette_wheel[middle+1] ) { return middle; } else if ( r < roulette_wheel[middle] ) { top = middle-1; bottom++; middle = bottom + (top-bottom)/2; } else { bottom = middle+1; top--; middle = bottom + (top-bottom)/2; } } } /* Allocate and initialize the weights over the range [-0.5..0.5] */ void init_weights () { int i; weights = (double*)calloc(num_scores, sizeof(double)); bias = drand48() - 0.5; for (i = 0; i < num_scores; i++) { weights[i] = range_lo[i] + drand48() * (range_hi[i] - range_lo[i]); } } /* Free the resources allocated for the weights. */ void destroy_weights () { if ( weights ) { free(weights); weights = 0; } } /* Writes out the weights in SpamAssassin score space. */ void write_weights (FILE * fp) { int i; int ga_nn, ga_yy, ga_ny, ga_yn; double nnscore, yyscore, nyscore, ynscore; ga_nn = ga_yy = ga_ny = ga_yn = 0; nnscore = yyscore = nyscore = ynscore = 0; /* Run through all of the instances in the training set and tally up * the scores. */ for (i = 0; i < num_nondup; i++) { double score = weight_to_score(evaluate_test_nogain(i)) + threshold; if ( score >= threshold && is_spam[i] ) { ga_yy += tests_count[i]; yyscore += tests_count[i] * score; } else if ( score < threshold && !is_spam[i] ) { ga_nn += tests_count[i]; nnscore += tests_count[i] * score; } else if ( score >= threshold && !is_spam[i] ) { ga_ny += tests_count[i]; nyscore += tests_count[i] * score; } else { ga_yn += tests_count[i]; ynscore += tests_count[i] * score; } } /* This is copied from the dump() function in craig-evolve.c. It * outputs some nice statistics about the learned classifier. */ fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold); fprintf (fp, "# Correctly non-spam: %6d %4.2f%%\n", ga_nn, (ga_nn / (float) num_ham) * 100.0); fprintf (fp, "# Correctly spam: %6d %4.2f%%\n", ga_yy, (ga_yy / (float) num_spam) * 100.0); fprintf (fp, "# False positives: %6d %4.2f%%\n", ga_ny, (ga_ny / (float) num_ham) * 100.0); fprintf (fp, "# False negatives: %6d %4.2f%%\n", ga_yn, (ga_yn / (float) num_spam) * 100.0); fprintf (fp,"# Average score for spam: %3.3f ham: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny))); fprintf (fp,"# Average for false-pos: %3.3f false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn)); fprintf (fp,"# TOTAL: %6d %3.2f%%\n\n", num_tests, 100.0); for (i = 0; i < num_scores; i++) { if ( is_mutable[i] ) { fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]); } else { fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]); } } } /* This is to support Daniel's threshold thing. */ void scale_scores (double old_threshold, double new_threshold) { int i; /* No need to scale something to itself. */ if ( old_threshold == new_threshold ) { return; } for (i = 0; i < num_scores; i++) { if ( is_mutable[i] ) { range_lo[i] = range_lo[i] * new_threshold / old_threshold; range_hi[i] = range_hi[i] * new_threshold / old_threshold; } } /* Maybe we don't want this bit. This prescaling stuff makes my * brain hurt.*/ /* for (i = 0; i < num_nondup; i++) { scores[i] = scores[i] * new_threshold / old_threshold; } */ } /* Computes the value of the activation function of the perceptron for * a given input. */ double evaluate_test (int test) { #ifdef LEAST_SQUARES_ERROR return 1/(1+exp(-evaluate_test_nogain(test))); #else #ifdef ENTROPIC_ERROR return tanh(evaluate_test_nogain(test)); #endif #endif } /* Computes the value of the transfer function (in this case, linear) for * an input defined in tests_hit[test]* */ double evaluate_test_nogain (int test) { double sum; int i; sum = bias; for (i = 0; i < num_tests_hit[test]; i++) { sum += weights[tests_hit[test][i]]; } /* Translate the 'unmutable' scores to weight space. */ sum += score_to_weight(scores[test]); return sum; } /* Trains the perceptron using stochastic gradient descent. */ void train (int num_epochs, double learning_rate) { int epoch, random_test; int i, j; int * tests; double y_out, error, delta; /* Initialize and populate an array containing indices of training * instances. This is shuffled on every epoch and then iterated * through. I had originally planned to use roulette wheel selection, * but shuffled selection seems to work better. */ tests = (int*)calloc(wheel_size, sizeof(int)); for (i = 0, j = 0; i < num_nondup-1; i++) { for (; j < roulette_wheel[i+1]; j++) { tests[j] = i; } } for (; j < wheel_size; j++) { tests[j] = num_nondup-1; } for (epoch = 0; epoch < num_epochs; epoch++) { /* decay the weights on every epoch to smooth out statistical * anomalies */ if ( weight_decay != 1.0 ) { bias *= weight_decay; for (i = 0; i < num_mutable; i++) { weights[i] *= weight_decay; } } /* shuffle the training instances */ for (i = 0; i < wheel_size-1; i++) { int tmp; int r = lrand48 () % (wheel_size - i); tmp = tests[i]; tests[i] = tests[r+i]; tests[r+i] = tmp; } for (j = 0; j < wheel_size; j++) { /* select a random test (they have been randomized above) */ random_test = tests[j]; /* compute the output of the network */ y_out = evaluate_test(random_test); /* compute the error gradient for the logsig node with least squares error */ #ifdef LEAST_SQUARES_ERROR error = is_spam[random_test] - y_out; delta = y_out * (1-y_out) * error / (num_tests_hit[random_test]+1) * learning_rate; #else /* compute the error gradient for the tanh node with entropic error */ #ifdef ENTROPIC_ERROR error = (2.0*is_spam[random_test]-1) - y_out; delta = error / (num_tests_hit[random_test]+1) * learning_rate; #endif #endif /* adjust the weights to descend the steepest part of the error gradient */ if ( epoch + 1 < num_epochs ) { bias += delta; } for (i = 0; i < num_tests_hit[random_test]; i++) { int idx = tests_hit[random_test][i]; weights[idx] += delta; #ifdef IGNORE_SCORE_RANGES /* Constrain the weights so that nice rules are always <= 0 etc. */ if ( range_lo[idx] >= 0 && weights[idx] < 0 ) { weights[idx] = 0; } else if ( range_hi[idx] <= 0 && weights[idx] > 0 ) { weights[idx] = 0; } #else if ( weights[idx] < score_to_weight(range_lo[idx]) ) { weights[idx] = score_to_weight(range_lo[idx]); } else if ( weights[idx] > score_to_weight(range_hi[idx]) ) { weights[idx] = score_to_weight(range_hi[idx]); } #endif } } } free(tests); } void usage () { printf ("usage: perceptron [args]\n" "\n" " -p ham_preference = adds extra ham to training set multiplied by number of\n" " tests hit (2.0 default)\n" " -e num_epochs = number of epochs to train (15 default)\n" " -l learning_rate = learning rate for gradient descent (2.0 default)\n" " -t threshold = minimum threshold for spam (5.0 default)\n" " -w weight_decay = per-epoch decay of learned weight and bias (1.0 default)\n" " -h = print this help\n" "\n"); exit(30); } int main (int argc, char ** argv) { struct timeval tv, tv_start, tv_end; long long int t_usec; FILE * fp; int arg; /* Read the command line options */ while ((arg = getopt (argc, argv, "p:e:l:t:w:h?")) != -1) { switch (arg) { case 'p': ham_preference = atof(optarg); break; case 'e': num_epochs = atoi(optarg); break; case 'l': learning_rate = atof(optarg); break; case 't': threshold = atof(optarg); break; case 'w': weight_decay = atof(optarg); break; case 'h': case '?': usage(); break; } } /* Seed the PRNG */ gettimeofday (&tv, 0); t_usec = tv.tv_sec * 1000000 + tv.tv_usec; srand48 ((int)t_usec); /* Load the instances and score constraints generated by logs-to-c. */ loadtests(); loadscores(); /* If the threshold has been changed, the ranges and scores need to be * scaled so that the output of the program will not be affected. */ scale_scores (DEFAULT_THRESHOLD, threshold); /* Replicate instances from the training set to bias against false positives. */ init_wheel (); /* Allocate and initialize the weight vector. */ init_weights (); /* Train the network using stochastic gradient descent. */ gettimeofday(&tv_start, 0); train(num_epochs, learning_rate); gettimeofday(&tv_end, 0); t_usec = tv_end.tv_sec * 1000000 + tv_end.tv_usec - (tv_start.tv_sec *1000000 + tv_start.tv_usec); printf ("Training time = %fs.\n", t_usec / 1000000.0f); /* Translate the learned weights to SA score space and output to a file. */ fp = fopen (OUTPUT_FILE, "w"); if ( fp ) { write_weights(fp); fclose(fp); } else { perror (OUTPUT_FILE); } /* Free resources */ destroy_weights (); destroy_wheel (); return 0; }