in mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.cpp [1350:1651]
void nsBayesianFilter::classifyMessage(
Tokenizer& tokenizer, const nsACString& messageURI,
nsTArray<uint32_t>& aProTraits, nsTArray<uint32_t>& aAntiTraits,
nsIJunkMailClassificationListener* listener,
nsIMsgTraitClassificationListener* aTraitListener,
nsIMsgTraitDetailListener* aDetailListener) {
if (aProTraits.Length() != aAntiTraits.Length()) {
NS_ERROR("Each Pro trait needs a matching Anti trait");
return;
}
Token* tokens = tokenizer.copyTokens();
uint32_t tokenCount;
if (!tokens) {
// This can happen with problems with UTF conversion
NS_ERROR("Trying to classify a null or invalid message");
tokenCount = 0;
// don't return so that we still call the listeners
} else {
tokenCount = tokenizer.countTokens();
}
/* this part is similar to the Graham algorithm with some adjustments. */
uint32_t traitCount = aProTraits.Length();
// pro message counts per trait index
AutoTArray<uint32_t, kTraitAutoCapacity> numProMessages;
// anti message counts per trait index
AutoTArray<uint32_t, kTraitAutoCapacity> numAntiMessages;
// array of pro aliases per trait index
AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> proAliasArrays;
// array of anti aliases per trait index
AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> antiAliasArrays;
// construct the outgoing listener arrays
AutoTArray<uint32_t, kTraitAutoCapacity> traits;
AutoTArray<uint32_t, kTraitAutoCapacity> percents;
if (traitCount > kTraitAutoCapacity) {
traits.SetCapacity(traitCount);
percents.SetCapacity(traitCount);
numProMessages.SetCapacity(traitCount);
numAntiMessages.SetCapacity(traitCount);
proAliasArrays.SetCapacity(traitCount);
antiAliasArrays.SetCapacity(traitCount);
}
nsresult rv;
nsCOMPtr<nsIMsgTraitService> traitService(
do_GetService("@mozilla.org/msg-trait-service;1", &rv));
if (NS_FAILED(rv)) {
NS_ERROR("Failed to get trait service");
MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
("Failed to get trait service"));
}
// get aliases and message counts for the pro and anti traits
for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
nsresult rv;
// pro trait
nsTArray<uint32_t> proAliases;
uint32_t proTrait = aProTraits[traitIndex];
if (traitService) {
rv = traitService->GetAliases(proTrait, proAliases);
if (NS_FAILED(rv)) {
NS_ERROR("trait service failed to get aliases");
MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
("trait service failed to get aliases"));
}
}
proAliasArrays.AppendElement(proAliases.Clone());
uint32_t proMessageCount = mCorpus.getMessageCount(proTrait);
for (uint32_t aliasIndex = 0; aliasIndex < proAliases.Length();
aliasIndex++)
proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]);
numProMessages.AppendElement(proMessageCount);
// anti trait
nsTArray<uint32_t> antiAliases;
uint32_t antiTrait = aAntiTraits[traitIndex];
if (traitService) {
rv = traitService->GetAliases(antiTrait, antiAliases);
if (NS_FAILED(rv)) {
NS_ERROR("trait service failed to get aliases");
MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
("trait service failed to get aliases"));
}
}
antiAliasArrays.AppendElement(antiAliases.Clone());
uint32_t antiMessageCount = mCorpus.getMessageCount(antiTrait);
for (uint32_t aliasIndex = 0; aliasIndex < antiAliases.Length();
aliasIndex++)
antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]);
numAntiMessages.AppendElement(antiMessageCount);
}
for (uint32_t i = 0; i < tokenCount; ++i) {
Token& token = tokens[i];
CorpusToken* t = mCorpus.get(token.mWord);
if (!t) continue;
for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
uint32_t iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]);
// add in any counts for aliases to proTrait
for (uint32_t aliasIndex = 0;
aliasIndex < proAliasArrays[traitIndex].Length(); aliasIndex++)
iProCount +=
mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]);
double proCount = static_cast<double>(iProCount);
uint32_t iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]);
// add in any counts for aliases to antiTrait
for (uint32_t aliasIndex = 0;
aliasIndex < antiAliasArrays[traitIndex].Length(); aliasIndex++)
iAntiCount +=
mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]);
double antiCount = static_cast<double>(iAntiCount);
double prob, denom;
// Prevent a divide by zero error by setting defaults for prob
// If there are no matching tokens at all, ignore.
if (antiCount == 0.0 && proCount == 0.0) continue;
// if only anti match, set probability to 0%
if (proCount == 0.0) prob = 0.0;
// if only pro match, set probability to 100%
else if (antiCount == 0.0)
prob = 1.0;
// not really needed, but just to be sure check the denom as well
else if ((denom = proCount * numAntiMessages[traitIndex] +
antiCount * numProMessages[traitIndex]) == 0.0)
continue;
else
prob = (proCount * numAntiMessages[traitIndex]) / denom;
double n = proCount + antiCount;
prob = (0.225 + n * prob) / (.45 + n);
double distance = std::abs(prob - 0.5);
if (distance >= .1) {
mozilla::DebugOnly<nsresult> rv =
setAnalysis(token, traitIndex, distance, prob);
NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
}
}
}
for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
AutoTArray<TraitAnalysis, 1024> traitAnalyses;
// copy valid tokens into an array to sort
for (uint32_t tokenIndex = 0; tokenIndex < tokenCount; tokenIndex++) {
uint32_t storeIndex = getAnalysisIndex(tokens[tokenIndex], traitIndex);
if (storeIndex) {
TraitAnalysis ta = {tokenIndex, mAnalysisStore[storeIndex].mDistance,
mAnalysisStore[storeIndex].mProbability};
traitAnalyses.AppendElement(ta);
}
}
// sort the array by the distances
traitAnalyses.Sort(compareTraitAnalysis());
uint32_t count = traitAnalyses.Length();
uint32_t first, last = count;
const uint32_t kMaxTokens = 150;
first = (count > kMaxTokens) ? count - kMaxTokens : 0;
// Setup the arrays to save details if needed
nsTArray<double> sArray;
nsTArray<double> hArray;
uint32_t usedTokenCount = (count > kMaxTokens) ? kMaxTokens : count;
if (aDetailListener) {
sArray.SetCapacity(usedTokenCount);
hArray.SetCapacity(usedTokenCount);
}
double H = 1.0, S = 1.0;
int32_t Hexp = 0, Sexp = 0;
uint32_t goodclues = 0;
int e;
// index from end to analyze most significant first
for (uint32_t ip1 = last; ip1 != first; --ip1) {
TraitAnalysis& ta = traitAnalyses[ip1 - 1];
if (ta.mDistance > 0.0) {
goodclues++;
double value = ta.mProbability;
S *= (1.0 - value);
H *= value;
if (S < 1e-200) {
S = frexp(S, &e);
Sexp += e;
}
if (H < 1e-200) {
H = frexp(H, &e);
Hexp += e;
}
MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
("token probability (%s) is %f", tokens[ta.mTokenIndex].mWord,
ta.mProbability));
}
if (aDetailListener) {
sArray.AppendElement(log(S) + Sexp * M_LN2);
hArray.AppendElement(log(H) + Hexp * M_LN2);
}
}
S = log(S) + Sexp * M_LN2;
H = log(H) + Hexp * M_LN2;
double prob;
if (goodclues > 0) {
int32_t chi_error;
S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
if (!chi_error) H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
// if any error toss the entire calculation
if (!chi_error)
prob = (S - H + 1.0) / 2.0;
else
prob = 0.5;
} else
prob = 0.5;
if (aDetailListener) {
// Prepare output arrays
nsTArray<uint32_t> tokenPercents(usedTokenCount);
nsTArray<uint32_t> runningPercents(usedTokenCount);
nsTArray<nsString> tokenStrings(usedTokenCount);
double clueCount = 1.0;
for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++) {
TraitAnalysis& ta = traitAnalyses[last - 1 - tokenIndex];
int32_t chi_error;
S = chi2P(-2.0 * sArray[tokenIndex], 2.0 * clueCount, &chi_error);
if (!chi_error)
H = chi2P(-2.0 * hArray[tokenIndex], 2.0 * clueCount, &chi_error);
clueCount += 1.0;
double runningProb;
if (!chi_error)
runningProb = (S - H + 1.0) / 2.0;
else
runningProb = 0.5;
runningPercents.AppendElement(
static_cast<uint32_t>(runningProb * 100. + .5));
tokenPercents.AppendElement(
static_cast<uint32_t>(ta.mProbability * 100. + .5));
tokenStrings.AppendElement(
NS_ConvertUTF8toUTF16(tokens[ta.mTokenIndex].mWord));
}
aDetailListener->OnMessageTraitDetails(messageURI, aProTraits[traitIndex],
tokenStrings, tokenPercents,
runningPercents);
}
uint32_t proPercent = static_cast<uint32_t>(prob * 100. + .5);
// directly classify junk to maintain backwards compatibility
if (aProTraits[traitIndex] == kJunkTrait) {
bool isJunk = (prob >= mJunkProbabilityThreshold);
MOZ_LOG(BayesianFilterLogModule, LogLevel::Info,
("%s is junk probability = (%f) HAM SCORE:%f SPAM SCORE:%f",
PromiseFlatCString(messageURI).get(), prob, H, S));
// the algorithm in "A Plan For Spam" assumes that you have a large good
// corpus and a large junk corpus.
// that won't be the case with users who first use the junk mail trait
// so, we do certain things to encourage them to train.
//
// if there are no good tokens, assume the message is junk
// this will "encourage" the user to train
// and if there are no bad tokens, assume the message is not junk
// this will also "encourage" the user to train
// see bug #194238
if (listener && !mCorpus.getMessageCount(kGoodTrait))
isJunk = true;
else if (listener && !mCorpus.getMessageCount(kJunkTrait))
isJunk = false;
if (listener)
listener->OnMessageClassified(
messageURI,
isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK)
: nsMsgJunkStatus(nsIJunkMailPlugin::GOOD),
proPercent);
}
if (aTraitListener) {
traits.AppendElement(aProTraits[traitIndex]);
percents.AppendElement(proPercent);
}
}
if (aTraitListener)
aTraitListener->OnMessageTraitsClassified(messageURI, traits, percents);
delete[] tokens;
// reuse mAnalysisStore without clearing memory
mNextAnalysisIndex = 1;
// but shrink it back to the default size
if (mAnalysisStore.Length() > kAnalysisStoreCapacity)
mAnalysisStore.RemoveElementsAt(
kAnalysisStoreCapacity,
mAnalysisStore.Length() - kAnalysisStoreCapacity);
mAnalysisStore.Compact();
}