Server IP : 85.214.239.14 / Your IP : 18.221.165.190 Web Server : Apache/2.4.62 (Debian) System : Linux h2886529.stratoserver.net 4.9.0 #1 SMP Tue Jan 9 19:45:01 MSK 2024 x86_64 User : www-data ( 33) PHP Version : 7.4.18 Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare, MySQL : OFF | cURL : OFF | WGET : ON | Perl : ON | Python : ON | Sudo : ON | Pkexec : OFF Directory : /usr/share/perl5/Mail/SpamAssassin/Plugin/ |
Upload File : |
# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </@LICENSE> =head1 NAME Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier =head1 DESCRIPTION This is a Bayesian-style probabilistic classifier, using an algorithm based on the one detailed in Paul Graham's I<A Plan For Spam> paper at: http://www.paulgraham.com/spam.html It also incorporates some other aspects taken from Graham Robinson's webpage on the subject at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html And the chi-square probability combiner as described here: http://www.linuxjournal.com/print.php?sid=6467 The results are incorporated into SpamAssassin as the BAYES_* rules. =head1 ADMINISTRATOR SETTINGS =over 4 =item bayes_stopword_languages lang (default: en) Languages enabled in bayes stopwords processing, every language have a default stopwords regexp, tokens matching this regular expression will not be considered in bayes processing. Custom regular expressions for additional languages can be defined in C<local.cf>. Custom regular expressions can be specified by using the C<bayes_stopword_lang> keyword like in the following example: bayes_stopword_languages en se bayes_stopword_en (?:you|me) bayes_stopword_se (?:du|mig) Regexps are case-insensitive will be anchored automatically at beginning and end. To disable stopwords usage, specify C<bayes_stopword_languages disable>. Only one bayes_stopword_languages or bayes_stopword_xx configuration line can be used. New configuration line will override the old one, for example the ones from SpamAssassin default ruleset (60_bayes_stopwords.cf). =back =over 4 =item bayes_max_token_length (default: 15) Configure the maximum number of character a token could contain =back =cut package Mail::SpamAssassin::Plugin::Bayes; use strict; use warnings; # use bytes; use re 'taint'; use Digest::SHA qw(sha1 sha1_hex); use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::PerMsgStatus; use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util qw(compile_regexp untaint_var); # pick ONLY ONE of these combining implementations. use Mail::SpamAssassin::Bayes::CombineChi; # use Mail::SpamAssassin::Bayes::CombineNaiveBayes; our @ISA = qw(Mail::SpamAssassin::Plugin); # Which headers should we scan for tokens? Don't use all of them, as it's easy # to pick up spurious clues from some. What we now do is use all of them # *less* these well-known headers; that way we can pick up spammers' tracking # headers (which are obviously not well-known in advance!). # Received is handled specially our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise |Delivered-To |Delivery-Date |(?:X-)?Envelope-To |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text |Subject # not worth a tiny gain vs. to db size increase # Date: can provide invalid cues if your spam corpus is # older/newer than ham |Date # List headers: ignore. a spamfiltering mailing list will # become a nonspam sign. |X-List|(?:X-)?Mailing-List |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe |Unsubscribe|Host|Id|Manager|Admin|Comment |Name|Url) |X-Unsub(?:scribe)? |X-Mailman-Version |X-Been[Tt]here |X-Loop |Mail-Followup-To |X-eGroups-(?:Return|From) |X-MDMailing-List |X-XEmacs-List |X-Sympa-To # gatewayed through mailing list (thanks to Allen Smith) |(?:X-)?Resent-(?:From|To|Date) |(?:X-)?Original-(?:From|To|Date) # Spamfilter/virus-scanner headers: too easy to chain from # these |X-MailScanner(?:-SpamCheck)? |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? |X-Antispam |X-RBL-Warning |X-Mailscanner |X-MDaemon-Deliver-To |X-Virus-Scanned |X-Mass-Check-Id |X-Pyzor |X-DCC-\S{2,25}-Metrics |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status |X-SpamCop-[^:]+ |X-SMTPD |(?:X-)?Spam-Apparently-To |SPAM |X-Perlmx-Spam |X-Bogosity # some noisy Outlook headers that add no good clues: |Content-Class |Thread-(?:Index|Topic) |X-Original[Aa]rrival[Tt]ime # Annotations from IMAP, POP, and MH: |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded |Lines |Content-Length |X-UIDL? |X-IMAPbase # Annotations from Bugzilla |X-Bugzilla-[^:]+ # Annotations from VM: (thanks to Allen Smith) |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified |Summary-Format|VHeader|v\d-Data|Message-Order) # Annotations from Gnus: | X-Gnus-Mail-Source | Xref )}ix; # Note only the presence of these headers, in order to reduce the # hapaxen they generate. our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint |D(?:KIM|omainKey)-Signature |X-Google-DKIM-Signature |ARC-(?:Message-Signature|Seal) |Autocrypt )}ix; # tweaks tested as of Nov 18 2002 by jm posted to -devel at # http://sourceforge.net/p/spamassassin/mailman/message/12977556/ # for results. The winners are now the default settings. use constant IGNORE_TITLE_CASE => 1; use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0; use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1; use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; # tweaks by jm on May 12 2003, see -devel email at # http://sourceforge.net/p/spamassassin/mailman/message/14844556/ use constant PRE_CHEW_ADDR_HEADERS => 1; use constant CHEW_BODY_URIS => 1; use constant CHEW_BODY_MAILADDRS => 1; use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; use constant IGNORE_MSGID_TOKENS => 0; # tweaks of 12 March 2004, see bug 2129. use constant DECOMPOSE_BODY_TOKENS => 1; use constant MAP_HEADERS_MID => 1; use constant MAP_HEADERS_FROMTOCC => 1; use constant MAP_HEADERS_USERAGENT => 1; # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; # We store header-mined tokens in the db with a "HHeaderName:val" format. # some headers may contain lots of gibberish tokens, so allow a little basic # compression by mapping the header name at least here. these are the headers # which appear with the most frequency in my db. note: this doesn't have to # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing # tokens from multiple different headers may impact accuracy, so might as well # avoid this if possible. These are the top ones from my corpus, BTW (jm). our %HEADER_NAME_COMPRESSION = ( 'Message-Id' => '*m', 'Message-ID' => '*M', 'Received' => '*r', 'User-Agent' => '*u', 'References' => '*f', 'In-Reply-To' => '*i', 'From' => '*F', 'Reply-To' => '*R', 'Return-Path' => '*p', 'Return-path' => '*rp', 'X-Mailer' => '*x', 'X-Authentication-Warning' => '*a', 'Organization' => '*o', 'Organisation' => '*o', 'Content-Type' => '*ct', 'Content-Disposition' => '*cd', 'Content-Transfer-Encoding' => '*ce', 'x-spam-relays-trusted' => '*RT', 'x-spam-relays-untrusted' => '*RU', ); # How many seconds should the opportunistic_expire lock be valid? our $OPPORTUNISTIC_LOCK_VALID = 300; # Should we use the Robinson f(w) equation from # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? # It gives better results, in that scores are more likely to distribute # into the <0.5 range for nonspam and >0.5 for spam. use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; # How many of the most significant tokens should we use for the p(w) # calculation? use constant N_SIGNIFICANT_TOKENS => 150; # How many significant tokens are required for a classifier score to # be considered usable? use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; # How long a token should we hold onto? (note: German speakers typically # will require a longer token than English ones.) # This is just a default value, option can be changed using # bayes_max_token_length option use constant MAX_TOKEN_LENGTH => 15; ########################################################################### sub new { my $class = shift; my ($main) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($main); bless ($self, $class); $self->{main} = $main; $self->{conf} = $main->{conf}; $self->{use_ignores} = 1; # Old default stopword list, need to have hardcoded one incase sa-update is not available $self->{bayes_stopword}{en} = qr/(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))/; $self->set_config($self->{conf}); $self->register_eval_rule("check_bayes", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS); $self; } sub set_config { my ($self, $conf) = @_; my @cmds; push(@cmds, { setting => 'bayes_max_token_length', default => MAX_TOKEN_LENGTH, is_admin => 1, type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC, }); push(@cmds, { setting => 'bayes_stopword_languages', default => ['en'], is_admin => 1, type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRINGLIST, code => sub { my ($self, $key, $value, $line) = @_; my @langs; if ($value eq 'disable') { @{$self->{bayes_stopword_languages}} = (); } else { foreach my $lang (split(/(?:\s*,\s*|\s+)/, lc($value))) { if ($lang !~ /^([a-z]{2})$/) { return $Mail::SpamAssassin::Conf::INVALID_VALUE; } push @langs, $lang; } if (!@langs) { return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; } @{$self->{bayes_stopword_languages}} = @langs; } } }); $conf->{parser}->register_commands(\@cmds); } sub parse_config { my ($self, $opts) = @_; # Ignore users's configuration lines return 0 if $opts->{user_config}; if ($opts->{key} =~ /^bayes_stopword_([a-z]{2})$/i) { $self->inhibit_further_callbacks(); my $lang = lc($1); foreach my $re (split(/\s+/, $opts->{value})) { my ($rec, $err) = compile_regexp('^(?i)'.$re.'$', 0); if (!$rec) { warn "bayes: invalid regexp for $opts->{key}: $err\n"; return 0; } $self->{bayes_stopword}{$lang} = $rec; } return 1; } return 0; } sub finish_parsing_end { my ($self, $opts) = @_; my $conf = $opts->{conf}; my @langs; foreach my $lang (@{$conf->{bayes_stopword_languages}}) { if (defined $self->{bayes_stopword}{$lang}) { push @langs, $lang; } else { warn "bayes: missing stopwords regexp for language '$lang'\n"; } } if (@langs) { dbg("bayes: stopwords for languages enabled: ".join(' ', @langs)); @{$conf->{bayes_stopword_languages}} = @langs; } else { dbg("bayes: no stopword languages enabled"); $conf->{bayes_stopword_languages} = []; } return 0; } sub finish { my $self = shift; if ($self->{store}) { $self->{store}->untie_db(); } %{$self} = (); } ########################################################################### # Plugin hook. # Return this implementation object, for callers that need to know # it. TODO: callers shouldn't *need* to know it! # used only in test suite to get access to {store}, internal APIs. # sub learner_get_implementation { return shift; } ########################################################################### # Plugin hook. # Called in the parent process shortly before forking off child processes. sub prefork_init { my ($self) = @_; if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) { $self->{store}->prefork_init; } } ########################################################################### # Plugin hook. # Called in a child process shortly after being spawned. sub spamd_child_init { my ($self) = @_; if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) { $self->{store}->spamd_child_init; } } ########################################################################### # Plugin hook. sub check_bayes { my ($self, $pms, $fulltext, $min, $max) = @_; return 0 if (!$self->{conf}->{use_learner}); return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules}); if (!exists ($pms->{bayes_score})) { my $timer = $self->{main}->time_method("check_bayes"); $pms->{bayes_score} = $self->scan($pms, $pms->{msg}); } if (defined $pms->{bayes_score} && ($min == 0 || $pms->{bayes_score} > $min) && ($max eq "undef" || $pms->{bayes_score} <= $max)) { if ($self->{conf}->{detailed_bayes_score}) { $pms->test_log(sprintf ("score: %3.4f, hits: %s", $pms->{bayes_score}, $pms->{bayes_hits})); } else { $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score})); } return 1; } return 0; } ########################################################################### # Plugin hook. sub learner_close { my ($self, $params) = @_; my $quiet = $params->{quiet}; # do a sanity check here. Weird things happen if we remain tied # after compiling; for example, spamd will never see that the # number of messages has reached the bayes-scanning threshold. if ($self->{store}->db_readable()) { warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; $self->{store}->untie_db(); } } ########################################################################### # read configuration items to control bayes behaviour. Called by # BayesStore::read_db_configs(). sub read_db_configs { my ($self) = @_; # use of hapaxes. Set on bayes object, since it controls prob # computation. $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; } ########################################################################### sub ignore_message { my ($self,$PMS) = @_; return 0 unless $self->{use_ignores}; my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); my $ignore = $ig_from || $ig_to; dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; return $ignore; } ########################################################################### # Plugin hook. sub learn_message { my ($self, $params) = @_; my $isspam = $params->{isspam}; my $msg = $params->{msg}; my $id = $params->{id}; if (!$self->{conf}->{use_bayes}) { return; } my $msgdata = $self->get_body_from_msg ($msg); my $ret; eval { local $SIG{'__DIE__'}; # do not run user die() traps in here my $timer = $self->{main}->time_method("b_learn"); my $ok; if ($self->{main}->{learn_to_journal}) { # If we're going to learn to journal, we'll try going r/o first... # If that fails for some reason, let's try going r/w. This happens # if the DB doesn't exist yet. $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); } else { $ok = $self->{store}->tie_db_writable(); } if ($ok) { $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } } 1; } or do { # if we died, untie the dbs. my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; $self->{store}->untie_db(); die "bayes: (in learn) $eval_stat\n"; }; return $ret; } # this function is trapped by the wrapper above sub _learn_trapped { my ($self, $isspam, $msg, $msgdata, $msgid) = @_; my @msgid = ( $msgid ); if (!defined $msgid) { @msgid = ( $msg->generate_msgid(), $msg->get_msgid() ); } foreach my $msgid_t ( @msgid ) { next if !defined $msgid_t; my $seen = $self->{store}->seen_get ($msgid_t); if (defined ($seen)) { if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { dbg("bayes: $msgid_t already learnt correctly, not learning twice"); return 0; } elsif ($seen !~ /^[hs]$/) { warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored"); } else { # bug 3704: If the message was already learned, don't try learning it again. # this prevents, for instance, manually learning as spam, then autolearning # as ham, or visa versa. if ($self->{main}->{learn_no_relearn}) { dbg("bayes: $msgid_t already learnt as opposite, not re-learning"); return 0; } dbg("bayes: $msgid_t already learnt as opposite, forgetting first"); # kluge so that forget() won't untie the db on us ... my $orig = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); # reset the value post-forget() ... $self->{main}->{learn_caller_will_untie} = $orig; # forget() gave us a fatal error, so propagate that up if ($fatal) { dbg("bayes: forget() returned a fatal error, so learn() will too"); return; } } # we're only going to have seen this once, so stop if it's been # seen already last; } } # Now that we're sure we haven't seen this message before ... $msgid = $msgid[0]; my $msgatime = $msg->receive_date(); # If the message atime comes back as being more than 1 day in the # future, something's messed up and we should revert to current time as # a safety measure. # $msgatime = time if ( $msgatime - time > 86400 ); my $tokens = $self->tokenize($msg, $msgdata); { my $timer = $self->{main}->time_method('b_count_change'); if ($isspam) { $self->{store}->nspam_nham_change(1, 0); $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); } else { $self->{store}->nspam_nham_change(0, 1); $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); } } $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); $self->{store}->cleanup(); $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, isspam => $isspam, msgid => $msgid, msgatime => $msgatime, }); dbg("bayes: learned '$msgid', atime: $msgatime"); 1; } ########################################################################### # Plugin hook. sub forget_message { my ($self, $params) = @_; my $msg = $params->{msg}; my $id = $params->{id}; if (!$self->{conf}->{use_bayes}) { return; } my $msgdata = $self->get_body_from_msg ($msg); my $ret; # we still tie for writing here, since we write to the seen db # synchronously eval { local $SIG{'__DIE__'}; # do not run user die() traps in here my $timer = $self->{main}->time_method("b_learn"); my $ok; if ($self->{main}->{learn_to_journal}) { # If we're going to learn to journal, we'll try going r/o first... # If that fails for some reason, let's try going r/w. This happens # if the DB doesn't exist yet. $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); } else { $ok = $self->{store}->tie_db_writable(); } if ($ok) { $ret = $self->_forget_trapped ($msg, $msgdata, $id); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } } 1; } or do { # if we died, untie the dbs. my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; $self->{store}->untie_db(); die "bayes: (in forget) $eval_stat\n"; }; return $ret; } # this function is trapped by the wrapper above sub _forget_trapped { my ($self, $msg, $msgdata, $msgid) = @_; my @msgid = ( $msgid ); my $isspam; if (!defined $msgid) { @msgid = ( $msg->generate_msgid(), $msg->get_msgid() ); } while( $msgid = shift @msgid ) { my $seen = $self->{store}->seen_get ($msgid); if (defined ($seen)) { if ($seen eq 's') { $isspam = 1; } elsif ($seen eq 'h') { $isspam = 0; } else { dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); return 0; } # messages should only be learned once, so stop if we find a msgid # which was seen before last; } else { dbg("bayes: forget: msgid $msgid not learnt, ignored"); } } # This message wasn't learnt before, so return if (!defined $isspam) { dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); return 0; } elsif ($isspam) { $self->{store}->nspam_nham_change (-1, 0); } else { $self->{store}->nspam_nham_change (0, -1); } my $tokens = $self->tokenize($msg, $msgdata); if ($isspam) { $self->{store}->multi_tok_count_change (-1, 0, $tokens); } else { $self->{store}->multi_tok_count_change (0, -1, $tokens); } $self->{store}->seen_delete ($msgid); $self->{store}->cleanup(); $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, isspam => $isspam, msgid => $msgid, }); 1; } ########################################################################### # Plugin hook. sub learner_sync { my ($self, $params) = @_; if (!$self->{conf}->{use_bayes}) { return 0; } dbg("bayes: bayes journal sync starting"); $self->{store}->sync($params); dbg("bayes: bayes journal sync completed"); } ########################################################################### # Plugin hook. sub learner_expire_old_training { my ($self, $params) = @_; if (!$self->{conf}->{use_bayes}) { return 0; } dbg("bayes: expiry starting"); my $timer = $self->{main}->time_method("expire_bayes"); $self->{store}->expire_old_tokens($params); dbg("bayes: expiry completed"); } ########################################################################### # Plugin hook. # Check to make sure we can tie() the DB, and we have enough entries to do a scan # if we're told the caller will untie(), go ahead and leave the db tied. sub learner_is_scan_available { my ($self, $params) = @_; return 0 unless $self->{conf}->{use_bayes}; return 0 unless $self->{store}->tie_db_readonly(); # We need the DB to stay tied, so if the journal sync occurs, don't untie! my $caller_untie = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; # Do a journal sync if necessary. Do this before the nspam_nham_get() # call since the sync may cause an update in the number of messages # learnt. $self->_opportunistic_calls(1); # Reset the variable appropriately $self->{main}->{learn_caller_will_untie} = $caller_untie; my ($ns, $nn) = $self->{store}->nspam_nham_get(); if ($ns < $self->{conf}->{bayes_min_spam_num}) { dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 0; } if ($nn < $self->{conf}->{bayes_min_ham_num}) { dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 0; } return 1; } ########################################################################### sub scan { my ($self, $permsgstatus, $msg) = @_; return unless $self->{conf}->{use_learner}; # When we're doing a scan, we'll guarantee that we'll do the untie, # so override the global setting until we're done. my $caller_untie = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); goto skip unless $self->learner_is_scan_available(); my ($ns, $nn) = $self->{store}->nspam_nham_get(); ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() ## $self->{raw_counts} = " ns=$ns nn=$nn "; ## } dbg("bayes: corpus size: nspam = $ns, nham = $nn"); my $msgtokens; { my $timer = $self->{main}->time_method('b_tokenize'); my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); $msgtokens = $self->tokenize($msg, $msgdata); } my $tokensdata; { my $timer = $self->{main}->time_method('b_tok_get_all'); $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); } my $timer_compute_prob = $self->{main}->time_method('b_comp_prob'); my $probabilities_ref = $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn); my %pw; foreach my $tokendata (@{$tokensdata}) { my $prob = shift(@$probabilities_ref); next unless defined $prob; my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; $pw{$token} = { prob => $prob, spam_count => $tok_spam, ham_count => $tok_ham, atime => $atime }; } my @pw_keys = keys %pw; # If none of the tokens were found in the DB, we're going to skip # this message... if (!@pw_keys) { dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); goto skip; } my $tcount_total = keys %{$msgtokens}; my $tcount_learned = scalar @pw_keys; # Figure out the message receive time (used as atime below) # If the message atime comes back as being in the future, something's # messed up and we should revert to current time as a safety measure. # my $msgatime = $msg->receive_date(); my $now = time; $msgatime = $now if ( $msgatime > $now ); my @touch_tokens; my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys); my $log_each_token = (would_log('dbg', 'bayes') > 1); # now take the most significant tokens and calculate probs using # Robinson's formula. @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys; if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 } my @sorted; my $score; foreach my $tok (@pw_keys) { next if $tok_strength{$tok} < $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; my $pw_tok = $pw{$tok}; my $pw_prob = $pw_tok->{prob}; # What's more expensive, scanning headers for HAMMYTOKENS and # SPAMMYTOKENS tags that aren't there or collecting data that # won't be used? Just collecting the data is certainly simpler. # my $raw_token = $msgtokens->{$tok} || "(unknown)"; my $s = $pw_tok->{spam_count}; my $n = $pw_tok->{ham_count}; my $a = $pw_tok->{atime}; push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy }, [$raw_token, $pw_prob, $s, $n, $a] ); push(@sorted, $pw_prob); # update the atime on this token, it proved useful push(@touch_tokens, $tok); if ($log_each_token) { dbg("bayes: token '$raw_token' => $pw_prob"); } } if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) { dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); goto skip; } $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); undef $timer_compute_prob; # end a timing section # Couldn't come up with a probability? goto skip unless defined $score; dbg("bayes: score = $score"); # no need to call tok_touch_all unless there were significant # tokens and a score was returned # we don't really care about the return value here { my $timer = $self->{main}->time_method('b_tok_touch_all'); $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); } my $timer_finish = $self->{main}->time_method('b_finish'); $permsgstatus->{bayes_nspam} = $ns; $permsgstatus->{bayes_nham} = $nn; ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; ## } $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, probsref => \%pw, score => $score, msgatime => $msgatime, significant_tokens => \@touch_tokens, }); skip: if (!defined $score) { dbg("bayes: not scoring message, returning undef"); } undef $timer_compute_prob; # end a timing section if still running if (!defined $timer_finish) { $timer_finish = $self->{main}->time_method('b_finish'); } # Take any opportunistic actions we can take if ($self->{main}->{opportunistic_expire_check_only}) { # we're supposed to report on expiry only -- so do the # _opportunistic_calls() run for the journal only. $self->_opportunistic_calls(1); $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); } else { $self->_opportunistic_calls(); } # Do any cleanup we need to do $self->{store}->cleanup(); # Reset the value accordingly $self->{main}->{learn_caller_will_untie} = $caller_untie; # If our caller won't untie the db, we need to do it. if (!$caller_untie) { $self->{store}->untie_db(); } $permsgstatus->set_tag ('BAYESTCHAMMY', ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0)); $permsgstatus->set_tag ('BAYESTCSPAMMY', ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0)); $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned); $permsgstatus->set_tag ('BAYESTC', $tcount_total); $permsgstatus->set_tag ('HAMMYTOKENS', sub { my $pms = shift; $self->bayes_report_make_list ($pms, $pms->{bayes_token_info_hammy}, shift); }); $permsgstatus->set_tag ('SPAMMYTOKENS', sub { my $pms = shift; $self->bayes_report_make_list ($pms, $pms->{bayes_token_info_spammy}, shift); }); $permsgstatus->set_tag ('TOKENSUMMARY', sub { my $pms = shift; if ( defined $pms->{tag_data}{BAYESTC} ) { my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED} - $pms->{tag_data}{BAYESTCSPAMMY} - $pms->{tag_data}{BAYESTCHAMMY}; my $tcount_new = $pms->{tag_data}{BAYESTC} - $pms->{tag_data}{BAYESTCLEARNED}; "Tokens: new, $tcount_new; " ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; " ."neutral, $tcount_neutral; " ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}." } else { "Bayes not run."; } }); return $score; } ########################################################################### # Plugin hook. sub learner_dump_database { my ($self, $params) = @_; my $magic = $params->{magic}; my $toks = $params->{toks}; my $regex = $params->{regex}; # allow dump to occur even if use_bayes disables everything else ... #return 0 unless $self->{conf}->{use_bayes}; return 0 unless $self->{store}->tie_db_readonly(); my @vars = $self->{store}->get_storage_variables(); my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; my $template = '%3.3f %10u %10u %10u %s'."\n"; if ( $magic ) { printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version') or die "Error writing: $!"; printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam') or die "Error writing: $!"; printf($template, 0.0, 0, $nh, 0, 'non-token data: nham') or die "Error writing: $!"; printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens') or die "Error writing: $!"; printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime') or die "Error writing: $!"; if ( $bv >= 2 ) { printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime') or die "Error writing: $!"; } if ( $bv < 2 ) { printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') or die "Error writing: $!"; } if ( $bv >= 2 ) { printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') or die "Error writing: $!"; } printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime') or die "Error writing: $!"; if ( $bv >= 2 ) { printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta') or die "Error writing: $!"; printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count') or die "Error writing: $!"; } } if ( $toks ) { # let the store sort out the db_toks $self->{store}->dump_db_toks($template, $regex, @vars); } if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 1; } ########################################################################### # TODO: these are NOT public, but the test suite needs to call them. sub get_body_from_msg { my ($self, $msg) = @_; if (!ref $msg) { # I have no idea why this seems to happen. TODO warn "bayes: msg not a ref: '$msg'"; return { }; } my $permsgstatus = Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); $msg->extract_message_metadata ($permsgstatus); my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); $permsgstatus->finish(); if (!defined $msgdata) { # why?! warn "bayes: failed to get body for ".scalar($self->{msg}->generate_msgid())."\n"; return { }; } return $msgdata; } sub _get_msgdata_from_permsgstatus { my ($self, $pms) = @_; my $t_src = $self->{conf}->{bayes_token_sources}; my $msgdata = { }; $msgdata->{bayes_token_body} = $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible}; $msgdata->{bayes_token_inviz} = $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible}; $msgdata->{bayes_mimepart_digests} = $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart}; @{$msgdata->{bayes_token_uris}} = $pms->get_uri_list() if $t_src->{uri}; return $msgdata; } ########################################################################### # The calling functions expect a uniq'ed array of tokens ... sub tokenize { my ($self, $msg, $msgdata) = @_; my $conf = $self->{conf}; my $t_src = $conf->{bayes_token_sources}; $self->{stopword_cache} = (); # visible tokens from the body my @tokens_body; if ($msgdata->{bayes_token_body}) { foreach (@{$msgdata->{bayes_token_body}}) { push(@tokens_body, $self->_tokenize_line ($_, '', 1)); last if scalar @tokens_body >= 50000; } dbg("bayes: tokenized body: %d tokens", scalar @tokens_body); } # the URI list my @tokens_uri; if ($msgdata->{bayes_token_uris}) { foreach (@{$msgdata->{bayes_token_uris}}) { push(@tokens_uri, $self->_tokenize_line ($_, '', 2)); last if scalar @tokens_uri >= 10000; } dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri); } # add invisible tokens my @tokens_inviz; if ($msgdata->{bayes_token_inviz}) { my $tokprefix; if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' } if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' } if (defined $tokprefix) { foreach (@{$msgdata->{bayes_token_inviz}}) { push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1)); last if scalar @tokens_inviz >= 50000; } } dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz); } # add digests and Content-Type of all MIME parts my @tokens_mimepart; if ($msgdata->{bayes_mimepart_digests}) { my %shorthand = ( # some frequent MIME part contents for human readability 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML', ); @tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_), @{ $msgdata->{bayes_mimepart_digests} }); dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart); dbg("bayes: mime-part token %s", $_) for @tokens_mimepart; } # Tokenize the headers my @tokens_header; if ($t_src->{header}) { my %hdrs = $self->_tokenize_headers ($msg); while( my($prefix, $value) = each %hdrs ) { push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0)); last if scalar @tokens_header >= 10000; } dbg("bayes: tokenized header: %d tokens", scalar @tokens_header); } delete $self->{stopword_cache}; # Go ahead and uniq the array, skip null tokens (can happen sometimes) # generate an SHA1 hash and take the lower 40 bits as our token my %tokens; foreach my $token (@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header) { # dbg("bayes: token: %s", $token); $tokens{substr(sha1($token), -5)} = $token if $token ne ''; } # return the keys == tokens ... return \%tokens; } sub _tokenize_line { my $self = $_[0]; my $tokprefix = $_[2]; my $region = $_[3]; local ($_) = $_[1]; my $conf = $self->{conf}; my @rettokens; # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" ### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic) s{ ( [A-Za-z0-9,@*!_'"\$. -]+ | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3} | [\xA1-\xFF] ) | . } { defined $1 ? $1 : ' ' }xsge; # should we also turn NBSP ( \xC2\xA0 ) into space? # DO split on "..." or "--" or "---"; common formatting error resulting in # hapaxes. Keep the separator itself as a token, though, as long ones can # be good spamsigns. s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; if (IGNORE_TITLE_CASE) { if ($region == 1 || $region == 2) { # lower-case Title Case at start of a full-stop-delimited line (as would # be seen in a Western language). s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; } } my $magic_re = $self->{store}->get_magic_re(); # Note that split() in scope of 'use bytes' results in words with utf8 flag # cleared, even if the source string has perl characters semantics !!! # Is this really still desirable? TOKEN: foreach my $token (split) { $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens # Skip false magic tokens # TVD: we need to do a defined() check since SQL doesn't have magic # tokens, so the SQL BayesStore returns undef. I really want a way # of optimizing that out, but I haven't come up with anything yet. # next if ( defined $magic_re && $token =~ /$magic_re/o ); # *do* keep 3-byte tokens; there's some solid signs in there my $len = length($token); # but extend the stop-list. These are squarely in the gray # area, and it just slows us down to record them. # See http://wiki.apache.org/spamassassin/BayesStopList for more info. # next if $len < 3; # check stopwords regexp if not cached if (@{$conf->{bayes_stopword_languages}}) { if (!exists $self->{stopword_cache}{$token}) { foreach my $lang (@{$conf->{bayes_stopword_languages}}) { if ($token =~ $self->{bayes_stopword}{$lang}) { dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'"); $self->{stopword_cache}{$token} = 1; next TOKEN; } } $self->{stopword_cache}{$token} = 0; } else { # bail out if cached known next if $self->{stopword_cache}{$token}; } } # are we in the body? If so, apply some body-specific breakouts if ($region == 1 || $region == 2) { if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { push (@rettokens, $self->_tokenize_mail_addrs ($token)); } elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { push (@rettokens, "UD:".$token); # the full token my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { push (@rettokens, "UD:".$1); # UD = URL domain } } } # note: do not trim down overlong tokens if they contain '*'. This is # used as part of split tokens such as "HTo:D*net" indicating that # the domain ".net" appeared in the To header. # if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) { if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) { # Bug 7135 # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] ) [\x80-\xBF]{2} )/xsg; if (@t) { push (@rettokens, map($tokprefix.'u8:'.$_, @t)); next; } } if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan # to me! (jm) while ($token =~ s/^(..?)//) { push (@rettokens, $tokprefix.'8:'.$1); } next; } if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) { # if (TOKENIZE_LONG_TOKENS_AS_SKIPS) # Spambayes trick via Matt: Just retain 7 chars. Do not retain the # length, it does not help; see jm's mail to -devel on Nov 20 2002 at # http://sourceforge.net/p/spamassassin/mailman/message/12977605/ # "sk:" stands for "skip". # Bug 7141: retain seven UTF-8 chars (or other bytes), # if followed by at least two bytes $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3} | . ){7} )) .{2,} \z }{sk:$1}xs; ## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes } } # decompose tokens? do this after shortening long tokens if ($region == 1 || $region == 2) { if (DECOMPOSE_BODY_TOKENS) { if ($token =~ /[^\w:\*]/) { my $decompd = $token; # "Foo!" $decompd =~ s/[^\w:\*]//gs; push (@rettokens, $tokprefix.$decompd); # "Foo" } if ($token =~ /[A-Z]/) { my $decompd = $token; $decompd = lc $decompd; push (@rettokens, $tokprefix.$decompd); # "foo!" if ($token =~ /[^\w:\*]/) { $decompd =~ s/[^\w:\*]//gs; push (@rettokens, $tokprefix.$decompd); # "foo" } } } } push (@rettokens, $tokprefix.$token); } return @rettokens; } sub _tokenize_headers { my ($self, $msg) = @_; my %parsed; # get headers in array context my @hdrs; my @rcvdlines; for ($msg->get_all_headers()) { # first, keep a copy of Received headers, so we can strip down to last 2 if (/^Received:/i) { push(@rcvdlines, $_); next; } # and now skip lines for headers we don't want (including all Received) next if /^${IGNORED_HDRS}:/i; next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; push(@hdrs, $_); } push(@hdrs, $msg->get_all_metadata()); # and re-add the last 2 received lines: usually a good source of # spamware tokens and HELO names. if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } for (@hdrs) { next unless /\S/; my ($hdr, $val) = split(/:/, $_, 2); # remove user-specified headers here, after Received, in case they # want to ignore that too next if exists $self->{conf}->{bayes_ignore_header}->{lc $hdr}; # Prep the header value $val ||= ''; chomp($val); # special tokenization for some headers: if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { $val = $self->_pre_chew_message_id ($val); } elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) { $val = $self->_pre_chew_addr_header ($val); } elsif ($hdr eq 'Received') { $val = $self->_pre_chew_received ($val); } elsif ($hdr eq 'Content-Type') { $val = $self->_pre_chew_content_type ($val); } elsif ($hdr eq 'MIME-Version') { $val =~ s/1\.0//; # totally innocuous } elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { $val = "1"; # just mark the presence, they create lots of hapaxen } elsif ($hdr =~ /^x-spam-relays-(?:external|internal|trusted|untrusted)$/) { # remove redundant rdns helo ident envfrom intl auth msa words $val =~ s/ [a-z]+=/ /g; } if (MAP_HEADERS_MID) { if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { if (exists $parsed{"*MI"}) { $parsed{"*MI"} .= " ".$val; } else { $parsed{"*MI"} = $val; } } } if (MAP_HEADERS_FROMTOCC) { if ($hdr =~ /^(?:From|To|Cc)$/i) { if (exists $parsed{"*Ad"}) { $parsed{"*Ad"} .= " ".$val; } else { $parsed{"*Ad"} = $val; } } } if (MAP_HEADERS_USERAGENT) { if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { if (exists $parsed{"*UA"}) { $parsed{"*UA"} .= " ".$val; } else { $parsed{"*UA"} = $val; } } } # replace hdr name with "compressed" version if possible if (defined $HEADER_NAME_COMPRESSION{$hdr}) { $hdr = $HEADER_NAME_COMPRESSION{$hdr}; } if (exists $parsed{$hdr}) { $parsed{$hdr} .= " ".$val; } else { $parsed{$hdr} = $val; } } if (would_log('dbg', 'bayes') > 1) { foreach my $hdr (sort keys %parsed) { dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); } } return %parsed; } sub _pre_chew_content_type { my ($self, $val) = @_; # hopefully this will retain good bits without too many hapaxen if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { my $boundary = $1; $boundary = '' if !defined $boundary; # avoid a warning $boundary =~ s/[a-fA-F0-9]/H/gs; # break up blocks of separator chars so they become their own tokens $boundary =~ s/([-_\.=]+)/ $1 /gs; $val .= $boundary; } # stop-list words for Content-Type header: these wind up totally gray $val =~ s/\b(?:text|charset)\b/ /g; $val; } sub _pre_chew_message_id { my ($self, $val) = @_; # we can (a) get rid of a lot of hapaxen and (b) increase the token # specificity by pre-parsing some common formats. # Outlook Express format: $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ ([0-9a-f]{4})[0-9a-f]{4}\$ ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; # Exim: $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; # Sendmail: $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. [A-F0-9]{10,12}\@//gx; # try to split Message-ID segments on probable ID boundaries. Note that # Outlook message-ids seem to contain a server identifier ID in the last # 8 bytes before the @. Make sure this becomes its own token, it's a # great spam-sign for a learning system! Be sure to split on ".". $val =~ s/[^_A-Za-z0-9]/ /g; $val; } sub _pre_chew_received { my ($self, $val) = @_; # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs # and valid-format RFC-822/2822 dates $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? [0-3\s]?[0-9]\s (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s (?:19|20)?[0-9]{2}\s [0-2][0-9](?:\:[0-5][0-9]){1,2}\s (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* //gx; # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens # (on both sides) # also make a dup with the full IP, as fodder for # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { $1.$2.$3.$4. " ip*".$1.$2.$3.$4." "; } else { $1.$2.$3. " ip*".$1.$2.$3.$4." "; } }gex; # trim these: they turn out as the most common tokens, but with a # prob of about .5. waste of space! $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; $val; } sub _pre_chew_addr_header { my ($self, $val) = @_; local ($_); my @addrs = Mail::SpamAssassin::Util::parse_header_addresses($val); my @toks; foreach my $addr (@addrs) { if (defined $addr->{phrase}) { foreach (split(/\s+/, $addr->{phrase})) { push @toks, "N*".$_; # Bug 6319 } } if (defined $addr->{address}) { push @toks, $self->_tokenize_mail_addrs($addr->{address}); } } return join (' ', @toks); } sub _tokenize_mail_addrs { my ($self, $addr) = @_; ($addr =~ /(.+)\@(.+)$/) or return (); my @toks; push(@toks, "U*".$1, "D*".$2); $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } return @toks; } ########################################################################### # compute the probability that a token is spammish for each token sub _compute_prob_for_all_tokens { my ($self, $tokensdata, $ns, $nn) = @_; my @probabilities; return if !$ns || !$nn; my $threshold = 1; # ignore low-freq tokens below this s+n threshold if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { $threshold = 10; } if (!$self->{use_hapaxes}) { $threshold = 2; } foreach my $tokendata (@{$tokensdata}) { my $s = $tokendata->[1]; # spam count my $n = $tokendata->[2]; # ham count my $prob; no warnings 'uninitialized'; # treat undef as zero in addition if ($s + $n >= $threshold) { # ignoring low-freq tokens, also covers the (!$s && !$n) case # my $ratios = $s / $ns; # my $ration = $n / $nn; # $prob = $ratios / ($ration + $ratios); # $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { # use Robinson's f(x) equation for low-n tokens, instead of just # ignoring them my $robn = $s + $n; $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) / ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); } } # 'log_raw_counts' is used to log the raw data for the Bayes equations # during a mass-check, allowing the S and X constants to be optimized # quickly without requiring re-tokenization of the messages for each # attempt. There's really no need for this code to be uncommented in # normal use, however. It has never been publicly documented, so # commenting it out is fine. ;) # ## if ($self->{log_raw_counts}) { ## $self->{raw_counts} .= " s=$s,n=$n "; ## } push(@probabilities, $prob); } return \@probabilities; } # compute the probability that a token is spammish sub _compute_prob_for_token { my ($self, $token, $ns, $nn, $s, $n) = @_; # we allow the caller to give us the token information, just # to save a potentially expensive lookup if (!defined($s) || !defined($n)) { ($s, $n, undef) = $self->{store}->tok_get($token); } return if !$s && !$n; my $probabilities_ref = $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn); return $probabilities_ref->[0]; } ########################################################################### # If a token is neither hammy nor spammy, return 0. # For a spammy token, return the minimum number of additional ham messages # it would have had to appear in to no longer be spammy. Hammy tokens # are handled similarly. That's what the function does (at the time # of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly # more useful if it returned the number of /additional/ ham messages # a spammy token would have to appear in to no longer be spammy but I # fear that might require the solution to a cubic equation, and I # just don't have the time for that now. sub _compute_declassification_distance { my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; return 0 if $ns == 0 && $nn == 0; if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} return 0 if $Ns == 0 || $Nn == 0; return 0 if abs( $prob - 0.5 ) < $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; my $a = $Nb * ( 1 - $p ); my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); my $discrim = $b * $b - 4 * $a * $c; my $disc_max_0 = $discrim < 0 ? 0 : $discrim; my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; # This shouldn't be necessary. Should not be < 1 return $dd_exact < 1 ? 1 : int($dd_exact); } ########################################################################### sub _opportunistic_calls { my($self, $journal_only) = @_; # If we're not already tied, abort. if (!$self->{store}->db_readable()) { dbg("bayes: opportunistic call attempt failed, DB not readable"); return; } # Is an expire or sync running? my $running_expire = $self->{store}->get_running_expire_tok(); if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); return; } # handle expiry and syncing if (!$journal_only && $self->{store}->expiry_due()) { dbg("bayes: opportunistic call found expiry due"); # sync will bring the DB R/W as necessary, and the expire will remove # the running_expire token, may untie as well. $self->{main}->{bayes_scanner}->sync(1,1); } elsif ( $self->{store}->sync_due() ) { dbg("bayes: opportunistic call found journal sync due"); # sync will bring the DB R/W as necessary, may untie as well $self->{main}->{bayes_scanner}->sync(1,0); # We can only remove the running_expire token if we're doing R/W if ($self->{store}->db_writable()) { $self->{store}->remove_running_expire_tok(); } } return; } ########################################################################### sub learner_new { my ($self) = @_; my $store; my $module = $self->{conf}->{bayes_store_module}; if (!$module) { $module = 'Mail::SpamAssassin::BayesStore::DBM'; } elsif ($module =~ /^([_A-Za-z0-9:]+)$/) { $module = untaint_var($module); } else { die "bayes: invalid module: $module\n"; } dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module); undef $self->{store}; # DESTROYs previous object, if any eval ' require '.$module.'; $store = '.$module.'->new($self); 1; ' or do { my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; die "bayes: learner_new $module new() failed: $eval_stat\n"; }; dbg("bayes: learner_new: got store=%s", $store); $self->{store} = $store; $self; } ########################################################################### sub bayes_report_make_list { my ($self, $pms, $info, $param) = @_; return "Tokens not available." unless defined $info; my ($limit,$fmt_arg,$more) = split /,/, ($param || '5'); my %formats = ( short => '$t', Short => 'Token: \"$t\"', compact => '$p-$D--$t', Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', medium => '$p-$D-$N--$t', long => '$p-$d--${h}h-${s}s--${a}d--$t', Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"' ); my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); return "Invalid format, must be one of: ".join(",",keys %formats) unless defined $raw_fmt; my $fmt = '"'.$raw_fmt.'"'; my $amt = $limit < @$info ? $limit : @$info; return "" unless $amt; my $ns = $pms->{bayes_nspam}; my $nh = $pms->{bayes_nham}; my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; my $now = time; join ', ', map { my($t,$prob,$s,$h,$u) = @$_; my $a = int(($now - $u)/(3600 * 24)); my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob); my $p = sprintf "%.3f", $prob; my $n = $s + $h; my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); eval $fmt; ## no critic } @{$info}[0..$amt-1]; } 1;