The Bayesian module is used to initialize,
store and load the database of word probabilities and
blacklisted/whitelisted IP address that are used during the filtering
process. The time at which the initialization occurred is also stored. The
'database' is a serialize hash. An abridged example of such a hash is
given below. Note that the :blacklist
IP addresses are
hashified to make searching for common octets faster.
{:timestamp=>Wed Oct 01 10:19:28 -0230 2008, :word_probs=> {"shiny"=>"0.29", ... "thq.com"=>"0.99", "pint."=>"0.99"}, :blacklist=> {33=> {148=>{32=>{205=>1}}, 44=>{159=>{104=>2}}, 72=>{224=>{125=>1}}, 182=>{168=>{171=>1}}, 216=>{83=>{46=>1}}, 227=>{87=>{41=>1}}, 24=>{110=>{137=>1}, 196=>{81=>1}}, 114=>{151=>{225=>1}}, 175=>{160=>{38=>1}}}, ... 201=> {18=> {38=>{166=>2}, 126=>{16=>2}, ... 63=>{172=>1, 58=>3}, 169=>{13=>2}}, ... 92=> {38=>{237=>1}, 82=>{237=>1}, ... 208=>{41=>1}, 142=>{54=>1}}}}, :whitelist=> ["10.100.21.85", "198.165.43.133", ... "207.171.180.180", "69.136.253.7"]}
Load and return the bayesian database. Raises a Bayesian::DatabaseError exception if there were problems.
# File buryspam.rb, line 4393 def db # Should really be using the 'once' trick used by date.rb stdlib. return @db unless @db.nil? Logger.debug("Loading Buryspam database.") # While filtering, load only the database; ignore the caches. @db = Hashbase.load(Config.word_file) if @db.nil? || @db.empty? raise DatabaseError, "Invalid or missing Bayesian database:\n" + "\t'#{Config.word_file}'.\n" + "(Initialize it with '#{$0} --init')." end raise DatabaseError, "No timestamp." if @db[:timestamp].nil? raise DatabaseError, "No blacklisted IPs." if @db[:blacklist].empty? raise DatabaseError, "No whitelisted IPs." if @db[:whitelist].empty? raise DatabaseError, "No word probabilities." if @db[:word_probs].empty? @db end
Initialize the bayesian database. Only one initialization can be performed at a time. Displays benchmark information after initialization.
# File buryspam.rb, line 4369 def init begin Lockfile.only_one { |lockfile| Spam.move_missed_spam_file time = Benchmark.realtime { init_db Status.print(" Saving word database: ") Hashbase.save(Config.word_file => @db) Status.puts("%.2fMiB." % (File.size(Config.word_file).to_f / (1024**2))) } min = time.to_i / 60 min = min.zero? ? "" : "#{min}m" sec = time.to_i % 60 Status.puts("\nInitialization time: #{min}#{sec}s") } rescue Lockfile::AlreadyLockedError Status.error($!.message) end end
Given a hash of sample words => probabilities, return the bayesian value of the word probabilities.
# File buryspam.rb, line 4414 def value(samples) probs = samples.collect{ |word, prob| prob } p1 = probs.inject(1.0) { |prod, prob| prod * prob } p2 = probs.inject(1.0) { |prod, prob| prod * (1.0-prob) } p1 / (p1 + p2) end
Given a good/bad index and a cache counts hash parameter accumulate the word counts and ip address counts. Used exclusively by the ::update_db_counters method.
# File buryspam.rb, line 4602 def accumulate(gbi, counts) Mbox::COUNTERS.each { |counter| counts[counter].each { |item, num| gb_count = @db_counters[counter].fetch(item, [0, 0]) gb_count[gbi] += num @db_counters[counter][item] = gb_count } } counts[:num_msgs] end
Partition the ip addresses into good and bad addresses. Create a list of good IP address (whitelist) and create a hashified list of the octets comprising the bad IP addresses (blacklist) Returns the whitelist and blacklist.
# File buryspam.rb, line 4507 def analyze_ips() Status.print("Analyzing IP Addresses: ") s = Status.new("partitioning: ") ipaddrs = @db_counters[:ipaddrs] prog = Progress.new(ipaddrs, :partition) { |ipaddr, counts| counts[Gx].zero? } bad_ips, good_ips = prog.result s.update("good IP list: ") whitelist = [] Progress.new(good_ips, :each) { |ip, counts| whitelist << ip } s.update("bad IP table: ") blacklist = {} Progress.new(bad_ips, :each) { |ip, counts| octets = ip.split(".").map{|i| i.to_i} last_idx = octets.size - 1 b = blacklist octets.each_with_index { |oct, idx| b[oct] ||= (idx == last_idx ? counts[Bx] : {}) b = b[oct] } } s.finish Status.puts("%d/%d good/bad IP addresses added." % [good_ips.size, bad_ips.size]) return whitelist, blacklist end
After filtering out words that are to be ignored or that are outside the word length specified in the configuration, calculate the bayesian probabilties of each remaining word and store the words and corresponding probabilities in a hash.
# File buryspam.rb, line 4545 def calculate_probs Status.print(" Word probabilities: ") probs = {} num_good_msgs = @db_counters[:num_msgs][Gx] num_bad_msgs = @db_counters[:num_msgs][Bx] # Retrieve these values from Config so that we are not # doing a lot of redundant hash lookups in the loop below. ignore_words = Config.ignore_words word_length = Config.word_length min_word_num = Config.min_word_num good_init_weight = Config.good_init_weight bad_init_weight = Config.bad_init_weight ignore_probs = Config.ignore_probs bad_prob = Config.bad_prob good_prob = Config.good_prob precision = Config.precision good_words = bad_words = 0 s = Status.new("preprocessing...") words = @db_counters[:words].to_a s.update("filtering: ") Progress.new(words, :delete_if) { |word, counts| ignore_words =~ word || ! (word_length === word.length) } s.update("calculating: ") Progress.new(words, :each) { |word, counts| good, bad = counts[Gx], counts[Bx] g = good_init_weight * good b = bad_init_weight * bad next if g + b < min_word_num gf = g.to_f / num_good_msgs; gf = 1.0 if gf > 1.0 bf = b.to_f / num_bad_msgs; bf = 1.0 if bf > 1.0 br = bf / (gf + bf) next if ignore_probs === br prob = br > bad_prob ? bad_prob : br < good_prob ? good_prob : br probs[word] = "%.*f" % [precision, prob] if prob < Message::NEUTRAL good_words += 1 elsif prob > Message::NEUTRAL bad_words += 1 end } s.finish Status.puts("%d/%d good/bad word probabilities added." % [good_words, bad_words]) probs end
Low level function to setup and initialize the database hash structure.
# File buryspam.rb, line 4425 def init_db @db_counters = { :num_msgs => [0, 0], :ipaddrs => {}, :words => {} } ORDER.each { |gb| process(gb) } nb = @db_counters[:num_msgs][Bx] ng = @db_counters[:num_msgs][Gx] raise "No good/bad words?" if nb + ng == 0 Status.print(" Total messages: ") Status.puts("%d/%d good/bad messages processed (%d%% spam)" % [ng, nb, nb / (ng + nb).to_f * 100]) whitelist, blacklist = analyze_ips @db = { :timestamp => Time.now, :blacklist => blacklist, :whitelist => whitelist, :word_probs => calculate_probs } end
Scan over the directories containing either the good or bad mboxes. gb is :good or :bad.
# File buryspam.rb, line 4452 def process(gb) Status.puts("Processing #{gb} directories:") # Get the bad_dirs/good_dirs/bad_init_date_range/good_init_date_range # configuration values dynanmically. directories = Config.send(gb.to_s + '_dirs') @init_date_range = Config.send(gb.to_s + '_init_date_range') directories.each { |dir| unless FileUtils.dir_absolute?(dir) raise "Directory not absolute: '#{dir}'" end process_directory(dir, gb) } Status.puts end
Scan over the files in dir containing either the good or bad mboxes. gb is :good or :bad.
# File buryspam.rb, line 4469 def process_directory(dir, gb) unless File.directory?(dir) warning = "'#{dir}' directory does not exist." Status.warn(" WARNING: #{warning}") return end Status.puts(" #{dir}:") no_mboxes = true Dir[File.join(dir, '*')].sort.each { |filename| next unless File.file?(filename) Status.print(" %20.20s : " % File.basename(filename)) unless Mbox.is_valid?(filename) Status.puts("Not a valid mbox.") next end if Config.ignore_mboxes =~ filename Status.puts("Mbox ignored.") next end no_mboxes = false cache = status = nil time = Benchmark.realtime { cache = Cache.new(gb, filename, @init_date_range) status = update_db_counters(gb, cache) } Status.puts(status.nil? ? "No messages in range." : status + " %4.1fs (%.2f ms/msg)" % [time.to_f, time.to_f / cache[:num_msgs] * 1e3]) } Status.warn(" No mbox files found in #{dir}") if no_mboxes end
Update the number of words, ip addresses and messages
# File buryspam.rb, line 4614 def update_db_counters(gb, cache) return if cache.counts.nil? status = Status.new("updating counters...") gbi = ORDER.index(gb) if cache[:count_type] == :total num_msgs = accumulate(gbi, cache.counts[:total]) else num_msgs = 0 cache.counts.each { |time, counts| next unless @init_date_range.cover?(time) num_msgs += accumulate(gbi, counts) } end some = all = "" if num_msgs == cache[:num_msgs] all = "(full cache)" else some = "/#{cache[:num_msgs]}" end @db_counters[:num_msgs][gbi] += num_msgs status.finish return if num_msgs.zero? "%5d#{some} message%s used #{all}".pluralize(num_msgs, " ") end