Project: Mailvisa
Code Location: git://repo.or.cz/mailvisa.gitmaster
Browse
/
Download File
add_messages.rb
require 'wordlist'
require 'tokenize'

### Defaults
confdir = ENV['HOME'] + '/settings/mailvisa'
$ignorespamheaders = true
$weed_threshold = 1
$weed_count = 100000
filename = nil
messages = []

### Functions
def add_stream words, stream
	stream.each do |line|
		next if line[0,6] == 'X-Spam'
		tokenize(line).each do |word|
			next if word.length > 40
			count = words[word]
			words[word] = if count == nil then 1 else count + 1 end
			if $weed_count != 0
				if words.length % $weed_count == 0
					words = weed words
				end
			end
		end
	end
end

def weed words
	puts 'Weeding...'
	count = words.length
	words.each { |k,v| words.delete k if v <= $weed_threshold }
	weeded = count - words.length
	puts "#{weeded} words discarded"
	$weed_count = $weed_count * 2 if weeded < $weed_count / 10
	words
end

usage = 'USAGE: ' + $0 + ' [options] <wordlist> [<message> ...]'

help = <<EOT
Valid options are:

-c <path>	Look for configuration files in <path>
			(default: $HOME/settings/mailvisa)
-i		Include X-Spam headers in analysis
-w <num>	Weed wordlist every <num> words (default: 100000
			use 0 to disable weeding)
-t <num>	Weed words that occur <num> or fewer times (default: 1)
EOT

### Main program

## Process command line
i = 0
while i < ARGV.length
	case ARGV[i]
	when '-h'
		puts usage
		print "\n" + help
		exit
	when '-c'
		i = i + 1
		confdir = ARGV[i]
	when /^-/
		$stderr.puts 'Invalid option: ' + ARGV[i]
		$stderr.puts usage
		exit 0x80
	else
		filename = ARGV[i]
		i = i + 1
		messages = ARGV[i..-1]
		break
	end
	i = i + 1
end

if filename == nil
	$stderr.puts 'No wordlist specified'
	$stderr.puts usage
	exit 0x80
end

filename = confdir + '/' + filename if filename.index('/') == nil

begin
	fh = open filename
rescue
	print filename + ' not found, will create new file'
	fh = false
end

if fh
	$stderr.print "Loading #{filename}..."
	wordlist = load_wordlist fh
	fh.close
	words = wordlist[:words]
	message_count = wordlist[:messages]
	$stderr.puts words.length.to_s + ' words loaded'
else
	words = {}
	message_count = 0
end

if messages.length > 0
	messages.each do |x|
		$stderr.puts "Adding #{x}"
		fh = open x
		add_stream words, fh
		fh.close
		message_count = message_count + 1
	end
else
	add_stream words, $stdin
	message_count = message_count + 1
end

wordlist = {
	:messages => message_count,
	:words => words
}

$stderr.print "Writing #{filename}..."
fh = open filename, 'w'
dump_wordlist wordlist, fh
fh.close
$stderr.puts 'done'