#!/usr/bin/perl -w # sample_bayesian.pl # Sample Bayesian classifier to filter spam and nonspam messages # (c) 2007 Srilatha Attaluri, Sean Gilpin use strict; our %spam_dict; our %nspam_dict; our $prior_prob_spam; sub countFrequency { my $choice = shift(@_); my (@emailWords) = (@_); foreach(@emailWords) { if($_ ne "") { if($choice == 1) { if (exists($spam_dict{$_})) {$spam_dict{$_}++;} else { $spam_dict{$_} = 1 ; } } else { if (exists($nspam_dict{$_})) {$nspam_dict{$_}++;} else { $nspam_dict{$_} = 1 ; } } } } return; } sub readFile { my $file = $_[0]; open(IN, $file) or return (); # if the file doesn't open return "" chomp(my @lines = ); close(IN); my @words = (); foreach(@lines) { (@words) = (@words,split(/ /,$_)); } return (@words); } sub train { my %hash = (); my $index = 0; my @total = (0,0); #TODO:ignore the top 10 #if content is zero we add 1 to each of the contents for($index = 0; $index < 2; $index++) { my $containsZero = 0; if($index == 0) {%hash = %spam_dict;} else {%hash = %nspam_dict;} SKIP: { while(my($k,$v) = each(%hash)) { if($v == 0) {$containsZero = 1; last SKIP;} } } if($index == 0) {%hash = %spam_dict;} else {%hash = %nspam_dict;} if($containsZero) #if there is atleast one zero add 1 to all elements in hash { while(my($k,$v) = each(%hash)) { $hash{$k} = $hash{$k} + 1; $total[$index]+= $hash{$k}; } } if($index == 0) {%spam_dict = %hash;} elsif($index == 1) {%nspam_dict = %hash;} } #calculate the probabilities for($index = 0; $index < 2; $index++) { if($index == 0) {%hash = %spam_dict;} else {%hash = %nspam_dict;} while(my($k,$v) = each(%hash)) { $hash{$k} = sprintf("%.10f",$v/$total[$index]); } if($index == 0) {%spam_dict = %hash;} else {%nspam_dict = %hash;} } return @total; } sub classify { my @message_words = @_; my $crit_value = log($prior_prob_spam) - log(1 - $prior_prob_spam); foreach my $word (@message_words) { if(exists($spam_dict{$word}) and exists($nspam_dict{$word})) { $crit_value = $crit_value + (log ($spam_dict{$word}) - log($nspam_dict{$word})); } } if($crit_value > 0) { return "spam"; } else { return "nonspam"; } } sub classifyFile { my $fileName = shift(@_); my @wordsUnknown = readFile($fileName); #change all the words into lowercase @wordsUnknown = map(lc($_),@wordsUnknown); return "" if scalar(@wordsUnknown) == 0; #filter special symbols and digits foreach(@wordsUnknown) { $_ =~ s/(http:\/\/)?(https:\/\/)?(www(\.|-))?(([a-z]|\d)+\d*\.)+([a-z]{1,3})*(\/([a-z]*\d*(\.|_|-|\?|=|&|@)*)*)*//g;#filter web sites $_ =~ s/#|\d|\.|,|!|:|-|\'|'|;|\?|\"|\*|[(]|[)]|{|}|\[|\]|_|\^|\$|`|’|\\|®|<|>|\t*|“|\+|\/|~|\!|%|@|=|`|&|™//g; } return classify(@wordsUnknown); } sub singleFileMode() { print "\nEnter file name to classified\n: "; my $fileName = ; chomp $fileName; my $classification = classifyFile($fileName); if($classification eq "") { print "Bad file name.\n\n"; } else { print "\n", $fileName, " ", $classification, "\n\n"; } } sub batchMode { # Read all files in current directory with file name unknownXX.txt where XX is two or more digits my @unknownFiles = <./unknown??.txt>; print "\n"; foreach my $fileName (@unknownFiles) { print $fileName, " ", classifyFile($fileName), "\n"; } print "\n"; } sub initialize { my @spamFileNames = <./spam/*.txt>; my $spamFile = ""; foreach my $filename(@spamFileNames) { $spamFile = $filename; my @wordsSpam = readFile($spamFile); #change all the words into lowercase @wordsSpam = map(lc($_),@wordsSpam); #filter special symbols and digits foreach(@wordsSpam) { $_ =~ s/(http:\/\/)?(https:\/\/)?(www(\.|-))?(([a-z]|\d)+\d*\.)+([a-z]{1,3})*(\/([a-z]*\d*(\.|_|-|\?|=|&|@)*)*)*//g;#filter web sites $_ =~ s/#|\d|\.|,|!|:|-|\'|'|;|\?|\"|\*|[(]|[)]|{|}|\[|\]|_|\^|\$|`|’|\\|®|<|>|\t*|“|\+|\/|~|\!|%|@|=|`|&|™//g; } #count the frequencies countFrequency(1,@wordsSpam); #counts the freq of spam } my @nspamFileNames = <./nonspam/*.txt>; my $nonSpamFile = ""; foreach my $filename(@nspamFileNames) { $nonSpamFile = $filename; my @wordsNSpam = readFile($nonSpamFile); #change all the words into lowercase @wordsNSpam = map(lc($_),@wordsNSpam); #filter special symbols and digits foreach(@wordsNSpam) { $_ =~ s/(http:\/\/)?(https:\/\/)?(www(\.|-))?(([a-z]|\d)+\d*\.)+([a-z]{1,3})*(\/([a-z]*\d*(\.|_|-|\?|=|&|@)*)*)*//g;#filter web sites $_ =~ s/#|\d|\.|,|!|:|-|\'|'|;|\?|\"|\*|[(]|[)]|{|}|\[|\]|_|\^|\$|`|’|\\|®|<|>|\t*|“|\+|\/|~|\!|%|@|=|`|&|™//g; } #count the frequencies countFrequency(2,@wordsNSpam); #counts the freq of non-spam } #delete nulls from the hashes while(my($k,$v) = each(%spam_dict)) { $k =~ s/(\\s)*//g; chomp($spam_dict{$k}); delete $spam_dict{$k} if ($k eq ""); } while(my($k,$v) = each(%nspam_dict)) { $k =~ s/(\\s)*//g; chomp($k); delete $nspam_dict{$k} if ($k eq ""); } calculateSpamPriorProb(); # Do this before merging arrays #merge the arrays while(my($k,$v) = each(%spam_dict)) { $nspam_dict{$k} = 0 if (not exists($nspam_dict{$k})); } while(my($k,$v) = each(%nspam_dict)) { $spam_dict{$k} = 0 if (not exists($spam_dict{$k})); } #train the model my($total_spam,$total_nspam) = train(); #trains the model with corresponding probabilities } sub calculateSpamPriorProb { #Make sure to call this before merging arrays #calculate prior probabilties my $spam_wordcount = 0; foreach( values(%spam_dict)) { $spam_wordcount += $_; } my $nspam_wordcount = 0; foreach( values(%nspam_dict)) { $nspam_wordcount += $_; } #die "Spam dictionary or nonspam is empty\n" if(($spam_wordcount == 0) | ($nspam_wordcount == 0)); #Set global variable $prior_prob_spam = $spam_wordcount/($spam_wordcount + $nspam_wordcount); } MAIN: { initialize(); MENU: { print "****************************************\n"; print "* *\n"; print "* Bayesian Classifier Menu *\n"; print "* *\n"; print "* Press 1 for single file mode *\n"; print "* Press 2 for batch classify mode *\n"; print "* Press q to quit *\n"; print "* *\n"; print "****************************************\n"; print "\n: "; my $choice = ; chomp($choice); if($choice eq "1") { singleFileMode(); } elsif($choice eq "2") { batchMode(); } elsif(lc($choice) eq "q") { last MENU; } else { print "Please enter a valid option from menu. Press to continue\n"; ; } redo MENU; } exit; }