#!/usr/bin/perl # Tom Chien # Yue Chen # 2008-03-20 # Bayesian Classifier use strict; use warnings; sub initialize (); sub getfiles (); sub train ($@); sub pseudocount (); sub classify (); sub menu (); sub batchclassify (); our %stopwords = ("" => 1); our $totalcount; our @nonspamfile; our %nonspam; our %nonspamprob; our @nonspamkey; our $nonspamcount = 0; our $nonspamprior; our @spamfile; our %spam; our %spamprob; our @spamkey; our $spamcount = 0; our $spamprior; getfiles (); train (1, @nonspamfile); train (2, @spamfile); pseudocount (); initialize (); classify (); exit; ################################################################# sub initialize () # print useful data { $totalcount = $nonspamcount + $spamcount; # total number of words counted $nonspamprior = $nonspamcount / $totalcount; # nonspam prior probability $spamprior = $spamcount / $totalcount; # spam prior probability $nonspamprob{$_} = $nonspam{$_} / $nonspamcount foreach (@nonspamkey); # frequency table for nonspam dictionary $spamprob{$_} = $spam{$_} / $spamcount foreach (@spamkey); # frequency table for spam dictionary print "Nonspam word count = ", scalar (@nonspamkey), "\n"; print "Spam word count = ", scalar (@spamkey), "\n"; printf "Nonspam count = $nonspamcount, prior probability = %.5f\n", $nonspamprior; printf "Spam count = $spamcount, prior probability = %.5f\n", $spamprior; print "Total word count = $totalcount\n"; } #################################################################### sub getfiles () # obtain files { my @dir; my @file; my $filecount; opendir (DIR, "/home/k2006/pw5/nonspam"); # get all files from nonspam directory @dir = readdir (DIR); closedir (DIR); shift (@dir); shift (@dir); foreach $filecount (@dir) # iterate through each nonspam file to make one nonspam file { open (IN, "/home/k2006/pw5/nonspam/$filecount"); chomp (@file = ); close (IN); push (@nonspamfile, $_) foreach (@file); } opendir (DIR, "/home/k2006/pw5/spam"); # get all files from spam directory @dir = readdir (DIR); closedir (DIR); shift (@dir); shift (@dir); foreach $filecount (@dir) # iterate through each spam file to make one spam file { open (IN, "/home/k2006/pw5/spam/$filecount"); chomp (@file = ); close (IN); push (@spamfile, $_) foreach (@file); } } ################################################################ sub train ($@) { my $option = shift; # option = 1 when data is nonspam, option = 2 when data is spam my $line; my @word; foreach $line (@_) # iterate through file line by line { $line =~ s/\W/ /g; # change all non-word to whitespace @word = split (/ /, $line); # split words by whitespace foreach (@word) # iterate through each word { $_ = lc ($_); # make all words lower case if ($option == 1) # make nonspam dictionary { $nonspam{$_}++ if (exists ($nonspam{$_})); $nonspam{$_} = 1 unless (exists ($nonspam{$_}) || exists ($stopwords{$_}) || length ($_) == 1); $nonspamcount++ unless (exists ($stopwords{$_}) || length ($_) == 1); } if ($option == 2) # make spam dictionary { $spam{$_}++ if (exists ($spam{$_})); $spam{$_} = 1 unless (exists ($spam{$_}) || exists ($stopwords{$_}) || length ($_) == 1); $spamcount++ unless (exists ($stopwords{$_}) || length ($_) == 1); } } } @nonspamkey = keys (%nonspam); @spamkey = keys (%spam); } ####################################################################### sub pseudocount () { my $word; foreach $word (@nonspamkey) { unless (exists ($spam{$word})) { $spam{$word} = 0; } } @spamkey = keys (%spam); foreach (@spamkey) { $spam{$_}++; $spamcount++; } foreach (@nonspamkey) { $nonspam{$_}++; $nonspamcount++; } foreach $word (@spamkey) { unless (exists ($nonspam{$word})) { $nonspam{$word} = 0; } } @nonspamkey = keys (%nonspam); foreach (@nonspamkey) { $nonspam{$_}++; $nonspamcount++; } foreach (@spamkey) { $spam{$_}++; $spamcount++; } } ####################################################################### sub classify () { my $key; my $choice; my $name; my @unknown; my @word; my $line; my @prob = (log ($nonspamprior), log ($spamprior)); BLOCK: { $choice = menu (); ($choice == 1 || $choice == 2) or warn "Choose again\n"; redo BLOCK unless ($choice == 1 || $choice == 2); } if ($choice == 1) { print "Enter file name: "; chomp ($name = ); open (IN, $name); chomp (@unknown = ); close (IN); foreach $line (@unknown) { $line =~ s/\W/ /g; @word = split (/ /, $line); foreach (@word) { $_ = lc ($_); $prob[0] += log ($nonspamprob{$_}) if (exists ($nonspamprob{$_})); $prob[1] += log ($spamprob{$_}) if (exists ($spamprob{$_})); } } print "$name is nonspam\n" if ($prob[0] > $prob[1]); print "$name is spam\n" if ($prob[1] > $prob[0]); } if ($choice == 2) { batchclassify (); } } ######################################################## sub menu () { my $var; print "1. Single-file mode classify\n"; print "2. Batch classify\n:"; chomp ($var = ); return $var; } ####################################################### sub batchclassify () { my @dir; my @file; my $filecount; my $line; my @word; my $key; my @prob; my @newdir; opendir (DIR, "/home/k2006/pw5"); # get all files from unknown message directory @dir = readdir (DIR); closedir (DIR); foreach (@dir) { push (@newdir, $_) if ($_ =~ m/unknown/); } @newdir = sort (@newdir); foreach $filecount (@newdir) # iterate through each file and classify { open (IN, "/home/k2006/pw5/$filecount"); chomp (@file = ); close (IN); @prob = (log ($nonspamprior), log ($spamprior)); foreach $line (@file) { $line =~ s/\W/ /g; @word = split (/ /, $line); foreach (@word) { $_ = lc ($_); $prob[0] += log ($nonspamprob{$_}) if (exists ($nonspamprob{$_})); $prob[1] += log ($spamprob{$_}) if (exists ($spamprob{$_})); } } print "$filecount is nonspam\n" if ($prob[0] > $prob[1]); print "$filecount is spam\n" if ($prob[1] > $prob[0]); } }