1. We will use the files from 'full' folder in this dataset.

  2. We will use Pandas to read the content and categorize them into spam:1 and ham:0

  3. We will clean up the content by removing line endings , tabs , return characters . We will also remove email addresses, numbers and punctuations from the sentences and convert the text to lowercase.

  4. We will use NLTK library to remove stopwords like 'the' 'had' , etc

  5. We will will use NLTK for stemming which is to convert various forms of a root word to the root word itself e.g 'like' is the root word for 'liked' , 'likes' , 'liking' , etc . We will use NLTK SnowballStemmer which handles languages other than English. When handling English language alone Porter stemmer could be used.

  6. We will use TfidfVectorizer from Scikit Learn library to vectorize and create the train, validation samples

  7. We will train and test spam / ham classification using Support Vector Machines and NaiveBayes

import Pandas and other modules for reading data

import pandas as pd
import os
from pathlib import Path

read email data

data=pd.read_csv("./full/index",sep=' ',header=None)
data.head()
0 1
0 spam ../data/inmail.1
1 ham ../data/inmail.2
2 spam ../data/inmail.3
3 spam ../data/inmail.4
4 spam ../data/inmail.5

create class, filepath and contents columns

data.columns=['class','filepath']
data['contents']=None
data.head()
class filepath contents
0 spam ../data/inmail.1 None
1 ham ../data/inmail.2 None
2 spam ../data/inmail.3 None
3 spam ../data/inmail.4 None
4 spam ../data/inmail.5 None

polulate the contents column with email text

import re
import string
    
for i,row in data.iterrows():
    
    filepath=os.path.join(os.getcwd(),row['filepath'].replace('../',''))
    with open(filepath, 'rb') as f:
        email_txt = f.read()
        email_text=str(email_txt)
        if i<2:
            print(email_txt)
        
    data.at[i,'contents']= email_txt
        
print(data.head())
print(data.info())
b'From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007\nReturn-Path: <RickyAmes@aol.com>\nReceived: from 129.97.78.23 ([211.202.101.74])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;\n\tSun, 8 Apr 2007 13:07:21 -0400\nReceived: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100\nMessage-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>\nFrom: "Tomas Jacobs" <RickyAmes@aol.com>\nReply-To: "Tomas Jacobs" <RickyAmes@aol.com>\nTo: the00@speedy.uwaterloo.ca\nSubject: Generic Cialis, branded quality@ \nDate: Sun, 08 Apr 2007 21:00:48 +0300\nX-Mailer: Microsoft Outlook Express 6.00.2600.0000\nMIME-Version: 1.0\nContent-Type: multipart/alternative;\n\tboundary="--8896484051606557286"\nX-Priority: 3\nX-MSMail-Priority: Normal\nStatus: RO\nContent-Length: 988\nLines: 24\n\n----8896484051606557286\nContent-Type: text/html;\nContent-Transfer-Encoding: 7Bit\n\n<html>\n<body bgcolor="#ffffff">\n<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0px; margin-bottom: 0px;" align="center">\n<table style="border: 1px; border-style: solid; border-color:#000000;" cellpadding="5" cellspacing="0" bgcolor="#CCFFAA">\n<tr>\n<td style="border: 0px; border-bottom: 1px; border-style: solid; border-color:#000000;">\n<center>\nDo you feel the pressure to perform and not rising to the occasion??<br>\n</center>\n</td></tr><tr>\n<td bgcolor=#FFFF33 style="border: 0px; border-bottom: 1px; border-style: solid; border-color:#000000;">\n<center>\n\n<b><a href=\'http://excoriationtuh.com/?lzmfnrdkleks\'>Try <span>V</span><span>ia<span></span>gr<span>a</span>.....</a></b></center>\n</td></tr><td><center>your anxiety will be a thing of the past and you will<br>\nbe back to your old self.\n</center></td></tr></table></div></body></html>\n\n\n----8896484051606557286--\n\n'
b'From bounce-debian-mirrors=ktwarwic=speedy.uwaterloo.ca@lists.debian.org  Sun Apr  8 13:09:29 2007\nReturn-Path: <bounce-debian-mirrors=ktwarwic=speedy.uwaterloo.ca@lists.debian.org>\nReceived: from murphy.debian.org (murphy.debian.org [70.103.162.31])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with ESMTP id l38H9S0I003031\n\tfor <ktwarwic@speedy.uwaterloo.ca>; Sun, 8 Apr 2007 13:09:28 -0400\nReceived: from localhost (localhost [127.0.0.1])\n\tby murphy.debian.org (Postfix) with QMQP\n\tid 90C152E68E; Sun,  8 Apr 2007 12:09:05 -0500 (CDT)\nOld-Return-Path: <yan.morin@savoirfairelinux.com>\nX-Spam-Checker-Version: SpamAssassin 3.1.4 (2006-07-26) on murphy.debian.org\nX-Spam-Level: \nX-Spam-Status: No, score=-1.1 required=4.0 tests=BAYES_05 autolearn=no \n\tversion=3.1.4\nX-Original-To: debian-mirrors@lists.debian.org\nReceived: from xenon.savoirfairelinux.net (savoirfairelinux.net [199.243.85.90])\n\tby murphy.debian.org (Postfix) with ESMTP id 827432E3E5\n\tfor <debian-mirrors@lists.debian.org>; Sun,  8 Apr 2007 11:52:35 -0500 (CDT)\nReceived: from [192.168.0.101] (bas6-montreal28-1177925679.dsl.bell.ca [70.53.184.47])\n\tby xenon.savoirfairelinux.net (Postfix) with ESMTP id C1223F69B7\n\tfor <debian-mirrors@lists.debian.org>; Sun,  8 Apr 2007 12:52:34 -0400 (EDT)\nMessage-ID: <46191DCE.3020508@savoirfairelinux.com>\nDate: Sun, 08 Apr 2007 12:52:30 -0400\nFrom: Yan Morin <yan.morin@savoirfairelinux.com>\nUser-Agent: Icedove 1.5.0.10 (X11/20070329)\nMIME-Version: 1.0\nTo: debian-mirrors@lists.debian.org\nSubject: Typo in /debian/README\nX-Enigmail-Version: 0.94.2.0\nContent-Type: text/plain; charset=ISO-8859-1\nContent-Transfer-Encoding: 7bit\nX-Rc-Spam: 2007-01-18_01\nX-Rc-Virus: 2006-10-25_01\nX-Rc-Spam: 2007-01-18_01\nResent-Message-ID: <tHOiyB.A.jEC.xGSGGB@murphy>\nResent-From: debian-mirrors@lists.debian.org\nX-Mailing-List: <debian-mirrors@lists.debian.org> \nX-Loop: debian-mirrors@lists.debian.org\nList-Id: <debian-mirrors.lists.debian.org>\nList-Post: <mailto:debian-mirrors@lists.debian.org>\nList-Help: <mailto:debian-mirrors-request@lists.debian.org?subject=help>\nList-Subscribe: <mailto:debian-mirrors-request@lists.debian.org?subject=subscribe>\nList-Unsubscribe: <mailto:debian-mirrors-request@lists.debian.org?subject=unsubscribe>\nPrecedence: list\nResent-Sender: debian-mirrors-request@lists.debian.org\nResent-Date: Sun,  8 Apr 2007 12:09:05 -0500 (CDT)\nStatus: RO\nContent-Length: 729\nLines: 26\n\nHi, i\'ve just updated from the gulus and I check on other mirrors.\nIt seems there is a little typo in /debian/README file\n\nExample:\nhttp://gulus.usherbrooke.ca/debian/README\nftp://ftp.fr.debian.org/debian/README\n\n"Testing, or lenny.  Access this release through dists/testing.  The\ncurrent tested development snapshot is named etch.  Packages which\nhave been tested in unstable and passed automated tests propogate to\nthis release."\n\netch should be replace by lenny like in the README.html\n\n\n\n-- \nYan Morin\nConsultant en logiciel libre\nyan.morin@savoirfairelinux.com\n514-994-1556\n\n\n-- \nTo UNSUBSCRIBE, email to debian-mirrors-REQUEST@lists.debian.org\nwith a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org\n\n'
  class          filepath                                           contents
0  spam  ../data/inmail.1  b'From RickyAmes@aol.com  Sun Apr  8 13:07:32 ...
1   ham  ../data/inmail.2  b'From bounce-debian-mirrors=ktwarwic=speedy.u...
2  spam  ../data/inmail.3  b'From 7stocknews@tractionmarketing.com  Sun A...
3  spam  ../data/inmail.4  b'From vqucsmdfgvsg@ruraltek.com  Sun Apr  8 1...
4  spam  ../data/inmail.5  b'From dcube@totalink.net  Sun Apr  8 13:19:30...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75419 entries, 0 to 75418
Data columns (total 3 columns):
class       75419 non-null object
filepath    75419 non-null object
contents    75419 non-null object
dtypes: object(3)
memory usage: 1.7+ MB
None

categorize class as spam:1 and ham:0

category={'spam':1,'ham':0}
data['class']=[category[item] for item in data['class'] ]
data.head()
class filepath contents
0 1 ../data/inmail.1 b'From RickyAmes@aol.com Sun Apr 8 13:07:32 ...
1 0 ../data/inmail.2 b'From bounce-debian-mirrors=ktwarwic=speedy.u...
2 1 ../data/inmail.3 b'From 7stocknews@tractionmarketing.com Sun A...
3 1 ../data/inmail.4 b'From vqucsmdfgvsg@ruraltek.com Sun Apr 8 1...
4 1 ../data/inmail.5 b'From dcube@totalink.net Sun Apr 8 13:19:30...

create train, test data from the full set

X_train=(data['contents'][0:1999]).copy()
y_train=data['class'][0:1999]
X_test=(data['contents'][2000:2500]).copy()
y_test=data['class'][2000:2500]

import nltk stowords

import nltk 
from nltk.corpus import stopwords
nltk.download ('stopwords')

nltk.download ('punkt')
stop_words=set(stopwords.words("english"))

    
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravindra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ravindra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

import nltk tokenizers, stemmers and add function definitions for processing the email text

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,SnowballStemmer

def cleanText(email_txt):
    email_txt=str(email_txt).replace('\\n', ' ').replace('\\r', ' ').replace('\\t',' ')
    #print(email_txt)
    clean1 = re.compile('<.*?>')
    email_txt=re.sub(clean1, '', str(email_txt)).lower()
    clean2=re.compile('\S*@\S*\s?')
    email_txt=re.sub(clean2,'emailAddress',email_txt)
    email_txt=email_txt.translate(str.maketrans('','',string.punctuation))
    email_txt=email_txt.translate(str.maketrans('','','1234567890'))
    return str(email_txt)

def tokenizeText(text):
     return word_tokenize(text)
    
def removeStopWords(text):
    result=[]
    for word in text:
        if word not in stop_words:
            result.append(word)
    return result


def performStemming(text):
    result=''
    stemr=SnowballStemmer('english')
    for word in text:
        result +=(stemr.stem(word))+' '
    return result

def preprocessText(text):
    text0=cleanText(text)
    text1=tokenizeText(text0)
    text2=removeStopWords(text1)
    
    text3=performStemming(text2)
    #print(text3)
    return text3

apply preprocessing on train , test set

X_train = X_train.apply(preprocessText)
X_test=X_test.apply(preprocessText)

use TfidVectorizer for feature extraction /vectorization and create train/validation set using train_test_split module

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(X_train)
features_train, features_test, labels_train, labels_test = train_test_split(features, y_train, test_size=0.3, random_state=42)

use SVM for training and then test spam / ham classification on validation set

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)
0.9916666666666667

use NaiveBayes to train and then test spam / ham classification on validation set

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)
0.9716666666666667

Get the prediction of NaiveBayes classifier on the test set

features3 = vectorizer.transform(X_test)
print(features3.shape)
prediction = mnb.predict(features3)
(500, 82430)
accuracy_score(y_test,prediction)
0.986