Spam Classifier
Classifying emails as Spam / Ham(Not spam) using Email Data from TREC 2007 Public Corpus
We will use the files from 'full' folder in this dataset.
We will use Pandas to read the content and categorize them into spam:1 and ham:0
We will clean up the content by removing line endings , tabs , return characters . We will also remove email addresses, numbers and punctuations from the sentences and convert the text to lowercase.
We will use NLTK library to remove stopwords like 'the' 'had' , etc
We will will use NLTK for stemming which is to convert various forms of a root word to the root word itself e.g 'like' is the root word for 'liked' , 'likes' , 'liking' , etc . We will use NLTK SnowballStemmer which handles languages other than English. When handling English language alone Porter stemmer could be used.
We will use TfidfVectorizer from Scikit Learn library to vectorize and create the train, validation samples
We will train and test spam / ham classification using Support Vector Machines and NaiveBayes
import pandas as pd
import os
from pathlib import Path
data=pd.read_csv("./full/index",sep=' ',header=None)
data.head()
data.columns=['class','filepath']
data['contents']=None
data.head()
import re
import string
for i,row in data.iterrows():
filepath=os.path.join(os.getcwd(),row['filepath'].replace('../',''))
with open(filepath, 'rb') as f:
email_txt = f.read()
email_text=str(email_txt)
if i<2:
print(email_txt)
data.at[i,'contents']= email_txt
print(data.head())
print(data.info())
category={'spam':1,'ham':0}
data['class']=[category[item] for item in data['class'] ]
data.head()
X_train=(data['contents'][0:1999]).copy()
y_train=data['class'][0:1999]
X_test=(data['contents'][2000:2500]).copy()
y_test=data['class'][2000:2500]
import nltk
from nltk.corpus import stopwords
nltk.download ('stopwords')
nltk.download ('punkt')
stop_words=set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,SnowballStemmer
def cleanText(email_txt):
email_txt=str(email_txt).replace('\\n', ' ').replace('\\r', ' ').replace('\\t',' ')
#print(email_txt)
clean1 = re.compile('<.*?>')
email_txt=re.sub(clean1, '', str(email_txt)).lower()
clean2=re.compile('\S*@\S*\s?')
email_txt=re.sub(clean2,'emailAddress',email_txt)
email_txt=email_txt.translate(str.maketrans('','',string.punctuation))
email_txt=email_txt.translate(str.maketrans('','','1234567890'))
return str(email_txt)
def tokenizeText(text):
return word_tokenize(text)
def removeStopWords(text):
result=[]
for word in text:
if word not in stop_words:
result.append(word)
return result
def performStemming(text):
result=''
stemr=SnowballStemmer('english')
for word in text:
result +=(stemr.stem(word))+' '
return result
def preprocessText(text):
text0=cleanText(text)
text1=tokenizeText(text0)
text2=removeStopWords(text1)
text3=performStemming(text2)
#print(text3)
return text3
X_train = X_train.apply(preprocessText)
X_test=X_test.apply(preprocessText)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(X_train)
features_train, features_test, labels_train, labels_test = train_test_split(features, y_train, test_size=0.3, random_state=42)
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)
features3 = vectorizer.transform(X_test)
print(features3.shape)
prediction = mnb.predict(features3)
accuracy_score(y_test,prediction)