The company in question has workers recieving vast amounts of emails every day. The objective of this Case Study is to classify whether or not an email is spam or work-related to streamline inboxes to only include the important emails.
The methods our team applied to solve this problem include Naive Bayes and K-Means Clustering. In preparation for the modeling, the team partitioned each email segment from the email file into a structured form. Then various natural language processing cleansing techniques were implemented to conduct the exploratory data analysis. Due to the unique nature of the input data, an email file, this approach seemed appropriate to feature engineer the enriched data available within the file structure.
The metrics our team utilized for this project include F-1 Score and the Confusion Matrix for the Naive Bayes.
Initially we intended to use Accuracy and Precision, but due to the imbalanced nature of the data we pivoted towards F1 Score.
The main reason our team chose to use these metrics is because they evaluate the model performance better with disproportionate classifications that exist within our dataset. F1 is a calculation of recall and precision which represents a more holisitic view of the success of our Naive Bayes model.
import os
import pandas as pd
import numpy as np
from email.parser import BytesParser, Parser
from email.policy import default
import email
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from collections import Counter
import re
import plotly.express as px
import nltk
import seaborn as sns
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import statsmodels.api as sm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from yellowbrick.model_selection import FeatureImportances
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')
def strip_html(text):
soup = BeautifulSoup(text,"html.parser")
return soup.get_text()
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stop_list:
new_words.append(word)
return new_words
def lemmatize_list(words):
lemmatizer = WordNetLemmatizer()
new_words = []
for word in words:
new_words.append(lemmatizer.lemmatize(word, pos='v'))
return new_words
def normalize(words):
words = to_lowercase(words)
words = remove_punctuation(words)
words = remove_stopwords(words)
words = lemmatize_list(words)
return ' '.join(words)
stop_words = stopwords.words('english')
customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
"wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
stop_list = list(set(stop_words) - set(customlist))
spam_list = os.listdir("./spam/")
ham_list = os.listdir("./easy_ham/")
folder=("./easy_ham")
files=os.listdir(folder)
emails=[folder+'/'+file for file in files]
dataStackText = pd.DataFrame()
my_dict_text = {"Words":[], "Type":[]}
words=[]
for email in emails:
f = open(email, encoding='latin-1')
blob = f.read()
my_dict_text["Words"].append(blob)
my_dict_text['Type'].append(0)
dataStackText = pd.DataFrame(my_dict_text)
dataStack = pd.DataFrame()
my_dict = {"To":[], "From":[], "Subject":[]}
for i in ham_list:
with open("./easy_ham/"+i, 'rb') as fp:
try:
headers = BytesParser(policy=default).parse(fp)
to_text = '{}'.format(headers['to'])
from_text = '{}'.format(headers['from'])
subject = '{}'.format(headers['subject'])
my_dict["To"].append(to_text)
my_dict["From"].append(from_text)
my_dict["Subject"].append(subject)
except Exception:
continue
dataStack = pd.DataFrame(my_dict)
EMAIL_DF = pd.merge(dataStack,dataStackText, how='left',left_index = True, right_index = True)
COMPLETE_EMAIL_DF = EMAIL_DF
COMPLETE_EMAIL_DF.shape
folder=("./spam")
files=os.listdir(folder)
emails=[folder+'/'+file for file in files]
dataStackText = pd.DataFrame()
my_dict_text = {"Words":[], "Type":[]}
words=[]
for email in emails:
f = open(email, encoding='latin-1')
blob = f.read()
my_dict_text["Words"].append(blob)
my_dict_text['Type'].append(1)
dataStackText = pd.DataFrame(my_dict_text)
dataStack = pd.DataFrame()
my_dict = {"To":[], "From":[], "Subject":[]}
for i in spam_list:
with open("./spam/"+i, 'rb') as fp:
try:
headers = BytesParser(policy=default).parse(fp)
#print(headers)
to_text = '{}'.format(headers['to'])
from_text = '{}'.format(headers['from'])
subject = '{}'.format(headers['subject'])
my_dict["To"].append(to_text)
my_dict["From"].append(from_text)
my_dict["Subject"].append(subject)
except Exception:
continue
dataStack = pd.DataFrame(my_dict)
EMAIL_DF = pd.merge(dataStack,dataStackText, how='left',left_index = True, right_index = True)
COMPLETE_EMAIL_DF = COMPLETE_EMAIL_DF.append(EMAIL_DF)
COMPLETE_EMAIL_DF.shape
COMPLETE_EMAIL_DF.head()
# Removing any excess html
COMPLETE_EMAIL_DF['Words'] = COMPLETE_EMAIL_DF['Words'].apply(lambda x: strip_html(x))
COMPLETE_EMAIL_DF['Subject'] = COMPLETE_EMAIL_DF['Subject'].apply(lambda x: strip_html(x))
# Tokenizing each column
COMPLETE_EMAIL_DF['Words_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: nltk.word_tokenize(row['Words']), axis=1)
COMPLETE_EMAIL_DF['Subject_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: nltk.word_tokenize(row['Subject']), axis=1)
COMPLETE_EMAIL_DF['To_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: nltk.word_tokenize(row['To']), axis=1)
COMPLETE_EMAIL_DF['From_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: nltk.word_tokenize(row['From']), axis=1)
# normalizing each column
COMPLETE_EMAIL_DF['Subject_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: normalize(row['Subject_norm']), axis=1)
COMPLETE_EMAIL_DF['Words_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: normalize(row['Words_norm']), axis=1)
COMPLETE_EMAIL_DF['To_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: normalize(row['To_norm']), axis=1)
COMPLETE_EMAIL_DF['From_norm'] = COMPLETE_EMAIL_DF.apply(lambda row: normalize(row['From_norm']), axis=1)
COMPLETE_EMAIL_DF.head()
folder=("./spam")
files=os.listdir(folder)
emails=[folder+'/'+file for file in files]
words=[]
for email in emails:
f = open(email, encoding='latin-1')
blob = f.read()
words += blob.split(" ")
words = to_lowercase(words)
words = remove_stopwords(words)
for i in range(len(words)):
if not words[i].isalpha():
words[i]=""
word_dict = Counter(words)
del word_dict[""]
word_dict = word_dict.most_common(1000)
word_dict = [k for k,v in word_dict]
COMPLETE_EMAIL_DF.head()
spam_list_words = word_dict
COMPLETE_EMAIL_DF['spam_count_content'] = COMPLETE_EMAIL_DF['Words_norm'].apply(lambda x: sum(i in spam_list_words for i in x.split()))
COMPLETE_EMAIL_DF['spam_count_subject'] = COMPLETE_EMAIL_DF['Subject_norm'].apply(lambda x: sum(i in spam_list_words for i in x.split()))
COMPLETE_EMAIL_DF['subject_cl_count'] = COMPLETE_EMAIL_DF['Subject'].apply(lambda x: sum(1 for c in x if c.isupper()))
COMPLETE_EMAIL_DF['from_cl_count'] = COMPLETE_EMAIL_DF['From'].apply(lambda x: sum(1 for c in x if c.isupper()))
COMPLETE_EMAIL_DF['content_cl_count'] = COMPLETE_EMAIL_DF['Words'].apply(lambda x: sum(1 for c in x if c.isupper()))
COMPLETE_EMAIL_DF['from_ch_count'] = COMPLETE_EMAIL_DF['From'].apply(lambda x: sum(1 for c in x if c!=''))
COMPLETE_EMAIL_DF['from_int_count'] = COMPLETE_EMAIL_DF['From'].apply(lambda x: sum(1 for c in x if c.isdigit()))
COMPLETE_EMAIL_DF['from_dotcom'] = COMPLETE_EMAIL_DF['From_norm'].apply(lambda x: x.endswith("com"))
COMPLETE_EMAIL_DF['from_dotedu'] = COMPLETE_EMAIL_DF['From_norm'].apply(lambda x: x.endswith("edu"))
COMPLETE_EMAIL_DF['from_dotus'] = COMPLETE_EMAIL_DF['From_norm'].apply(lambda x: x.endswith("us"))
COMPLETE_EMAIL_DF['is_spam'] = COMPLETE_EMAIL_DF['Type']
data_final = COMPLETE_EMAIL_DF.drop(['To','From','Subject','Type'], axis=1)
data_final
data_final.isnull().sum()
data_final.describe()
columns = list(data_final.select_dtypes('int64'))
data_final[columns].hist(stacked=False, bins=100, figsize=(15,15), layout=(5,3));
data_final.skew()
data_final.groupby(['is_spam'])['spam_count_content'].mean()
data_final.groupby(['is_spam'])['spam_count_subject'].mean()
sns.boxplot(x="is_spam", y="spam_count_subject", data=data_final)
data_final = data_final[data_final['spam_count_subject'] < 20]
sns.boxplot(x="is_spam", y="spam_count_subject", data=data_final)
data_final = data_final[data_final['spam_count_content'] < 300]
sns.boxplot(x="is_spam", y="spam_count_content", data=data_final)
data_final.skew()
model_data = data_final.select_dtypes(exclude = 'object')
model_data.info()
fig = px.scatter_3d(model_data, x='spam_count_subject', y='from_ch_count', z='from_cl_count', color='is_spam', title="Separation of Spam")
fig.update_layout(width = 550, height = 550,margin=dict(l=0, r=0, b=0, t=0))
fig.show()
x = model_data.iloc[:]# 1t for rows and second for columns
x
sse = {}
# Fit KMeans and calculate SSE for each k
for k in range(1, 10):
# Initialize KMeans with k clusters
kmeans = KMeans(n_clusters=k, random_state=1)
# Fit KMeans on the normalized dataset
kmeans.fit(model_data)
# Assign sum of squared distances to k element of dictionary
sse[k] = kmeans.inertia_
# Plotting the elbow plot
plt.figure(figsize=(12,8))
plt.title('The Elbow Method')
plt.xlabel('k');
plt.ylabel('Sum of squared errors')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
kmeans = KMeans(n_clusters=2, algorithm = 'full')
kmeans.fit(x)
identified_clusters = kmeans.fit_predict(x)
identified_clusters
data_with_clusters = model_data.copy()
data_with_clusters['Clusters'] = identified_clusters
data_with_clusters
X = data_with_clusters.drop('is_spam', axis=1)
y = data_with_clusters['is_spam']
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)
clf=MultinomialNB()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
predictions = pd.DataFrame(y_pred, columns = ['Predictions'])
X_test = X_test.reset_index()
results = pd.merge(X_test, predictions, left_index=True, right_index=True)
results.head(25)
cnf_matrix = confusion_matrix(y_test, y_pred)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
viz = FeatureImportances(clf, relative=False)
viz.fit(X_train, y_train)