In [3]:
# install kaggle package
! pip install -q kaggle

upload kaggle.jsonAPI key

In [4]:
# make folder for api key
! mkdir ~/.kaggle
mkdir: cannot create directory ‘/root/.kaggle’: File exists
In [5]:
# copy key into folder
! cp kaggle.json ~/.kaggle/
In [6]:
# change access permissions
! chmod 600 ~/.kaggle/kaggle.json

for getting data go to kaggle page and ... and copy API command

In [7]:
! kaggle datasets download -d wcukierski/enron-email-dataset
enron-email-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
In [10]:
! unzip /content/enron-email-dataset.zip
Archive:  /content/enron-email-dataset.zip
replace emails.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
In [9]:
!pip install datashader -q
!pip install -qq -U gensim
In [11]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline
import email
import datetime

import nltk
import re 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from string import punctuation

import spacy
nlp = spacy.load("en_core_web_sm")
import re
import networkx as nx

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
kwargs = dict(width=800, height=800, xaxis=None, yaxis=None)
opts.defaults(opts.Nodes(**kwargs), opts.Graph(**kwargs))


from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.lsimodel import LsiModel
from gensim.similarities import MatrixSimilarity
from sklearn.decomposition import PCA


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [12]:
#Loading the dataset. 
df_emails = pd.read_csv("emails.csv")
#And inspects it: 
df_emails.head()
Out[12]:
file message
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e...
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e...
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e...
In [13]:
# Drops rows containing messages without some specified value in the expected locations. 
def standard_format(df, Series, string, slicer):
    rows = []
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        if string not in message_words[slicer]:
            rows.append(row)
    df = df.drop(df.index[rows])
    return df

# Applying the cleansing.
x = len(df_emails.index)
headers = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ']
for i, v in enumerate(headers):
    df_emails = standard_format(df_emails, df_emails.message, v, i)
df_emails = df_emails.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(df_emails.index), np.round(((x - len(df_emails.index)) / x) * 100, decimals=2)))
Got rid of 111433 useless emails! That's 21.54% of the total number of messages in this dataset.
In [14]:
# With the function below, we can subtract the information we want from the "message" column. 
def get_field(field, messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get(field))
    return column
In [15]:
# Now using the function above. 
df_emails["date"] = get_field("Date", df_emails["message"])
df_emails["subject"] = get_field("Subject", df_emails["message"])
df_emails["From"] = get_field("From", df_emails["message"])
df_emails["To"] = get_field("To", df_emails["message"])
df_emails["X-Folder"] = get_field("X-Folder", df_emails["message"])
df_emails["X-From"] = get_field("X-From", df_emails["message"])
df_emails["X-To"] = get_field("X-To", df_emails["message"])
df_emails.head(3)
Out[15]:
index file message date subject From To X-Folder X-From X-To
0 0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e... Mon, 14 May 2001 16:39:00 -0700 (PDT) phillip.allen@enron.com tim.belden@enron.com \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate>
1 1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e... Fri, 4 May 2001 13:51:00 -0700 (PDT) Re: phillip.allen@enron.com john.lavorato@enron.com \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg...
2 2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e... Wed, 18 Oct 2000 03:00:00 -0700 (PDT) Re: test phillip.allen@enron.com leah.arsdall@enron.com \Phillip_Allen_Dec2000\Notes Folders\'sent mail Phillip K Allen Leah Van Arsdall
In [16]:
#Changes date column to datetime format, using the datetime package  
df_emails['date'] = pd.to_datetime(df_emails['date'], infer_datetime_format=True, utc=True)
In [17]:
#The function below extracts the body/actual mail from the "message" column. 

def body(messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get_payload())
    return column

df_emails["body"] = body(df_emails["message"])
In [18]:
#This function extracts the employee name of the sender. 

def employee(file):
    column = []
    for string in file:
        column.append(string.split("/")[0])
    return column

df_emails["employee"] = employee(df_emails["file"])
df_emails.head(3)
Out[18]:
index file message date subject From To X-Folder X-From X-To body employee
0 0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e... 2001-05-14 23:39:00+00:00 phillip.allen@enron.com tim.belden@enron.com \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Phillip K Allen Tim Belden <Tim Belden/Enron@EnronXGate> Here is our forecast\n\n allen-p
1 1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e... 2001-05-04 20:51:00+00:00 Re: phillip.allen@enron.com john.lavorato@enron.com \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... Phillip K Allen John J Lavorato <John J Lavorato/ENRON@enronXg... Traveling to have a business meeting takes the... allen-p
2 2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e... 2000-10-18 10:00:00+00:00 Re: test phillip.allen@enron.com leah.arsdall@enron.com \Phillip_Allen_Dec2000\Notes Folders\'sent mail Phillip K Allen Leah Van Arsdall test successful. way to go!!! allen-p

EDA

In [19]:
#First, we take a look at the number of employees, thats included in the dataset. 
print("Number of employees:",df_emails['employee'].nunique())
Number of employees: 150

We where interested in how many charcheter where used in the emails to get an understand, how there could be so many. By going through the email body we could find the lenght. the first graph show that the density all most of the emails contains few or no words, and the other scale some messages containing a lot for words.

In [20]:
#Maybe include 
df_emails['Message Length'] = df_emails['body'].apply(lambda x: len(x))

sns.distplot(df_emails['Message Length'])
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95058cd7d0>

Then we wanted to see the distribution of different size of characters, by making a range for characters and set a bin size with 100 gave an good indication as the characters decreased

In [21]:
#The for loop below, plots and visualize the length of the emails in the dataset. 

for order_of_magnitude in reversed(range(2,6)):
    max_ = 10**order_of_magnitude
    print("Messages not longer than %i characters:"%max_)
    plt.hist(df_emails.query('`Message Length`<@max_')['Message Length'], bins=100)
    #histplot(email_df.query('`Message Length`<@max_'), x='Message Length')
    plt.show()
Messages not longer than 100000 characters:
Messages not longer than 10000 characters:
Messages not longer than 1000 characters:
Messages not longer than 100 characters:
In [22]:
#The function below shows in which period the emails from the dataset have been sent. 
plt.figure(figsize=(12,6))
ax = df_emails.groupby(df_emails['date'].dt.year)['body'].count().plot()
ax.set_xlabel('Year', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
ax.set_xlim(1998,2004)
Out[22]:
(1998.0, 2004.0)

From the data set we have we can clearly see on the year email that therre is a ramp up of send emails, for the month graph show the year with the most emails, year 2001. Enron changed their CEO, in february march time, so more emails where send, going of for summer vacation, the amount of emails dropped. During august and october there was a dicussion going on for the employees of enron to buy enron stock and the company was still doing fine. short after this, accorind to the stock price. Enron have lot a lost of money, resulting the decreaseing emails.

In [23]:
# Next, we illustrate the most active email-months. 
# Clearly, theres less emails being send doing the summer. 
plt.figure(figsize=(12,6))
ax = df_emails.groupby(df_emails['date'].dt.month)['body'].count().plot()
ax.set_xlabel('Most active month', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
ax.set_xticks(range(1,13))
Out[23]:
[<matplotlib.axis.XTick at 0x7f95060e3510>,
 <matplotlib.axis.XTick at 0x7f95060e3a50>,
 <matplotlib.axis.XTick at 0x7f94fbf8a2d0>,
 <matplotlib.axis.XTick at 0x7f94fbf58e50>,
 <matplotlib.axis.XTick at 0x7f95061b0f10>,
 <matplotlib.axis.XTick at 0x7f950609fd50>,
 <matplotlib.axis.XTick at 0x7f950609ffd0>,
 <matplotlib.axis.XTick at 0x7f94fbf585d0>,
 <matplotlib.axis.XTick at 0x7f95060e3690>,
 <matplotlib.axis.XTick at 0x7f95060b2210>,
 <matplotlib.axis.XTick at 0x7f95060b2090>,
 <matplotlib.axis.XTick at 0x7f9505ff8950>]

We wanted to see which of the folder where most used, and this turn out to be Kay Mann's folder this is a person in charge of the legal in Enron, so many of the emails contain information about legal issues.

In [24]:
unique_emails = pd.DataFrame(df_emails['X-Folder'].value_counts())
unique_emails.reset_index(inplace=True)


unique_emails.columns = ['folder_name', 'count']
# Top 20 folders
print(unique_emails.iloc[:10,:])

#Plots the figure 
plt.figure(figsize=(10,6))
sns.barplot(x='count', y='folder_name', data=unique_emails.iloc[:10, :], palette="Blues_d")
plt.title("Top 10 folders")
plt.xlabel("Count")
plt.ylabel("Folder_Name")
plt.show()
                                         folder_name  count
0   \Kay_Mann_June2001_1\Notes Folders\All documents   6081
1  \Vincent_Kaminski_Jun2001_1\Notes Folders\All ...   4635
2   \Tanya_Jones_Dec2000\Notes Folders\All documents   4606
3  \Sara_Shackleton_Dec2000_June2001_1\Notes Fold...   4560
4  \Kay_Mann_June2001_2\Notes Folders\Discussion ...   4405
5            \Kay_Mann_June2001_3\Notes Folders\Sent   4269
6      \Kay_Mann_June2001_4\Notes Folders\'sent mail   4089
7  \Jeff_Dasovich_June2001\Notes Folders\All docu...   3657
8  \Vincent_Kaminski_Jun2001_2\Notes Folders\Disc...   3567
9  \Mark_Taylor _Dec_2000\Notes Folders\All docum...   3378

Then we wanted to find out who is person who send the most emails.

  • Kaminski was the director of reseach, so that's why he send a lot of emails
  • Dasovich was the governmental affairs executive Some of the others ware secretaies and traders

the CEO, Lay, is at only ranked as 20, so he was sending not so much emails

In [25]:
# The plot below shows the most frequent sender. 
top_20 = pd.DataFrame(df_emails['employee'].value_counts()[:20])
top_20.reset_index(inplace=True)
top_20.columns = ["Employee_name", "Counts"]

#Plots the figure
plt.figure(figsize=(10,8))
sns.barplot(y="Employee_name", x="Counts", data=top_20, palette="Blues_d")
plt.title("Top 20 highest email sender employee")
plt.xlabel("Count")
plt.ylabel("Employee_name")
plt.show()

Network

In [26]:
#Setting up the edges
edges = df_emails[["From","To"]]

edges = edges[edges.From != edges.To]
edges.head()
Out[26]:
From To
0 phillip.allen@enron.com tim.belden@enron.com
1 phillip.allen@enron.com john.lavorato@enron.com
2 phillip.allen@enron.com leah.arsdall@enron.com
3 phillip.allen@enron.com randall.gay@enron.com
4 phillip.allen@enron.com greg.piper@enron.com
In [27]:
# Grouping to aggregate multiple co-occurences and to generate a weight: 
# Based on how many times one sender sends a mail to a reciever. 
# Lastly, reset_index makes everytging from a multi-index-series into a dataframe
edges = edges.groupby(['From', 'To']).size().reset_index()
In [28]:
#Renaming the column 0 to weight. 
edges.rename({0:'weight'}, axis = 1, inplace=True)
In [29]:
# Creates network object from pandas edgelist
G = nx.from_pandas_edgelist(edges, source='From', target='To', edge_attr='weight', create_using=nx.Graph())
In [30]:
#Then sets the node attributes. 
nx.set_node_attributes(G, {G.degree(): 'degree'})
In [31]:
# Subset the graph keeping only nodes with degree > 1
G = nx.subgraph(G, [n for n,d in G.degree() if d > 1])

# Here we can calculate different centrality indicators.
centrality_dgr = nx.degree_centrality(G)
centrality_eig = nx.eigenvector_centrality_numpy(G)
degree = G.degree()

# All these indicators can now be set as attribute of the Graph
nx.set_node_attributes(G, centrality_dgr, 'dgr')
nx.set_node_attributes(G, centrality_eig, 'eig')
nx.set_node_attributes(G, dict(degree), 'degree_basic')
In [32]:
# Turns the Graph object (NetworkX) to a Dataframe
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')
In [33]:
# Since the dataset contains of 150 people, we'll find the top 25 most central people. 
top_10_eig = nodes_df.sort_values('eig', ascending=False)[:10]
In [34]:
#Creates nodes for plot
emails_eig = top_10_eig.eig.index

#Create subset graph
g_sub_emails = nx.subgraph(G,emails_eig)

# Create and save a layout.
g_layout = nx.layout.spring_layout(g_sub_emails) 
g_plot = hv.Graph.from_networkx(g_sub_emails, g_layout).opts(tools=['hover'], node_color='partition')
labels = hv.Labels(g_plot.nodes, ['x', 'y'])

# Makes the plot
from holoviews.operation.datashader import datashade, bundle_graph
bundled = bundle_graph(g_plot)

# Show the plot
show(hv.render(bundled * labels.opts(text_font_size='6pt', text_color='white', bgcolor='lightblue')))

print(top_10_eig[:10])
bokeh.core.validation.check - ERROR - E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "fill_color" value "partition" [renderer: GlyphRenderer(id='1218', ...)]
                                dgr       eig  degree_basic
tana.jones@enron.com       0.059611  0.206684           597
louise.kitchen@enron.com   0.044733  0.199194           448
sara.shackleton@enron.com  0.058612  0.191055           587
mark.taylor@enron.com      0.045931  0.177937           460
sally.beck@enron.com       0.043635  0.140461           437
mark.haedicke@enron.com    0.027659  0.139406           277
vince.kaminski@enron.com   0.060110  0.133938           602
john.lavorato@enron.com    0.029855  0.133050           299
elizabeth.sager@enron.com  0.028557  0.131349           286
richard.sanders@enron.com  0.032651  0.125268           327

NLP

Subject wordcloud

In [35]:
subject_text = " ".join(message for message in df_emails.subject)
print ("There are {} words in the subjects in combination of all mails.".format(len(subject_text)))
There are 11396608 words in the subjects in combination of all mails.
In [36]:
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["RE", "FW","Fwd","Date","Hour","HourAhead","Reminder","PLEASEREAD"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(subject_text)

# Display the generated wordcloud:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Full text

In [44]:
#Takes a sample of 10.000 emails
emails_nlp = nlp(df_emails.body.sample(n=10000,random_state=42).to_string())
In [45]:
# Most mentioned people in the e-mails
persons_entity = [(ent.text, ent.label_) for ent in emails_nlp.ents if ent.label_ == "PERSON" ]
persons_entity = pd.DataFrame(persons_entity,columns=["Name","-"])

print("Most mentioned people in mails")
print()
print(persons_entity.value_counts()[:10])
print()
print("Top 10 central people accoring to Network Analysis")
print(top_10_eig[:10])
Most mentioned people in mails

Name        -     
Jeff        PERSON    47
Steve       PERSON    40
Kay Mann/C  PERSON    37
John        PERSON    35
Mark        PERSON    33
Kay         PERSON    26
Mike        PERSON    26
Mary        PERSON    22
Vince J Ka  PERSON    21
Rick        PERSON    21
dtype: int64

Top 10 central people accoring to Network Analysis
                                dgr       eig  degree_basic
tana.jones@enron.com       0.059611  0.206684           597
louise.kitchen@enron.com   0.044733  0.199194           448
sara.shackleton@enron.com  0.058612  0.191055           587
mark.taylor@enron.com      0.045931  0.177937           460
sally.beck@enron.com       0.043635  0.140461           437
mark.haedicke@enron.com    0.027659  0.139406           277
vince.kaminski@enron.com   0.060110  0.133938           602
john.lavorato@enron.com    0.029855  0.133050           299
elizabeth.sager@enron.com  0.028557  0.131349           286
richard.sanders@enron.com  0.032651  0.125268           327
In [46]:
#Most used adjectives in the emails
adj_entity = [(token.lemma_, token.pos_) for token in emails_nlp if token.pos_ == "ADJ" and not token.is_stop]
adj_entity = pd.DataFrame(adj_entity,columns=["Adjective","-"])

print("Most used adjectives in mails")
print()
print(adj_entity.value_counts()[:10])


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(adj_entity.Adjective.to_string())

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Most used adjectives in mails

Adjective  -  
good       ADJ    167
new        ADJ     90
dear       ADJ     86
great      ADJ     71
late       ADJ     57
sure       ADJ     52
fine       ADJ     48
sorry      ADJ     39
able       ADJ     32
glad       ADJ     27
dtype: int64
In [47]:
# The function below returns a string that is cleaned. 

def clean_text(Series):
  
    result = []
    strings = Series.str.lower()
    
    for string in strings:
        new_string = []
        words = string.split(" ")
        
        for word in words:
            word = word.strip(punctuation) 
            
            if word in stopwords:
                continue
            if re.search(r'[\W\d]',word):
                continue
                
            new_string.append(word)
                
        new_string = " ".join(new_string)
        
        result.append(new_string)
    
    return result
In [48]:
#Next, we'll generate a wordcloud on the "body" of every email.
#And firstly, we create a new dataframe. 
enorn_person_email = df_emails[["employee","body"]]

#Then cleans the data 
enorn_person_email['body'] = clean_text(enorn_person_email['body'])

#Grouping every word a person has mailed into one dataframe. 
df_grouped = enorn_person_email.groupby("employee")['body'].apply(' '.join).reset_index()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [52]:
# Generate a word cloud image
stopwords.update(["CC","subject","forwarded","seeattached","please see","pleasefind","fyi"])
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(enorn_person_email.body.to_string())

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Unsupervised Machine Learning

In [53]:
#For the UML, we first take a sample, since our Collab cannot run the full dataset. 
uml_emails = df_emails.sample(n=4000,random_state=42)
In [54]:
#We interested to see, if theres a certain pattern in terms of whether the sender is from Enron or not. 
#Therefore, we firstly split the "From" emails by the "@", and afterwards we categorize this into a new column. 
uml_emails["Enron_or_not"] = uml_emails.From.apply(lambda t: t.split("@")[1])
In [55]:
filter = uml_emails["Enron_or_not"] == "enron.com"
In [56]:
uml_emails["Enron_or_not"] = uml_emails["Enron_or_not"].replace("enron.com","Enron")
uml_emails["Enron_or_not"] = uml_emails.Enron_or_not.where(filter,"Not_Enron")
In [57]:
uml_emails[["Enron_or_not","From"]].head()
Out[57]:
Enron_or_not From
286099 Enron joe.quenet@enron.com
328179 Enron sara.shackleton@enron.com
1550 Enron phillip.allen@enron.com
134351 Not_Enron lofeco@ev1.net
377751 Enron justin.boyd@enron.com
In [58]:
#After we've classified the different emails, we create a list of our tokens, called tokens.
#Here, we both use lemmatization and lower the tokens. Moreover, are we only interesseted in a certain types of words.  

tokens = []

for summary in nlp.pipe(uml_emails.body):
  proj_tok = [token.lemma_.lower() for token in summary if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] and not token.is_stop] 
  tokens.append(proj_tok)
In [59]:
#The, we sets the index
uml_emails.index = range(len(uml_emails))

#And takes the tokens back into our sample of emails
uml_emails["tokens"] = tokens
In [60]:
# Create a Dictionary from the emails, called: dictionary
dictionary = Dictionary(uml_emails['tokens'])

# And based on our dictionary, we can construct our corpus. 
corpus = [dictionary.doc2bow(doc) for doc in uml_emails['tokens']]
In [61]:
# We now sets up our Tfidf model, using our corpus from above. 
# Create and fit a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)
# Now we can transform the whole corpus
tfidf_corpus = tfidf[corpus]
In [62]:
#We then trains our model
lsi = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=400)

# And our trained model can then be used to transform our corpus
lsi_corpus = lsi[tfidf_corpus]
In [63]:
# Creates the email-topic-matrix
email_topic_matrix = MatrixSimilarity(lsi_corpus)
email_topic_matrix_ix = email_topic_matrix.index
In [64]:
from sklearn.cluster import KMeans
# Creates a "for loop", to determine number of clusters. 
distortions = []
K = range(1,6)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(email_topic_matrix_ix)
    distortions.append(kmeanModel.inertia_)


plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'o-')
plt.xlabel('number of clusters, k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal clusters')
plt.show()
In [70]:
reduced = PCA(n_components = 2).fit_transform(email_topic_matrix_ix)

# Sets up the clustrs, which is 2
clusterer = KMeans(n_clusters = 2)
clusterer.fit(email_topic_matrix_ix)

# Plotting things
sns.set_style("darkgrid")

plt.rcParams.update({'font.size': 12})
plt.figure(figsize=(12,12))
g = sns.scatterplot(reduced[:,0],reduced[:,1],
                   hue=uml_emails["Enron_or_not"],
                    palette="Paired",
                   legend='full')

#Illustrating this in a crosstab as well, to get a better picture of the clustering
pd.crosstab(clusterer.labels_, uml_emails['Enron_or_not'])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
Out[70]:
Enron_or_not Enron Not_Enron
row_0
0 3257 595
1 73 75

Supervised machine learning

In [78]:
#We import a set of fraudlent emails, to mix with our EnronEmails, to train our model
! unzip /content/fradulent_emails.txt.zip

#Reads fraudlent emails:
with open("fradulent_emails.txt", 'r',encoding="latin1") as file:
    fraudlent_mails = file.read()

#And inspects the dataset
print(fraudlent_mails[:5000]) #Clearly, every mail starts with "From r", 
# - this will therefore be used to split the emails into seperate strings.
Archive:  /content/fradulent_emails.txt.zip
replace fradulent_emails.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
From r  Wed Oct 30 21:41:56 2002
Return-Path: <james_ngola2002@maktoob.com>
X-Sieve: cmu-sieve 2.0
Return-Path: <james_ngola2002@maktoob.com>
Message-Id: <200210310241.g9V2fNm6028281@cs.CU>
From: "MR. JAMES NGOLA." <james_ngola2002@maktoob.com>
Reply-To: james_ngola2002@maktoob.com
To: webmaster@aclweb.org
Date: Thu, 31 Oct 2002 02:38:20 +0000
Subject: URGENT BUSINESS ASSISTANCE AND PARTNERSHIP
X-Mailer: Microsoft Outlook Express 5.00.2919.6900 DM
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 8bit
X-MIME-Autoconverted: from quoted-printable to 8bit by sideshowmel.si.UM id g9V2foW24311
Status: O

FROM:MR. JAMES NGOLA.
CONFIDENTIAL TEL: 233-27-587908.
E-MAIL: (james_ngola2002@maktoob.com).

URGENT BUSINESS ASSISTANCE AND PARTNERSHIP.


DEAR FRIEND,

I AM ( DR.) JAMES NGOLA, THE PERSONAL ASSISTANCE TO THE LATE CONGOLESE (PRESIDENT LAURENT KABILA) WHO WAS ASSASSINATED BY HIS BODY GUARD ON 16TH JAN. 2001.


THE INCIDENT OCCURRED IN OUR PRESENCE WHILE WE WERE HOLDING MEETING WITH HIS EXCELLENCY OVER THE FINANCIAL RETURNS FROM THE DIAMOND SALES IN THE AREAS CONTROLLED BY (D.R.C.) DEMOCRATIC REPUBLIC OF CONGO FORCES AND THEIR FOREIGN ALLIES ANGOLA AND ZIMBABWE, HAVING RECEIVED THE PREVIOUS DAY (USD$100M) ONE HUNDRED MILLION UNITED STATES DOLLARS, CASH IN THREE DIPLOMATIC BOXES ROUTED THROUGH ZIMBABWE.

MY PURPOSE OF WRITING YOU THIS LETTER IS TO SOLICIT FOR YOUR ASSISTANCE AS TO BE A COVER TO THE FUND AND ALSO COLLABORATION IN MOVING THE SAID FUND INTO YOUR BANK ACCOUNT THE SUM OF (USD$25M) TWENTY FIVE MILLION UNITED STATES DOLLARS ONLY, WHICH I DEPOSITED WITH A SECURITY COMPANY IN GHANA, IN A DIPLOMATIC BOX AS GOLDS WORTH (USD$25M) TWENTY FIVE MILLION UNITED STATES DOLLARS ONLY FOR SAFE KEEPING IN A SECURITY VAULT FOR ANY FURTHER INVESTMENT PERHAPS IN YOUR COUNTRY. 

YOU WERE INTRODUCED TO ME BY A RELIABLE FRIEND OF MINE WHO IS A TRAVELLER,AND ALSO A MEMBER OF CHAMBER OF COMMERCE AS A RELIABLE AND TRUSTWORTHY PERSON WHOM I CAN RELY ON AS FOREIGN PARTNER, EVEN THOUGH THE NATURE OF THE TRANSACTION WAS NOT REVEALED TO HIM FOR SECURITY REASONS.


THE (USD$25M) WAS PART OF A PROCEEDS FROM DIAMOND TRADE MEANT FOR THE LATE PRESIDENT LAURENT KABILA WHICH WAS DELIVERED THROUGH ZIMBABWE IN DIPLOMATIC BOXES. THE BOXES WERE KEPT UNDER MY CUSTODY BEFORE THE SAD EVENT THAT TOOK THE LIFE OF (MR. PRESIDENT).THE CONFUSION THAT ENSUED AFTER THE ASSASSINATION AND THE SPORADIC SHOOTING AMONG THE FACTIONS, I HAVE TO RUN AWAY FROM THE COUNTRY FOR MY DEAR LIFE AS I AM NOT A SOLDIER BUT A CIVIL SERVANT I CROSSED RIVER CONGO TO OTHER SIDE OF CONGO LIBREVILLE FROM THERE I MOVED TO THE THIRD COUNTRY GHANA WHERE I AM PRESENTLY TAKING REFUGE. 

AS A MATTER OF FACT, WHAT I URGENTLY NEEDED FROM YOU IS YOUR ASSISTANCE IN MOVING THIS MONEY INTO YOUR ACCOUNT IN YOUR COUNTRY FOR INVESTMENT WITHOUT RAISING EYEBROW. FOR YOUR ASSISTANCE I WILL GIVE YOU 20% OF THE TOTAL SUM AS YOUR OWN SHARE WHEN THE MONEY GETS TO YOUR ACCOUNT, WHILE 75% WILL BE FOR ME, OF WHICH WITH YOUR KIND ADVICE I HOPE TO INVEST IN PROFITABLE VENTURE IN YOUR COUNTRY IN OTHER TO SETTLE DOWN FOR MEANINGFUL LIFE, AS I AM TIRED OF LIVING IN A WAR ENVIRONMENT. 

THE REMAINING 5% WILL BE USED TO OFFSET ANY COST INCURRED IN THE CAUSE OF MOVING THE MONEY TO YOUR ACCOUNT. IF THE PROPOSAL IS ACCEPTABLE TO YOU PLEASE CONTACT ME IMMEDIATELY THROUGH THE ABOVE TELEPHONE AND E-MAIL, TO ENABLE ME ARRANGE FACE TO FACE MEETING WITH YOU IN GHANA FOR THE CLEARANCE OF THE FUNDS BEFORE TRANSFRING IT TO YOUR BANK ACCOUNT AS SEEING IS BELIEVING. 

FINALLY, IT IS IMPORTANT ALSO THAT I LET YOU UNDERSTAND THAT THERE IS NO RISK INVOLVED WHATSOEVER AS THE MONEY HAD NO RECORD IN KINSHASA FOR IT WAS MEANT FOR THE PERSONAL USE OF (MR. PRESIDEND ) BEFORE THE NEFARIOUS INCIDENT OCCURRED, AND ALSO I HAVE ALL THE NECESSARY DOCUMENTS AS REGARDS TO THE FUNDS INCLUDING THE (CERTIFICATE OF DEPOSIT), AS I AM THE DEPOSITOR OF THE CONSIGNMENT.


LOOKING FORWARD TO YOUR URGENT RESPONSE.

YOUR SINCERELY,

MR. JAMES NGOLA. 










From r  Thu Oct 31 08:11:39 2002
Return-Path: <bensul2004nng@spinfinder.com>
X-Sieve: cmu-sieve 2.0
Return-Path: <bensul2004nng@spinfinder.com>
Message-Id: <200210311310.g9VDANt24674@bloodwork.mr.itd.UM>
From: "Mr. Ben Suleman" <bensul2004nng@spinfinder.com>
Date: Thu, 31 Oct 2002 05:10:00
To: R@M
Subject: URGENT ASSISTANCE /RELATIONSHIP (P)
MIME-Version: 1.0
Content-Type: text/plain;charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
Status: O

Dear Friend,

I am Mr. Ben Suleman a custom officer and work as Assistant controller of the Customs and Excise department Of the Federal Ministry of Internal Affairs stationed at the Murtala Mohammed International Airport, Ikeja, Lagos-Nigeria.

After the sudden death of the former Head of state of Nigeria General Sanni Abacha on June 8th 1998 his aides and immediate members of his family were arrested while trying to escape from Nigeria in a Chartered jet to Saudi Arabia with 6 trunk boxes Marked "Diplomatic B
In [81]:
#Then, we use our body function, to extract the body of the emails. 
fraudlent_mails_body = body(fraudlent_mails)

#And afterwards we put it into a new DataFrame.  
fraudlent_mails_body = pd.DataFrame(fraudlent_mails_body,columns={"body"})
fraudlent_mails_body.drop([0,0],inplace=True)
fraudlent_mails_body.head()
Out[81]:
body
1 FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-2...
2 Dear Friend,\n\nI am Mr. Ben Suleman a custom ...
3 FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...
4 FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...
5 Dear sir, \n \nIt is with a heart full of hope...
In [82]:
# For the next cleaning part, we define a function to remove punctuation marks and other nonword characters using regex library. 

def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower() 
            token = re.sub(r'[\W\d]', "", token)
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens
In [83]:
def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    return token

Bag-of-words model

For the computer to make inferences of the e-mails, it has to be able to interpret the text by making a numerical representation of it. One way to do this is by using something called a "bag-of-words" model. This model simply counts the frequency of word tokens for each email and thereby represents it as a vector of these counts.

In [84]:
#Firstly, we'll take out a random 10.000 sample of the Enron-emails body
EnronEmails = df_emails.body.sample(n=10000,random_state=42)
#Then uses word_tokenizer to tokenize the words
EnronEmails = EnronEmails.apply(lambda w: word_tokenize(w))
#Now, we remove stopwords from the mails
EnronEmails = EnronEmails.apply(lambda w: stop_word_removal(w))
#Then, we use our Reg_expres fucntion to delete punctuation marks and other nonword characters
EnronEmails = EnronEmails.apply(reg_expressions)

Next, we do all the same steps on our "Fraud/spam" mails

In [85]:
SpamEmails = fraudlent_mails_body.body.sample(n=3977).astype(str)
SpamEmails = SpamEmails.apply(lambda w: word_tokenize(w))
SpamEmails = SpamEmails.apply(lambda w: stop_word_removal(w))
SpamEmails = SpamEmails.apply(reg_expressions)
In [86]:
#Lastly, we take a sample of 1000 mails from both Enrom and Spam
nsamples = 1000

SpamEmails = SpamEmails.sample(n=nsamples,random_state=42)
EnronEmails = EnronEmails.sample(n=nsamples,random_state=42)

#Then we contat these, and name the columns: SpamEmails and EnronEmails
concat_mails = pd.concat([SpamEmails,EnronEmails], axis=0).values
#Checks the shape 
concat_mails.shape #Which is correct, since 1000*2 = 4000 ;).
Out[86]:
(2000,)
In [87]:
#The function below assembles a new dataframe containing all the unique words found in the input. 
# - it then counts the word frequency and then returns the new dataframe.

def assemble_bag(data):
    used_tokens = []
    all_tokens = []

    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
    
    df = pd.DataFrame(0, index = np.arange(len(data)), columns = used_tokens)
    
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1    
    return df
In [88]:
# We then uses our funtion above, to create a bag-of-words model
EnronSpamBag = assemble_bag(concat_mails)
# This is the list of words in our bag-of-words model
predictors = [column for column in EnronSpamBag.columns]

#And lastly shows the model
EnronSpamBag
Out[88]:
mr zuma barley auditing accounting dept african development bank benin adb will i contact fund end us account republic involved person deceased immediate next kin claims reliable plane transfer customer foreign owner take national we absolute portrayed bonafide beneficiary ... epri tesla abboud rga labs razzledazzle gems uncertain cgcarmic aladdin goodbye brands auctions carnival tourister ubid warehouse ub sportslinecom warranties sta mcfarland corpus mesa flags irvine nsas lafayette minneapolis roswell ju tallahassee dhabi cali colombia lima milan bombay janeiro regency
0 75 3 3 3 3 2 2 2 2 6 6 2 16 9 2 9 4 7 4 2 2 2 7 3 9 8 2 2 2 4 2 3 2 2 2 3 3 2 2 2 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 51 0 0 0 0 0 0 0 0 0 0 0 4 4 0 1 0 2 0 1 0 1 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1995 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1996 7 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1997 38 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1998 109 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1 1 1 2 1 2 5 1 1 4 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1999 61 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1

2000 rows × 16508 columns

In [89]:
#Sets up the header, before we can shuffle and mix the data. 
header = ([1]*nsamples)
header.extend(([0]*nsamples))
In [90]:
#This function mixes our data, so we can split it into a training and test set.  
def shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p,:]
    header = np.asarray(header)[p]
    return data, header
In [91]:
data, header = shuffle_data(EnronSpamBag.values, header)
print(header.shape)
print(data.shape)
(2000,)
(2000, 16508)
In [92]:
# Splits into independent 70% training and 30% testing sets
idx = int(0.7*data.shape[0])

# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# Remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:]
In [93]:
logreg = LogisticRegression()
logreg.fit(train_x,train_y)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
Out[93]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [94]:
#Uses the Logistic Regression to predict
y_pred = logreg.predict(test_x)

#Evaluates the score
print("The logistic regression accuracy score is:")
print(accuracy_score(test_y,y_pred))
The logistic regression accuracy score is:
0.9916666666666667
In [ ]:
!jupyter nbconvert --to html ""