# install kaggle package
! pip install -q kaggle
upload kaggle.json
API key
# make folder for api key
! mkdir ~/.kaggle
# copy key into folder
! cp kaggle.json ~/.kaggle/
# change access permissions
! chmod 600 ~/.kaggle/kaggle.json
for getting data go to kaggle page and ... and copy API command
! kaggle datasets download -d wcukierski/enron-email-dataset
! unzip /content/enron-email-dataset.zip
!pip install datashader -q
!pip install -qq -U gensim
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline
import email
import datetime
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from string import punctuation
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import networkx as nx
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
kwargs = dict(width=800, height=800, xaxis=None, yaxis=None)
opts.defaults(opts.Nodes(**kwargs), opts.Graph(**kwargs))
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.lsimodel import LsiModel
from gensim.similarities import MatrixSimilarity
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#Loading the dataset.
df_emails = pd.read_csv("emails.csv")
#And inspects it:
df_emails.head()
# Drops rows containing messages without some specified value in the expected locations.
def standard_format(df, Series, string, slicer):
rows = []
for row, message in enumerate(Series):
message_words = message.split('\n')
if string not in message_words[slicer]:
rows.append(row)
df = df.drop(df.index[rows])
return df
# Applying the cleansing.
x = len(df_emails.index)
headers = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ']
for i, v in enumerate(headers):
df_emails = standard_format(df_emails, df_emails.message, v, i)
df_emails = df_emails.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(df_emails.index), np.round(((x - len(df_emails.index)) / x) * 100, decimals=2)))
# With the function below, we can subtract the information we want from the "message" column.
def get_field(field, messages):
column = []
for message in messages:
e = email.message_from_string(message)
column.append(e.get(field))
return column
# Now using the function above.
df_emails["date"] = get_field("Date", df_emails["message"])
df_emails["subject"] = get_field("Subject", df_emails["message"])
df_emails["From"] = get_field("From", df_emails["message"])
df_emails["To"] = get_field("To", df_emails["message"])
df_emails["X-Folder"] = get_field("X-Folder", df_emails["message"])
df_emails["X-From"] = get_field("X-From", df_emails["message"])
df_emails["X-To"] = get_field("X-To", df_emails["message"])
df_emails.head(3)
#Changes date column to datetime format, using the datetime package
df_emails['date'] = pd.to_datetime(df_emails['date'], infer_datetime_format=True, utc=True)
#The function below extracts the body/actual mail from the "message" column.
def body(messages):
column = []
for message in messages:
e = email.message_from_string(message)
column.append(e.get_payload())
return column
df_emails["body"] = body(df_emails["message"])
#This function extracts the employee name of the sender.
def employee(file):
column = []
for string in file:
column.append(string.split("/")[0])
return column
df_emails["employee"] = employee(df_emails["file"])
df_emails.head(3)
#First, we take a look at the number of employees, thats included in the dataset.
print("Number of employees:",df_emails['employee'].nunique())
We where interested in how many charcheter where used in the emails to get an understand, how there could be so many. By going through the email body we could find the lenght. the first graph show that the density all most of the emails contains few or no words, and the other scale some messages containing a lot for words.
#Maybe include
df_emails['Message Length'] = df_emails['body'].apply(lambda x: len(x))
sns.distplot(df_emails['Message Length'])
Then we wanted to see the distribution of different size of characters, by making a range for characters and set a bin size with 100 gave an good indication as the characters decreased
#The for loop below, plots and visualize the length of the emails in the dataset.
for order_of_magnitude in reversed(range(2,6)):
max_ = 10**order_of_magnitude
print("Messages not longer than %i characters:"%max_)
plt.hist(df_emails.query('`Message Length`<@max_')['Message Length'], bins=100)
#histplot(email_df.query('`Message Length`<@max_'), x='Message Length')
plt.show()
#The function below shows in which period the emails from the dataset have been sent.
plt.figure(figsize=(12,6))
ax = df_emails.groupby(df_emails['date'].dt.year)['body'].count().plot()
ax.set_xlabel('Year', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
ax.set_xlim(1998,2004)
From the data set we have we can clearly see on the year email that therre is a ramp up of send emails, for the month graph show the year with the most emails, year 2001. Enron changed their CEO, in february march time, so more emails where send, going of for summer vacation, the amount of emails dropped. During august and october there was a dicussion going on for the employees of enron to buy enron stock and the company was still doing fine. short after this, accorind to the stock price. Enron have lot a lost of money, resulting the decreaseing emails.
# Next, we illustrate the most active email-months.
# Clearly, theres less emails being send doing the summer.
plt.figure(figsize=(12,6))
ax = df_emails.groupby(df_emails['date'].dt.month)['body'].count().plot()
ax.set_xlabel('Most active month', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
ax.set_xticks(range(1,13))
We wanted to see which of the folder where most used, and this turn out to be Kay Mann's folder this is a person in charge of the legal in Enron, so many of the emails contain information about legal issues.
unique_emails = pd.DataFrame(df_emails['X-Folder'].value_counts())
unique_emails.reset_index(inplace=True)
unique_emails.columns = ['folder_name', 'count']
# Top 20 folders
print(unique_emails.iloc[:10,:])
#Plots the figure
plt.figure(figsize=(10,6))
sns.barplot(x='count', y='folder_name', data=unique_emails.iloc[:10, :], palette="Blues_d")
plt.title("Top 10 folders")
plt.xlabel("Count")
plt.ylabel("Folder_Name")
plt.show()
Then we wanted to find out who is person who send the most emails.
the CEO, Lay, is at only ranked as 20, so he was sending not so much emails
# The plot below shows the most frequent sender.
top_20 = pd.DataFrame(df_emails['employee'].value_counts()[:20])
top_20.reset_index(inplace=True)
top_20.columns = ["Employee_name", "Counts"]
#Plots the figure
plt.figure(figsize=(10,8))
sns.barplot(y="Employee_name", x="Counts", data=top_20, palette="Blues_d")
plt.title("Top 20 highest email sender employee")
plt.xlabel("Count")
plt.ylabel("Employee_name")
plt.show()
#Setting up the edges
edges = df_emails[["From","To"]]
edges = edges[edges.From != edges.To]
edges.head()
# Grouping to aggregate multiple co-occurences and to generate a weight:
# Based on how many times one sender sends a mail to a reciever.
# Lastly, reset_index makes everytging from a multi-index-series into a dataframe
edges = edges.groupby(['From', 'To']).size().reset_index()
#Renaming the column 0 to weight.
edges.rename({0:'weight'}, axis = 1, inplace=True)
# Creates network object from pandas edgelist
G = nx.from_pandas_edgelist(edges, source='From', target='To', edge_attr='weight', create_using=nx.Graph())
#Then sets the node attributes.
nx.set_node_attributes(G, {G.degree(): 'degree'})
# Subset the graph keeping only nodes with degree > 1
G = nx.subgraph(G, [n for n,d in G.degree() if d > 1])
# Here we can calculate different centrality indicators.
centrality_dgr = nx.degree_centrality(G)
centrality_eig = nx.eigenvector_centrality_numpy(G)
degree = G.degree()
# All these indicators can now be set as attribute of the Graph
nx.set_node_attributes(G, centrality_dgr, 'dgr')
nx.set_node_attributes(G, centrality_eig, 'eig')
nx.set_node_attributes(G, dict(degree), 'degree_basic')
# Turns the Graph object (NetworkX) to a Dataframe
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')
# Since the dataset contains of 150 people, we'll find the top 25 most central people.
top_10_eig = nodes_df.sort_values('eig', ascending=False)[:10]
#Creates nodes for plot
emails_eig = top_10_eig.eig.index
#Create subset graph
g_sub_emails = nx.subgraph(G,emails_eig)
# Create and save a layout.
g_layout = nx.layout.spring_layout(g_sub_emails)
g_plot = hv.Graph.from_networkx(g_sub_emails, g_layout).opts(tools=['hover'], node_color='partition')
labels = hv.Labels(g_plot.nodes, ['x', 'y'])
# Makes the plot
from holoviews.operation.datashader import datashade, bundle_graph
bundled = bundle_graph(g_plot)
# Show the plot
show(hv.render(bundled * labels.opts(text_font_size='6pt', text_color='white', bgcolor='lightblue')))
print(top_10_eig[:10])
subject_text = " ".join(message for message in df_emails.subject)
print ("There are {} words in the subjects in combination of all mails.".format(len(subject_text)))
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["RE", "FW","Fwd","Date","Hour","HourAhead","Reminder","PLEASEREAD"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(subject_text)
# Display the generated wordcloud:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#Takes a sample of 10.000 emails
emails_nlp = nlp(df_emails.body.sample(n=10000,random_state=42).to_string())
# Most mentioned people in the e-mails
persons_entity = [(ent.text, ent.label_) for ent in emails_nlp.ents if ent.label_ == "PERSON" ]
persons_entity = pd.DataFrame(persons_entity,columns=["Name","-"])
print("Most mentioned people in mails")
print()
print(persons_entity.value_counts()[:10])
print()
print("Top 10 central people accoring to Network Analysis")
print(top_10_eig[:10])
#Most used adjectives in the emails
adj_entity = [(token.lemma_, token.pos_) for token in emails_nlp if token.pos_ == "ADJ" and not token.is_stop]
adj_entity = pd.DataFrame(adj_entity,columns=["Adjective","-"])
print("Most used adjectives in mails")
print()
print(adj_entity.value_counts()[:10])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(adj_entity.Adjective.to_string())
# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# The function below returns a string that is cleaned.
def clean_text(Series):
result = []
strings = Series.str.lower()
for string in strings:
new_string = []
words = string.split(" ")
for word in words:
word = word.strip(punctuation)
if word in stopwords:
continue
if re.search(r'[\W\d]',word):
continue
new_string.append(word)
new_string = " ".join(new_string)
result.append(new_string)
return result
#Next, we'll generate a wordcloud on the "body" of every email.
#And firstly, we create a new dataframe.
enorn_person_email = df_emails[["employee","body"]]
#Then cleans the data
enorn_person_email['body'] = clean_text(enorn_person_email['body'])
#Grouping every word a person has mailed into one dataframe.
df_grouped = enorn_person_email.groupby("employee")['body'].apply(' '.join).reset_index()
# Generate a word cloud image
stopwords.update(["CC","subject","forwarded","seeattached","please see","pleasefind","fyi"])
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(enorn_person_email.body.to_string())
# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#For the UML, we first take a sample, since our Collab cannot run the full dataset.
uml_emails = df_emails.sample(n=4000,random_state=42)
#We interested to see, if theres a certain pattern in terms of whether the sender is from Enron or not.
#Therefore, we firstly split the "From" emails by the "@", and afterwards we categorize this into a new column.
uml_emails["Enron_or_not"] = uml_emails.From.apply(lambda t: t.split("@")[1])
filter = uml_emails["Enron_or_not"] == "enron.com"
uml_emails["Enron_or_not"] = uml_emails["Enron_or_not"].replace("enron.com","Enron")
uml_emails["Enron_or_not"] = uml_emails.Enron_or_not.where(filter,"Not_Enron")
uml_emails[["Enron_or_not","From"]].head()
#After we've classified the different emails, we create a list of our tokens, called tokens.
#Here, we both use lemmatization and lower the tokens. Moreover, are we only interesseted in a certain types of words.
tokens = []
for summary in nlp.pipe(uml_emails.body):
proj_tok = [token.lemma_.lower() for token in summary if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] and not token.is_stop]
tokens.append(proj_tok)
#The, we sets the index
uml_emails.index = range(len(uml_emails))
#And takes the tokens back into our sample of emails
uml_emails["tokens"] = tokens
# Create a Dictionary from the emails, called: dictionary
dictionary = Dictionary(uml_emails['tokens'])
# And based on our dictionary, we can construct our corpus.
corpus = [dictionary.doc2bow(doc) for doc in uml_emails['tokens']]
# We now sets up our Tfidf model, using our corpus from above.
# Create and fit a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)
# Now we can transform the whole corpus
tfidf_corpus = tfidf[corpus]
#We then trains our model
lsi = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=400)
# And our trained model can then be used to transform our corpus
lsi_corpus = lsi[tfidf_corpus]
# Creates the email-topic-matrix
email_topic_matrix = MatrixSimilarity(lsi_corpus)
email_topic_matrix_ix = email_topic_matrix.index
from sklearn.cluster import KMeans
# Creates a "for loop", to determine number of clusters.
distortions = []
K = range(1,6)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(email_topic_matrix_ix)
distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'o-')
plt.xlabel('number of clusters, k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal clusters')
plt.show()
reduced = PCA(n_components = 2).fit_transform(email_topic_matrix_ix)
# Sets up the clustrs, which is 2
clusterer = KMeans(n_clusters = 2)
clusterer.fit(email_topic_matrix_ix)
# Plotting things
sns.set_style("darkgrid")
plt.rcParams.update({'font.size': 12})
plt.figure(figsize=(12,12))
g = sns.scatterplot(reduced[:,0],reduced[:,1],
hue=uml_emails["Enron_or_not"],
palette="Paired",
legend='full')
#Illustrating this in a crosstab as well, to get a better picture of the clustering
pd.crosstab(clusterer.labels_, uml_emails['Enron_or_not'])
#We import a set of fraudlent emails, to mix with our EnronEmails, to train our model
! unzip /content/fradulent_emails.txt.zip
#Reads fraudlent emails:
with open("fradulent_emails.txt", 'r',encoding="latin1") as file:
fraudlent_mails = file.read()
#And inspects the dataset
print(fraudlent_mails[:5000]) #Clearly, every mail starts with "From r",
# - this will therefore be used to split the emails into seperate strings.
#Then, we use our body function, to extract the body of the emails.
fraudlent_mails_body = body(fraudlent_mails)
#And afterwards we put it into a new DataFrame.
fraudlent_mails_body = pd.DataFrame(fraudlent_mails_body,columns={"body"})
fraudlent_mails_body.drop([0,0],inplace=True)
fraudlent_mails_body.head()
# For the next cleaning part, we define a function to remove punctuation marks and other nonword characters using regex library.
def reg_expressions(row):
tokens = []
try:
for token in row:
token = token.lower()
token = re.sub(r'[\W\d]', "", token)
tokens.append(token)
except:
token = ""
tokens.append(token)
return tokens
def stop_word_removal(row):
token = [token for token in row if token not in stopwords]
return token
For the computer to make inferences of the e-mails, it has to be able to interpret the text by making a numerical representation of it. One way to do this is by using something called a "bag-of-words" model. This model simply counts the frequency of word tokens for each email and thereby represents it as a vector of these counts.
#Firstly, we'll take out a random 10.000 sample of the Enron-emails body
EnronEmails = df_emails.body.sample(n=10000,random_state=42)
#Then uses word_tokenizer to tokenize the words
EnronEmails = EnronEmails.apply(lambda w: word_tokenize(w))
#Now, we remove stopwords from the mails
EnronEmails = EnronEmails.apply(lambda w: stop_word_removal(w))
#Then, we use our Reg_expres fucntion to delete punctuation marks and other nonword characters
EnronEmails = EnronEmails.apply(reg_expressions)
Next, we do all the same steps on our "Fraud/spam" mails
SpamEmails = fraudlent_mails_body.body.sample(n=3977).astype(str)
SpamEmails = SpamEmails.apply(lambda w: word_tokenize(w))
SpamEmails = SpamEmails.apply(lambda w: stop_word_removal(w))
SpamEmails = SpamEmails.apply(reg_expressions)
#Lastly, we take a sample of 1000 mails from both Enrom and Spam
nsamples = 1000
SpamEmails = SpamEmails.sample(n=nsamples,random_state=42)
EnronEmails = EnronEmails.sample(n=nsamples,random_state=42)
#Then we contat these, and name the columns: SpamEmails and EnronEmails
concat_mails = pd.concat([SpamEmails,EnronEmails], axis=0).values
#Checks the shape
concat_mails.shape #Which is correct, since 1000*2 = 4000 ;).
#The function below assembles a new dataframe containing all the unique words found in the input.
# - it then counts the word frequency and then returns the new dataframe.
def assemble_bag(data):
used_tokens = []
all_tokens = []
for item in data:
for token in item:
if token in all_tokens:
if token not in used_tokens:
used_tokens.append(token)
else:
all_tokens.append(token)
df = pd.DataFrame(0, index = np.arange(len(data)), columns = used_tokens)
for i, item in enumerate(data):
for token in item:
if token in used_tokens:
df.iloc[i][token] += 1
return df
# We then uses our funtion above, to create a bag-of-words model
EnronSpamBag = assemble_bag(concat_mails)
# This is the list of words in our bag-of-words model
predictors = [column for column in EnronSpamBag.columns]
#And lastly shows the model
EnronSpamBag
#Sets up the header, before we can shuffle and mix the data.
header = ([1]*nsamples)
header.extend(([0]*nsamples))
#This function mixes our data, so we can split it into a training and test set.
def shuffle_data(data, header):
p = np.random.permutation(len(header))
data = data[p,:]
header = np.asarray(header)[p]
return data, header
data, header = shuffle_data(EnronSpamBag.values, header)
print(header.shape)
print(data.shape)
# Splits into independent 70% training and 30% testing sets
idx = int(0.7*data.shape[0])
# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# Remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:]
logreg = LogisticRegression()
logreg.fit(train_x,train_y)
#Uses the Logistic Regression to predict
y_pred = logreg.predict(test_x)
#Evaluates the score
print("The logistic regression accuracy score is:")
print(accuracy_score(test_y,y_pred))
!jupyter nbconvert --to html ""