22
loading...
This website collects cookies to deliver better user experience
emails = []
with open('emaildataset.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
emails.append((row[0].strip(), row[1].strip()))
def punctuation_removal(data_string):
punctuations = [",", ".", "?", "!", "'", "+", "(", ")"]
for punc in punctuations:
data_string = data_string.replace(punc, "")
return data_string
def stopword_removal(tokens):
stopwords = ['of', 'on', 'i', 'am', 'this', 'is', 'a', 'was']
filtered_tokens = []
for token in tokens:
if token not in stopwords:
filtered_tokens.append(token)
return filtered_tokens
def stemming(filtered_tokens):
root_to_token = {'you have':['youve'],
'select':['selected', 'selection'],
'it is':['its'],
'move':['moving'],
'photo':['photos'],
'success':['successfully', 'successful']
}
base_form_tokens = []
for token in filtered_tokens:
for base_form, token_list in root_to_token.items():
if token in token_list:
base_form_tokens.append(base_form)
else:
base_form_tokens.append(token)
return base_form_tokens
tokens = []
for email in emails:
email = email[0].lower().split()
for word in email:
clean_word = punctuation_removal(word)
tokens.append(clean_word)
tokens = set(tokens)
filtered_tokens = stopword_removal(tokens)
base_form_tokens = stemming(filtered_tokens)
unique_words = []
unique_words = set(base_form_tokens)
feature_vec = {}
for word in unique_words:
feature_vec[word] = word in base_form_tokens
pair = (feature_vec, email[1]) #email[1] is the label for each email
train_data.append(pair)
train_data = []
for email in emails:
tokens = []
word_list = email[0].lower().split()
for word in word_list:
clean_word = punctuation_removal(word)
tokens.append(clean_word)
filtered_tokens = stopword_removal(tokens)
base_form_tokens = stemming(filtered_tokens)
feature_vec = {}
for word in unique_words:
feature_vec[word] = word in base_form_tokens
pair = (feature_vec, email[1])
train_data.append(pair)
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)
output = classifier.classify(test_features)
def testing(email_str):
tokens = []
word_list = email_str.lower().split()
for word in word_list:
clean_word = punctuation_removal(word)
tokens.append(clean_word)
filtered_tokens = stopword_removal(tokens)
base_form_tokens = stemming(filtered_tokens)
test_features = {}
for word in unique_words:
test_features[word] = word in base_form_tokens
output = classifier.classify(test_features)
return output