email_analysis
Email data analysis
This website collects cookies to deliver better user experience
pip install pandas
if you do not have it.import imaplib
import email
import getpass
import pandas as pd
username = input("Enter the email address: ")
password = getpass.getpass("Enter password: ")
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(username, password)
print(mail.list())
mail.select("inbox")
result, numbers = mail.search(None, '(FROM "[email protected]")')
uids = numbers[0].split()
uids = [id.decode("utf-8") for id in uids ]
result, messages = mail.fetch(','.join(uids) ,'(RFC822)')
walk()
method in the email library to get the parts and subparts of the message. get_content_type()
method to get the email body maintype/subtype.get_payload()
to get a string or message instance of the part.
body_list =[]
date_list = []
from_list = []
subject_list = []
for _, message in messages[::2]:
email_message = email.message_from_bytes(message)
email_subject = email.header.decode_header(email_message['Subject'])[0]
for part in email_message.walk():
if part.get_content_type() == "text/plain" :
body = part.get_payload(decode=True)
body = body.decode("utf-8")
body_list.append(body)
else:
continue
if isinstance(email_subject[0],bytes):
decoded = email_subject.decode(errors="ignore")
subject_list.append(decoded)
else:
subject_list.append(email_subject[0])
date_list.append(email_message.get('date'))
fromlist = email_message.get('From')
fromlist = fromlist.split("<")[0].replace('"', '')
from_list.append(fromlist)
to_datetime()
method, because the time has its UTC format attached, we sliced off the UTC format.date_list = pd.to_datetime(date_list)
date_list = [item.isoformat(' ')[:-6]for item in date_list]
data = pd.DataFrame(data={'Date':date_list,'Sender':from_list,'Subject':subject_list, 'Body':body_list})
data.to_csv('emails.csv',index=False)
data = pd.read_csv("\emails.csv")
data.head()
"\r\n"
as seen in the screenshot above. This makes the text more readable.def clean_data(data, column, i):
data.loc[i, column] = data.loc[i, column].split("\r\n")
new_string = " ".join(data.loc[i, column])
new_string = new_string.split("'',")
data[column][i:i+1] = pd.DataFrame(data = new_string)
return data
for n in range(len(data)):
new_data = clean_data(data, column = "Body", i = n)