Analyzing emails in Python

Open EML files

Create a list of paths to .eml files in a directory and extract their file names and texts.

from pathlib import Path
import pandas as pd
from email import policy
from email.parser import BytesParser

path = Path('.')
eml_files = list(path.glob('*.eml'))

file_names = []
texts = []

for file in eml_files:
    with open(file, 'rb') as fp:
        name = fp.name  # Get file name
        msg = BytesParser(policy=policy.default).parse(fp)
    text = msg.get_body(preferencelist=('plain')).get_content()
    file_names.append(name)
    texts.append(text)
    fp.close()

df_eml = pd.DataFrame([file_names, texts]).T
df_eml.columns = ['file_name', 'text']

You can save the dataframe as a Microsoft Excel file.

Microsoft Outlook format (MSG files)

from pathlib import Path
import pandas as pd
import extract_msg

path = Path('.')
msg_files = list(path.glob('*.msg'))

senders = []
dates = []
subjects = []
bodies = []

for file in eml_files:
    email_msg = extract_msg.openMsg(file)
    senders.append(email_msg.sender)
    dates.append(email_msg.date)
    subjects.append(email_msg.subject)
    bodies.append(email_msg.body)

df_eml = pd.DataFrame([senders, dates, subjects, bodies]).T
df_eml.columns = ['sender', 'date', 'subject', 'body']

References

Comments

Copied title and URL