Open EML files
Create a list of paths to .eml files in a directory and extract their file names and texts.
from pathlib import Path
import pandas as pd
from email import policy
from email.parser import BytesParser
path = Path('.')
eml_files = list(path.glob('*.eml'))
file_names = []
texts = []
for file in eml_files:
with open(file, 'rb') as fp:
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
file_names.append(name)
texts.append(text)
fp.close()
df_eml = pd.DataFrame([file_names, texts]).T
df_eml.columns = ['file_name', 'text']
You can save the dataframe as a Microsoft Excel file.
Microsoft Outlook format (MSG files)
from pathlib import Path
import pandas as pd
import extract_msg
path = Path('.')
msg_files = list(path.glob('*.msg'))
senders = []
dates = []
subjects = []
bodies = []
for file in eml_files:
email_msg = extract_msg.openMsg(file)
senders.append(email_msg.sender)
dates.append(email_msg.date)
subjects.append(email_msg.subject)
bodies.append(email_msg.body)
df_eml = pd.DataFrame([senders, dates, subjects, bodies]).T
df_eml.columns = ['sender', 'date', 'subject', 'body']
Comments