Innerhalb meiner Mailbox habe ich zahlreiche CSV-Dateien mit Daten drin. Diese Daten sind leider verstreut und somit unbrauchbar. Als Lösung habe ich folgendes Verfahren gefunden:
- Filtere die gewünschten mails mit entsprechenden Regeln und lable sie
- Verwende Google Takeout um das Label zu exportieren –> .mbox Datei
- Verwende ein Pythonscript um die Anhänge aus der mbox datei zu extrahieren, filtere nach CSV und kombiniere alle CSV mit dem gleichen Header zu einer Datei.
das Script benötigt pandas und einige andere Libraries. Mit pip install pandas kann dieser ganz leicht nachinstalliert werden
#mit diesem Script werden die Attachments aus einem mbox file von Google Takeout extrahiert und anschliessend alle CSV mit dem gleichen Header zu einem kombiniert.
#Dateinamen: Quellldatei: file.mbox; Zieldatei: merged.csv; Zwischenspeicher: Ordner mit Namen attachments
#Dank an GPT3.5 Turbo der 90% der Codingarbeit und Debugging für mich gemacht hat.
import mailbox
import os
import email
from email import policy
from email.parser import BytesParser
import pandas as pd
def extract_all_attachments_from_mbox(mbox_file_path, output_folder):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Open the mbox file
mbox = mailbox.mbox(mbox_file_path)
# Counter for unique filenames
count = 1
# Loop through each message in the mbox file
for message in mbox:
# Parse the message using BytesParser
msg = BytesParser(policy=policy.default).parsebytes(message.as_bytes())
# Iterate through each part of the message
for part in msg.iter_parts():
# Check if the part has a filename
if part.get_filename():
# Get the original filename of the attachment
original_filename = part.get_filename()
# Generate a unique name for the attachment (if needed)
attachment_filename = f"attachment_{count}_{original_filename}" if original_filename else f"attachment_{count}"
count += 1
# Save the attachment to the output folder with the original or generated filename
attachment_path = os.path.join(output_folder, attachment_filename)
with open(attachment_path, "wb") as f:
f.write(part.get_payload(decode=True))
# Function to merge CSV files in a directory
def merge_csv_files(directory_path, output_file):
# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]
# Check if there are at least two CSV files to merge
if len(csv_files) < 2:
print("There are not enough CSV files to merge.")
return
# Read the first CSV file to get the header
first_file_path = os.path.join(directory_path, csv_files[0])
first_df = pd.read_csv(first_file_path, encoding='utf-8') # Specify the encoding
# Initialize the merged dataframe with the first dataframes
merged_df = first_df
# Iterate through the remaining CSV files and merge them
for csv_file in csv_files[1:]:
file_path = os.path.join(directory_path, csv_file)
try:
df = pd.read_csv(file_path, encoding='utf-8') # Specify the encoding
except (UnicodeDecodeError, pd.errors.ParserError) as e:
print(f"Error reading file {csv_file}: {e}")
continue
# Check if the headers match
if not first_df.columns.equals(df.columns):
print(f"Skipping file {csv_file} as it has a different header.")
continue
# Check if the number of fields is consistent
if len(first_df.columns) != len(df.columns):
print(f"Skipping file {csv_file} as it has a different number of fields.")
continue
# Merge the dataframes
merged_df = pd.concat([merged_df, df], ignore_index=True)
# Save the merged dataframe to a new CSV file
merged_df.to_csv(output_file, index=False)
print(f"Merged CSV files saved to {output_file}.")
# Example usage
mbox_file_path = "file.mbox"
attachments_output_folder = "attachments"
merged_csv_output_file = 'merged.csv'
extract_all_attachments_from_mbox(mbox_file_path, attachments_output_folder)
merge_csv_files(attachments_output_folder, merged_csv_output_file)
