Best Picks Labor

Destilliertes Wissen zu Schule und Technik

Attachments aus der Mailbox extrahieren

Innerhalb meiner Mailbox habe ich zahlreiche CSV-Dateien mit Daten drin. Diese Daten sind leider verstreut und somit unbrauchbar. Als Lösung habe ich folgendes Verfahren gefunden:

  • Filtere die gewünschten mails mit entsprechenden Regeln und lable sie
  • Verwende Google Takeout um das Label zu exportieren –> .mbox Datei
  • Verwende ein Pythonscript um die Anhänge aus der mbox datei zu extrahieren, filtere nach CSV und kombiniere alle CSV mit dem gleichen Header zu einer Datei.
    das Script benötigt pandas und einige andere Libraries. Mit pip install pandas kann dieser ganz leicht nachinstalliert werden
#mit diesem Script werden die Attachments aus einem mbox file von Google Takeout extrahiert und anschliessend alle CSV mit dem gleichen Header zu einem kombiniert.
#Dateinamen: Quellldatei: file.mbox; Zieldatei: merged.csv; Zwischenspeicher: Ordner mit Namen attachments 
#Dank an GPT3.5 Turbo der 90% der Codingarbeit und Debugging für mich gemacht hat. 

import mailbox
import os
import email
from email import policy
from email.parser import BytesParser
import pandas as pd

def extract_all_attachments_from_mbox(mbox_file_path, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the mbox file
    mbox = mailbox.mbox(mbox_file_path)

    # Counter for unique filenames
    count = 1

    # Loop through each message in the mbox file
    for message in mbox:
        # Parse the message using BytesParser
        msg = BytesParser(policy=policy.default).parsebytes(message.as_bytes())

        # Iterate through each part of the message
        for part in msg.iter_parts():
            # Check if the part has a filename
            if part.get_filename():
                # Get the original filename of the attachment
                original_filename = part.get_filename()

                # Generate a unique name for the attachment (if needed)
                attachment_filename = f"attachment_{count}_{original_filename}" if original_filename else f"attachment_{count}"
                count += 1

                # Save the attachment to the output folder with the original or generated filename
                attachment_path = os.path.join(output_folder, attachment_filename)
                with open(attachment_path, "wb") as f:
                    f.write(part.get_payload(decode=True))

# Function to merge CSV files in a directory
def merge_csv_files(directory_path, output_file):
    # Get a list of all CSV files in the directory
    csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

    # Check if there are at least two CSV files to merge
    if len(csv_files) < 2:
        print("There are not enough CSV files to merge.")
        return

    # Read the first CSV file to get the header
    first_file_path = os.path.join(directory_path, csv_files[0])
    first_df = pd.read_csv(first_file_path, encoding='utf-8')  # Specify the encoding

    # Initialize the merged dataframe with the first dataframes
    merged_df = first_df

    # Iterate through the remaining CSV files and merge them
    for csv_file in csv_files[1:]:
        file_path = os.path.join(directory_path, csv_file)
        try:
            df = pd.read_csv(file_path, encoding='utf-8')  # Specify the encoding
        except (UnicodeDecodeError, pd.errors.ParserError) as e:
            print(f"Error reading file {csv_file}: {e}")
            continue

        # Check if the headers match
        if not first_df.columns.equals(df.columns):
            print(f"Skipping file {csv_file} as it has a different header.")
            continue

        # Check if the number of fields is consistent
        if len(first_df.columns) != len(df.columns):
            print(f"Skipping file {csv_file} as it has a different number of fields.")
            continue

        # Merge the dataframes
        merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Save the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged CSV files saved to {output_file}.")

# Example usage
mbox_file_path = "file.mbox"
attachments_output_folder = "attachments"
merged_csv_output_file = 'merged.csv'

extract_all_attachments_from_mbox(mbox_file_path, attachments_output_folder)
merge_csv_files(attachments_output_folder, merged_csv_output_file)

Related Posts

None found