Script for extracting mailbody of mbox files

This snippet is just for extracting the Mailbody of mbox files,
I used it for Elastic Search to index my mails:

#!/usr/bin/python
# extractor for mailbody of mbox files
# you can reduce the outputlines at the bottom --> print(body[0:10000])
# better to let it like this, if something went wrong with attachments

import mailbox                                          # class for mail extraction
import sys, getopt                              # class to get arguments from stdin
import email                                                    # class for rfc822 Messages

def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.
    if msg.is_multipart():
        for part in msg.walk():

            # If part is multipart, walk through the subparts.
            if part.is_multipart():

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True)
                        #charset = subpart.get_charset()

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True)
   # No checking done to match the charset with the correct part.
    for charset in getcharsets(msg):
        try:
            body = body.decode(charset)
        except UnicodeDecodeError:
            handleerror("UnicodeDecodeError: encountered.",msg,charset)
        except AttributeError:
             handleerror("AttributeError: encountered" ,msg,charset)
    return body

def getbodyrfc822(msg):
                msg = email.message_from_string(raw_message)
                for part in msg.walk():
                # each part is a either non-multipart, or another multipart message
                # that contains further parts... Message is organized like a tree
                                if part.get_content_type() == 'text/plain':
                                                print part.get_payload() # prints the raw text

filetype = str(sys.argv[1])
mboxfile = str(sys.argv[2])

#print(mboxfile)
if filetype == 'application/mbox':
                for thisemail in mailbox.mbox(mboxfile):
                                body = getbodyfromemail(thisemail)
                                print(body[0:10000])
elif filetype == 'message/rfc822':
                body = getbodyrfc822(mboxfile)
                print(body[0:10000])

2 Gedanken zu „Script for extracting mailbody of mbox files“

Schreibe einen Kommentar

Diese Website verwendet Akismet, um Spam zu reduzieren. Erfahre mehr darüber, wie deine Kommentardaten verarbeitet werden.