Reading mail contents in mbox file using python mailbox

I am trying to print the contents of a mail (body of a letter) using a Python mailbox.

import mailbox mbox = mailbox.mbox('Inbox') i=1 for message in mbox: print i print "from :",message['from'] print "subject:",message['subject'] print "message:",message['**messages**'] print "**************************************" i+=1 

But I feel that the message [' messages ] is incorrectly printing email content here. I could not figure it out from the documentation

+10
source share
3 answers

To get the contents of a message, you want to use get_payload() . mailbox.Message is a subclass of email.message.Message . You will also want to check is_multipart() , as this will affect the return value of get_payload() . Example:

 if message.is_multipart(): content = ''.join(part.get_payload(decode=True) for part in message.get_payload()) else: content = message.get_payload(decode=True) 
+13
source
 def getbody(message): #getting plain text 'email body' body = None if message.is_multipart(): for part in message.walk(): if part.is_multipart(): for subpart in part.walk(): if subpart.get_content_type() == 'text/plain': body = subpart.get_payload(decode=True) elif part.get_content_type() == 'text/plain': body = part.get_payload(decode=True) elif message.get_content_type() == 'text/plain': body = message.get_payload(decode=True) return body 

this function can give you a message body if the body is plain text.

+14
source

Here's a more complete answer:

  • character set detection (using content-encoding and chardet in case it doesn't work)
  • Recursive analysis of a partial body (instead of a hard-set maximum level)
  • Ignore images and other attachments
  • strip html with beautifulsoup
  • parsers and receivers

,

 import io, csv, email from email import header from email.utils import getaddresses from bs4 import BeautifulSoup import chardet def mbox_to_csv(mbox): mail = None def add_mail(): if mail: msg = email.message_from_string(mail) subject = header.make_header(header.decode_header(msg['Subject'])) body = str(subject) body += '\n' def parse_payload(message): if message.is_multipart(): for part in message.get_payload(): yield from parse_payload(part) else: yield message, message.get_payload(decode=True) for submsg, part in parse_payload(msg): content_type = submsg.get_content_type() content = '' def decode(): charset = submsg.get_content_charset('utf-8') try: return part.decode(charset) except UnicodeDecodeError: charset = chardet.detect(part)['encoding'] return part.decode(charset) if 'plain' in content_type: content = decode() if 'html' in content_type: content = BeautifulSoup(decode()).text body += '\n' + content senders = getaddresses(msg.get_all('from', [])) tos = msg.get_all('to', []) ccs = msg.get_all('cc', []) resent_tos = msg.get_all('resent-to', []) resent_ccs = msg.get_all('resent-cc', []) all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs) for line in mbox: if line.startswith('From '): add_mail() mail = '' if mail is not None: # ignore email without headers mail += line add_mail() 
+3
source

Source: https://habr.com/ru/post/977222/


All Articles