From e36dca49688db7a572612ec72337cd1aaea1f5a5 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Tue, 18 Sep 2018 14:54:04 +0530 Subject: [PATCH 1/5] Ingest outlook pst files using pypff --- ingestors/email/msg.py | 1 + ingestors/email/outlookpst.py | 30 ++++++++----- ingestors/support/outlookpst.py | 80 +++++++++++++++++++++++++++++++++ ingestors/util.py | 6 ++- 4 files changed, 105 insertions(+), 12 deletions(-) create mode 100644 ingestors/support/outlookpst.py diff --git a/ingestors/email/msg.py b/ingestors/email/msg.py index 30681dfce..8cb1f4089 100644 --- a/ingestors/email/msg.py +++ b/ingestors/email/msg.py @@ -16,6 +16,7 @@ class RFC822Ingestor(Ingestor, EmailSupport): MIME_TYPES = [ 'multipart/mixed', + 'multipart/alternative', 'message/rfc822' ] EXTENSIONS = [ diff --git a/ingestors/email/outlookpst.py b/ingestors/email/outlookpst.py index edb84b768..c85f8d037 100644 --- a/ingestors/email/outlookpst.py +++ b/ingestors/email/outlookpst.py @@ -1,11 +1,21 @@ +import logging +import os + +import pypff + from ingestors.base import Ingestor from ingestors.support.temp import TempFileSupport -from ingestors.support.shell import ShellSupport +from ingestors.support.outlookpst import OutlookPSTSupport from ingestors.support.ole import OLESupport from ingestors.directory import DirectoryIngestor +from ingestors.util import join_path + + +log = logging.getLogger(__name__) -class OutlookPSTIngestor(Ingestor, TempFileSupport, ShellSupport, OLESupport): +class OutlookPSTIngestor(Ingestor, TempFileSupport, + OutlookPSTSupport, OLESupport): MIME_DEFAULT = 'application/vnd.ms-outlook' MIME_TYPES = [MIME_DEFAULT] EXTENSIONS = ['pst', 'ost', 'pab'] @@ -17,15 +27,13 @@ def ingest(self, file_path): self.result.flag(self.result.FLAG_PACKAGE) temp_dir = self.make_empty_directory() try: - self.exec_command('readpst', - '-e', # make subfolders, files per message - '-D', # include deleted - '-r', # recursive structure - '-8', # utf-8 where possible - '-b', - '-q', # quiet - '-o', temp_dir, - file_path) + pst_file = pypff.open(file_path) + root = pst_file.get_root_folder() + root_folder_name = root.name or os.path.basename(file_path) + root_folder_path = os.path.join(temp_dir, root_folder_name) + os.makedirs(root_folder_path) + self.folder_traverse(root, root_folder_path) + self.check_for_messages(root, root_folder_path) self.manager.delegate(DirectoryIngestor, self.result, temp_dir) except Exception: # Handle partially extracted archives. diff --git a/ingestors/support/outlookpst.py b/ingestors/support/outlookpst.py new file mode 100644 index 000000000..8a0fb86f9 --- /dev/null +++ b/ingestors/support/outlookpst.py @@ -0,0 +1,80 @@ +import os +import logging +from email.message import EmailMessage +from email.parser import Parser +from email import policy + +import pypff +import magic + +from ingestors.util import join_path, safe_path + +log = logging.getLogger(__name__) + + +class OutlookPSTSupport(object): + """Provides helpers for parsing outlook data files (pst, ost etc).""" + + def folder_traverse(self, parent_folder, parent_path): + for folder in parent_folder.sub_folders: + if not folder.name: + continue + new_path = os.path.join(parent_path, safe_path(folder.name)) + os.makedirs(new_path) + if folder.number_of_sub_folders: + self.folder_traverse(folder, new_path) + self.check_for_messages(folder, new_path) + + def handle_email_message(self, message, folder_path): + file_path = os.path.join( + folder_path, safe_path(message.subject) + '.email' + ) + msg = EmailMessage() + if message.plain_text_body: + msg.set_content(message.plain_text_body) + if message.html_body: + msg.add_alternative( + message.html_body, maintype='text', subtype='html' + ) + headers = Parser(policy=policy.default).parsestr( + message.transport_headers, headersonly=True + ) + for key, val in headers.items(): + if key not in msg: + msg.add_header(key, val) + for index, attachment in enumerate(message.attachments): + name = attachment.name or "attachment-{0}".format(index) + attachment_buffer = attachment.read_buffer(attachment.size) + ctype = magic.from_buffer(attachment_buffer, mime=True) + maintype, subtype = ctype.split('/', 1) + msg.add_attachment( + attachment_buffer, maintype=maintype, + subtype=subtype, filename=name) + with open(file_path, 'wb') as fp: + fp.write(msg.as_bytes(policy=policy.default)) + + + def handle_text_message(self, message, folder_path): + file_path = os.path.join(folder_path, safe_path(message.subject)) + with open(file_path, 'wb') as fp: + if message.html_body: + fp.write(message.html_body) + elif message.plain_text_body: + fp.write(message.plain_text_body) + elif message.rtf_body: + fp.write(message.rtf_body) + for index, attachment in enumerate(message.attachments): + name = attachment.name or "attachment-{0}".format(index) + attachment_path = os.path.join(folder_path, safe_path(name)) + with open(attachment_path, 'wb') as fp: + fp.write(attachment.read_buffer(attachment.size)) + + + def check_for_messages(self, folder, folder_path): + for message in folder.sub_messages: + if not message.subject: + continue + if message.transport_headers: + self.handle_email_message(message, folder_path) + else: + self.handle_text_message(message, folder_path) diff --git a/ingestors/util.py b/ingestors/util.py index 70d0b8fbe..69e59802f 100644 --- a/ingestors/util.py +++ b/ingestors/util.py @@ -2,7 +2,7 @@ import shutil from banal import decode_path -from normality import stringify +from normality import stringify, safe_filename from normality.cleaning import remove_unsafe_chars @@ -66,3 +66,7 @@ def remove_directory(file_path): shutil.rmtree(file_path, True) except Exception: pass + + +def safe_path(filename): + return filename.replace('/', ':') From 3634ea7530f058de303005d380df70eed7021608 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Tue, 18 Sep 2018 17:44:26 +0530 Subject: [PATCH 2/5] Remove unused import --- ingestors/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestors/util.py b/ingestors/util.py index 69e59802f..371b04852 100644 --- a/ingestors/util.py +++ b/ingestors/util.py @@ -2,7 +2,7 @@ import shutil from banal import decode_path -from normality import stringify, safe_filename +from normality import stringify from normality.cleaning import remove_unsafe_chars From 83b3b88c483209956d355036b38139c307b0a920 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 20 Sep 2018 17:42:37 +0530 Subject: [PATCH 3/5] Create only non-emptry folders --- ingestors/support/outlookpst.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ingestors/support/outlookpst.py b/ingestors/support/outlookpst.py index 8a0fb86f9..fd8bbee9f 100644 --- a/ingestors/support/outlookpst.py +++ b/ingestors/support/outlookpst.py @@ -20,7 +20,6 @@ def folder_traverse(self, parent_folder, parent_path): if not folder.name: continue new_path = os.path.join(parent_path, safe_path(folder.name)) - os.makedirs(new_path) if folder.number_of_sub_folders: self.folder_traverse(folder, new_path) self.check_for_messages(folder, new_path) @@ -31,7 +30,9 @@ def handle_email_message(self, message, folder_path): ) msg = EmailMessage() if message.plain_text_body: - msg.set_content(message.plain_text_body) + msg.set_content( + message.plain_text_body, maintype='text', subtype='plain' + ) if message.html_body: msg.add_alternative( message.html_body, maintype='text', subtype='html' @@ -50,12 +51,16 @@ def handle_email_message(self, message, folder_path): msg.add_attachment( attachment_buffer, maintype=maintype, subtype=subtype, filename=name) + if not os.path.isdir(folder_path): + os.makedirs(folder_path) with open(file_path, 'wb') as fp: fp.write(msg.as_bytes(policy=policy.default)) def handle_text_message(self, message, folder_path): file_path = os.path.join(folder_path, safe_path(message.subject)) + if not os.path.isdir(folder_path): + os.makedirs(folder_path) with open(file_path, 'wb') as fp: if message.html_body: fp.write(message.html_body) From 81a9c49e1849893689590b8cb924ed9945bea19a Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 20 Sep 2018 18:27:01 +0530 Subject: [PATCH 4/5] Install pypff from source --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 77a3d666a..1135a9250 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,8 @@ RUN apt-get -qq -y update \ poppler-utils poppler-data pst-utils \ # document processing libreoffice \ + # libpff build tools + git autoconf automake autopoint libtool pkg-config \ && apt-get -qq -y autoremove \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -36,6 +38,9 @@ ENV LANG='en_US.UTF-8' \ LC_ALL='en_US.UTF-8' RUN pip3 install -q --upgrade pip setuptools six wheel +RUN curl -SL "https://github.com/sunu/libpff/archive/master.tar.gz" | tar -xz -C /tmp/ && cd /tmp/libpff-master \ + && ./synclibs.sh && ./autogen.sh && ./configure --enable-python \ + && cd /tmp/libpff-master && python3 setup.py install RUN pip3 install -q banal>=0.3.4 \ normality>=0.5.11 \ celestial>=0.2.3 \ From 4cd6ad13645082429ff51966915318eba1a23a51 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 20 Sep 2018 18:28:01 +0530 Subject: [PATCH 5/5] some clean up --- ingestors/email/outlookpst.py | 4 ++-- ingestors/support/outlookpst.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ingestors/email/outlookpst.py b/ingestors/email/outlookpst.py index c85f8d037..8d05e45ce 100644 --- a/ingestors/email/outlookpst.py +++ b/ingestors/email/outlookpst.py @@ -1,8 +1,6 @@ import logging import os -import pypff - from ingestors.base import Ingestor from ingestors.support.temp import TempFileSupport from ingestors.support.outlookpst import OutlookPSTSupport @@ -27,6 +25,8 @@ def ingest(self, file_path): self.result.flag(self.result.FLAG_PACKAGE) temp_dir = self.make_empty_directory() try: + # if installed with pip, pypff may not be available + import pypff pst_file = pypff.open(file_path) root = pst_file.get_root_folder() root_folder_name = root.name or os.path.basename(file_path) diff --git a/ingestors/support/outlookpst.py b/ingestors/support/outlookpst.py index fd8bbee9f..7a2ac5fa1 100644 --- a/ingestors/support/outlookpst.py +++ b/ingestors/support/outlookpst.py @@ -4,10 +4,9 @@ from email.parser import Parser from email import policy -import pypff import magic -from ingestors.util import join_path, safe_path +from ingestors.util import safe_path log = logging.getLogger(__name__)