diff --git a/Dockerfile b/Dockerfile index 77a3d666a..1135a9250 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,8 @@ RUN apt-get -qq -y update \ poppler-utils poppler-data pst-utils \ # document processing libreoffice \ + # libpff build tools + git autoconf automake autopoint libtool pkg-config \ && apt-get -qq -y autoremove \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -36,6 +38,9 @@ ENV LANG='en_US.UTF-8' \ LC_ALL='en_US.UTF-8' RUN pip3 install -q --upgrade pip setuptools six wheel +RUN curl -SL "https://github.com/sunu/libpff/archive/master.tar.gz" | tar -xz -C /tmp/ && cd /tmp/libpff-master \ + && ./synclibs.sh && ./autogen.sh && ./configure --enable-python \ + && cd /tmp/libpff-master && python3 setup.py install RUN pip3 install -q banal>=0.3.4 \ normality>=0.5.11 \ celestial>=0.2.3 \ diff --git a/ingestors/email/msg.py b/ingestors/email/msg.py index 30681dfce..8cb1f4089 100644 --- a/ingestors/email/msg.py +++ b/ingestors/email/msg.py @@ -16,6 +16,7 @@ class RFC822Ingestor(Ingestor, EmailSupport): MIME_TYPES = [ 'multipart/mixed', + 'multipart/alternative', 'message/rfc822' ] EXTENSIONS = [ diff --git a/ingestors/email/outlookpst.py b/ingestors/email/outlookpst.py index edb84b768..8d05e45ce 100644 --- a/ingestors/email/outlookpst.py +++ b/ingestors/email/outlookpst.py @@ -1,11 +1,19 @@ +import logging +import os + from ingestors.base import Ingestor from ingestors.support.temp import TempFileSupport -from ingestors.support.shell import ShellSupport +from ingestors.support.outlookpst import OutlookPSTSupport from ingestors.support.ole import OLESupport from ingestors.directory import DirectoryIngestor +from ingestors.util import join_path + + +log = logging.getLogger(__name__) -class OutlookPSTIngestor(Ingestor, TempFileSupport, ShellSupport, OLESupport): +class OutlookPSTIngestor(Ingestor, TempFileSupport, + OutlookPSTSupport, OLESupport): MIME_DEFAULT = 'application/vnd.ms-outlook' MIME_TYPES = [MIME_DEFAULT] EXTENSIONS = ['pst', 'ost', 'pab'] @@ -17,15 +25,15 @@ def ingest(self, file_path): self.result.flag(self.result.FLAG_PACKAGE) temp_dir = self.make_empty_directory() try: - self.exec_command('readpst', - '-e', # make subfolders, files per message - '-D', # include deleted - '-r', # recursive structure - '-8', # utf-8 where possible - '-b', - '-q', # quiet - '-o', temp_dir, - file_path) + # if installed with pip, pypff may not be available + import pypff + pst_file = pypff.open(file_path) + root = pst_file.get_root_folder() + root_folder_name = root.name or os.path.basename(file_path) + root_folder_path = os.path.join(temp_dir, root_folder_name) + os.makedirs(root_folder_path) + self.folder_traverse(root, root_folder_path) + self.check_for_messages(root, root_folder_path) self.manager.delegate(DirectoryIngestor, self.result, temp_dir) except Exception: # Handle partially extracted archives. diff --git a/ingestors/support/outlookpst.py b/ingestors/support/outlookpst.py new file mode 100644 index 000000000..7a2ac5fa1 --- /dev/null +++ b/ingestors/support/outlookpst.py @@ -0,0 +1,84 @@ +import os +import logging +from email.message import EmailMessage +from email.parser import Parser +from email import policy + +import magic + +from ingestors.util import safe_path + +log = logging.getLogger(__name__) + + +class OutlookPSTSupport(object): + """Provides helpers for parsing outlook data files (pst, ost etc).""" + + def folder_traverse(self, parent_folder, parent_path): + for folder in parent_folder.sub_folders: + if not folder.name: + continue + new_path = os.path.join(parent_path, safe_path(folder.name)) + if folder.number_of_sub_folders: + self.folder_traverse(folder, new_path) + self.check_for_messages(folder, new_path) + + def handle_email_message(self, message, folder_path): + file_path = os.path.join( + folder_path, safe_path(message.subject) + '.email' + ) + msg = EmailMessage() + if message.plain_text_body: + msg.set_content( + message.plain_text_body, maintype='text', subtype='plain' + ) + if message.html_body: + msg.add_alternative( + message.html_body, maintype='text', subtype='html' + ) + headers = Parser(policy=policy.default).parsestr( + message.transport_headers, headersonly=True + ) + for key, val in headers.items(): + if key not in msg: + msg.add_header(key, val) + for index, attachment in enumerate(message.attachments): + name = attachment.name or "attachment-{0}".format(index) + attachment_buffer = attachment.read_buffer(attachment.size) + ctype = magic.from_buffer(attachment_buffer, mime=True) + maintype, subtype = ctype.split('/', 1) + msg.add_attachment( + attachment_buffer, maintype=maintype, + subtype=subtype, filename=name) + if not os.path.isdir(folder_path): + os.makedirs(folder_path) + with open(file_path, 'wb') as fp: + fp.write(msg.as_bytes(policy=policy.default)) + + + def handle_text_message(self, message, folder_path): + file_path = os.path.join(folder_path, safe_path(message.subject)) + if not os.path.isdir(folder_path): + os.makedirs(folder_path) + with open(file_path, 'wb') as fp: + if message.html_body: + fp.write(message.html_body) + elif message.plain_text_body: + fp.write(message.plain_text_body) + elif message.rtf_body: + fp.write(message.rtf_body) + for index, attachment in enumerate(message.attachments): + name = attachment.name or "attachment-{0}".format(index) + attachment_path = os.path.join(folder_path, safe_path(name)) + with open(attachment_path, 'wb') as fp: + fp.write(attachment.read_buffer(attachment.size)) + + + def check_for_messages(self, folder, folder_path): + for message in folder.sub_messages: + if not message.subject: + continue + if message.transport_headers: + self.handle_email_message(message, folder_path) + else: + self.handle_text_message(message, folder_path) diff --git a/ingestors/util.py b/ingestors/util.py index 70d0b8fbe..371b04852 100644 --- a/ingestors/util.py +++ b/ingestors/util.py @@ -66,3 +66,7 @@ def remove_directory(file_path): shutil.rmtree(file_path, True) except Exception: pass + + +def safe_path(filename): + return filename.replace('/', ':')