Skip to content
This repository was archived by the owner on Dec 31, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ RUN apt-get -qq -y update \
poppler-utils poppler-data pst-utils \
# document processing
libreoffice \
# libpff build tools
git autoconf automake autopoint libtool pkg-config \
&& apt-get -qq -y autoremove \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
Expand All @@ -36,6 +38,9 @@ ENV LANG='en_US.UTF-8' \
LC_ALL='en_US.UTF-8'

RUN pip3 install -q --upgrade pip setuptools six wheel
RUN curl -SL "https://github.com/sunu/libpff/archive/master.tar.gz" | tar -xz -C /tmp/ && cd /tmp/libpff-master \
&& ./synclibs.sh && ./autogen.sh && ./configure --enable-python \
&& cd /tmp/libpff-master && python3 setup.py install
RUN pip3 install -q banal>=0.3.4 \
normality>=0.5.11 \
celestial>=0.2.3 \
Expand Down
1 change: 1 addition & 0 deletions ingestors/email/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
class RFC822Ingestor(Ingestor, EmailSupport):
MIME_TYPES = [
'multipart/mixed',
'multipart/alternative',
'message/rfc822'
]
EXTENSIONS = [
Expand Down
30 changes: 19 additions & 11 deletions ingestors/email/outlookpst.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import logging
import os

from ingestors.base import Ingestor
from ingestors.support.temp import TempFileSupport
from ingestors.support.shell import ShellSupport
from ingestors.support.outlookpst import OutlookPSTSupport
from ingestors.support.ole import OLESupport
from ingestors.directory import DirectoryIngestor
from ingestors.util import join_path


log = logging.getLogger(__name__)


class OutlookPSTIngestor(Ingestor, TempFileSupport, ShellSupport, OLESupport):
class OutlookPSTIngestor(Ingestor, TempFileSupport,
OutlookPSTSupport, OLESupport):
MIME_DEFAULT = 'application/vnd.ms-outlook'
MIME_TYPES = [MIME_DEFAULT]
EXTENSIONS = ['pst', 'ost', 'pab']
Expand All @@ -17,15 +25,15 @@ def ingest(self, file_path):
self.result.flag(self.result.FLAG_PACKAGE)
temp_dir = self.make_empty_directory()
try:
self.exec_command('readpst',
'-e', # make subfolders, files per message
'-D', # include deleted
'-r', # recursive structure
'-8', # utf-8 where possible
'-b',
'-q', # quiet
'-o', temp_dir,
file_path)
# if installed with pip, pypff may not be available
import pypff
pst_file = pypff.open(file_path)
root = pst_file.get_root_folder()
root_folder_name = root.name or os.path.basename(file_path)
root_folder_path = os.path.join(temp_dir, root_folder_name)
os.makedirs(root_folder_path)
self.folder_traverse(root, root_folder_path)
self.check_for_messages(root, root_folder_path)
self.manager.delegate(DirectoryIngestor, self.result, temp_dir)
except Exception:
# Handle partially extracted archives.
Expand Down
84 changes: 84 additions & 0 deletions ingestors/support/outlookpst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import logging
from email.message import EmailMessage
from email.parser import Parser
from email import policy

import magic

from ingestors.util import safe_path

log = logging.getLogger(__name__)


class OutlookPSTSupport(object):
"""Provides helpers for parsing outlook data files (pst, ost etc)."""

def folder_traverse(self, parent_folder, parent_path):
for folder in parent_folder.sub_folders:
if not folder.name:
continue
new_path = os.path.join(parent_path, safe_path(folder.name))
if folder.number_of_sub_folders:
self.folder_traverse(folder, new_path)
self.check_for_messages(folder, new_path)

def handle_email_message(self, message, folder_path):
file_path = os.path.join(
folder_path, safe_path(message.subject) + '.email'
)
msg = EmailMessage()
if message.plain_text_body:
msg.set_content(
message.plain_text_body, maintype='text', subtype='plain'
)
if message.html_body:
msg.add_alternative(
message.html_body, maintype='text', subtype='html'
)
headers = Parser(policy=policy.default).parsestr(
message.transport_headers, headersonly=True
)
for key, val in headers.items():
if key not in msg:
msg.add_header(key, val)
for index, attachment in enumerate(message.attachments):
name = attachment.name or "attachment-{0}".format(index)
attachment_buffer = attachment.read_buffer(attachment.size)
ctype = magic.from_buffer(attachment_buffer, mime=True)
maintype, subtype = ctype.split('/', 1)
msg.add_attachment(
attachment_buffer, maintype=maintype,
subtype=subtype, filename=name)
if not os.path.isdir(folder_path):
os.makedirs(folder_path)
with open(file_path, 'wb') as fp:
fp.write(msg.as_bytes(policy=policy.default))


def handle_text_message(self, message, folder_path):
file_path = os.path.join(folder_path, safe_path(message.subject))
if not os.path.isdir(folder_path):
os.makedirs(folder_path)
with open(file_path, 'wb') as fp:
if message.html_body:
fp.write(message.html_body)
elif message.plain_text_body:
fp.write(message.plain_text_body)
elif message.rtf_body:
fp.write(message.rtf_body)
for index, attachment in enumerate(message.attachments):
name = attachment.name or "attachment-{0}".format(index)
attachment_path = os.path.join(folder_path, safe_path(name))
with open(attachment_path, 'wb') as fp:
fp.write(attachment.read_buffer(attachment.size))


def check_for_messages(self, folder, folder_path):
for message in folder.sub_messages:
if not message.subject:
continue
if message.transport_headers:
self.handle_email_message(message, folder_path)
else:
self.handle_text_message(message, folder_path)
4 changes: 4 additions & 0 deletions ingestors/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,7 @@ def remove_directory(file_path):
shutil.rmtree(file_path, True)
except Exception:
pass


def safe_path(filename):
return filename.replace('/', ':')