From 48293bf9d4f63d676076092650d5348cedf947bc Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Wed, 5 Jul 2017 18:58:12 -0300 Subject: [PATCH 1/6] Proposal: API for rows.utils.decompress in the form of tests --- rows/utils.py | 5 ++++ tests/tests_utils.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/rows/utils.py b/rows/utils.py index b745afdd..86fcff7b 100644 --- a/rows/utils.py +++ b/rows/utils.py @@ -297,3 +297,8 @@ def export_to_uri(table, uri, *args, **kwargs): raise ValueError('Plugin (export) "{}" not found'.format(plugin_name)) return export_function(table, uri, *args, **kwargs) + + +def decompress(path, **kwargs): + 'Given a zip, gzip or lzma file returns a decompressed file object' + pass # TODO diff --git a/tests/tests_utils.py b/tests/tests_utils.py index 80e62efa..498b0334 100644 --- a/tests/tests_utils.py +++ b/tests/tests_utils.py @@ -17,8 +17,12 @@ from __future__ import unicode_literals +import gzip +import lzma +import os import tempfile import unittest +import zipfile import rows.utils @@ -71,3 +75,57 @@ def test_local_file_sample_size(self): # TODO: test normalize_mime_type # TODO: test plugin_name_by_mime_type # TODO: test plugin_name_by_uri + + +class UtilsDecompressTestCase(unittest.TestCase): + + def setUp(self): + self.contents = 'Ahoy' + self.temp = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp.cleanup() + + def test_decompress_with_gz(self): + compressed = os.path.join(self.tmp.name, 'test.gz') + with gzip.open(compressed) as compressed_handler: + compressed_handler.write(self.contents) + decompressed = rows.utils.decompress(compressed) + self.assertEqual(self.contents, decompressed.read()) + + def test_decompress_with_lzma(self): + compressed = os.path.join(self.tmp.name, 'test.lzma') + with lzma.open(compressed) as compressed_handler: + compressed_handler.write(self.contents) + decompressed = rows.utils.decompress(compressed) + self.assertEqual(self.contents, decompressed.read()) + + def test_decompress_with_xz(self): + compressed = os.path.join(self.tmp.name, 'test.gz') + with lzma.open(compressed) as compressed_handler: + compressed_handler.write(self.contents) + decompressed = rows.utils.decompress(compressed) + self.assertEqual(self.contents, decompressed.read()) + + def test_decompress_with_zip(self): + uncompressed = os.path.join(self.tmp.name, 'test.csv') + uncompressed_archived_path = os.path.join('test', 'test.csv') + compressed = os.path.join(self.tmp.name, 'test.zip') + + with open(uncompressed, 'w') as uncompressed_handler: + uncopressed_handler.write(self.contents) + + with zipfile.ZipFile(compressed, mode='w') as handler: + handler.write(uncompressed, arcname=uncompressed_archived_path) + + decompressed = rows.utils.decompress(compressed, + inner=uncompressed_archived_path) + self.assertEqual(self.contents, decompressed.read()) + + @unittest.skip('TODO') + def test_decompress_with_zip_without_inner(self): + pass + + @unittest.skip('TODO') + def test_decompress_with_incompatible_file(self): + pass From d62ba69200990dd9643cdcf90b0ce26493bf8413 Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Thu, 14 Sep 2017 19:14:59 -0300 Subject: [PATCH 2/6] Remove proposed zip API https://github.com/turicas/rows/issues/230\#issuecomment-328364587 --- tests/tests_utils.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/tests/tests_utils.py b/tests/tests_utils.py index 498b0334..35c68b9d 100644 --- a/tests/tests_utils.py +++ b/tests/tests_utils.py @@ -107,25 +107,6 @@ def test_decompress_with_xz(self): decompressed = rows.utils.decompress(compressed) self.assertEqual(self.contents, decompressed.read()) - def test_decompress_with_zip(self): - uncompressed = os.path.join(self.tmp.name, 'test.csv') - uncompressed_archived_path = os.path.join('test', 'test.csv') - compressed = os.path.join(self.tmp.name, 'test.zip') - - with open(uncompressed, 'w') as uncompressed_handler: - uncopressed_handler.write(self.contents) - - with zipfile.ZipFile(compressed, mode='w') as handler: - handler.write(uncompressed, arcname=uncompressed_archived_path) - - decompressed = rows.utils.decompress(compressed, - inner=uncompressed_archived_path) - self.assertEqual(self.contents, decompressed.read()) - - @unittest.skip('TODO') - def test_decompress_with_zip_without_inner(self): - pass - @unittest.skip('TODO') def test_decompress_with_incompatible_file(self): pass From 78338cfa4cfdb2d19b57e26d8418c43b2bdee01e Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Thu, 14 Sep 2017 19:16:07 -0300 Subject: [PATCH 3/6] Implement rows.utils.decompress for lzma and gzip --- rows/utils.py | 56 ++++++++++++++++++++++++++++++++++++-------- tests/tests_utils.py | 21 ++++++++++++----- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/rows/utils.py b/rows/utils.py index 86fcff7b..8bf4597e 100644 --- a/rows/utils.py +++ b/rows/utils.py @@ -18,6 +18,7 @@ from __future__ import unicode_literals import cgi +import gzip import mimetypes import os import tempfile @@ -27,6 +28,11 @@ except ImportError: from urllib.parse import urlparse # Python 3 +try: + import lzma +except ImportError: + lzma = None + try: import magic except ImportError: @@ -158,14 +164,7 @@ def plugin_name_by_mime_type(mime_type, mime_name, file_extension): None) -def detect_local_source(path, content, mime_type=None, encoding=None): - - # TODO: may add sample_size - - filename = os.path.basename(path) - parts = filename.split('.') - extension = parts[-1] if len(parts) > 1 else None - +def describe_file_type(filename, content, mime_type=None, encoding=None): if magic is not None: detected = magic.detect_from_content(content) encoding = detected.encoding or encoding @@ -177,6 +176,19 @@ def detect_local_source(path, content, mime_type=None, encoding=None): mime_name = None mime_type = mime_type or mimetypes.guess_type(filename)[0] + return mime_type, encoding, mime_name + +def detect_local_source(path, content, mime_type=None, encoding=None): + # TODO: may add sample_size + + filename = os.path.basename(path) + parts = filename.split('.') + extension = parts[-1] if len(parts) > 1 else None + + args = (filename, content) + kwargs = dict(mime_type=mime_type, encoding=encoding) + mime_type, encoding, mime_name = describe_file_type(*args, **kwargs) + plugin_name = plugin_name_by_mime_type(mime_type, mime_name, extension) if encoding == 'binary': encoding = None @@ -300,5 +312,29 @@ def export_to_uri(table, uri, *args, **kwargs): def decompress(path, **kwargs): - 'Given a zip, gzip or lzma file returns a decompressed file object' - pass # TODO + """ + Given a gzip or lzma file returns a decompressed file object. All kwargs + are passed to either `gzip.open` or `lzma.open`. + :param path: (str) path to a gzip or lzma file + """ + filename = os.path.basename(path) + with open(path, 'rb') as handler: + mime_type = describe_file_type(filename, handler.read())[0] + + mapping = { + 'application/gzip': gzip.open, + 'application/gz': gzip.open, + } + if lzma: + mapping.update({ + 'application/x-xz': lzma.open, + 'application/x-lzma': lzma.open, + }) + open_compressed = mapping.get(mime_type) + + if not open_compressed: + msg = "Couldn't identify file mimetype, or lzma module isn't available" + raise RuntimeError(msg) + + with open_compressed(path, **kwargs) as handler: + return handler diff --git a/tests/tests_utils.py b/tests/tests_utils.py index 35c68b9d..570f75b4 100644 --- a/tests/tests_utils.py +++ b/tests/tests_utils.py @@ -18,12 +18,18 @@ from __future__ import unicode_literals import gzip -import lzma import os import tempfile import unittest import zipfile +try: + import lzma +except ImportError: + lzma = None + +import six + import rows.utils import tests.utils as utils @@ -80,7 +86,7 @@ def test_local_file_sample_size(self): class UtilsDecompressTestCase(unittest.TestCase): def setUp(self): - self.contents = 'Ahoy' + self.contents = six.b('Ahoy') self.temp = tempfile.TemporaryDirectory() def tearDown(self): @@ -88,11 +94,12 @@ def tearDown(self): def test_decompress_with_gz(self): compressed = os.path.join(self.tmp.name, 'test.gz') - with gzip.open(compressed) as compressed_handler: + with gzip.open(compressed, mode='wb') as compressed_handler: compressed_handler.write(self.contents) decompressed = rows.utils.decompress(compressed) self.assertEqual(self.contents, decompressed.read()) + @unittest.skipIf(not lzma, 'lzma module not available') def test_decompress_with_lzma(self): compressed = os.path.join(self.tmp.name, 'test.lzma') with lzma.open(compressed) as compressed_handler: @@ -100,13 +107,15 @@ def test_decompress_with_lzma(self): decompressed = rows.utils.decompress(compressed) self.assertEqual(self.contents, decompressed.read()) + @unittest.skipIf(not lzma, 'lzma module not available') def test_decompress_with_xz(self): compressed = os.path.join(self.tmp.name, 'test.gz') - with lzma.open(compressed) as compressed_handler: + with lzma.open(compressed) as compressed_handilsler: compressed_handler.write(self.contents) decompressed = rows.utils.decompress(compressed) self.assertEqual(self.contents, decompressed.read()) - @unittest.skip('TODO') def test_decompress_with_incompatible_file(self): - pass + with self.assertRaises(): + with tempfile.NamedTemporaryFile() as tmp: + rows.utils.decompress(tmp.name) From ac3bdbf34849b9ba9765e43e58aa0cad9fcc9153 Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Thu, 14 Sep 2017 19:36:08 -0300 Subject: [PATCH 4/6] Implement rows.utils.decompress for bz2 --- rows/utils.py | 42 ++++++++++++++++++++++++++++++++---------- tests/tests_utils.py | 11 +++++++++-- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/rows/utils.py b/rows/utils.py index 8bf4597e..9763f378 100644 --- a/rows/utils.py +++ b/rows/utils.py @@ -17,6 +17,7 @@ from __future__ import unicode_literals +import bz2 import cgi import gzip import mimetypes @@ -321,16 +322,37 @@ def decompress(path, **kwargs): with open(path, 'rb') as handler: mime_type = describe_file_type(filename, handler.read())[0] - mapping = { - 'application/gzip': gzip.open, - 'application/gz': gzip.open, - } - if lzma: - mapping.update({ - 'application/x-xz': lzma.open, - 'application/x-lzma': lzma.open, - }) - open_compressed = mapping.get(mime_type) + bz2_mime_types = ( + 'application/bzip2', + 'application/octet-stream', + 'application/x-bz2', + 'application/x-bzip', + 'application/x-compressed', + 'application/x-stuffit' + ) + gzip_mime_types = ( + 'application/gzip', + 'application/x-gzip', + 'application/x-gunzip', + 'application/gzipped', + 'application/gzip-compressed', + 'application/x-compressed', + 'application/x-compress', + 'gzip/document', + 'application/octet-stream' + ) + lzma_mime_types = ( + 'application/x-xz', + 'application/x-lzma' + ) + + open_compressed = None + if mime_type in bz2_mime_types: + open_compressed = bz2.open + if mime_type in gzip_mime_types: + open_compressed = gzip.open + if lzma and mime_type in lzma_mime_types: + open_compressed = lzma.open if not open_compressed: msg = "Couldn't identify file mimetype, or lzma module isn't available" diff --git a/tests/tests_utils.py b/tests/tests_utils.py index 570f75b4..2d6b50e2 100644 --- a/tests/tests_utils.py +++ b/tests/tests_utils.py @@ -17,11 +17,11 @@ from __future__ import unicode_literals +import bz2 import gzip import os import tempfile import unittest -import zipfile try: import lzma @@ -92,6 +92,13 @@ def setUp(self): def tearDown(self): self.temp.cleanup() + def test_decompress_with_bz2(self): + compressed = os.path.join(self.tmp.name, 'test.bz2') + with bz2.open(compressed, mode='wb') as compressed_handler: + compressed_handler.write(self.contents) + decompressed = rows.utils.decompress(compressed) + self.assertEqual(self.contents, decompressed.read()) + def test_decompress_with_gz(self): compressed = os.path.join(self.tmp.name, 'test.gz') with gzip.open(compressed, mode='wb') as compressed_handler: @@ -110,7 +117,7 @@ def test_decompress_with_lzma(self): @unittest.skipIf(not lzma, 'lzma module not available') def test_decompress_with_xz(self): compressed = os.path.join(self.tmp.name, 'test.gz') - with lzma.open(compressed) as compressed_handilsler: + with lzma.open(compressed) as compressed_handler: compressed_handler.write(self.contents) decompressed = rows.utils.decompress(compressed) self.assertEqual(self.contents, decompressed.read()) From f0b786d101a2e1af4a64860d7f43850a53a3513f Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Thu, 14 Sep 2017 19:40:10 -0300 Subject: [PATCH 5/6] Properly document bz2 in docstrings --- rows/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rows/utils.py b/rows/utils.py index 9763f378..308682df 100644 --- a/rows/utils.py +++ b/rows/utils.py @@ -314,9 +314,9 @@ def export_to_uri(table, uri, *args, **kwargs): def decompress(path, **kwargs): """ - Given a gzip or lzma file returns a decompressed file object. All kwargs - are passed to either `gzip.open` or `lzma.open`. - :param path: (str) path to a gzip or lzma file + Given a bz2, gzip or lzma file returns a decompressed file object. All + kwargs are passed to either `bz2.openn`, `gzip.open` or `lzma.open`. + :param path: (str) path to a bz2, gzip or lzma file """ filename = os.path.basename(path) with open(path, 'rb') as handler: From 758c1b6f5cb20d393dbfd7d149114c8095f713aa Mon Sep 17 00:00:00 2001 From: Eduardo Cuducos Date: Sun, 15 Oct 2017 11:30:55 -0200 Subject: [PATCH 6/6] Refactor decompress function --- rows/utils.py | 67 +++++++++++---------------- tests/tests_utils.py | 108 +++++++++++++++++++++++++++++-------------- 2 files changed, 99 insertions(+), 76 deletions(-) diff --git a/rows/utils.py b/rows/utils.py index 308682df..632bbd7d 100644 --- a/rows/utils.py +++ b/rows/utils.py @@ -57,6 +57,7 @@ pass import rows +from rows.plugins.utils import get_filename_and_fobj # TODO: should get this information from the plugins TEXT_PLAIN = { @@ -312,51 +313,35 @@ def export_to_uri(table, uri, *args, **kwargs): return export_function(table, uri, *args, **kwargs) -def decompress(path, **kwargs): +def decompress(path_or_fobj, algorithm=None, **kwargs): """ Given a bz2, gzip or lzma file returns a decompressed file object. All kwargs are passed to either `bz2.openn`, `gzip.open` or `lzma.open`. - :param path: (str) path to a bz2, gzip or lzma file + :param path_or_fobj: (str) path to a bz2, gzip or lzma file + :param algorithm: (str) either bz2, gzip or lzma """ - filename = os.path.basename(path) - with open(path, 'rb') as handler: - mime_type = describe_file_type(filename, handler.read())[0] - - bz2_mime_types = ( - 'application/bzip2', - 'application/octet-stream', - 'application/x-bz2', - 'application/x-bzip', - 'application/x-compressed', - 'application/x-stuffit' - ) - gzip_mime_types = ( - 'application/gzip', - 'application/x-gzip', - 'application/x-gunzip', - 'application/gzipped', - 'application/gzip-compressed', - 'application/x-compressed', - 'application/x-compress', - 'gzip/document', - 'application/octet-stream' - ) - lzma_mime_types = ( - 'application/x-xz', - 'application/x-lzma' + filename, fobj = get_filename_and_fobj(path_or_fobj, 'rb') + + extension = None + if not algorithm and filename: + _, extension = os.path.splitext(filename) + extension = extension.replace('.', '') + + open_mapping = dict( + bz2=bz2.BZ2File, + gzip=gzip.GzipFile, + gz=gzip.GzipFile, + lzma=getattr(lzma, 'LZMAFile'), # lzma might not be available + xz=getattr(lzma, 'LZMAFile') # lzma might not be available ) - - open_compressed = None - if mime_type in bz2_mime_types: - open_compressed = bz2.open - if mime_type in gzip_mime_types: - open_compressed = gzip.open - if lzma and mime_type in lzma_mime_types: - open_compressed = lzma.open + open_compressed = open_mapping.get(algorithm or extension) if not open_compressed: - msg = "Couldn't identify file mimetype, or lzma module isn't available" - raise RuntimeError(msg) - - with open_compressed(path, **kwargs) as handler: - return handler + raise RuntimeError(( + 'Unknown extension and/or invalid algorithm: options are: bz2, ' + 'gzip, gz, lzma or xz ({})'.format(filename or fobj) + )) + + target = filename or fobj + with open_compressed(target, 'rb', **kwargs) as handler: + return handler.read() diff --git a/tests/tests_utils.py b/tests/tests_utils.py index 2d6b50e2..3ece8273 100644 --- a/tests/tests_utils.py +++ b/tests/tests_utils.py @@ -18,8 +18,10 @@ from __future__ import unicode_literals import bz2 +import contextlib import gzip import os +import shutil import tempfile import unittest @@ -83,46 +85,82 @@ def test_local_file_sample_size(self): # TODO: test plugin_name_by_uri -class UtilsDecompressTestCase(unittest.TestCase): +class TestUtilsDecompress(unittest.TestCase): def setUp(self): - self.contents = six.b('Ahoy') - self.temp = tempfile.TemporaryDirectory() + self.contents = b'I use rows and it is awesome!' + self.temp = tempfile.mkdtemp() def tearDown(self): - self.temp.cleanup() - - def test_decompress_with_bz2(self): - compressed = os.path.join(self.tmp.name, 'test.bz2') - with bz2.open(compressed, mode='wb') as compressed_handler: - compressed_handler.write(self.contents) - decompressed = rows.utils.decompress(compressed) - self.assertEqual(self.contents, decompressed.read()) - - def test_decompress_with_gz(self): - compressed = os.path.join(self.tmp.name, 'test.gz') - with gzip.open(compressed, mode='wb') as compressed_handler: - compressed_handler.write(self.contents) - decompressed = rows.utils.decompress(compressed) - self.assertEqual(self.contents, decompressed.read()) - - @unittest.skipIf(not lzma, 'lzma module not available') - def test_decompress_with_lzma(self): - compressed = os.path.join(self.tmp.name, 'test.lzma') - with lzma.open(compressed) as compressed_handler: - compressed_handler.write(self.contents) - decompressed = rows.utils.decompress(compressed) - self.assertEqual(self.contents, decompressed.read()) - - @unittest.skipIf(not lzma, 'lzma module not available') - def test_decompress_with_xz(self): - compressed = os.path.join(self.tmp.name, 'test.gz') - with lzma.open(compressed) as compressed_handler: - compressed_handler.write(self.contents) - decompressed = rows.utils.decompress(compressed) - self.assertEqual(self.contents, decompressed.read()) + shutil.rmtree(self.temp) + + @contextlib.contextmanager + def _create_file(self, algorithm, extension=True): + extension = '.{}'.format(algorithm.__name__) if extension else '' + filename = 'test{}'.format(extension) + filepath = os.path.join(self.temp, filename) + + open_mapping = { + bz2: bz2.BZ2File, + gzip: gzip.GzipFile, + lzma: getattr(lzma, 'LZMAFile') + } + open_method = open_mapping.get(algorithm) + with open_method(filepath, 'wb') as obj: + obj.write(self.contents) + + with open(filepath, 'rb') as obj: + yield filepath, obj + + def _test_decompress_with_path(self, algorithm): + with self._create_file(algorithm) as path_and_obj: + path, _ = path_and_obj + decompressed = rows.utils.decompress(path) + self.assertEqual(self.contents, decompressed) + + def _test_decompress_with_file_obj(self, algorithm,): + with self._create_file(algorithm) as path_and_obj: + _, obj = path_and_obj + decompressed = rows.utils.decompress(obj) + self.assertEqual(self.contents, decompressed) + + def _test_decompress_without_extension(self, algorithm): + with self._create_file(algorithm, False) as path_and_obj: + path, _ = path_and_obj + decompressed = rows.utils.decompress(path, algorithm.__name__) + self.assertEqual(self.contents, decompressed) + + def test_decompress_bz2_with_path(self): + self._test_decompress_with_path(bz2) + + def test_decompress_gzip_with_path(self): + self._test_decompress_with_path(gzip) + + @unittest.skipIf(not lzma, 'No lzma module available') + def test_decompress_lzma_with_path(self): + self._test_decompress_with_path(lzma) + + def test_decompress_bz2_with_file_object(self): + self._test_decompress_with_file_obj(bz2) + + def test_decompress_gzip_with_file_object(self): + self._test_decompress_with_file_obj(gzip) + + @unittest.skipIf(not lzma, 'No lzma module available') + def test_decompress_lzma_with_file_object(self): + self._test_decompress_with_file_obj(lzma) + + def test_decompress_bz2_without_extension(self): + self._test_decompress_without_extension(bz2) + + def test_decompress_gzip_without_extension(self): + self._test_decompress_without_extension(gzip) + + @unittest.skipIf(not lzma, 'No lzma module available') + def test_decompress_lzma_without_extension(self): + self._test_decompress_without_extension(lzma) def test_decompress_with_incompatible_file(self): - with self.assertRaises(): + with self.assertRaises(RuntimeError): with tempfile.NamedTemporaryFile() as tmp: rows.utils.decompress(tmp.name)