From 372007aa3466dd3d4dc2f432364ba8f220d8da32 Mon Sep 17 00:00:00 2001 From: Rhenan Bartels Date: Wed, 16 May 2018 14:55:52 -0300 Subject: [PATCH 1/4] Replace null bytes with empty spaces --- rows/plugins/plugin_csv.py | 15 ++++++++++++++- tests/data/csv_with_null_bytes.csv | Bin 0 -> 3552 bytes tests/tests_plugin_csv.py | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 tests/data/csv_with_null_bytes.csv diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py index bcd1904a..cd4937f4 100644 --- a/rows/plugins/plugin_csv.py +++ b/rows/plugins/plugin_csv.py @@ -17,7 +17,7 @@ from __future__ import unicode_literals -from io import BytesIO +from io import BytesIO, BufferedReader import six import unicodecsv @@ -27,6 +27,18 @@ sniffer = unicodecsv.Sniffer() + +class NotNullBytesWrapper(BufferedReader): + + def read(self, *args, **kwargs): + data = super().read(*args, **kwargs) + return data.replace(b'\x00', b'') + + def readline(self, *args, **kwargs): + data = super().readline(*args, **kwargs) + return data.replace(b'\x00', b'') + + if six.PY2: def discover_dialect(sample, encoding=None, @@ -104,6 +116,7 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, """ filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') + fobj = NotNullBytesWrapper(fobj) if dialect is None: dialect = discover_dialect(sample=read_sample(fobj, sample_size), diff --git a/tests/data/csv_with_null_bytes.csv b/tests/data/csv_with_null_bytes.csv new file mode 100644 index 0000000000000000000000000000000000000000..a43b69524800eec9a08beb7ecc8f70aab5c44819 GIT binary patch literal 3552 zcmeH}L5`a+6ozN(IfV~EH5lTMEIr0ak&=KQi7HiHxPyz30**|Wi}nav^Z?y;RhtkX z8Ff+3EXm4#@0r*iKffRSo#vT@KUH>?FsYA1EC>MZ1-5)@Dif)Mm8G%dF;^C?GW_K3@SW_^VXnB~Pzg7|j0^Lg`b;m%2F*FpxCw zh-dr~@cD@krh9&n>d~?F4@5Cb0-tSYK%h@x%%a`%Dxd^7HnP#b(+KI+2qcWgY)fLZ znnB{PgH4cZV?s&z6ve;=VSQ4(y1tDHQmK@NE-K*KV9ifj5&Qu4o29a>>Oz7EAZd}8 zd8OAGG{|voBU%HBAApB#K#nq(u&+PmuTpPZAFGe)XTX%hRx9@j0qcvhz|2Ea*PIEv zfJBTkn6P*s&@h2Z7Ly=>tE~p~waI-mcu@E})AhCo5J2e3Mo%UP96WR978>~)@E^f< zcFWhD%(3N+cyICIx#J_%!gVpS zyvYdm{b2DeScEJ*3brxg-Vgpuu#MBYxN+KEuy{H}mOVP{2as(%wkL~b*uo>&JFru` zx$m`gA{QjlE{2~V4c9EDK?s9&6vkWA_(1lh8yef~dj-d7oWboZ_hEF~TesCW7)F}1 zjocZC!c_p-X2~|6n-25RfZat0t=L4jl=?=Sk9db2^sotg7hdelHD0IhbRQo70+l66 Ar~m)} literal 0 HcmV?d00001 diff --git a/tests/tests_plugin_csv.py b/tests/tests_plugin_csv.py index 4c3cdc49..e501a721 100644 --- a/tests/tests_plugin_csv.py +++ b/tests/tests_plugin_csv.py @@ -325,3 +325,10 @@ def test_export_callback(self): [x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10] ) + + def test_issue_273(self): + filename = 'tests/data/csv_with_null_bytes.csv' + # Should not raise Error: line contains NULL byte + table = rows.import_from_csv(filename, encoding='latin-1') + + self.assertEqual(len(table), 9) From 7b1a93412cd945f53495ad98d693ae273ef31cab Mon Sep 17 00:00:00 2001 From: Rhenan Bartels Date: Fri, 25 May 2018 14:31:41 -0300 Subject: [PATCH 2/4] Replace null bytes in csv file for Python 2.x --- rows/plugins/plugin_csv.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py index cd4937f4..f37412cf 100644 --- a/rows/plugins/plugin_csv.py +++ b/rows/plugins/plugin_csv.py @@ -18,6 +18,7 @@ from __future__ import unicode_literals from io import BytesIO, BufferedReader +from io import open as io_open import six import unicodecsv @@ -28,18 +29,17 @@ sniffer = unicodecsv.Sniffer() -class NotNullBytesWrapper(BufferedReader): - - def read(self, *args, **kwargs): - data = super().read(*args, **kwargs) - return data.replace(b'\x00', b'') +if six.PY2: - def readline(self, *args, **kwargs): - data = super().readline(*args, **kwargs) - return data.replace(b'\x00', b'') + class NotNullBytesWrapper(BufferedReader): + def read(self, *args, **kwargs): + data = super(NotNullBytesWrapper, self).read(*args, **kwargs) + return data.replace(b'\x00', b'') -if six.PY2: + def readline(self, *args, **kwargs): + data = super(NotNullBytesWrapper, self).readline(*args, **kwargs) + return data.replace(b'\x00', b'') def discover_dialect(sample, encoding=None, delimiters=(b',', b';', b'\t', b'|')): @@ -61,6 +61,16 @@ def discover_dialect(sample, encoding=None, elif six.PY3: + class NotNullBytesWrapper(BufferedReader): + + def read(self, *args, **kwargs): + data = super().read(*args, **kwargs) + return data.replace(b'\x00', b'') + + def readline(self, *args, **kwargs): + data = super().readline(*args, **kwargs) + return data.replace(b'\x00', b'') + def discover_dialect(sample, encoding, delimiters=(',', ';', '\t', '|')): """Discover a CSV dialect based on a sample size @@ -116,7 +126,11 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, """ filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') - fobj = NotNullBytesWrapper(fobj) + + if six.PY2: + fobj = NotNullBytesWrapper(io_open(filename, mode='rb')) + elif six.PY3: + fobj = NotNullBytesWrapper(fobj) if dialect is None: dialect = discover_dialect(sample=read_sample(fobj, sample_size), From 4f5e836b92ac20475915b8aace377ab276fae47e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Sat, 26 May 2018 21:21:08 -0300 Subject: [PATCH 3/4] Add first tests for get_filename_and_fobj Pair programming with @rhenanbartels --- rows/plugins/utils.py | 22 ++++++++++---- tests/tests_plugin_utils.py | 60 ++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/rows/plugins/utils.py b/rows/plugins/utils.py index d5c4181d..599f1c64 100644 --- a/rows/plugins/utils.py +++ b/rows/plugins/utils.py @@ -17,6 +17,7 @@ from __future__ import unicode_literals +import io from collections import Iterator, OrderedDict from itertools import chain, islice from unicodedata import normalize @@ -82,13 +83,24 @@ def ipartition(iterable, partition_size): yield data -def get_filename_and_fobj(filename_or_fobj, mode='r', dont_open=False): - if getattr(filename_or_fobj, 'read', None) is not None: +def get_filename_and_fobj(filename_or_fobj, mode='r', dont_open=False, **kwargs): + + # TODO: what if fobj is passed, using a different mode from `mode`? + + if getattr(filename_or_fobj, 'read', None) is not None: # file-like object + filename = getattr(filename_or_fobj, 'name', None) fobj = filename_or_fobj - filename = getattr(fobj, 'name', None) - else: - fobj = open(filename_or_fobj, mode=mode) if not dont_open else None + try: + file_number = fobj.fileno() + except io.UnsupportedOperation: + # Another kind of file object, like `io.BytesIO` + fobj = io.BufferedReader(fobj, **kwargs) # TODO: pass mode + else: # Regular file + fobj = io.open(file_number, mode=mode, **kwargs) + + else: # filename filename = filename_or_fobj + fobj = io.open(filename_or_fobj, mode=mode, **kwargs) if not dont_open else None return filename, fobj diff --git a/tests/tests_plugin_utils.py b/tests/tests_plugin_utils.py index b9798861..576ca19a 100644 --- a/tests/tests_plugin_utils.py +++ b/tests/tests_plugin_utils.py @@ -17,6 +17,7 @@ from __future__ import unicode_literals +import io import itertools import random import tempfile @@ -33,6 +34,8 @@ from rows import fields +get_filename_and_fobj = plugins_utils.get_filename_and_fobj + class GenericUtilsTestCase(unittest.TestCase): def test_slug(self): @@ -339,7 +342,62 @@ def test_export_data(self): result = plugins_utils.export_data(filename_or_fobj, data) self.assertIs(result, data) + +class FilenameFObjTestCase(unittest.TestCase): + # TODO: test other features of this function (example: BytesIO should + # return filename = None) + + def setUp(self): + self.filename = 'tests/data/csv_with_null_bytes.csv' + self.encoding = 'latin1' + self.data = io.open(self.filename, mode='rb').read() + self.decoded_data = self.data.decode(self.encoding) + + def test_get_filename_and_fobj_passing_filename(self): + mode = 'rb' + _, f = get_filename_and_fobj(self.filename, mode=mode) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.data) + + def test_get_filename_and_fobj_passing_text_fobj(self): + if six.PY3: + mode = 'r' + fobj = open(self.filename, encoding=self.encoding) + _, f = get_filename_and_fobj(fobj, mode=mode, encoding=self.encoding) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.decoded_data) + + mode = 'r' + fobj = io.open(self.filename, encoding=self.encoding) + _, f = get_filename_and_fobj(fobj, mode=mode, encoding=self.encoding) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.decoded_data) + + def test_get_filename_and_fobj_passing_bytes_fobj(self): + mode = 'rb' + fobj = open(self.filename, mode=mode) + _, f = get_filename_and_fobj(fobj, mode=mode) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.data) + + fobj = io.open(self.filename, mode=mode) + _, f = get_filename_and_fobj(fobj, mode=mode) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.data) + + def test_get_filename_and_fobj_passing_BytesIO(self): + mode = 'rb' + fobj = io.BytesIO(self.data) + _, f = get_filename_and_fobj(fobj, mode=mode) + self.assertTrue(hasattr(f, 'readable') and f.readable()) + self.assertEqual(f.mode, mode) + self.assertEqual(f.read(), self.data) + # TODO: test make_header # TODO: test all features of create_table # TODO: test if error is raised if len(row) != len(fields) - # TODO: test get_fobj_and_filename (BytesIO should return filename = None) From b5fa166a78f8c87e7f0fddcd39bdfe11d20763a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Sat, 26 May 2018 21:21:56 -0300 Subject: [PATCH 4/4] Fix CSV's not-null byte wrapper --- rows/plugins/plugin_csv.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py index f37412cf..0a1d4228 100644 --- a/rows/plugins/plugin_csv.py +++ b/rows/plugins/plugin_csv.py @@ -17,8 +17,7 @@ from __future__ import unicode_literals -from io import BytesIO, BufferedReader -from io import open as io_open +from io import open as io_open, BytesIO, BufferedReader import six import unicodecsv @@ -163,7 +162,7 @@ def export_to_csv(table, filename_or_fobj=None, encoding='utf-8', else: fobj = BytesIO() - # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can + # TODO: may use `BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)