From e77fbae38b6e6e2e08a9758296ae08c71b333061 Mon Sep 17 00:00:00 2001 From: Romain Dorgueil Date: Wed, 28 Dec 2016 12:27:23 +0100 Subject: [PATCH] Implements naive csv reader / writer (#2) --- bonobo/ext/jupyter/plugin.py | 7 ++- bonobo/io/csv.py | 90 ++++++++++++++++++++++++++++++++++++ tests/io/test_csv.py | 65 ++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 4 deletions(-) create mode 100644 bonobo/io/csv.py create mode 100644 tests/io/test_csv.py diff --git a/bonobo/ext/jupyter/plugin.py b/bonobo/ext/jupyter/plugin.py index b19c880..63483ce 100644 --- a/bonobo/ext/jupyter/plugin.py +++ b/bonobo/ext/jupyter/plugin.py @@ -6,10 +6,9 @@ try: except ImportError as e: import logging - logging.exception( - 'You must install Jupyter to use the bonobo Jupyter extension. Easiest way is to install the ' - 'optional "jupyter" dependencies with «pip install bonobo[jupyter]», but you can also install a ' - 'specific version by yourself.') + logging.exception('You must install Jupyter to use the bonobo Jupyter extension. Easiest way is to install the ' + 'optional "jupyter" dependencies with «pip install bonobo[jupyter]», but you can also install a ' + 'specific version by yourself.') class JupyterOutputPlugin(Plugin): diff --git a/bonobo/io/csv.py b/bonobo/io/csv.py new file mode 100644 index 0000000..aae58d5 --- /dev/null +++ b/bonobo/io/csv.py @@ -0,0 +1,90 @@ +import csv +from copy import copy + +from .file import FileReader, FileWriter, FileHandler + + +class CsvHandler(FileHandler): + delimiter = ';' + quotechar = '"' + headers = None + + +class CsvReader(CsvHandler, FileReader): + """ + Reads a CSV and yield the values as dicts. + + .. attribute:: delimiter + + The CSV delimiter. + + .. attribute:: quotechar + + The CSV quote character. + + .. attribute:: headers + + The list of column names, if the CSV does not contain it as its first line. + + .. attribute:: skip + + The amount of lines to skip before it actually yield output. + + """ + + skip = 0 + + def __init__(self, path_or_buf, delimiter=None, quotechar=None, headers=None, skip=None): + super().__init__(path_or_buf) + + self.delimiter = str(delimiter or self.delimiter) + self.quotechar = quotechar or self.quotechar + self.headers = headers or self.headers + self.skip = skip or self.skip + + @property + def has_headers(self): + return bool(self.headers) + + def read(self, ctx): + reader = csv.reader(ctx.file, delimiter=self.delimiter, quotechar=self.quotechar) + headers = self.has_headers and self.headers or next(reader) + field_count = len(headers) + + if self.skip and self.skip > 0: + for i in range(0, self.skip): + next(reader) + + for row in reader: + if len(row) != field_count: + raise ValueError('Got a line with %d fields, expecting %d.' % ( + len(row), + field_count, )) + + yield dict(zip(headers, row)) + + +class CsvWriter(CsvHandler, FileWriter): + def __init__(self, path_or_buf, delimiter=None, quotechar=None, headers=None): + super().__init__(path_or_buf) + + self.delimiter = str(delimiter or self.delimiter) + self.quotechar = quotechar or self.quotechar + self.headers = headers or self.headers + + def initialize(self, ctx): + super().initialize(ctx) + ctx.writer = csv.writer(ctx.file, delimiter=self.delimiter, quotechar=self.quotechar) + ctx.headers = copy(self.headers) + ctx.first = True + + def write(self, ctx, row): + if ctx.first: + ctx.headers = ctx.headers or row.keys() + ctx.writer.writerow(ctx.headers) + ctx.first = False + ctx.writer.writerow(row[header] for header in ctx.headers) + + def finalize(self, ctx): + del ctx.headers, ctx.writer, ctx.first + super().finalize(ctx) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py new file mode 100644 index 0000000..96def96 --- /dev/null +++ b/tests/io/test_csv.py @@ -0,0 +1,65 @@ +import pytest + +from bonobo import Bag +from bonobo.core.contexts import ComponentExecutionContext +from bonobo.io.csv import CsvReader, CsvWriter +from bonobo.util.testing import CapturingComponentExecutionContext +from bonobo.util.tokens import BEGIN, END + + +def test_write_csv_to_file(tmpdir): + file = tmpdir.join('output.json') + writer = CsvWriter(str(file)) + context = ComponentExecutionContext(writer, None) + + context.initialize() + context.recv(BEGIN, Bag({'foo': 'bar'}), Bag({'foo': 'baz', 'ignore': 'this'}), END) + context.step() + context.step() + context.finalize() + + assert file.read() == 'foo\nbar\nbaz\n' + + with pytest.raises(AttributeError): + getattr(context, 'file') + + +def test_write_json_without_initializer_should_not_work(tmpdir): + file = tmpdir.join('output.json') + writer = CsvWriter(str(file)) + + context = ComponentExecutionContext(writer, None) + with pytest.raises(AttributeError): + writer(context, {'foo': 'bar'}) + + +def test_read_csv_from_file(tmpdir): + file = tmpdir.join('input.csv') + file.write('a,b,c\na foo,b foo,c foo\na bar,b bar,c bar') + + reader = CsvReader(str(file), delimiter=',') + + context = CapturingComponentExecutionContext(reader, None) + + context.initialize() + context.recv(BEGIN, Bag(), END) + context.step() + context.finalize() + + assert len(context.send.mock_calls) == 2 + + args0, kwargs0 = context.send.call_args_list[0] + assert len(args0) == 1 and not len(kwargs0) + args1, kwargs1 = context.send.call_args_list[1] + assert len(args1) == 1 and not len(kwargs1) + + assert args0[0].args[0] == { + 'a': 'a foo', + 'b': 'b foo', + 'c': 'c foo', + } + assert args1[0].args[0] == { + 'a': 'a bar', + 'b': 'b bar', + 'c': 'c bar', + }