[core] I/O formats allowing both arg0 formating and kwargs based. Starting with 0.4, kwargs based will be default (BC break here, but needed for the greater good).

2017-06-05 11:38:11 +02:00
parent c34b86872f
commit e5483de344
8 changed files with 111 additions and 82 deletions
--- a/bonobo/examples/files/csv_handlers.py
+++ b/bonobo/examples/files/csv_handlers.py
@ -2,8 +2,8 @@ import bonobo
 from bonobo.commands.run import get_default_services

 graph = bonobo.Graph(
-    bonobo.CsvReader('datasets/coffeeshops.txt'),
-    print,
+    bonobo.CsvReader('datasets/coffeeshops.txt', headers=('item',)),
+    bonobo.PrettyPrinter(),
 )

 if __name__ == '__main__':
--- a/bonobo/examples/files/json_handlers.py
+++ b/bonobo/examples/files/json_handlers.py
@ -1,15 +1,16 @@
 import bonobo
+from bonobo import Bag
 from bonobo.commands.run import get_default_services


-def get_fields(row):
-    return row['fields']
+def get_fields(**row):
+    return Bag(**row['fields'])


 graph = bonobo.Graph(
    bonobo.JsonReader('datasets/theaters.json'),
    get_fields,
-    bonobo.PrettyPrint(title_keys=('eq_nom_equipement', )),
+    bonobo.PrettyPrinter(),
 )

 if __name__ == '__main__':
--- a/bonobo/examples/files/pickle_handlers.py
+++ b/bonobo/examples/files/pickle_handlers.py
@ -1,10 +1,38 @@
+'''
+This example shows how a different file system service can be injected
+into a transformation (as compressing pickled objects often makes sense
+anyways).  The pickle itself contains a list of lists as follows:
+
+```
+[
+    ['category', 'sms'],
+    ['ham', 'Go until jurong point, crazy..'],
+    ['ham', 'Ok lar... Joking wif u oni...'],
+    ['spam', 'Free entry in 2 a wkly comp to win...'],
+    ['ham', 'U dun say so early hor... U c already then say...'],
+    ['ham', 'Nah I don't think he goes to usf, he lives around here though'],
+    ['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
+    ...
+]
+```
+
+where the first column categorizes and sms as "ham" or "spam".  The second
+column contains the sms itself.
+
+Data set taken from:
+https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
+
+The transformation (1) reads the pickled data, (2) marks and shortens
+messages categorized as spam, and (3) prints the output.
+
+'''
+
 import bonobo
+from bonobo.commands.run import get_default_services
 from fs.tarfs import TarFS
-import os


-def cleanse_sms(row):
-
+def cleanse_sms(**row):
    if row['category'] == 'spam':
        row['sms_clean'] = '**MARKED AS SPAM** ' + row['sms'][0:50] + (
            '...' if len(row['sms']) > 50 else ''
@ -16,46 +44,21 @@ def cleanse_sms(row):


 graph = bonobo.Graph(
-    bonobo.PickleReader('spam.pkl'
-                        ),  # spam.pkl is within the gzipped tarball
+    # spam.pkl is within the gzipped tarball
+    bonobo.PickleReader('spam.pkl'),
    cleanse_sms,
-    print
+    bonobo.PrettyPrinter(),
 )

-if __name__ == '__main__':
-    '''
-    This example shows how a different file system service can be injected
-    into a transformation (as compressing pickled objects often makes sense
-    anyways).  The pickle itself contains a list of lists as follows:

-    ```
-    [
-        ['category', 'sms'],
-        ['ham', 'Go until jurong point, crazy..'],
-        ['ham', 'Ok lar... Joking wif u oni...'],
-        ['spam', 'Free entry in 2 a wkly comp to win...'],
-        ['ham', 'U dun say so early hor... U c already then say...'],
-        ['ham', 'Nah I don't think he goes to usf, he lives around here though'],
-        ['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
-        ...
-    ]
-    ```
-
-    where the first column categorizes and sms as "ham" or "spam".  The second
-    column contains the sms itself.
-
-    Data set taken from:
-    https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
-
-    The transformation (1) reads the pickled data, (2) marks and shortens
-    messages categorized as spam, and (3) prints the output.
-    '''
-
-    services = {
+def get_services():
+    return {
        'fs':
-        TarFS(
-            os.path.
-            join(bonobo.get_examples_path(), 'datasets', 'spam.tgz')
-        )
+            TarFS(
+                bonobo.get_examples_path('datasets/spam.tgz')
+            )
    }
-    bonobo.run(graph, services=services)
+
+
+if __name__ == '__main__':
+    bonobo.run(graph, services=get_default_services(__file__))
--- a/bonobo/nodes/io/csv.py
+++ b/bonobo/nodes/io/csv.py
@ -3,10 +3,8 @@ import csv
 from bonobo.config import Option
 from bonobo.config.processors import ContextProcessor
 from bonobo.constants import NOT_MODIFIED
-from bonobo.errors import ConfigurationError, ValidationError
-from bonobo.structs import Bag
+from bonobo.nodes.io.file import FileHandler, FileReader, FileWriter
 from bonobo.util.objects import ValueHolder
-from .file import FileHandler, FileReader, FileWriter


 class CsvHandler(FileHandler):
@ -30,14 +28,6 @@ class CsvHandler(FileHandler):
    headers = Option(tuple)


-def validate_csv_output_format(v):
-    if callable(v):
-        return v
-    if v in {'dict', 'kwargs'}:
-        return v
-    raise ValidationError('Unsupported format {!r}.'.format(v))
-
-
 class CsvReader(CsvHandler, FileReader):
    """
    Reads a CSV and yield the values as dicts.
@ -49,26 +39,17 @@ class CsvReader(CsvHandler, FileReader):
    """

    skip = Option(int, default=0)
-    output_format = Option(validate_csv_output_format, default='dict')

    @ContextProcessor
    def csv_headers(self, context, fs, file):
        yield ValueHolder(self.headers)

-    def get_output_formater(self):
-        if callable(self.output_format):
-            return self.output_format
-        elif isinstance(self.output_format, str):
-            return getattr(self, '_format_as_' + self.output_format)
-        else:
-            raise ConfigurationError('Unsupported format {!r} for {}.'.format(self.output_format, type(self).__name__))
-
    def read(self, fs, file, headers):
        reader = csv.reader(file, delimiter=self.delimiter, quotechar=self.quotechar)
-        formater = self.get_output_formater()

        if not headers.get():
            headers.set(next(reader))
+        _headers = headers.get()

        field_count = len(headers)

@ -78,15 +59,9 @@ class CsvReader(CsvHandler, FileReader):

        for row in reader:
            if len(row) != field_count:
-                raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count, ))
+                raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count,))

-            yield formater(headers.get(), row)
-
-    def _format_as_dict(self, headers, values):
-        return dict(zip(headers, values))
-
-    def _format_as_kwargs(self, headers, values):
-        return Bag(**dict(zip(headers, values)))
+            yield self.get_output(dict(zip(_headers, row)))


 class CsvWriter(CsvHandler, FileWriter):
@ -96,7 +71,8 @@ class CsvWriter(CsvHandler, FileWriter):
        headers = ValueHolder(list(self.headers) if self.headers else None)
        yield writer, headers

-    def write(self, fs, file, lineno, writer, headers, row):
+    def write(self, fs, file, lineno, writer, headers, *args, **kwargs):
+        row = self.get_input(*args, **kwargs)
        if not lineno:
            headers.set(headers.value or row.keys())
            writer.writerow(headers.get())
--- a/bonobo/nodes/io/file.py
+++ b/bonobo/nodes/io/file.py
@ -1,7 +1,9 @@
+from bonobo import settings
 from bonobo.config import Option, Service
 from bonobo.config.configurables import Configurable
 from bonobo.config.processors import ContextProcessor
 from bonobo.constants import NOT_MODIFIED
+from bonobo.structs.bags import Bag
 from bonobo.util.objects import ValueHolder


@ -22,6 +24,8 @@ class FileHandler(Configurable):

    fs = Service('fs')  # type: str

+    ioformat = Option(settings.validate_io_format, default=settings.IOFORMAT)
+
    @ContextProcessor
    def file(self, context, fs):
        with self.open(fs) as file:
@ -30,15 +34,35 @@ class FileHandler(Configurable):
    def open(self, fs):
        return fs.open(self.path, self.mode, encoding=self.encoding)

+    def get_input(self, *args, **kwargs):
+        if self.ioformat == settings.IOFORMAT_ARG0:
+            assert len(args) == 1 and not len(kwargs), 'ARG0 format implies one arg and no kwargs.'
+            return args[0]
+
+        if self.ioformat == settings.IOFORMAT_KWARGS:
+            assert len(args) == 0 and len(kwargs), 'KWARGS format implies no arg.'
+            return kwargs
+
+        raise NotImplementedError('Unsupported format.')
+
+    def get_output(self, row):
+        if self.ioformat == settings.IOFORMAT_ARG0:
+            return row
+
+        if self.ioformat == settings.IOFORMAT_KWARGS:
+            return Bag(**row)
+
+        raise NotImplementedError('Unsupported format.')
+

 class Reader(FileHandler):
    """Abstract component factory for readers.
    """

-    def __call__(self, *args):
-        yield from self.read(*args)
+    def __call__(self, *args, **kwargs):
+        yield from self.read(*args, **kwargs)

-    def read(self, *args):
+    def read(self, *args, **kwargs):
        raise NotImplementedError('Abstract.')


@ -46,10 +70,10 @@ class Writer(FileHandler):
    """Abstract component factory for writers.
    """

-    def __call__(self, *args):
+    def __call__(self, *args, **kwargs):
        return self.write(*args)

-    def write(self, *args):
+    def write(self, *args, **kwargs):
        raise NotImplementedError('Abstract.')


--- a/bonobo/nodes/io/json.py
+++ b/bonobo/nodes/io/json.py
@ -14,7 +14,7 @@ class JsonReader(JsonHandler, FileReader):

    def read(self, fs, file):
        for line in self.loader(file):
-            yield line
+            yield self.get_output(line)


 class JsonWriter(JsonHandler, FileWriter):
--- a/bonobo/nodes/io/pickle.py
+++ b/bonobo/nodes/io/pickle.py
@ -53,7 +53,7 @@ class PickleReader(PickleHandler, FileReader):
            if len(i) != item_count:
                raise ValueError('Received an object with %d items, expecting %d.' % (len(i), item_count, ))

-            yield dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i))
+            yield self.get_output(dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i)))


 class PickleWriter(PickleHandler, FileWriter):
--- a/bonobo/settings.py
+++ b/bonobo/settings.py
@ -2,6 +2,8 @@ import os

 import logging

+from bonobo.errors import ValidationError
+

 def to_bool(s):
    if len(s):
@ -23,7 +25,30 @@ QUIET = to_bool(os.environ.get('QUIET', 'f'))
 # Logging level.
 LOGGING_LEVEL = logging.DEBUG if DEBUG else logging.INFO

+# Input/Output format for transformations
+IOFORMAT_ARG0 = 'arg0'
+IOFORMAT_KWARGS = 'kwargs'
+
+IOFORMATS = {
+    IOFORMAT_ARG0,
+    IOFORMAT_KWARGS,
+}
+
+IOFORMAT = os.environ.get('IOFORMAT', IOFORMAT_KWARGS)
+
+
+def validate_io_format(v):
+    if callable(v):
+        return v
+    if v in IOFORMATS:
+        return v
+    raise ValidationError('Unsupported format {!r}.'.format(v))
+

 def check():
    if DEBUG and QUIET:
        raise RuntimeError('I cannot be verbose and quiet at the same time.')
+
+    if IOFORMAT not in IOFORMATS:
+        raise RuntimeError('Invalid default input/output format.')
+