[core] I/O formats allowing both arg0 formating and kwargs based. Starting with 0.4, kwargs based will be default (BC break here, but needed for the greater good).

2017-06-05 11:38:11 +02:00
parent c34b86872f
commit e5483de344
8 changed files with 111 additions and 82 deletions
--- a/bonobo/examples/files/csv_handlers.py
+++ b/bonobo/examples/files/csv_handlers.py
@ -2,8 +2,8 @@ import bonobo
 from bonobo.commands.run import get_default_services
 graph = bonobo.Graph(
-    bonobo.CsvReader('datasets/coffeeshops.txt'),
+    bonobo.CsvReader('datasets/coffeeshops.txt', headers=('item',)),
-    print,
+    bonobo.PrettyPrinter(),
 )
 if __name__ == '__main__':
--- a/bonobo/examples/files/json_handlers.py
+++ b/bonobo/examples/files/json_handlers.py
@ -1,15 +1,16 @@
 import bonobo
 from bonobo import Bag
 from bonobo.commands.run import get_default_services
-def get_fields(row):
+def get_fields(**row):
-    return row['fields']
+    return Bag(**row['fields'])
 graph = bonobo.Graph(
    bonobo.JsonReader('datasets/theaters.json'),
    get_fields,
-    bonobo.PrettyPrint(title_keys=('eq_nom_equipement', )),
+    bonobo.PrettyPrinter(),
 )
 if __name__ == '__main__':
--- a/bonobo/examples/files/pickle_handlers.py
+++ b/bonobo/examples/files/pickle_handlers.py
@ -1,10 +1,38 @@
 '''
 This example shows how a different file system service can be injected
 into a transformation (as compressing pickled objects often makes sense
 anyways).  The pickle itself contains a list of lists as follows:
 ```
 [
    ['category', 'sms'],
    ['ham', 'Go until jurong point, crazy..'],
    ['ham', 'Ok lar... Joking wif u oni...'],
    ['spam', 'Free entry in 2 a wkly comp to win...'],
    ['ham', 'U dun say so early hor... U c already then say...'],
    ['ham', 'Nah I don't think he goes to usf, he lives around here though'],
    ['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
    ...
 ]
 ```
 where the first column categorizes and sms as "ham" or "spam".  The second
 column contains the sms itself.
 Data set taken from:
 https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
 The transformation (1) reads the pickled data, (2) marks and shortens
 messages categorized as spam, and (3) prints the output.
 '''
 import bonobo
 from bonobo.commands.run import get_default_services
 from fs.tarfs import TarFS
 import os
-def cleanse_sms(row):
+def cleanse_sms(**row):
    if row['category'] == 'spam':
        row['sms_clean'] = '**MARKED AS SPAM** ' + row['sms'][0:50] + (
            '...' if len(row['sms']) > 50 else ''
@ -16,46 +44,21 @@ def cleanse_sms(row):
 graph = bonobo.Graph(
-    bonobo.PickleReader('spam.pkl'
+    # spam.pkl is within the gzipped tarball
-                        ),  # spam.pkl is within the gzipped tarball
+    bonobo.PickleReader('spam.pkl'),
    cleanse_sms,
-    print
+    bonobo.PrettyPrinter(),
 )
 if __name__ == '__main__':
    '''
    This example shows how a different file system service can be injected
    into a transformation (as compressing pickled objects often makes sense
    anyways).  The pickle itself contains a list of lists as follows:
-    ```
+def get_services():
-    [
+    return {
        ['category', 'sms'],
        ['ham', 'Go until jurong point, crazy..'],
        ['ham', 'Ok lar... Joking wif u oni...'],
        ['spam', 'Free entry in 2 a wkly comp to win...'],
        ['ham', 'U dun say so early hor... U c already then say...'],
        ['ham', 'Nah I don't think he goes to usf, he lives around here though'],
        ['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
        ...
    ]
    ```
    where the first column categorizes and sms as "ham" or "spam".  The second
    column contains the sms itself.
    Data set taken from:
    https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
    The transformation (1) reads the pickled data, (2) marks and shortens
    messages categorized as spam, and (3) prints the output.
    '''
    services = {
        'fs':
-        TarFS(
+            TarFS(
-            os.path.
+                bonobo.get_examples_path('datasets/spam.tgz')
-            join(bonobo.get_examples_path(), 'datasets', 'spam.tgz')
+            )
        )
    }
-    bonobo.run(graph, services=services)
+
 if __name__ == '__main__':
    bonobo.run(graph, services=get_default_services(__file__))
--- a/bonobo/nodes/io/csv.py
+++ b/bonobo/nodes/io/csv.py
@ -3,10 +3,8 @@ import csv
 from bonobo.config import Option
 from bonobo.config.processors import ContextProcessor
 from bonobo.constants import NOT_MODIFIED
-from bonobo.errors import ConfigurationError, ValidationError
+from bonobo.nodes.io.file import FileHandler, FileReader, FileWriter
 from bonobo.structs import Bag
 from bonobo.util.objects import ValueHolder
 from .file import FileHandler, FileReader, FileWriter
 class CsvHandler(FileHandler):
@ -30,14 +28,6 @@ class CsvHandler(FileHandler):
    headers = Option(tuple)
 def validate_csv_output_format(v):
    if callable(v):
        return v
    if v in {'dict', 'kwargs'}:
        return v
    raise ValidationError('Unsupported format {!r}.'.format(v))
 class CsvReader(CsvHandler, FileReader):
    """
    Reads a CSV and yield the values as dicts.
@ -49,26 +39,17 @@ class CsvReader(CsvHandler, FileReader):
    """
    skip = Option(int, default=0)
    output_format = Option(validate_csv_output_format, default='dict')
    @ContextProcessor
    def csv_headers(self, context, fs, file):
        yield ValueHolder(self.headers)
    def get_output_formater(self):
        if callable(self.output_format):
            return self.output_format
        elif isinstance(self.output_format, str):
            return getattr(self, '_format_as_' + self.output_format)
        else:
            raise ConfigurationError('Unsupported format {!r} for {}.'.format(self.output_format, type(self).__name__))
    def read(self, fs, file, headers):
        reader = csv.reader(file, delimiter=self.delimiter, quotechar=self.quotechar)
        formater = self.get_output_formater()
        if not headers.get():
            headers.set(next(reader))
        _headers = headers.get()
        field_count = len(headers)
@ -78,15 +59,9 @@ class CsvReader(CsvHandler, FileReader):
        for row in reader:
            if len(row) != field_count:
-                raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count, ))
+                raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count,))
-            yield formater(headers.get(), row)
+            yield self.get_output(dict(zip(_headers, row)))
    def _format_as_dict(self, headers, values):
        return dict(zip(headers, values))
    def _format_as_kwargs(self, headers, values):
        return Bag(**dict(zip(headers, values)))
 class CsvWriter(CsvHandler, FileWriter):
@ -96,7 +71,8 @@ class CsvWriter(CsvHandler, FileWriter):
        headers = ValueHolder(list(self.headers) if self.headers else None)
        yield writer, headers
-    def write(self, fs, file, lineno, writer, headers, row):
+    def write(self, fs, file, lineno, writer, headers, *args, **kwargs):
        row = self.get_input(*args, **kwargs)
        if not lineno:
            headers.set(headers.value or row.keys())
            writer.writerow(headers.get())
--- a/bonobo/nodes/io/file.py
+++ b/bonobo/nodes/io/file.py
@ -1,7 +1,9 @@
 from bonobo import settings
 from bonobo.config import Option, Service
 from bonobo.config.configurables import Configurable
 from bonobo.config.processors import ContextProcessor
 from bonobo.constants import NOT_MODIFIED
 from bonobo.structs.bags import Bag
 from bonobo.util.objects import ValueHolder
@ -22,6 +24,8 @@ class FileHandler(Configurable):
    fs = Service('fs')  # type: str
    ioformat = Option(settings.validate_io_format, default=settings.IOFORMAT)
    @ContextProcessor
    def file(self, context, fs):
        with self.open(fs) as file:
@ -30,15 +34,35 @@ class FileHandler(Configurable):
    def open(self, fs):
        return fs.open(self.path, self.mode, encoding=self.encoding)
    def get_input(self, *args, **kwargs):
        if self.ioformat == settings.IOFORMAT_ARG0:
            assert len(args) == 1 and not len(kwargs), 'ARG0 format implies one arg and no kwargs.'
            return args[0]
        if self.ioformat == settings.IOFORMAT_KWARGS:
            assert len(args) == 0 and len(kwargs), 'KWARGS format implies no arg.'
            return kwargs
        raise NotImplementedError('Unsupported format.')
    def get_output(self, row):
        if self.ioformat == settings.IOFORMAT_ARG0:
            return row
        if self.ioformat == settings.IOFORMAT_KWARGS:
            return Bag(**row)
        raise NotImplementedError('Unsupported format.')
 class Reader(FileHandler):
    """Abstract component factory for readers.
    """
-    def __call__(self, *args):
+    def __call__(self, *args, **kwargs):
-        yield from self.read(*args)
+        yield from self.read(*args, **kwargs)
-    def read(self, *args):
+    def read(self, *args, **kwargs):
        raise NotImplementedError('Abstract.')
@ -46,10 +70,10 @@ class Writer(FileHandler):
    """Abstract component factory for writers.
    """
-    def __call__(self, *args):
+    def __call__(self, *args, **kwargs):
        return self.write(*args)
-    def write(self, *args):
+    def write(self, *args, **kwargs):
        raise NotImplementedError('Abstract.')
--- a/bonobo/nodes/io/json.py
+++ b/bonobo/nodes/io/json.py
@ -14,7 +14,7 @@ class JsonReader(JsonHandler, FileReader):
    def read(self, fs, file):
        for line in self.loader(file):
-            yield line
+            yield self.get_output(line)
 class JsonWriter(JsonHandler, FileWriter):
--- a/bonobo/nodes/io/pickle.py
+++ b/bonobo/nodes/io/pickle.py
@ -53,7 +53,7 @@ class PickleReader(PickleHandler, FileReader):
            if len(i) != item_count:
                raise ValueError('Received an object with %d items, expecting %d.' % (len(i), item_count, ))
-            yield dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i))
+            yield self.get_output(dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i)))
 class PickleWriter(PickleHandler, FileWriter):
--- a/bonobo/settings.py
+++ b/bonobo/settings.py
@ -2,6 +2,8 @@ import os
 import logging
 from bonobo.errors import ValidationError
 def to_bool(s):
    if len(s):
@ -23,7 +25,30 @@ QUIET = to_bool(os.environ.get('QUIET', 'f'))
 # Logging level.
 LOGGING_LEVEL = logging.DEBUG if DEBUG else logging.INFO
 # Input/Output format for transformations
 IOFORMAT_ARG0 = 'arg0'
 IOFORMAT_KWARGS = 'kwargs'
 IOFORMATS = {
    IOFORMAT_ARG0,
    IOFORMAT_KWARGS,
 }
 IOFORMAT = os.environ.get('IOFORMAT', IOFORMAT_KWARGS)
 def validate_io_format(v):
    if callable(v):
        return v
    if v in IOFORMATS:
        return v
    raise ValidationError('Unsupported format {!r}.'.format(v))
 def check():
    if DEBUG and QUIET:
        raise RuntimeError('I cannot be verbose and quiet at the same time.')
    if IOFORMAT not in IOFORMATS:
        raise RuntimeError('Invalid default input/output format.')