[core] I/O formats allowing both arg0 formating and kwargs based. Starting with 0.4, kwargs based will be default (BC break here, but needed for the greater good).

This commit is contained in:
Romain Dorgueil
2017-06-05 11:38:11 +02:00
parent c34b86872f
commit e5483de344
8 changed files with 111 additions and 82 deletions

View File

@ -2,8 +2,8 @@ import bonobo
from bonobo.commands.run import get_default_services
graph = bonobo.Graph(
bonobo.CsvReader('datasets/coffeeshops.txt'),
print,
bonobo.CsvReader('datasets/coffeeshops.txt', headers=('item',)),
bonobo.PrettyPrinter(),
)
if __name__ == '__main__':

View File

@ -1,15 +1,16 @@
import bonobo
from bonobo import Bag
from bonobo.commands.run import get_default_services
def get_fields(row):
return row['fields']
def get_fields(**row):
return Bag(**row['fields'])
graph = bonobo.Graph(
bonobo.JsonReader('datasets/theaters.json'),
get_fields,
bonobo.PrettyPrint(title_keys=('eq_nom_equipement', )),
bonobo.PrettyPrinter(),
)
if __name__ == '__main__':

View File

@ -1,10 +1,38 @@
'''
This example shows how a different file system service can be injected
into a transformation (as compressing pickled objects often makes sense
anyways). The pickle itself contains a list of lists as follows:
```
[
['category', 'sms'],
['ham', 'Go until jurong point, crazy..'],
['ham', 'Ok lar... Joking wif u oni...'],
['spam', 'Free entry in 2 a wkly comp to win...'],
['ham', 'U dun say so early hor... U c already then say...'],
['ham', 'Nah I don't think he goes to usf, he lives around here though'],
['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
...
]
```
where the first column categorizes and sms as "ham" or "spam". The second
column contains the sms itself.
Data set taken from:
https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
The transformation (1) reads the pickled data, (2) marks and shortens
messages categorized as spam, and (3) prints the output.
'''
import bonobo
from bonobo.commands.run import get_default_services
from fs.tarfs import TarFS
import os
def cleanse_sms(row):
def cleanse_sms(**row):
if row['category'] == 'spam':
row['sms_clean'] = '**MARKED AS SPAM** ' + row['sms'][0:50] + (
'...' if len(row['sms']) > 50 else ''
@ -16,46 +44,21 @@ def cleanse_sms(row):
graph = bonobo.Graph(
bonobo.PickleReader('spam.pkl'
), # spam.pkl is within the gzipped tarball
# spam.pkl is within the gzipped tarball
bonobo.PickleReader('spam.pkl'),
cleanse_sms,
print
bonobo.PrettyPrinter(),
)
if __name__ == '__main__':
'''
This example shows how a different file system service can be injected
into a transformation (as compressing pickled objects often makes sense
anyways). The pickle itself contains a list of lists as follows:
```
[
['category', 'sms'],
['ham', 'Go until jurong point, crazy..'],
['ham', 'Ok lar... Joking wif u oni...'],
['spam', 'Free entry in 2 a wkly comp to win...'],
['ham', 'U dun say so early hor... U c already then say...'],
['ham', 'Nah I don't think he goes to usf, he lives around here though'],
['spam', 'FreeMsg Hey there darling it's been 3 week's now...'],
...
]
```
where the first column categorizes and sms as "ham" or "spam". The second
column contains the sms itself.
Data set taken from:
https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip
The transformation (1) reads the pickled data, (2) marks and shortens
messages categorized as spam, and (3) prints the output.
'''
services = {
def get_services():
return {
'fs':
TarFS(
os.path.
join(bonobo.get_examples_path(), 'datasets', 'spam.tgz')
)
TarFS(
bonobo.get_examples_path('datasets/spam.tgz')
)
}
bonobo.run(graph, services=services)
if __name__ == '__main__':
bonobo.run(graph, services=get_default_services(__file__))

View File

@ -3,10 +3,8 @@ import csv
from bonobo.config import Option
from bonobo.config.processors import ContextProcessor
from bonobo.constants import NOT_MODIFIED
from bonobo.errors import ConfigurationError, ValidationError
from bonobo.structs import Bag
from bonobo.nodes.io.file import FileHandler, FileReader, FileWriter
from bonobo.util.objects import ValueHolder
from .file import FileHandler, FileReader, FileWriter
class CsvHandler(FileHandler):
@ -30,14 +28,6 @@ class CsvHandler(FileHandler):
headers = Option(tuple)
def validate_csv_output_format(v):
if callable(v):
return v
if v in {'dict', 'kwargs'}:
return v
raise ValidationError('Unsupported format {!r}.'.format(v))
class CsvReader(CsvHandler, FileReader):
"""
Reads a CSV and yield the values as dicts.
@ -49,26 +39,17 @@ class CsvReader(CsvHandler, FileReader):
"""
skip = Option(int, default=0)
output_format = Option(validate_csv_output_format, default='dict')
@ContextProcessor
def csv_headers(self, context, fs, file):
yield ValueHolder(self.headers)
def get_output_formater(self):
if callable(self.output_format):
return self.output_format
elif isinstance(self.output_format, str):
return getattr(self, '_format_as_' + self.output_format)
else:
raise ConfigurationError('Unsupported format {!r} for {}.'.format(self.output_format, type(self).__name__))
def read(self, fs, file, headers):
reader = csv.reader(file, delimiter=self.delimiter, quotechar=self.quotechar)
formater = self.get_output_formater()
if not headers.get():
headers.set(next(reader))
_headers = headers.get()
field_count = len(headers)
@ -78,15 +59,9 @@ class CsvReader(CsvHandler, FileReader):
for row in reader:
if len(row) != field_count:
raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count, ))
raise ValueError('Got a line with %d fields, expecting %d.' % (len(row), field_count,))
yield formater(headers.get(), row)
def _format_as_dict(self, headers, values):
return dict(zip(headers, values))
def _format_as_kwargs(self, headers, values):
return Bag(**dict(zip(headers, values)))
yield self.get_output(dict(zip(_headers, row)))
class CsvWriter(CsvHandler, FileWriter):
@ -96,7 +71,8 @@ class CsvWriter(CsvHandler, FileWriter):
headers = ValueHolder(list(self.headers) if self.headers else None)
yield writer, headers
def write(self, fs, file, lineno, writer, headers, row):
def write(self, fs, file, lineno, writer, headers, *args, **kwargs):
row = self.get_input(*args, **kwargs)
if not lineno:
headers.set(headers.value or row.keys())
writer.writerow(headers.get())

View File

@ -1,7 +1,9 @@
from bonobo import settings
from bonobo.config import Option, Service
from bonobo.config.configurables import Configurable
from bonobo.config.processors import ContextProcessor
from bonobo.constants import NOT_MODIFIED
from bonobo.structs.bags import Bag
from bonobo.util.objects import ValueHolder
@ -22,6 +24,8 @@ class FileHandler(Configurable):
fs = Service('fs') # type: str
ioformat = Option(settings.validate_io_format, default=settings.IOFORMAT)
@ContextProcessor
def file(self, context, fs):
with self.open(fs) as file:
@ -30,15 +34,35 @@ class FileHandler(Configurable):
def open(self, fs):
return fs.open(self.path, self.mode, encoding=self.encoding)
def get_input(self, *args, **kwargs):
if self.ioformat == settings.IOFORMAT_ARG0:
assert len(args) == 1 and not len(kwargs), 'ARG0 format implies one arg and no kwargs.'
return args[0]
if self.ioformat == settings.IOFORMAT_KWARGS:
assert len(args) == 0 and len(kwargs), 'KWARGS format implies no arg.'
return kwargs
raise NotImplementedError('Unsupported format.')
def get_output(self, row):
if self.ioformat == settings.IOFORMAT_ARG0:
return row
if self.ioformat == settings.IOFORMAT_KWARGS:
return Bag(**row)
raise NotImplementedError('Unsupported format.')
class Reader(FileHandler):
"""Abstract component factory for readers.
"""
def __call__(self, *args):
yield from self.read(*args)
def __call__(self, *args, **kwargs):
yield from self.read(*args, **kwargs)
def read(self, *args):
def read(self, *args, **kwargs):
raise NotImplementedError('Abstract.')
@ -46,10 +70,10 @@ class Writer(FileHandler):
"""Abstract component factory for writers.
"""
def __call__(self, *args):
def __call__(self, *args, **kwargs):
return self.write(*args)
def write(self, *args):
def write(self, *args, **kwargs):
raise NotImplementedError('Abstract.')

View File

@ -14,7 +14,7 @@ class JsonReader(JsonHandler, FileReader):
def read(self, fs, file):
for line in self.loader(file):
yield line
yield self.get_output(line)
class JsonWriter(JsonHandler, FileWriter):

View File

@ -53,7 +53,7 @@ class PickleReader(PickleHandler, FileReader):
if len(i) != item_count:
raise ValueError('Received an object with %d items, expecting %d.' % (len(i), item_count, ))
yield dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i))
yield self.get_output(dict(zip(i)) if is_dict else dict(zip(pickle_headers.value, i)))
class PickleWriter(PickleHandler, FileWriter):

View File

@ -2,6 +2,8 @@ import os
import logging
from bonobo.errors import ValidationError
def to_bool(s):
if len(s):
@ -23,7 +25,30 @@ QUIET = to_bool(os.environ.get('QUIET', 'f'))
# Logging level.
LOGGING_LEVEL = logging.DEBUG if DEBUG else logging.INFO
# Input/Output format for transformations
IOFORMAT_ARG0 = 'arg0'
IOFORMAT_KWARGS = 'kwargs'
IOFORMATS = {
IOFORMAT_ARG0,
IOFORMAT_KWARGS,
}
IOFORMAT = os.environ.get('IOFORMAT', IOFORMAT_KWARGS)
def validate_io_format(v):
if callable(v):
return v
if v in IOFORMATS:
return v
raise ValidationError('Unsupported format {!r}.'.format(v))
def check():
if DEBUG and QUIET:
raise RuntimeError('I cannot be verbose and quiet at the same time.')
if IOFORMAT not in IOFORMATS:
raise RuntimeError('Invalid default input/output format.')