Merge branch 'master' into develop

This commit is contained in:
Romain Dorgueil
2018-01-10 08:39:37 +01:00
39 changed files with 588 additions and 164 deletions

View File

@ -1,4 +1,4 @@
# Generated by Medikit 0.4.5 on 2018-01-01. # Generated by Medikit 0.4.3 on 2018-01-10.
# All changes will be overriden. # All changes will be overriden.
PACKAGE ?= bonobo PACKAGE ?= bonobo
@ -19,8 +19,9 @@ SPHINX_SOURCEDIR ?= docs
SPHINX_BUILDDIR ?= $(SPHINX_SOURCEDIR)/_build SPHINX_BUILDDIR ?= $(SPHINX_SOURCEDIR)/_build
YAPF ?= $(PYTHON) -m yapf YAPF ?= $(PYTHON) -m yapf
YAPF_OPTIONS ?= -rip YAPF_OPTIONS ?= -rip
SPHINX_AUTOBUILD ?= $(PYTHON_DIRNAME)/sphinx-autobuild
.PHONY: $(SPHINX_SOURCEDIR) clean format install install-dev test update update-requirements .PHONY: $(SPHINX_SOURCEDIR) clean format install install-dev test update update-requirements watch-$(SPHINX_SOURCEDIR)
# Installs the local project dependencies. # Installs the local project dependencies.
install: install:
@ -57,3 +58,6 @@ $(SPHINX_SOURCEDIR): install-dev
format: install-dev format: install-dev
$(YAPF) $(YAPF_OPTIONS) . $(YAPF) $(YAPF_OPTIONS) .
$(YAPF) $(YAPF_OPTIONS) Projectfile $(YAPF) $(YAPF_OPTIONS) Projectfile
watch-$(SPHINX_SOURCEDIR):
$(SPHINX_AUTOBUILD) $(SPHINX_SOURCEDIR) $(shell mktemp -d)

View File

@ -2,13 +2,12 @@
from medikit import require from medikit import require
make = require('make')
pytest = require('pytest') pytest = require('pytest')
python = require('python') python = require('python')
sphinx = require('sphinx') sphinx = require('sphinx')
yapf = require('yapf') yapf = require('yapf')
# python.set_versions('3.5', '3.6', '3.7') --> not yet implemented in medikit
python.setup( python.setup(
name='bonobo', name='bonobo',
description='Bonobo, a simple, modern and atomic extract-transform-load toolkit for python 3.5+.', description='Bonobo, a simple, modern and atomic extract-transform-load toolkit for python 3.5+.',
@ -54,8 +53,10 @@ python.add_requirements(
'stevedore ~=1.27', 'stevedore ~=1.27',
'whistle ~=1.0', 'whistle ~=1.0',
dev=[ dev=[
'pytest-sugar >=0.9,<0.10', 'cookiecutter >=1.5,<1.6',
'pytest-timeout ~=1.0', 'pytest-sugar >=0.8,<0.9',
'pytest-timeout >=1,<2',
'sphinx-sitemap >=0.2,<0.3',
], ],
docker=[ docker=[
'bonobo-docker ~=0.6.0a1', 'bonobo-docker ~=0.6.0a1',
@ -69,4 +70,11 @@ python.add_requirements(
], ],
) )
@listen(make.on_generate)
def on_make_generate(event):
event.makefile['SPHINX_AUTOBUILD'] = '$(PYTHON_DIRNAME)/sphinx-autobuild'
event.makefile.add_target('watch-$(SPHINX_SOURCEDIR)', '''
$(SPHINX_AUTOBUILD) $(SPHINX_SOURCEDIR) $(shell mktemp -d)
''', phony=True)
# vim: ft=python: # vim: ft=python:

View File

@ -1,5 +1,5 @@
from bonobo.errors import AbstractError from bonobo.errors import AbstractError
from bonobo.util import isoption, iscontextprocessor, sortedlist from bonobo.util import isoption, iscontextprocessor, sortedlist, get_name
__all__ = [ __all__ = [
'Configurable', 'Configurable',
@ -37,6 +37,26 @@ class ConfigurableMeta(type):
cls.__names.add(name) cls.__names.add(name)
cls.__options.insort((not value.positional, value._creation_counter, name, value)) cls.__options.insort((not value.positional, value._creation_counter, name, value))
# Docstring formating
_options_doc = []
for _positional, _counter, _name, _value in cls.__options:
_param = _name
if _value.type:
_param = get_name(_value.type) + ' ' + _param
prefix = ':param {}: '.format(_param)
for lineno, line in enumerate((_value.__doc__ or '').split('\n')):
_options_doc.append((' ' * len(prefix) if lineno else prefix) + line)
cls.__doc__ = '\n\n'.join(
map(
str.strip,
filter(None, (
cls.__doc__,
'\n'.join(_options_doc)
))
)
)
@property @property
def __options__(cls): def __options__(cls):
return ((name, option) for _, _, name, option in cls.__options) return ((name, option) for _, _, name, option in cls.__options)

View File

@ -1,3 +1,4 @@
import textwrap
import types import types
from bonobo.util.inspect import istype from bonobo.util.inspect import istype
@ -62,7 +63,12 @@ class Option:
self.positional = positional self.positional = positional
self.default = default self.default = default
self.__doc__ = __doc__ or self.__doc__ # Docstring formating
self.__doc__ = __doc__ or None
if self.__doc__:
self.__doc__ = textwrap.dedent(self.__doc__.strip('\n')).strip()
if default:
self.__doc__ += '\nDefault: {!r}'.format(default)
# This hack is necessary for python3.5 # This hack is necessary for python3.5
self._creation_counter = Option._creation_counter self._creation_counter = Option._creation_counter

View File

@ -12,12 +12,21 @@ class FileHandler(Configurable):
encoding (str): which encoding to use when opening the file. encoding (str): which encoding to use when opening the file.
""" """
path = Option(str, required=True, positional=True) # type: str path = Option(str, required=True, positional=True, __doc__='''
eol = Option(str, default='\n') # type: str Path to use within the provided filesystem.
mode = Option(str) # type: str ''') # type: str
encoding = Option(str, default='utf-8') # type: str eol = Option(str, default='\n', __doc__='''
Character to use as line separator.
fs = Service('fs') # type: str ''') # type: str
mode = Option(str, __doc__='''
What mode to use for open() call.
''') # type: str
encoding = Option(str, default='utf-8', __doc__='''
Encoding.
''') # type: str
fs = Service('fs', __doc__='''
The filesystem instance to use.
''') # type: str
@ContextProcessor @ContextProcessor
def file(self, context, *, fs): def file(self, context, *, fs):

View File

@ -55,14 +55,11 @@ class CsvHandler(FileHandler):
class CsvReader(FileReader, CsvHandler): class CsvReader(FileReader, CsvHandler):
""" """
Reads a CSV and yield the values as dicts. Reads a CSV and yield the values as dicts.
.. attribute:: skip
The amount of lines to skip before it actually yield output.
""" """
skip = Option(int, default=0) skip = Option(int, default=0, __doc__='''
If set and greater than zero, the reader will skip this amount of lines.
''')
@Method( @Method(
positional=False, positional=False,

View File

@ -12,7 +12,9 @@ class FileReader(Reader, FileHandler):
present. Extending it is usually the right way to create more specific file readers (like json, csv, etc.) present. Extending it is usually the right way to create more specific file readers (like json, csv, etc.)
""" """
mode = Option(str, default='r') mode = Option(str, default='r', __doc__='''
What mode to use for open() call.
''') # type: str
output_fields = Option( output_fields = Option(
ensure_tuple, ensure_tuple,
@ -70,7 +72,9 @@ class FileWriter(Writer, FileHandler):
usually the right way to create more specific file writers (like json, csv, etc.) usually the right way to create more specific file writers (like json, csv, etc.)
""" """
mode = Option(str, default='w+') mode = Option(str, default='w+', __doc__='''
What mode to use for open() call.
''') # type: str
def write(self, file, context, line, *, fs): def write(self, file, context, line, *, fs):
""" """

View File

@ -64,7 +64,7 @@ class Graph:
if _name in self.named: if _name in self.named:
raise KeyError('Duplicate name {!r} in graph.'.format(_name)) raise KeyError('Duplicate name {!r} in graph.'.format(_name))
self.named[_name] = _last self.named[_name] = _last
if not _first: if _first is None:
_first = _last _first = _last
self.outputs_of(_input, create=True).add(_last) self.outputs_of(_input, create=True).add(_last)
_input = _last _input = _last

View File

@ -21,8 +21,11 @@ extensions = [
'sphinx.ext.ifconfig', 'sphinx.ext.ifconfig',
'sphinx.ext.viewcode', 'sphinx.ext.viewcode',
'sphinx.ext.graphviz', 'sphinx.ext.graphviz',
'sphinx_sitemap',
] ]
site_url = 'http://docs.bonobo-project.org/en/master/'
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']

11
docs/guide/_toc.rst Normal file
View File

@ -0,0 +1,11 @@
.. toctree::
:maxdepth: 2
introduction
transformations
graphs
services
environment
purity
debugging
plugins

0
docs/guide/debugging.rst Normal file
View File

View File

@ -5,6 +5,92 @@ Graphs are the glue that ties transformations together. They are the only data-s
must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number of nodes can be rather high, practical use cases usually do not exceed more than a few hundred nodes and only then in extreme cases. must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number of nodes can be rather high, practical use cases usually do not exceed more than a few hundred nodes and only then in extreme cases.
Within a graph, each node are isolated and can only communicate using their
input and output queues. For each input row, a given node will be called with
the row passed as arguments. Each *return* or *yield* value will be put on the
node's output queue, and the nodes connected in the graph will then be able to
process it.
|bonobo| is a line-by-line data stream processing solution.
Handling the data-flow this way brings the following properties:
- **First in, first out**: unless stated otherwise, each node will receeive the
rows from FIFO queues, and so, the order of rows will be preserved. That is
true for each single node, but please note that if you define "graph bubbles"
(where a graph diverge in different branches then converge again), the
convergence node will receive rows FIFO from each input queue, meaning that
the order existing at the divergence point wont stay true at the convergence
point.
- **Parallelism**: each node run in parallel (by default, using independant
threads). This is useful as you don't have to worry about blocking calls.
If a thread waits for, let's say, a database, or a network service, the other
nodes will continue handling data, as long as they have input rows available.
- **Independance**: the rows are independant from each other, making this way
of working with data flows good for line-by-line data processing, but
also not ideal for "grouped" computations (where an output depends on more
than one line of input data). You can overcome this with rolling windows if
the input required are adjacent rows, but if you need to work on the whole
dataset at once, you should consider other software.
Graphs are defined using :class:`bonobo.Graph` instances, as seen in the
previous tutorial step.
What can be a node?
:::::::::::::::::::
**TL;DR**: … anything, as long as its callable().
Functions
---------
.. code-block:: python
def get_item(id):
return id, items.get(id)
Each node of a graph will be executed in isolation from the other nodes, and the data is passed from one node to the
next using FIFO queues, managed by the framework. It's transparent to the end-user, though, and you'll only use
function arguments (for inputs) and return/yield values (for outputs).
Each input row of a node will cause one call to this node's callable. Each output is cast internally as a tuple-like
data structure (or more precisely, a namedtuple-like data structure), and for one given node, each output row must
have the same structure.
If you return/yield something which is not a tuple, bonobo will create a tuple of one element.
Properties
----------
|bonobo| assists you with defining the data-flow of your data engineering process, and then streams data through your
callable graphs.
* Each node call will process one row of data.
* Queues that flows the data between node are first-in, first-out (FIFO) standard python :class:`queue.Queue`.
* Each node will run in parallel
* Default execution strategy use threading, and each node will run in a separate thread.
Fault tolerance
---------------
Node execution is fault tolerant.
If an exception is raised from a node call, then this node call will be aborted but bonobo will continue the execution
with the next row (after outputing the stack trace and incrementing the "err" counter for the node context).
It allows to have ETL jobs that ignore faulty data and try their best to process the valid rows of a dataset.
Some errors are fatal, though.
If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an :class:`bonobo.errors.UnrecoverableTypeError`, and exit the
current graph execution as fast as it can (finishing the other node executions that are in progress first, but not
starting new ones if there are remaining input rows).
Definitions Definitions
::::::::::: :::::::::::

View File

@ -3,13 +3,8 @@ Guides
This section will guide you through your journey with Bonobo ETL. This section will guide you through your journey with Bonobo ETL.
.. toctree::
:maxdepth: 2
introduction .. include:: _toc.rst
transformations
graphs
services
environment
purity

View File

@ -4,9 +4,6 @@ Bonobo API
The Bonobo API, available directly under the :mod:`bonobo` package, contains all the tools you need to get started with The Bonobo API, available directly under the :mod:`bonobo` package, contains all the tools you need to get started with
bonobo. bonobo.
The :mod:`bonobo` package
:::::::::::::::::::::::::
.. automodule:: bonobo .. automodule:: bonobo
:members: :members:
:undoc-members: :undoc-members:

View File

@ -0,0 +1,9 @@
.. warning::
This tutorial was written for |bonobo| 0.5, while the current stable version is |bonobo| 0.6.
Please be aware that some things changed.
A summary of changes is available in the `migration guide from 0.5 to 0.6 <https://news.bonobo-project.org/migration-guide-for-bonobo-0-6-alpha-c1d36b0a9d35>`_.

View File

@ -0,0 +1,65 @@
First steps
===========
.. include:: _outdated_note.rst
What is Bonobo?
:::::::::::::::
Bonobo is an ETL (Extract-Transform-Load) framework for python 3.5. The goal is to define data-transformations, with
python code in charge of handling similar shaped independent lines of data.
Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas.
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery
(execution contexts, parallelism, error handling, console output, logging, ...).
Bonobo uses simple python and should be quick and easy to learn.
Tutorial
::::::::
.. note::
Good documentation is not easy to write. We do our best to make it better and better.
Although all content here should be accurate, you may feel a lack of completeness, for which we plead guilty and
apologize.
If you're stuck, please come and ask on our `slack channel <https://bonobo-slack.herokuapp.com/>`_, we'll figure
something out.
If you're not stuck but had trouble understanding something, please consider contributing to the docs (via GitHub
pull requests).
.. toctree::
:maxdepth: 2
tut01
tut02
tut03
tut04
What's next?
::::::::::::
Read a few examples
-------------------
* :doc:`/reference/examples`
Read about best development practices
-------------------------------------
* :doc:`/guide/index`
* :doc:`/guide/purity`
Read about integrating external tools with bonobo
-------------------------------------------------
* :doc:`/extension/docker`: run transformation graphs in isolated containers.
* :doc:`/extension/jupyter`: run transformations within jupyter notebooks.
* :doc:`/extension/selenium`: crawl the web using a real browser and work with the gathered data.
* :doc:`/extension/sqlalchemy`: everything you need to interract with SQL databases.

View File

@ -0,0 +1,13 @@
Just enough Python for Bonobo
=============================
.. include:: _outdated_note.rst
.. todo::
This is a work in progress and it is not yet available. Please come back later or even better, help us write this
guide!
This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It
should definately not be considered as a general python introduction, neither a deep dive into details.

View File

@ -1,7 +1,10 @@
Let's get started! Let's get started!
================== ==================
To get started with Bonobo, you need to install it in a working python 3.5+ environment: .. include:: _outdated_note.rst
To begin with Bonobo, you need to install it in a working python 3.5+ environment, and you'll also need cookiecutter
to bootstrap your project.
.. code-block:: shell-session .. code-block:: shell-session
@ -13,24 +16,21 @@ See :doc:`/install` for more options.
Create an empty project Create an empty project
::::::::::::::::::::::: :::::::::::::::::::::::
Your ETL code will live in standard python files and packages. Your ETL code will live in ETL projects, which are basically a bunch of files, including python code, that bonobo
can run.
.. code-block:: shell-session .. code-block:: shell-session
$ bonobo create tutorial.py $ bonobo init tutorial
This will create a simple example job in a `tutorial.py` file. This will create a `tutorial` directory (`content description here <https://www.bonobo-project.org/with/cookiecutter>`_).
Now, try to execute it: To run this project, use:
.. code-block:: shell-session .. code-block:: shell-session
$ python tutorial.py $ bonobo run tutorial
Congratulations, you just ran your first ETL job!
.. todo:: XXX **CHANGES NEEDED BELOW THIS POINTS BEFORE 0.6** XXX
Write a first transformation Write a first transformation
:::::::::::::::::::::::::::: ::::::::::::::::::::::::::::
@ -107,9 +107,6 @@ To do this, it needs to know what data-flow you want to achieve, and you'll use
The `if __name__ == '__main__':` section is not required, unless you want to run it directly using the python The `if __name__ == '__main__':` section is not required, unless you want to run it directly using the python
interpreter. interpreter.
The name of the `graph` variable is arbitrary, but this variable must be global and available unconditionally.
Do not put it in its own function or in the `if __name__ == '__main__':` section.
Execute the job Execute the job
::::::::::::::: :::::::::::::::
@ -133,9 +130,9 @@ Rewrite it using builtins
There is a much simpler way to describe an equivalent graph: There is a much simpler way to describe an equivalent graph:
.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py .. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py
:language: python :language: python
The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input
generator. generator.
This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module: This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
@ -177,8 +174,8 @@ strategy). Actual behavior of an execution will depend on the strategy chosen, b
cases. cases.
④ Before actually executing the `transformations`, the `ExecutorStrategy` instance will wrap each component in an ④ Before actually executing the `transformations`, the `ExecutorStrategy` instance will wrap each component in an
`execution context`, whose responsibility is to hold the state of the transformation. It enables to keep the `execution context`, whose responsibility is to hold the state of the transformation. It enables you to keep the
`transformations` stateless, while allowing to add an external state if required. We'll expand on this later. `transformations` stateless, while allowing you to add an external state if required. We'll expand on this later.
Concepts and definitions Concepts and definitions
:::::::::::::::::::::::: ::::::::::::::::::::::::

View File

@ -1,6 +1,8 @@
Working with files Working with files
================== ==================
.. include:: _outdated_note.rst
Bonobo would be pointless if the aim was just to uppercase small lists of strings. Bonobo would be pointless if the aim was just to uppercase small lists of strings.
In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks. In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks.
@ -59,7 +61,13 @@ available in **Bonobo**'s repository:
.. code-block:: shell-session .. code-block:: shell-session
$ bonobo download examples/datasets/coffeeshops.txt $ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python3 -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'`
.. note::
The "example dataset download" step will be easier in the future.
https://github.com/python-bonobo/bonobo/issues/134
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py .. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py
:language: python :language: python

View File

@ -1,6 +1,8 @@
Configurables and Services Configurables and Services
========================== ==========================
.. include:: _outdated_note.rst
.. note:: .. note::
This section lacks completeness, sorry for that (but you can still read it!). This section lacks completeness, sorry for that (but you can still read it!).
@ -13,7 +15,7 @@ Class-based transformations and configurables
Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the
user to provide callables that logically fits in a graph. user to provide callables that logically fits in a graph.
You can use plain python objects with a `__call__()` method, and it ill just work. You can use plain python objects with a `__call__()` method, and it will just work.
As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of
them requiring your class to subclass :class:`bonobo.config.Configurable`. them requiring your class to subclass :class:`bonobo.config.Configurable`.
@ -30,7 +32,7 @@ Configurables allows to use the following features:
class PrefixIt(Configurable): class PrefixIt(Configurable):
prefix = Option(str, positional=True, default='>>>') prefix = Option(str, positional=True, default='>>>')
def __call__(self, row): def call(self, row):
return self.prefix + ' ' + row return self.prefix + ' ' + row
prefixer = PrefixIt('$') prefixer = PrefixIt('$')
@ -48,7 +50,7 @@ Configurables allows to use the following features:
url = Option(default='https://jsonplaceholder.typicode.com/users') url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client') http = Service('http.client')
def __call__(self, http): def call(self, http):
resp = http.get(self.url) resp = http.get(self.url)
for row in resp.json(): for row in resp.json():
@ -68,7 +70,7 @@ Configurables allows to use the following features:
class Applier(Configurable): class Applier(Configurable):
apply = Method() apply = Method()
def __call__(self, row): def call(self, row):
return self.apply(row) return self.apply(row)
@Applier @Applier
@ -114,7 +116,7 @@ Let's see how to use it, starting from the previous service example:
url = Option(default='https://jsonplaceholder.typicode.com/users') url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client') http = Service('http.client')
def __call__(self, http): def call(self, http):
resp = http.get(self.url) resp = http.get(self.url)
for row in resp.json(): for row in resp.json():

View File

@ -1,6 +1,8 @@
Working with databases Working with databases
====================== ======================
.. include:: _outdated_note.rst
Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never
be) included in the main package. Instead, working with databases is done using third party, well maintained and be) included in the main package. Instead, working with databases is done using third party, well maintained and
specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop. specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop.

View File

@ -1,113 +1,149 @@
Part 2: Writing ETL Jobs Part 2: Writing ETL Jobs
======================== ========================
What's an ETL job ? In |bonobo|, an ETL job is a graph with some logic to execute it, like the file we created in the previous section.
:::::::::::::::::::
In |bonobo|, an ETL job is a single graph that can be executed on its own. You can learn more about the :class:`bonobo.Graph` data-structure and its properties in the
:doc:`graphs guide </guide/graphs>`.
Within a graph, each node are isolated and can only communicate using their
input and output queues. For each input row, a given node will be called with
the row passed as arguments. Each *return* or *yield* value will be put on the
node's output queue, and the nodes connected in the graph will then be able to
process it.
|bonobo| is a line-by-line data stream processing solution. Scenario
::::::::
Handling the data-flow this way brings the following properties: Let's create a sample application, which goal will be to integrate some data in various systems.
- **First in, first out**: unless stated otherwise, each node will receeive the We'll use an open-data dataset, containing all the fablabs in the world.
rows from FIFO queues, and so, the order of rows will be preserved. That is
true for each single node, but please note that if you define "graph bubbles"
(where a graph diverge in different branches then converge again), the
convergence node will receive rows FIFO from each input queue, meaning that
the order existing at the divergence point wont stay true at the convergence
point.
- **Parallelism**: each node run in parallel (by default, using independant We will normalize this data using a few different rules, then write it somewhere.
threads). This is useful as you don't have to worry about blocking calls.
If a thread waits for, let's say, a database, or a network service, the other
nodes will continue handling data, as long as they have input rows available.
- **Independance**: the rows are independant from each other, making this way In this step, we'll focus on getting this data normalized and output to the console. In the next steps, we'll extend it
of working with data flows good for line-by-line data processing, but to other targets, like files, and databases.
also not ideal for "grouped" computations (where an output depends on more
than one line of input data). You can overcome this with rolling windows if
the input required are adjacent rows, but if you need to work on the whole
dataset at once, you should consider other software.
Graphs are defined using :class:`bonobo.Graph` instances, as seen in the
previous tutorial step.
What can be a node? Setup
::::::::::::::::::: :::::
**TL;DR**: … anything, as long as its callable(). We'll change the `tutorial.py` file created in the last step to handle this new scenario.
Functions First, let's remove all boilerplate code, so it looks like this:
---------
.. code-block:: python .. code-block:: python
def get_item(id): import bonobo
return id, items.get(id)
Each node of a graph will be executed in isolation from the other nodes, and the data is passed from one node to the def get_graph(**options):
next using FIFO queues, managed by the framework. It's transparent to the end-user, though, and you'll only use graph = bonobo.Graph()
function arguments (for inputs) and return/yield values (for outputs). return graph
Each input row of a node will cause one call to this node's callable. Each output is cast internally as a tuple-like
data structure (or more precisely, a namedtuple-like data structure), and for one given node, each output row must
have the same structure.
If you return/yield something which is not a tuple, bonobo will create a tuple of one element.
Properties
----------
|bonobo| assists you with defining the data-flow of your data engineering process, and then streams data through your
callable graphs.
* Each node call will process one row of data.
* Queues that flows the data between node are first-in, first-out (FIFO) standard python :class:`queue.Queue`.
* Each node will run in parallel
* Default execution strategy use threading, and each node will run in a separate thread.
Fault tolerance
---------------
Node execution is fault tolerant.
If an exception is raised from a node call, then this node call will be aborted but bonobo will continue the execution
with the next row (after outputing the stack trace and incrementing the "err" counter for the node context).
It allows to have ETL jobs that ignore faulty data and try their best to process the valid rows of a dataset.
Some errors are fatal, though.
If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an :class:`bonobo.errors.UnrecoverableTypeError`, and exit the
current graph execution as fast as it can (finishing the other node executions that are in progress first, but not
starting new ones if there are remaining input rows).
Let's write a sample data integration job def get_services(**options):
::::::::::::::::::::::::::::::::::::::::: return {}
Let's create a sample application.
The goal of this application will be to extract all the fablabs in the world using an open-data API, normalize this
data and, for now, display it. We'll then build on this foundation in the next steps to write to files, databases, etc.
if __name__ == '__main__':
parser = bonobo.get_argument_parser()
with bonobo.parse_args(parser) as options:
bonobo.run(get_graph(**options), services=get_services(**options))
Your job now contains the logic for executing an empty graph, and we'll complete this with our application logic.
Reading the source data
:::::::::::::::::::::::
Let's add a simple chain to our `get_graph(...)` function, so that it reads from the fablabs open-data api.
The source dataset we'll use can be found on `this site <https://public-us.opendatasoft.com/explore/dataset/fablabs/>`_.
It's licensed under `Public Domain`, which makes it just perfect for our example.
.. note::
There is a :mod:`bonobo.contrib.opendatasoft` module that makes reading from OpenDataSoft APIs easier, including
pagination and limits, but for our tutorial, we'll avoid that and build it manually.
Let's write our extractor:
.. code-block:: python
import requests
FABLABS_API_URL = 'https://public-us.opendatasoft.com/api/records/1.0/search/?dataset=fablabs&rows=1000'
def extract_fablabs():
yield from requests.get(FABLABS_API_URL).json().get('records')
This extractor will get called once, query the API url, parse it as JSON, and yield the items from the "records" list,
one by one.
.. note::
You'll probably want to make it a bit more verbose in a real application, to handle all kind of errors that can
happen here. What if the server is down? What if it returns a response which is not JSON? What if the data is not
in the expected format?
For simplicity sake, we'll ignore that here but that's the kind of questions you should have in mind when writing
pipelines.
To test our pipeline, let's use a :class:`bonobo.Limit` and a :class:`bonobo.PrettyPrinter`, and change our
`get_graph(...)` function accordingly:
.. code-block:: python
import bonobo
def get_graph(**options):
graph = bonobo.Graph()
graph.add_chain(
extract_fablabs,
bonobo.Limit(10),
bonobo.PrettyPrinter(),
)
return graph
Running this job should output a bit of data, along with some statistics.
First, let's look at the statistics:
.. code-block:: shell-session
- extract_fablabs in=1 out=995 [done]
- Limit in=995 out=10 [done]
- PrettyPrinter in=10 out=10 [done]
It is important to understand that we extracted everything (995 rows), before droping 99% of the dataset.
This is OK for debugging, but not efficient.
.. note::
You should always try to limit the amount of data as early as possible, which often means not generating the data
you won't need in the first place. Here, we could have used the `rows=` query parameter in the API URL to not
request the data we would anyway drop.
Normalize
:::::::::
.. include:: _todo.rst
Output
::::::
We used :class:`bonobo.PrettyPrinter` to output the data.
It's a flexible transformation provided that helps you display the content of a stream, and you'll probably use it a
lot for various reasons.
Moving forward Moving forward
:::::::::::::: ::::::::::::::
You now know: You now know:
* How to ... * How to use a reader node.
* How to use the console output.
* How to limit the number of elements in a stream.
* How to pass data from one node to another.
* How to structure a graph using chains.
**Next: :doc:`3-files`** It's now time to jump to :doc:`3-files`.

View File

@ -1,6 +1,51 @@
Part 3: Working with Files Part 3: Working with Files
========================== ==========================
.. include:: _wip_note.rst
Writing to the console is nice, but using files is probably more realistic.
Let's see how to use a few builtin writers and both local and remote filesystems.
Filesystems
:::::::::::
In |bonobo|, files are accessed within a **filesystem** service which must be something with the same interface as
`fs' FileSystem objects <https://docs.pyfilesystem.org/en/latest/builtin.html>`_. As a default, you'll get an instance
of a local filesystem mapped to the current working directory as the `fs` service. You'll learn more about services in
the next step, but for now, let's just use it.
Writing using the service
:::::::::::::::::::::::::
Although |bonobo| contains helpers to write to common file formats, let's start by writing it manually.
.. code-block:: python
from bonobo.config import use
from bonobo.constants import NOT_MODIFIED
@use('fs')
def write_repr_to_file(*row, fs):
with fs.open('output.txt', 'a+') as f:
print(row, file=f)
return NOT_MODIFIED
Then, update the `get_graph(...)` function, by adding `write_repr_to_file` just before your `PrettyPrinter()` node.
Let's try to run that and think about what happens.
Each time a row comes to this node, the output file is open in "append or create" mode, a line is written, and the file
is closed.
This is **NOT** how you want to do things. Let's rewrite it so our `open(...)` call becomes execution-wide.
* Filesystems * Filesystems
* Reading files * Reading files
@ -19,4 +64,4 @@ You now know:
* How to ... * How to ...
**Next: :doc:`4-services`** It's now time to jump to :doc:`4-services`.

View File

@ -1,6 +1,7 @@
Part 4: Services and Configurables Part 4: Services and Configurables
================================== ==================================
.. include:: _wip_note.rst
In the last section, we used a few new tools. In the last section, we used a few new tools.
@ -204,4 +205,4 @@ You now know:
* How to ... * How to ...
**Next: :doc:`5-packaging`** It's now time to jump to :doc:`5-packaging`.

View File

@ -1,6 +1,8 @@
Part 5: Projects and Packaging Part 5: Projects and Packaging
============================== ==============================
.. include:: _wip_note.rst
Until then, we worked with one file managing a job. Until then, we worked with one file managing a job.
Real life often involves more complicated setups, with relations and imports between different files. Real life often involves more complicated setups, with relations and imports between different files.
@ -13,7 +15,6 @@ kind of project structure, as the targert structure will be dicated by the hosti
sub-package would perfectly fit a django or flask project, or even a regular package, but it's up to you to chose the sub-package would perfectly fit a django or flask project, or even a regular package, but it's up to you to chose the
structure of your project. structure of your project.
about using |bonobo| in a pyt
is about set of jobs working together within a project. is about set of jobs working together within a project.
Let's see how to move from the current status to a package. Let's see how to move from the current status to a package.
@ -26,3 +27,19 @@ You now know:
* How to ... * How to ...
That's the end of the tutorial, you should now be familiar with all the basics.
A few appendixes to the tutorial can explain how to integrate with other systems (we'll use the "fablabs" application
created in this tutorial and extend it):
* :doc:`notebooks`
* :doc:`sqlalchemy`
* :doc:`django`
* :doc:`docker`
Then, you can either to jump head-first into your code, or you can have a better grasp at all concepts by
:doc:`reading the full bonobo guide </guide/index>`.
Happy data flows!

3
docs/tutorial/_todo.rst Normal file
View File

@ -0,0 +1,3 @@
.. warning::
This section is missing. Sorry, but stay tuned! It'll be added soon.

View File

@ -0,0 +1,12 @@
.. warning::
This section is being rewritten for |bonobo| 0.6, and it's now in a "work in progress" state.
You can read :doc:`the tutorial for the previous version (0.5) <0.5/index>`. Please note that things changed a bit
since then and you'll have quirks here and there.
You can also read the `migration guide from 0.5 to 0.6 <https://news.bonobo-project.org/migration-guide-for-bonobo-0-6-alpha-c1d36b0a9d35>`_
that will give you a good overview of the changes.
Hopefully, this document will be updated soon, and please accept our apologies about this doc status until then.

View File

@ -1,3 +1,24 @@
Working with Django Working with Django
=================== ===================
.. warning::
This section does not exist yet, but it's in the plans to write it quite soon.
Meanwhile, you can check the source code and other links provided below.
Source code
:::::::::::
https://github.com/python-bonobo/bonobo/tree/master/bonobo/contrib/django
bonobo.contrib.django
:::::::::::::::::::::
.. automodule:: bonobo.contrib.django
:members:
:undoc-members:
:show-inheritance:

16
docs/tutorial/docker.rst Normal file
View File

@ -0,0 +1,16 @@
Working with Docker
===================
.. warning::
This section does not exist yet, but it's in the plans to write it quite soon.
Meanwhile, you can check the source code and other links provided below.
Source code
:::::::::::
https://github.com/python-bonobo/bonobo-docker

View File

@ -53,3 +53,4 @@ out.
If you're not stuck but had trouble understanding something, please consider contributing to the docs (using GitHub If you're not stuck but had trouble understanding something, please consider contributing to the docs (using GitHub
pull requests). pull requests).
.. include:: _wip_note.rst

View File

@ -1,4 +1,13 @@
Working with Jupyter Notebooks Working with Jupyter Notebooks
============================== ==============================
.. warning::
This section does not exist yet, but it's in the plans to write it quite soon.
Meanwhile, you can check the source code and other links provided below.
Source code
:::::::::::
https://github.com/python-bonobo/bonobo/tree/master/bonobo/contrib/jupyter

View File

@ -1,4 +1,15 @@
Working with SQL Databases Working with SQL Databases
========================== ==========================
.. warning::
This section does not exist yet, but it's in the plans to write it quite soon.
Meanwhile, you can check the source code and other links provided below.
Source code
:::::::::::
https://github.com/python-bonobo/bonobo-sqlalchemy

View File

@ -6,23 +6,32 @@ dependencies:
- wheel=0.29.0 - wheel=0.29.0
- pip: - pip:
- appdirs==1.4.3 - appdirs==1.4.3
- certifi==2017.7.27.1 - certifi==2017.11.5
- chardet==3.0.4 - chardet==3.0.4
- colorama==0.3.9 - colorama==0.3.9
- fs==2.0.12 - fs==2.0.17
- graphviz==0.8.2
- idna==2.6 - idna==2.6
- jinja2==2.9.6 - jinja2==2.10
- markupsafe==1.0 - markupsafe==1.0
- mondrian==0.4.0 - mondrian==0.6.1
- packaging==16.8 - packaging==16.8
- pbr==3.1.1 - pbr==3.1.1
- psutil==5.4.0 - psutil==5.4.3
- pyparsing==2.2.0 - pyparsing==2.2.0
- python-slugify==1.2.4
- pytz==2017.3 - pytz==2017.3
- requests==2.18.4 - requests==2.18.4
- six==1.11.0 - six==1.11.0
- stevedore==1.27.1 - stevedore==1.28.0
- unidecode==1.0.22
- urllib3==1.22 - urllib3==1.22
- whistle==1.0.0 - whistle==1.0.0
# for docs
- alabaster==0.7.10
- sphinx-sitemap==0.2
- sphinx==1.6.5
- sphinxcontrib-websupport==1.0.1
# for examples # for examples
- pycountry ==17.9.23 - pycountry ==17.9.23

View File

@ -1,28 +1,38 @@
-e .[dev] -e .[dev]
alabaster==0.7.10 alabaster==0.7.10
arrow==0.12.0
attrs==17.4.0 attrs==17.4.0
babel==2.5.1 babel==2.5.1
binaryornot==0.4.4
certifi==2017.11.5 certifi==2017.11.5
chardet==3.0.4 chardet==3.0.4
click==6.7
cookiecutter==1.5.1
coverage==4.4.2 coverage==4.4.2
docutils==0.14 docutils==0.14
future==0.16.0
idna==2.6 idna==2.6
imagesize==0.7.1 imagesize==0.7.1
jinja2-time==0.2.0
jinja2==2.10 jinja2==2.10
markupsafe==1.0 markupsafe==1.0
pluggy==0.6.0 pluggy==0.6.0
poyo==0.4.1
py==1.5.2 py==1.5.2
pygments==2.2.0 pygments==2.2.0
pytest-cov==2.5.1 pytest-cov==2.5.1
pytest-sugar==0.9.0 pytest-sugar==0.8.0
pytest-timeout==1.2.1 pytest-timeout==1.2.1
pytest==3.3.1 pytest==3.3.2
python-dateutil==2.6.1
pytz==2017.3 pytz==2017.3
requests==2.18.4 requests==2.18.4
six==1.11.0 six==1.11.0
snowballstemmer==1.2.1 snowballstemmer==1.2.1
sphinx==1.6.5 sphinx-sitemap==0.2
sphinx==1.6.6
sphinxcontrib-websupport==1.0.1 sphinxcontrib-websupport==1.0.1
termcolor==1.1.0 termcolor==1.1.0
urllib3==1.22 urllib3==1.22
whichcraft==0.4.1
yapf==0.20.0 yapf==0.20.0

View File

@ -1,6 +1,6 @@
-e .[docker] -e .[docker]
appdirs==1.4.3 appdirs==1.4.3
bonobo-docker==0.6.0a1 bonobo-docker==0.6.0
certifi==2017.11.5 certifi==2017.11.5
chardet==3.0.4 chardet==3.0.4
colorama==0.3.9 colorama==0.3.9
@ -22,7 +22,7 @@ requests==2.18.4
semantic-version==2.6.0 semantic-version==2.6.0
six==1.11.0 six==1.11.0
stevedore==1.28.0 stevedore==1.28.0
unidecode==0.4.21 unidecode==1.0.22
urllib3==1.22 urllib3==1.22
websocket-client==0.46.0 websocket-client==0.46.0
whistle==1.0.0 whistle==1.0.0

View File

@ -1,6 +1,5 @@
-e .[jupyter] -e .[jupyter]
appnope==0.1.0 appnope==0.1.0
attrs==17.4.0
bleach==2.1.2 bleach==2.1.2
decorator==4.1.2 decorator==4.1.2
entrypoints==0.2.3 entrypoints==0.2.3
@ -12,7 +11,7 @@ ipywidgets==6.0.1
jedi==0.11.1 jedi==0.11.1
jinja2==2.10 jinja2==2.10
jsonschema==2.6.0 jsonschema==2.6.0
jupyter-client==5.2.0 jupyter-client==5.2.1
jupyter-console==5.2.0 jupyter-console==5.2.0
jupyter-core==4.4.0 jupyter-core==4.4.0
jupyter==1.0.0 jupyter==1.0.0
@ -25,20 +24,17 @@ pandocfilters==1.4.2
parso==0.1.1 parso==0.1.1
pexpect==4.3.1 pexpect==4.3.1
pickleshare==0.7.4 pickleshare==0.7.4
pluggy==0.6.0
prompt-toolkit==1.0.15 prompt-toolkit==1.0.15
ptyprocess==0.5.2 ptyprocess==0.5.2
py==1.5.2
pygments==2.2.0 pygments==2.2.0
pytest==3.3.1
python-dateutil==2.6.1 python-dateutil==2.6.1
pyzmq==17.0.0b3 pyzmq==16.0.3
qtconsole==4.3.1 qtconsole==4.3.1
simplegeneric==0.8.1 simplegeneric==0.8.1
six==1.11.0 six==1.11.0
terminado==0.8.1 terminado==0.8.1
testpath==0.3.1 testpath==0.3.1
tornado==5.0a1 tornado==4.5.3
traitlets==4.3.2 traitlets==4.3.2
wcwidth==0.1.7 wcwidth==0.1.7
webencodings==0.5.1 webencodings==0.5.1

View File

@ -1,6 +1,6 @@
-e .[sqlalchemy] -e .[sqlalchemy]
appdirs==1.4.3 appdirs==1.4.3
bonobo-sqlalchemy==0.6.0a1 bonobo-sqlalchemy==0.6.0
certifi==2017.11.5 certifi==2017.11.5
chardet==3.0.4 chardet==3.0.4
colorama==0.3.9 colorama==0.3.9
@ -20,6 +20,6 @@ requests==2.18.4
six==1.11.0 six==1.11.0
sqlalchemy==1.2.0 sqlalchemy==1.2.0
stevedore==1.28.0 stevedore==1.28.0
unidecode==0.4.21 unidecode==1.0.22
urllib3==1.22 urllib3==1.22
whistle==1.0.0 whistle==1.0.0

View File

@ -18,6 +18,6 @@ pytz==2017.3
requests==2.18.4 requests==2.18.4
six==1.11.0 six==1.11.0
stevedore==1.28.0 stevedore==1.28.0
unidecode==0.4.21 unidecode==1.0.22
urllib3==1.22 urllib3==1.22
whistle==1.0.0 whistle==1.0.0

View File

@ -64,8 +64,9 @@ setup(
], ],
extras_require={ extras_require={
'dev': [ 'dev': [
'coverage (>= 4.4, < 5.0)', 'pytest (>= 3.1, < 4.0)', 'pytest-cov (>= 2.5, < 3.0)', 'cookiecutter (>= 1.5, < 1.6)', 'coverage (>= 4.4, < 5.0)', 'pytest (>= 3.1, < 4.0)',
'pytest-sugar (>= 0.9, < 0.10)', 'pytest-timeout (~= 1.0)', 'sphinx (>= 1.6, < 2.0)', 'yapf' 'pytest-cov (>= 2.5, < 3.0)', 'pytest-sugar (>= 0.8, < 0.9)', 'pytest-timeout (>= 1, < 2)',
'sphinx (>= 1.6, < 2.0)', 'sphinx-sitemap (>= 0.2, < 0.3)', 'yapf'
], ],
'docker': ['bonobo-docker (~= 0.6.0a1)'], 'docker': ['bonobo-docker (~= 0.6.0a1)'],
'jupyter': ['ipywidgets (~= 6.0)', 'jupyter (~= 1.0)'], 'jupyter': ['ipywidgets (~= 6.0)', 'jupyter (~= 1.0)'],