Major update to documentation, removing deprecated docs and adding the new syntax to graph building options.

This commit is contained in:
Romain Dorgueil
2019-06-01 14:08:25 +02:00
parent c998708923
commit e84440df8c
23 changed files with 434 additions and 883 deletions

View File

@ -8,8 +8,8 @@ apidoc_root = "docs/reference/api"
class Module:
def __init__(self, name, title=None, *, automodule_options=None):
def __init__(self, name, title=None, *, automodule_options=None, append=None):
self.append = append
self.name = name
self.title = title or " ".join(map(str.title, self.name.split(".")[1:]))
self.automodule_options = automodule_options or list()
@ -18,20 +18,101 @@ class Module:
return "<{} ({})>".format(self.title, self.name)
def asdict(self):
return {"name": self.name, "title": self.title, "automodule_options": self.automodule_options}
return {
"append": self.append,
"automodule": True,
"automodule_options": self.automodule_options,
"name": self.name,
"title": self.title,
}
def get_path(self):
return os.path.join(__path__, apidoc_root, *self.name.split(".")) + ".rst"
import inspect
bonobo = __import__("bonobo")
assert bonobo.__version__
prefixes = {
"bonobo.nodes": None,
"bonobo._api": "bonobo",
"bonobo.structs.graphs": None,
"bonobo.execution.strategies": "bonobo",
"bonobo.registry": "bonobo",
"bonobo.util.environ": "bonobo",
}
api_objects = {}
display_order = [("bonobo.structs.graphs", "Graphs"), ("bonobo.nodes", "Nodes"), ("bonobo", "Other top-level APIs")]
for name in sorted(dir(bonobo)):
# ignore attributes starting by underscores
if name.startswith("_"):
continue
attr = getattr(bonobo, name)
if inspect.ismodule(attr):
continue
assert name in bonobo.__all__
o = getattr(bonobo, name)
modname = inspect.getmodule(o).__name__
family = None
family_override = None
for prefix, target in prefixes.items():
if modname == prefix or modname.startswith(prefix + "."):
family = target or prefix
display_name = ".".join([family, name])
break
if family is None:
raise Exception("Could not find family for {}".format(name))
api_objects.setdefault(family, [])
api_objects[family].append((name, o))
api_content = []
current_family = None
for family, title in display_order:
if family != current_family:
if current_family is not None:
api_content.append("")
api_content.append("")
api_content.append(title)
api_content.append(":" * len(title))
api_content.append("")
current_family = family
for api_object in sorted(api_objects[family]):
object_type = "func" if inspect.isfunction(api_object[1]) else "class"
api_content.append("* :{}:`{}.{}` ".format(object_type, family, api_object[0]))
if family == "bonobo":
for api_object in sorted(api_objects[family]):
object_type = "function" if inspect.isfunction(api_object[1]) else "class"
api_content.append("")
api_content.append("")
api_content.append(api_object[0])
api_content.append("-" * len(api_object[0]))
api_content.append("")
api_content.append(".. auto{}:: {}.{}".format(object_type, family, api_object[0]))
print("\n".join(api_content))
modules = [
Module("bonobo", title="Bonobo"),
Module("bonobo", title="Bonobo", automodule_options=["no-members"], append="\n".join(api_content)),
Module("bonobo.config"),
Module("bonobo.constants", automodule_options=["no-members"]),
Module("bonobo.execution"),
Module("bonobo.execution.contexts"),
Module("bonobo.execution.events"),
Module("bonobo.execution.strategies"),
Module("bonobo.nodes"),
Module("bonobo.structs.graphs", title="Graphs"),
Module("bonobo.util"),
]
@ -50,8 +131,13 @@ env = Environment(
:Module: :mod:`{{ name }}`
{% if automodule %}
.. automodule:: {{ name }}
{% for opt in automodule_options %} :{{ opt }}:{{ "\n" }}{% endfor %}
{% endif %}
{% if append %}
{{ append }}
{% endif %}
"""[
1:-1
]

View File

@ -1,6 +1,9 @@
"""
Contains all the tools you need to get started with the framework, including (but not limited to) generic
transformations, readers, writers, and tools for writing and executing graphs and jobs.
The root :mod:`bonobo` package contains all the tools you need to get started with the framework, including (but not
limited to) generic transformations, readers, writers, and tools for writing and executing graphs and jobs.
Mostly, it exposes objects found in sub-packages, and although you can access them directly here, you should refer to
the matching documentation pages when using them.
All objects in this module are considered very safe to use, and backward compatibility when moving up from one version
to another is maximal.

View File

@ -1,3 +1,10 @@
"""
The :mod:`bonobo.nodes` module contains all builtin transformations that you can use out of the box in your ETL jobs.
Please note that all objects from this package are also available directly through the root :mod:`bonobo` package.
"""
from bonobo.nodes.basics import *
from bonobo.nodes.basics import __all__ as _all_basics
from bonobo.nodes.filter import Filter

View File

@ -1,51 +1,40 @@
Graphs
======
Graphs are the glue that ties transformations together. They are the only data-structure bonobo can execute directly. Graphs
must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number of nodes can be rather high, practical use cases usually do not exceed more than a few hundred nodes and only then in extreme cases.
Graphs are the glue that ties transformations together. They are the only data-structure bonobo can execute directly.
Graphs must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number
of nodes can be rather high, practical cases usually do not exceed a few hundred nodes and even that is a rather high
number you may not encounter so often.
Within a graph, each node are isolated and can only communicate using their
input and output queues. For each input row, a given node will be called with
the row passed as arguments. Each *return* or *yield* value will be put on the
node's output queue, and the nodes connected in the graph will then be able to
process it.
Within a graph, each node are isolated and can only communicate using their input and output queues. For each input row,
a given node will be called with the row passed as arguments. Each *return* or *yield* value will be put on the node's
output queue, and the nodes connected in the graph will then be able to process it.
|bonobo| is a line-by-line data stream processing solution.
Handling the data-flow this way brings the following properties:
- **First in, first out**: unless stated otherwise, each node will receeive the
rows from FIFO queues, and so, the order of rows will be preserved. That is
true for each single node, but please note that if you define "graph bubbles"
(where a graph diverge in different branches then converge again), the
convergence node will receive rows FIFO from each input queue, meaning that
the order existing at the divergence point wont stay true at the convergence
point.
- **First in, first out**: unless stated otherwise, each node will receeive the rows from FIFO queues, and so, the order
of rows will be preserved. That is true for each single node, but please note that if you define "graph bubbles"
(where a graph diverge in different branches then converge again), the convergence node will receive rows FIFO from
each input queue, meaning that the order existing at the divergence point wont stay true at the convergence point.
- **Parallelism**: each node run in parallel (by default, using independent
threads). This is useful as you don't have to worry about blocking calls.
If a thread waits for, let's say, a database, or a network service, the other
nodes will continue handling data, as long as they have input rows available.
- **Parallelism**: each node run in parallel (by default, using independent threads). This is useful as you don't have
to worry about blocking calls. If a thread waits for, let's say, a database, or a network service, the other nodes
will continue handling data, as long as they have input rows available.
- **Independence**: the rows are independent from each other, making this way
of working with data flows good for line-by-line data processing, but
also not ideal for "grouped" computations (where an output depends on more
than one line of input data). You can overcome this with rolling windows if
the input required are adjacent rows, but if you need to work on the whole
dataset at once, you should consider other software.
- **Independence**: the rows are independent from each other, making this way of working with data flows good for
line-by-line data processing, but also not ideal for "grouped" computations (where an output depends on more than one
line of input data). You can overcome this with rolling windows if the input required are adjacent rows, but if you
need to work on the whole dataset at once, you should consider other software.
Graphs are defined using :class:`bonobo.Graph` instances, as seen in the
previous tutorial step.
.. warning::
This document is currently reviewed to check for correctness after the 0.6 release.
Graphs are defined using :class:`bonobo.Graph` instances, as seen in the previous tutorial step.
What can be a node?
:::::::::::::::::::
What can be used as a node?
:::::::::::::::::::::::::::
**TL;DR**: … anything, as long as its callable().
**TL;DR**: … anything, as long as its callable() or iterable.
Functions
---------
@ -55,7 +44,100 @@ Functions
def get_item(id):
return id, items.get(id)
When building your graph, you can simply add your function:
.. code-block:: python
graph.add_chain(..., get_item, ...)
Or using the new syntax:
.. code-block:: python
graph >> ... >> get_item >> ...
.. note::
Please note that we pass the function object, and not the result of the function being called. A common mistake is
to call the function while building the graph, which won't work and may be tedious to debug.
As a convention, we use snake_cased objects when the object can be directly passed to a graph, like this function.
Some functions are factories for closures, and thus behave differently (as you need to call them to get an actual
object usable as a transformation. When it is the case, we use CamelCase as a convention, as it behaves the same way
as a class.
Classes
-------
.. code-block:: python
class Foo:
...
def __call__(self, id):
return id, self.get(id)
When building your graph, you can add an instance of your object (or even multiple instances, eventually configured
differently):
.. code-block:: python
graph.add_chain(..., Foo(), ...)
Or using the new syntax:
.. code-block:: python
graph >> ... >> Foo() >> ...
Iterables (generators, lists, ...)
----------------------------------
As a convenience tool, we can use iterables directly within a graph. It can either be used as producer nodes (nodes that
are normally only called once and produce data) or, in case of generators, as transformations.
.. code-block:: python
def product(x):
for i in range(10)
yield x, i, x * i
Then, add it to a graph:
.. code-block:: python
graph.add_chain(range(10), product, ...)
Or using the new syntax:
.. code-block:: python
graph >> range(10) >> product >> ...
Builtins
--------
Again, as long as it is callable, you can use it as a node. It means that python builtins works (think about `print` or
`str.upper`...)
.. code-block:: python
graph.add_chain(range(ord("a"), ord("z")+1), chr, str.upper, print)
Or using the new syntax:
.. code-block:: python
graph >> range(ord("a"), ord("z")+1) >> chr >> str.upper >> print
What happens during the graph execution?
::::::::::::::::::::::::::::::::::::::::
Each node of a graph will be executed in isolation from the other nodes, and the data is passed from one node to the
next using FIFO queues, managed by the framework. It's transparent to the end-user, though, and you'll only use
@ -90,9 +172,9 @@ It allows to have ETL jobs that ignore faulty data and try their best to process
Some errors are fatal, though.
If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an :class:`bonobo.errors.UnrecoverableTypeError`, and exit the
current graph execution as fast as it can (finishing the other node executions that are in progress first, but not
starting new ones if there are remaining input rows).
If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an
:class:`bonobo.errors.UnrecoverableTypeError`, and exit the current graph execution as fast as it can (finishing the
other node executions that are in progress first, but not starting new ones if there are remaining input rows).
Definitions
@ -108,12 +190,20 @@ Node
included in a graph, multiple graph, or not at all.
Creating a graph
::::::::::::::::
Building graphs
:::::::::::::::
Graphs in |bonobo| are instances of :class:`bonobo.Graph`
Graphs should be instances of :class:`bonobo.Graph`. The :func:`bonobo.Graph.add_chain` method can take as many
positional parameters as you want.
.. note::
As of |bonobo| 0.7, a new syntax is available that we believe is more powerfull and more readable than the legacy
`add_chain` method. The former API is here to stay and it's perfectly safe to use it, but if it is an option, you
should consider the new syntax. During the transition period, we'll document both.
.. code-block:: python
import bonobo
@ -121,6 +211,16 @@ positional parameters as you want.
graph = bonobo.Graph()
graph.add_chain(a, b, c)
Or using the new syntax:
.. code-block:: python
import bonobo
graph = bonobo.Graph()
graph >> a >> b >> c
Resulting graph:
.. graphviz::
@ -149,6 +249,16 @@ To create two or more divergent data streams ("forks"), you should specify the `
graph.add_chain(a, b, c)
graph.add_chain(f, g, _input=b)
Or using the new syntax:
.. code-block:: python
import bonobo
graph = bonobo.Graph()
graph >> a >> b >> c
graph.get_cursor(b) >> f >> g
Resulting graph:
@ -184,6 +294,21 @@ To merge two data streams, you can use the `_output` kwarg to `add_chain`, or us
graph.add_chain(a, b, _output=normalize)
graph.add_chain(f, g, _output=normalize)
Or using the new syntax:
.. code-block:: python
import bonobo
graph = bonobo.Graph()
# Here we set _input to None, so normalize won't start on its own but only after it receives input from the other chains.
graph.get_cursor(None) >> normalize >> store
# Add two different chains
graph >> a >> b >> normalize
graph >> f >> g >> normalize
Resulting graph:
@ -230,6 +355,9 @@ Please note that naming a chain is exactly the same thing as naming the first no
graph.add_chain(a, b, _output="load")
graph.add_chain(f, g, _output="load")
Using the new syntax, there should not be a need to name nodes. Let us know if you think otherwise by creating an issue.
Resulting graph:
.. graphviz::
@ -283,6 +411,11 @@ You may want to connect two nodes at some point. You can use `add_chain` without
# Connect them
graph.add_chain(_input=a, _output=b)
Or using the new syntax:
.. code-block:: python
graph.get_cursor(a) >> b
Inspecting graphs

View File

@ -5,6 +5,120 @@
:Module: :mod:`bonobo`
.. automodule:: bonobo
:no-members:
Graphs
::::::
* :class:`bonobo.structs.graphs.Graph`
Nodes
:::::
* :class:`bonobo.nodes.CsvReader`
* :class:`bonobo.nodes.CsvWriter`
* :class:`bonobo.nodes.FileReader`
* :class:`bonobo.nodes.FileWriter`
* :class:`bonobo.nodes.Filter`
* :class:`bonobo.nodes.FixedWindow`
* :func:`bonobo.nodes.Format`
* :class:`bonobo.nodes.JsonReader`
* :class:`bonobo.nodes.JsonWriter`
* :class:`bonobo.nodes.LdjsonReader`
* :class:`bonobo.nodes.LdjsonWriter`
* :class:`bonobo.nodes.Limit`
* :func:`bonobo.nodes.MapFields`
* :func:`bonobo.nodes.OrderFields`
* :class:`bonobo.nodes.PickleReader`
* :class:`bonobo.nodes.PickleWriter`
* :class:`bonobo.nodes.PrettyPrinter`
* :class:`bonobo.nodes.RateLimited`
* :func:`bonobo.nodes.Rename`
* :func:`bonobo.nodes.SetFields`
* :func:`bonobo.nodes.Tee`
* :func:`bonobo.nodes.UnpackItems`
* :func:`bonobo.nodes.count`
* :func:`bonobo.nodes.identity`
* :func:`bonobo.nodes.noop`
Other top-level APIs
::::::::::::::::::::
* :func:`bonobo.create_reader`
* :func:`bonobo.create_strategy`
* :func:`bonobo.create_writer`
* :func:`bonobo.get_argument_parser`
* :func:`bonobo.get_examples_path`
* :func:`bonobo.inspect`
* :func:`bonobo.open_examples_fs`
* :func:`bonobo.open_fs`
* :func:`bonobo.parse_args`
* :func:`bonobo.run`
create_reader
-------------
.. autofunction:: bonobo.create_reader
create_strategy
---------------
.. autofunction:: bonobo.create_strategy
create_writer
-------------
.. autofunction:: bonobo.create_writer
get_argument_parser
-------------------
.. autofunction:: bonobo.get_argument_parser
get_examples_path
-----------------
.. autofunction:: bonobo.get_examples_path
inspect
-------
.. autofunction:: bonobo.inspect
open_examples_fs
----------------
.. autofunction:: bonobo.open_examples_fs
open_fs
-------
.. autofunction:: bonobo.open_fs
parse_args
----------
.. autofunction:: bonobo.parse_args
run
---
.. autofunction:: bonobo.run

View File

@ -5,6 +5,9 @@
:Module: :mod:`bonobo.config`
.. automodule:: bonobo.config

View File

@ -5,7 +5,10 @@
:Module: :mod:`bonobo.constants`
.. automodule:: bonobo.constants
:no-members:

View File

@ -5,15 +5,9 @@
:Module: :mod:`bonobo.execution`
.. toctree::
:caption: Submodules
:maxdepth: 1
execution/contexts
execution/events
execution/strategies
.. automodule:: bonobo.execution

View File

@ -5,6 +5,9 @@
:Module: :mod:`bonobo.execution.contexts`
.. automodule:: bonobo.execution.contexts

View File

@ -5,6 +5,9 @@
:Module: :mod:`bonobo.execution.events`
.. automodule:: bonobo.execution.events

View File

@ -5,6 +5,9 @@
:Module: :mod:`bonobo.execution.strategies`
.. automodule:: bonobo.execution.strategies

View File

@ -0,0 +1,13 @@
:mod:`Nodes <bonobo.nodes>`
===========================
.. currentmodule:: bonobo.nodes
:Module: :mod:`bonobo.nodes`
.. automodule:: bonobo.nodes

View File

@ -0,0 +1,13 @@
:mod:`Graphs <bonobo.structs.graphs>`
=====================================
.. currentmodule:: bonobo.structs.graphs
:Module: :mod:`bonobo.structs.graphs`
.. automodule:: bonobo.structs.graphs

View File

@ -5,6 +5,9 @@
:Module: :mod:`bonobo.util`
.. automodule:: bonobo.util

View File

@ -18,7 +18,6 @@ or
.. toctree::
:maxdepth: 4

View File

@ -11,8 +11,9 @@ means that the api is not yet 1.0-proof.
api/bonobo/config
api/bonobo/constants
api/bonobo/execution
api/bonobo/nodes
api/bonobo/structs/graphs
api/bonobo/util
commands
settings
examples
private

View File

@ -1,9 +0,0 @@
.. warning::
This tutorial was written for |bonobo| 0.5, while the current stable version is |bonobo| 0.6.
Please be aware that some things changed.
A summary of changes is available in the `migration guide from 0.5 to 0.6 <https://news.bonobo-project.org/migration-guide-for-bonobo-0-6-alpha-c1d36b0a9d35>`_.

View File

@ -1,65 +0,0 @@
First steps
===========
.. include:: _outdated_note.rst
What is Bonobo?
:::::::::::::::
Bonobo is an ETL (Extract-Transform-Load) framework for python 3.5. The goal is to define data-transformations, with
python code in charge of handling similar shaped independent lines of data.
Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas.
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery
(execution contexts, parallelism, error handling, console output, logging, ...).
Bonobo uses simple python and should be quick and easy to learn.
Tutorial
::::::::
.. note::
Good documentation is not easy to write. We do our best to make it better and better.
Although all content here should be accurate, you may feel a lack of completeness, for which we plead guilty and
apologize.
If you're stuck, please come and ask on our `slack channel <https://bonobo-slack.herokuapp.com/>`_, we'll figure
something out.
If you're not stuck but had trouble understanding something, please consider contributing to the docs (via GitHub
pull requests).
.. toctree::
:maxdepth: 2
tut01
tut02
tut03
tut04
What's next?
::::::::::::
Read a few examples
-------------------
* :doc:`/reference/examples`
Read about best development practices
-------------------------------------
* :doc:`/guide/index`
* :doc:`/guide/purity`
Read about integrating external tools with bonobo
-------------------------------------------------
* :doc:`/extension/docker`: run transformation graphs in isolated containers.
* :doc:`/extension/jupyter`: run transformations within jupyter notebooks.
* :doc:`/extension/selenium`: crawl the web using a real browser and work with the gathered data.
* :doc:`/extension/sqlalchemy`: everything you need to interract with SQL databases.

View File

@ -1,13 +0,0 @@
Just enough Python for Bonobo
=============================
.. include:: _outdated_note.rst
.. todo::
This is a work in progress and it is not yet available. Please come back later or even better, help us write this
guide!
This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It
should definately not be considered as a general python introduction, neither a deep dive into details.

View File

@ -1,202 +0,0 @@
Let's get started!
==================
.. include:: _outdated_note.rst
To begin with Bonobo, you need to install it in a working python 3.5+ environment, and you'll also need cookiecutter
to bootstrap your project.
.. code-block:: shell-session
$ pip install bonobo cookiecutter
See :doc:`/install` for more options.
Create an empty project
:::::::::::::::::::::::
Your ETL code will live in ETL projects, which are basically a bunch of files, including python code, that bonobo
can run.
.. code-block:: shell-session
$ bonobo init tutorial
This will create a `tutorial` directory (`content description here <https://www.bonobo-project.org/with/cookiecutter>`_).
To run this project, use:
.. code-block:: shell-session
$ bonobo run tutorial
Write a first transformation
::::::::::::::::::::::::::::
Open `tutorial/main.py`, and delete all the code here.
A transformation can be whatever python can call. Simplest transformations are functions and generators.
Let's write one:
.. code-block:: python
def transform(x):
return x.upper()
Easy.
.. note::
This function is very similar to :func:`str.upper`, which you can use directly.
Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from
scratch, and we'll use stdout to "simulate" data-persistence.
.. code-block:: python
def extract():
yield 'foo'
yield 'bar'
yield 'baz'
def load(x):
print(x)
Bonobo makes no difference between generators (yielding functions) and regular functions. It will, in all cases, iterate
on things returned, and a normal function will just be seen as a generator that yields only once.
.. note::
Once again, you should use the builtin :func:`print` directly instead of this `load()` function.
Create a transformation graph
:::::::::::::::::::::::::::::
Amongst other features, Bonobo will mostly help you there with the following:
* Execute the transformations in independent threads
* Pass the outputs of one thread to other(s) thread(s) inputs.
To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it.
.. code-block:: python
import bonobo
graph = bonobo.Graph(extract, transform, load)
if __name__ == '__main__':
bonobo.run(graph)
.. graphviz::
digraph {
rankdir = LR;
stylesheet = "../_static/graphs.css";
BEGIN [shape="point"];
BEGIN -> "extract" -> "transform" -> "load";
}
.. note::
The `if __name__ == '__main__':` section is not required, unless you want to run it directly using the python
interpreter.
Execute the job
:::::::::::::::
Save `tutorial/main.py` and execute your transformation again:
.. code-block:: shell-session
$ bonobo run tutorial
This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module:
.. code-block:: shell-session
$ bonobo run -m bonobo.examples.tutorials.tut01e01
Rewrite it using builtins
:::::::::::::::::::::::::
There is a much simpler way to describe an equivalent graph:
.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py
:language: python
The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input
generator.
This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
.. code-block:: shell-session
$ bonobo run -m bonobo.examples.tutorials.tut01e02
You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here
below.
Takeaways
:::::::::
① The :class:`bonobo.Graph` class is used to represent a data-processing pipeline.
It can represent simple list-like linear graphs, like here, but it can also represent much more complex graphs, with
forks and joins.
This is what the graph we defined looks like:
.. graphviz::
digraph {
rankdir = LR;
BEGIN [shape="point"];
BEGIN -> "iter(['foo', 'bar', 'baz'])" -> "str.upper" -> "print";
}
`Transformations` are simple python callables. Whatever can be called can be used as a `transformation`. Callables can
either `return` or `yield` data to send it to the next step. Regular functions (using `return`) should be prefered if
each call is guaranteed to return exactly one result, while generators (using `yield`) should be prefered if the
number of output lines for a given input varies.
③ The `Graph` instance, or `transformation graph` is executed using an `ExecutionStrategy`. You won't use it directly,
but :func:`bonobo.run` created an instance of :class:`bonobo.ThreadPoolExecutorStrategy` under the hood (the default
strategy). Actual behavior of an execution will depend on the strategy chosen, but the default should be fine for most
cases.
④ Before actually executing the `transformations`, the `ExecutorStrategy` instance will wrap each component in an
`execution context`, whose responsibility is to hold the state of the transformation. It enables you to keep the
`transformations` stateless, while allowing you to add an external state if required. We'll expand on this later.
Concepts and definitions
::::::::::::::::::::::::
* **Transformation**: a callable that takes input (as call parameters) and returns output(s), either as its return value or
by yielding values (a.k.a returning a generator).
* **Transformation graph (or Graph)**: a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
a directed acyclic graph (or DAG).
* **Node**: a graph element, most probably a transformation in a graph.
* **Execution strategy (or strategy)**: a way to run a transformation graph. It's responsibility is mainly to parallelize
(or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for
transformations' inputs and outputs.
* **Execution context (or context)**: a wrapper around a node that holds the state for it. If the node needs state, there
are tools available in bonobo to feed it to the transformation using additional call parameters, keeping
transformations stateless.
Next
::::
Time to jump to the second part: :doc:`tut02`.

View File

@ -1,123 +0,0 @@
Working with files
==================
.. include:: _outdated_note.rst
Bonobo would be pointless if the aim was just to uppercase small lists of strings.
In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks.
Some background...
::::::::::::::::::
Let's take the following graph:
.. graphviz::
digraph {
rankdir = LR;
BEGIN [shape="point"];
BEGIN -> "A" -> "B" -> "C";
"B" -> "D";
}
When run, the execution strategy wraps every component in a thread (assuming you're using the default
:class:`bonobo.strategies.ThreadPoolExecutorStrategy`).
Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns*
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. Meanwhile, `A`
will continue to run, if it's not done.
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`), the same thing
happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`.
One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure"
transformations (see :doc:`/guide/purity`).
You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry
too much about nodes running blocking operations, as they will run in parallel. As soon as a line of output is ready,
the next nodes will start consuming it.
That being said, let's manipulate some files.
Reading a file
::::::::::::::
There are a few component builders available in **Bonobo** that let you read from (or write to) files.
All readers work the same way. They need a filesystem to work with, and open a "path" they will read from.
* :class:`bonobo.CsvReader`
* :class:`bonobo.FileReader`
* :class:`bonobo.JsonReader`
* :class:`bonobo.PickleReader`
We'll use a text file that was generated using Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by
Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset
<https://opendata.paris.fr/explore/dataset/liste-des-cafes-a-un-euro/information/>`_.
You'll need the `"coffeeshops.txt" example dataset <https://github.com/python-bonobo/bonobo/blob/master/bonobo/examples/datasets/coffeeshops.txt>`_,
available in **Bonobo**'s repository:
.. code-block:: shell-session
$ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python3 -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'`
.. note::
The "example dataset download" step will be easier in the future.
https://github.com/python-bonobo/bonobo/issues/134
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py
:language: python
You can also run this example as a module (but you'll still need the dataset...):
.. code-block:: shell-session
$ bonobo run -m bonobo.examples.tutorials.tut02e01_read
.. note::
Don't focus too much on the `get_services()` function for now. It is required, with this exact name, but we'll get
into that in a few minutes.
Writing to files
::::::::::::::::
Let's split this file's each lines on the first comma and store a json file mapping coffee names to their addresses.
Here are, like the readers, the classes available to write files
* :class:`bonobo.CsvWriter`
* :class:`bonobo.FileWriter`
* :class:`bonobo.JsonWriter`
* :class:`bonobo.PickleWriter`
Let's write a first implementation:
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e02_write.py
:language: python
(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e02_write` or :code:`bonobo run myfile.py`)
If you read the output file, you'll see it misses the "map" part of the problem.
Let's extend :class:`bonobo.io.JsonWriter` to finish the job:
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e03_writeasmap.py
:language: python
(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e03_writeasmap` or :code:`bonobo run myfile.py`)
It should produce a nice map.
We favored a bit hackish solution here instead of constructing a map in python then passing the whole to
:func:`json.dumps` because we want to work with streams, if you have to construct the whole data structure in python,
you'll loose a lot of bonobo's benefits.
Next
::::
Time to write some more advanced transformations, with service dependencies: :doc:`tut03`.

View File

@ -1,202 +0,0 @@
Configurables and Services
==========================
.. include:: _outdated_note.rst
.. note::
This section lacks completeness, sorry for that (but you can still read it!).
In the last section, we used a few new tools.
Class-based transformations and configurables
:::::::::::::::::::::::::::::::::::::::::::::
Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the
user to provide callables that logically fits in a graph.
You can use plain python objects with a `__call__()` method, and it will just work.
As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of
them requiring your class to subclass :class:`bonobo.config.Configurable`.
Configurables allows to use the following features:
* You can add **Options** (using the :class:`bonobo.config.Option` descriptor). Options can be positional, or keyword
based, can have a default value and will be consumed from the constructor arguments.
.. code-block:: python
from bonobo.config import Configurable, Option
class PrefixIt(Configurable):
prefix = Option(str, positional=True, default='>>>')
def call(self, row):
return self.prefix + ' ' + row
prefixer = PrefixIt('$')
* You can add **Services** (using the :class:`bonobo.config.Service` descriptor). Services are a subclass of
:class:`bonobo.config.Option`, sharing the same basics, but specialized in the definition of "named services" that
will be resolved at runtime (a.k.a for which we will provide an implementation at runtime). We'll dive more into that
in the next section
.. code-block:: python
from bonobo.config import Configurable, Option, Service
class HttpGet(Configurable):
url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client')
def call(self, http):
resp = http.get(self.url)
for row in resp.json():
yield row
http_get = HttpGet()
* You can add **Methods** (using the :class:`bonobo.config.Method` descriptor). :class:`bonobo.config.Method` is a
subclass of :class:`bonobo.config.Option` that allows to pass callable parameters, either to the class constructor,
or using the class as a decorator.
.. code-block:: python
from bonobo.config import Configurable, Method
class Applier(Configurable):
apply = Method()
def call(self, row):
return self.apply(row)
@Applier
def Prefixer(self, row):
return 'Hello, ' + row
prefixer = Prefixer()
* You can add **ContextProcessors**, which are an advanced feature we won't introduce here. If you're familiar with
pytest, you can think of them as pytest fixtures, execution wise.
Services
::::::::
The motivation behind services is mostly separation of concerns, testability and deployability.
Usually, your transformations will depend on services (like a filesystem, an http client, a database, a rest api, ...).
Those services can very well be hardcoded in the transformations, but there is two main drawbacks:
* You won't be able to change the implementation depending on the current environment (development laptop versus
production servers, bug-hunting session versus execution, etc.)
* You won't be able to test your transformations without testing the associated services.
To overcome those caveats of hardcoding things, we define Services in the configurable, which are basically
string-options of the service names, and we provide an implementation at the last moment possible.
There are two ways of providing implementations:
* Either file-wide, by providing a `get_services()` function that returns a dict of named implementations (we did so
with filesystems in the previous step, :doc:`tut02`)
* Either directory-wide, by providing a `get_services()` function in a specially named `_services.py` file.
The first is simpler if you only have one transformation graph in one file, the second allows to group coherent
transformations together in a directory and share the implementations.
Let's see how to use it, starting from the previous service example:
.. code-block:: python
from bonobo.config import Configurable, Option, Service
class HttpGet(Configurable):
url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client')
def call(self, http):
resp = http.get(self.url)
for row in resp.json():
yield row
We defined an "http.client" service, that obviously should have a `get()` method, returning responses that have a
`json()` method.
Let's provide two implementations for that. The first one will be using `requests <http://docs.python-requests.org/>`_,
that coincidally satisfies the described interface:
.. code-block:: python
import bonobo
import requests
def get_services():
return {
'http.client': requests
}
graph = bonobo.Graph(
HttpGet(),
print,
)
If you run this code, you should see some mock data returned by the webservice we called (assuming it's up and you can
reach it).
Now, the second implementation will replace that with a mock, used for testing purposes:
.. code-block:: python
class HttpResponseStub:
def json(self):
return [
{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}},
{'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}},
]
class HttpStub:
def get(self, url):
return HttpResponseStub()
def get_services():
return {
'http.client': HttpStub()
}
graph = bonobo.Graph(
HttpGet(),
print,
)
The `Graph` definition staying the exact same, you can easily substitute the `_services.py` file depending on your
environment (the way you're doing this is out of bonobo scope and heavily depends on your usual way of managing
configuration files on different platforms).
Starting with bonobo 0.5 (not yet released), you will be able to use service injections with function-based
transformations too, using the `bonobo.config.requires` decorator to mark a dependency.
.. code-block:: python
from bonobo.config import requires
@requires('http.client')
def http_get(http):
resp = http.get('https://jsonplaceholder.typicode.com/users')
for row in resp.json():
yield row
Read more
:::::::::
* :doc:`/guide/services`
* :doc:`/reference/api_config`
Next
::::
:doc:`tut04`.

View File

@ -1,216 +0,0 @@
Working with databases
======================
.. include:: _outdated_note.rst
Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never
be) included in the main package. Instead, working with databases is done using third party, well maintained and
specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop.
.. note::
SQLAlchemy extension is not yet complete. Things may be not optimal, and some APIs will change. You can still try,
of course.
Consider the following document as a "preview" (yes, it should work, yes it may break in the future).
Also, note that for early development stages, we explicitely support only PostreSQL, although it may work well
with `any other database supported by SQLAlchemy <http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases>`_.
First, read https://www.bonobo-project.org/with/sqlalchemy for instructions on how to install. You **do need** the
bleeding edge version of `bonobo` and `bonobo-sqlalchemy` to make this work.
Requirements
::::::::::::
Once you installed `bonobo_sqlalchemy` (read https://www.bonobo-project.org/with/sqlalchemy to use bleeding edge
version), install the following additional packages:
.. code-block:: shell-session
$ pip install -U python-dotenv psycopg2 awesome-slugify
Those packages are not required by the extension, but `python-dotenv` will help us configure the database DSN, and
`psycopg2` is required by SQLAlchemy to connect to PostgreSQL databases. Also, we'll use a slugifier to create unique
identifiers for the database (maybe not what you'd do in the real world, but very much sufficient for example purpose).
Configure a database engine
:::::::::::::::::::::::::::
Open your `_services.py` file and replace the code:
.. code-block:: python
import bonobo, dotenv, logging, os
from bonobo_sqlalchemy.util import create_postgresql_engine
dotenv.load_dotenv(dotenv.find_dotenv())
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
def get_services():
return {
'fs': bonobo.open_examples_fs('datasets'),
'fs.output': bonobo.open_fs(),
'sqlalchemy.engine': create_postgresql_engine(**{
'name': 'tutorial',
'user': 'tutorial',
'pass': 'tutorial',
})
}
The `create_postgresql_engine` is a tiny function building the DSN from reasonable defaults, that you can override
either by providing kwargs, or with system environment variables. If you want to override something, open the `.env`
file and add values for one or more of `POSTGRES_NAME`, `POSTGRES_USER`, 'POSTGRES_PASS`, `POSTGRES_HOST`,
`POSTGRES_PORT`. Please note that kwargs always have precedence on environment, but that you should prefer using
environment variables for anything that is not immutable from one platform to another.
Add database operation to the graph
:::::::::::::::::::::::::::::::::::
Let's create a `tutorial/pgdb.py` job:
.. code-block:: python
import bonobo
import bonobo_sqlalchemy
from bonobo.examples.tutorials.tut02e03_writeasmap import graph, split_one_to_map
graph = graph.copy()
graph.add_chain(
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
_input=split_one_to_map
)
Notes here:
* We use the code from :doc:`tut02`, which is bundled with bonobo in the `bonobo.examples.tutorials` package.
* We "fork" the graph, by creating a copy and appending a new "chain", starting at a point that exists in the other
graph.
* We use :class:`bonobo_sqlalchemy.InsertOrUpdate` (which role, in case it is not obvious, is to create database rows if
they do not exist yet, or update the existing row, based on a "discriminant" criteria (by default, "id")).
If we run this transformation (with `bonobo run tutorial/pgdb.py`), we should get an error:
.. code-block:: text
| File ".../lib/python3.6/site-packages/psycopg2/__init__.py", line 130, in connect
| conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
| sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist
|
|
| The above exception was the direct cause of the following exception:
|
| Traceback (most recent call last):
| File ".../bonobo-devkit/bonobo/bonobo/strategies/executor.py", line 45, in _runner
| node_context.start()
| File ".../bonobo-devkit/bonobo/bonobo/execution/base.py", line 75, in start
| self._stack.setup(self)
| File ".../bonobo-devkit/bonobo/bonobo/config/processors.py", line 94, in setup
| _append_to_context = next(_processed)
| File ".../bonobo-devkit/bonobo-sqlalchemy/bonobo_sqlalchemy/writers.py", line 43, in create_connection
| raise UnrecoverableError('Could not create SQLAlchemy connection: {}.'.format(str(exc).replace('\n', ''))) from exc
| bonobo.errors.UnrecoverableError: Could not create SQLAlchemy connection: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist.
The database we requested do not exist. It is not the role of bonobo to do database administration, and thus there is
no tool here to create neither the database, nor the tables we want to use.
Create database and table
:::::::::::::::::::::::::
There are however tools in `sqlalchemy` to manage tables, so we'll create the database by ourselves, and ask sqlalchemy
to create the table:
.. code-block:: shell-session
$ psql -U postgres -h localhost
psql (9.6.1, server 9.6.3)
Type "help" for help.
postgres=# CREATE ROLE tutorial WITH LOGIN PASSWORD 'tutorial';
CREATE ROLE
postgres=# CREATE DATABASE tutorial WITH OWNER=tutorial TEMPLATE=template0 ENCODING='utf-8';
CREATE DATABASE
Now, let's use a little trick and add this section to `pgdb.py`:
.. code-block:: python
import sys
from sqlalchemy import Table, Column, String, Integer, MetaData
def main():
from bonobo.commands.run import get_default_services
services = get_default_services(__file__)
if len(sys.argv) == 1:
return bonobo.run(graph, services=services)
elif len(sys.argv) == 2 and sys.argv[1] == 'reset':
engine = services.get('sqlalchemy.engine')
metadata = MetaData()
coffee_table = Table(
'coffeeshops',
metadata,
Column('id', String(255), primary_key=True),
Column('name', String(255)),
Column('address', String(255)),
)
metadata.drop_all(engine)
metadata.create_all(engine)
else:
raise NotImplementedError('I do not understand.')
if __name__ == '__main__':
main()
.. note::
We're using private API of bonobo here, which is unsatisfactory, discouraged and may change. Some way to get the
service dictionnary will be added to the public api in a future release of bonobo.
Now run:
.. code-block:: python
$ python tutorial/pgdb.py reset
Database and table should now exist.
Format the data
:::::::::::::::
Let's prepare our data for database, and change the `.add_chain(..)` call to do it prior to `InsertOrUpdate(...)`
.. code-block:: python
from slugify import slugify_url
def format_for_db(row):
name, address = list(row.items())[0]
return {
'id': slugify_url(name),
'name': name,
'address': address,
}
# ...
graph = graph.copy()
graph.add_chain(
format_for_db,
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
_input=split_one_to_map
)
Run!
::::
You can now run the script (either with `bonobo run tutorial/pgdb.py` or directly with the python interpreter, as we
added a "main" section) and the dataset should be inserted in your database. If you run it again, no new rows are
created.
Note that as we forked the graph from :doc:`tut02`, the transformation also writes the data to `coffeeshops.json`, as
before.