Merge pull request #136 from hartym/develop

merge develop
This commit is contained in:
Romain Dorgueil
2017-07-06 12:48:49 +02:00
committed by GitHub
14 changed files with 643 additions and 180 deletions

View File

@ -5,7 +5,10 @@ graph = bonobo.Graph(
print,
)
def get_services():
return {'fs': bonobo.open_examples_fs('datasets')}
if __name__ == '__main__':
bonobo.run(
graph, services={'fs': bonobo.open_examples_fs('datasets')}
)
bonobo.run(graph, services=get_services())

View File

@ -8,10 +8,18 @@ def split_one(line):
graph = bonobo.Graph(
bonobo.FileReader('coffeeshops.txt'),
split_one,
bonobo.JsonWriter('coffeeshops.json', ioformat='arg0'),
bonobo.JsonWriter(
'coffeeshops.json', fs='fs.output', ioformat='arg0'
),
)
def get_services():
return {
'fs': bonobo.open_examples_fs('datasets'),
'fs.output': bonobo.open_fs(),
}
if __name__ == '__main__':
bonobo.run(
graph, services={'fs': bonobo.open_examples_fs('datasets')}
)
bonobo.run(graph, services=get_services())

View File

@ -1,4 +1,6 @@
import bonobo, json
import json
import bonobo
def split_one_to_map(line):
@ -18,10 +20,16 @@ class MyJsonWriter(bonobo.JsonWriter):
graph = bonobo.Graph(
bonobo.FileReader('coffeeshops.txt'),
split_one_to_map,
MyJsonWriter('coffeeshops.json'),
MyJsonWriter('coffeeshops.json', fs='fs.output', ioformat='arg0'),
)
def get_services():
return {
'fs': bonobo.open_examples_fs('datasets'),
'fs.output': bonobo.open_fs(),
}
if __name__ == '__main__':
bonobo.run(
graph, services={'fs': bonobo.open_examples_fs('datasets')}
)
bonobo.run(graph, services=get_services())

View File

@ -1,3 +1,5 @@
from copy import copy
from bonobo.constants import BEGIN
@ -62,6 +64,15 @@ class Graph:
return self
def copy(self):
g = Graph()
g.edges = copy(self.edges)
g.named = copy(self.named)
g.nodes = copy(self.nodes)
return g
@property
def topologically_sorted_indexes(self):
"""Iterate in topological order, based on networkx's topological_sort() function.

View File

@ -2,105 +2,116 @@
{% set title = _('Bonobo — Data processing for humans') %}
{% block body %}
<div style="border: 2px solid red; font-weight: bold; margin: 1em; padding: 1em">
Bonobo is <strong>ALPHA</strong> software. Some APIs will change.
</div>
<h1 style="text-align: center">
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
style=" width: 128px; height: 128px;"/>
</h1>
<h1 style="text-align: center">
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
style=" width: 128px; height: 128px;"/>
</h1>
<p>
{% trans %}
<strong>Bonobo</strong> is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
framework) emphasizing simple and atomic data transformations defined using a directed graph of plain old
python objects (functions, iterables, generators, ...).
{% endtrans %}
</p>
<h2 style="margin-bottom: 0">{% trans %}Documentation{% endtrans %}</h2>
<table class="contentstable">
<tr>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto("tutorial/index") }}">{% trans %}First steps{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}quick overview of basic features{% endtrans %}</span></p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto("search") }}">{% trans %}
Search{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}search the documentation{% endtrans %}</span></p>
</td>
</tr>
<tr>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto("guide/index") }}">{% trans %}
Guides{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}for a complete overview{% endtrans %}</span>
</p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto("reference/index") }}">{% trans %}References{% endtrans %}</a>
<br/>
<span class="linkdescr">{% trans %}all functions, classes, terms{% endtrans %}</span>
</p>
</td>
</tr>
<tr>
<td>
<p class="biglink"><a class="biglink" target="_blank" href="https://github.com/python-bonobo/bonobo/tree/master/bonobo/examples">{% trans %}
Cookbook{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}examples and recipes{% endtrans %}</span></p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto("contribute/index") }}">{% trans %}
Contribute{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}contributor guide{% endtrans %}</span></p>
</td>
</tr>
</table>
<h2>Features</h2>
<ul>
<li>
{% trans %}
<b>10 minutes to get started:</b> Know some python? Writing your first data processor is an affair
of minutes.
{% endtrans %}
</li>
<li>
{% trans %}
<b>Data sources and targets:</b> HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
streaming APIs, python objects...
{% endtrans %}
</li>
<li>
{% trans %}
<b>Service injection:</b> Abstract the transformation dependencies to easily switch data sources and
dependant libraries. You'll be able to specify the concrete implementations or configurations at
runtime, for example to switch a database connection string or an API endpoint.
{% endtrans %}
</li>
<li>
{% trans %}
<b>Plugins:</b> Easily add features to all your transformations by using builtin plugins (Jupyter,
Console, ...) or write your own.
{% endtrans %}
</li>
<li>
{% trans %}
Bonobo is young, and the todo-list is huge. Read the <a href="https://www.bonobo-project.org/roadmap">roadmap</a>.
{% endtrans %}
</li>
</ul>
<p>{% trans %}
You can also download PDF/EPUB versions of the Bonobo documentation:
<a href="http://readthedocs.org/projects/bonobo/downloads/pdf/stable/">PDF version</a>,
<a href="http://readthedocs.org/projects/bonobo/downloads/epub/stable/">EPUB version</a>.
<p>
{% trans %}
<strong>Bonobo</strong> is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
framework, or ETL) emphasizing simple and atomic data transformations defined using a directed graph of plain old
python objects (functions, iterables, generators, ...).
{% endtrans %}
</p>
</p>
<div style="border: 2px solid red; font-weight: bold; margin: 1em; padding: 1em">
Bonobo is <strong>ALPHA</strong> software. Some APIs will change.
</div>
<h2 style="margin-bottom: 0">{% trans %}Documentation{% endtrans %}</h2>
<table class="contentstable">
<tr>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto(" tutorial/index") }}">{% trans %}First steps{%
endtrans %}</a><br/>
<span class="linkdescr">{% trans %}quick overview of basic features{% endtrans %}</span></p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto(" search") }}">{% trans %}
Search{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}search the documentation{% endtrans %}</span></p>
</td>
</tr>
<tr>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto(" guide/index") }}">{% trans %}
Guides{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}for a complete overview{% endtrans %}</span>
</p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto(" reference/index") }}">{% trans %}References{%
endtrans %}</a>
<br/>
<span class="linkdescr">{% trans %}all functions, classes, terms{% endtrans %}</span>
</p>
</td>
</tr>
<tr>
<td>
<p class="biglink"><a class="biglink" target="_blank"
href="https://github.com/python-bonobo/bonobo/tree/master/bonobo/examples">{% trans %}
Cookbook{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}examples and recipes{% endtrans %}</span></p>
</td>
<td>
<p class="biglink"><a class="biglink" href="{{ pathto(" contribute/index") }}">{% trans %}
Contribute{% endtrans %}</a><br/>
<span class="linkdescr">{% trans %}contributor guide{% endtrans %}</span></p>
</td>
</tr>
</table>
<h2>Features</h2>
<ul>
<li>
{% trans %}
<b>10 minutes to get started:</b> Know some python? Writing your first data processor is an affair
of minutes.
{% endtrans %}
</li>
<li>
{% trans %}
<b>Data sources and targets:</b> HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
streaming APIs, python objects...
{% endtrans %}
</li>
<li>
{% trans %}
<b>Service injection:</b> Abstract the transformation dependencies to easily switch data sources and
dependant libraries. You'll be able to specify the concrete implementations or configurations at
runtime, for example to switch a database connection string or an API endpoint.
{% endtrans %}
</li>
<li>
{% trans %}
<b>Plugins:</b> Easily add features to all your transformations by using builtin plugins (Jupyter,
Console, ...) or write your own.
{% endtrans %}
</li>
<li>
{% trans %}
Bonobo is young, and the todo-list is huge. Read the <a
href="https://www.bonobo-project.org/roadmap">roadmap</a>.
{% endtrans %}
</li>
</ul>
<p>{% trans %}
You can also download PDF/EPUB versions of the Bonobo documentation:
<a href="http://readthedocs.org/projects/bonobo/downloads/pdf/stable/">PDF version</a>,
<a href="http://readthedocs.org/projects/bonobo/downloads/epub/stable/">EPUB version</a>.
{% endtrans %}
</p>
<h2>Table of contents</h2>
<div>
{{ toctree(maxdepth=2, collapse=False)}}
</div>
{% endblock %}

View File

@ -4,6 +4,7 @@ Bonobo with SQLAlchemy
.. todo:: The `bonobo-sqlalchemy` package is at a very alpha stage, and things will change. This section is here to
give a brief overview but is neither complete nor definitive.
Read the introduction: https://www.bonobo-project.org/with/sqlalchemy
Installation
::::::::::::

View File

@ -8,8 +8,8 @@ Bonobo
tutorial/index
guide/index
reference/index
contribute/index
faq
contribute/index
genindex
modindex

View File

@ -4,36 +4,47 @@ Installation
Create an ETL project
:::::::::::::::::::::
If you only want to use Bonobo to code ETLs, your easiest option to get started is to use our
`cookiecutter template <https://github.com/python-bonobo/cookiecutter-bonobo>`_.
Creating a project and starting to write code should take less than a minute:
.. code-block:: shell-session
$ pip install --upgrade bonobo cookiecutter
$ bonobo init my-etl-project
$ bonobo run my-etl-project
Once you bootstrapped a project, you can start editing the default example transformation by editing
`my-etl-project/main.py`. Now, you can head to :doc:`tutorial/index`.
Other installation options
::::::::::::::::::::::::::
Install from PyPI
:::::::::::::::::
-----------------
You can also install it directly from the `Python Package Index <https://pypi.python.org/pypi/bonobo>`_.
You can install it directly from the `Python Package Index <https://pypi.python.org/pypi/bonobo>`_ (like we did above).
.. code-block:: shell-session
$ pip install bonobo
Install from source
:::::::::::::::::::
-------------------
If you want to install an unreleased version, you can use git urls with pip. This is useful when using bonobo as a
dependency of your code and you want to try a forked version of bonobo with your software. You can use the git+http
string in your `requirements.txt` file. However, the best option for development on bonobo directly is not this one,
but editable installs (see below).
dependency of your code and you want to try a forked version of bonobo with your software. You can use a `git+http`
string in your `requirements.txt` file. However, the best option for development on bonobo is an editable install (see
below).
.. code-block:: shell-session
$ pip install git+https://github.com/python-bonobo/bonobo.git@master#egg=bonobo
$ pip install git+https://github.com/python-bonobo/bonobo.git@develop#egg=bonobo
Editable install
::::::::::::::::
----------------
If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip feature.
Pip will clone your repository in a source directory and create a symlink for it in the site-package directory of your
python interpreter.
If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip
feature. Pip will clone your repository in a source directory and create a symlink for it in the site-package directory
of your python interpreter.
.. code-block:: shell-session
@ -63,20 +74,17 @@ I usually name the git remote for the main bonobo repository "upstream", and my
$ git remote rename origin upstream
$ git remote add origin git@github.com:hartym/bonobo.git
$ git fetch --all
Of course, replace my github username by the one you used to fork bonobo. You should be good to go!
Windows support
:::::::::::::::
There are problems on the windows platform, mostly due to the fact bonobo was not developed by experienced windows users.
There are minor issues on the windows platform, mostly due to the fact bonobo was not developed by experienced windows
users.
We're trying to look into that but energy available to provide serious support on windows is very limited.
If you have experience in this domain and you're willing to help, you're more than welcome!
.. todo::
Better install docs, especially on how to use different forks or branches, etc.

View File

@ -9,17 +9,26 @@ python code in charge of handling similar shaped independant lines of data.
Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas.
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery.
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery
(execution contexts, parallelism, error handling, console output, logging, ...).
Bonobo uses simple python and should be quick and easy to learn.
Tutorial
::::::::
Warning: the documentation is still in progress. Although all content here should be accurate, you may feel a lack of
completeness, for which we plaid guilty and apologize. If there is something blocking, please come on our
`slack channel <https://bonobo-slack.herokuapp.com/>`_ and complain, we'll figure something out. If there is something
that did not block you but can be a no-go for others, please consider contributing to the docs.
.. note::
Good documentation is not easy to write. We do our best to make it better and better.
Although all content here should be accurate, you may feel a lack of completeness, for which we plaid guilty and
apologize.
If you're stuck, please come and ask on our `slack channel <https://bonobo-slack.herokuapp.com/>`_, we'll figure
something out.
If you're not stuck but had trouble understanding something, please consider contributing to the docs (via github
pull requests).
.. toctree::
:maxdepth: 2

View File

@ -19,7 +19,7 @@ can run.
.. code-block:: shell-session
bonobo init tutorial
$ bonobo init tutorial
This will create a `tutorial` directory (`content description here <https://www.bonobo-project.org/with/cookiecutter>`_).
@ -27,15 +27,15 @@ To run this project, use:
.. code-block:: shell-session
bonobo run tutorial
$ bonobo run tutorial
Write a first transformation
::::::::::::::::::::::::::::
Open `tutorial/__main__.py`, and delete all the code here.
Open `tutorial/main.py`, and delete all the code here.
A transformation can be whatever python can call, having inputs and outputs. Simplest transformations are functions.
A transformation can be whatever python can call. Simplest transformations are functions and generators.
Let's write one:
@ -48,10 +48,10 @@ Easy.
.. note::
This is about the same as :func:`str.upper`, and in the real world, you'd use it directly.
This function is very similar to :func:`str.upper`, which you can use directly.
Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from
scratch, and we'll use stdout to simulate data-persistence.
scratch, and we'll use stdout to "simulate" data-persistence.
.. code-block:: python
@ -68,16 +68,16 @@ on things returned, and a normal function will just be seen as a generator that
.. note::
Once again, :func:`print` would be used directly in a real-world transformation.
Once again, you should use the builtin :func:`print` directly instead of this `load()` function.
Create a transformation graph
:::::::::::::::::::::::::::::
Bonobo main roles are two things:
Amongst other features, Bonobo will mostly help you there with the following:
* Execute the transformations in independant threads
* Pass the outputs of one thread to other(s) thread(s).
* Pass the outputs of one thread to other(s) thread(s) inputs.
To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it.
@ -109,17 +109,17 @@ To do this, it needs to know what data-flow you want to achieve, and you'll use
Execute the job
:::::::::::::::
Save `tutorial/__main__.py` and execute your transformation:
Save `tutorial/main.py` and execute your transformation again:
.. code-block:: shell-session
bonobo run tutorial
$ bonobo run tutorial
This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module:
.. code-block:: shell-session
bonobo run -m bonobo.examples.tutorials.tut01e01
$ bonobo run -m bonobo.examples.tutorials.tut01e01
Rewrite it using builtins
@ -127,27 +127,17 @@ Rewrite it using builtins
There is a much simpler way to describe an equivalent graph:
.. code-block:: python
.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py
:language: python
import bonobo
The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input
generator.
graph = bonobo.Graph(
['foo', 'bar', 'baz',],
str.upper,
print,
)
if __name__ == '__main__':
bonobo.run(graph)
We use a shortcut notation for the generator, with a list. Bonobo will wrap an iterable as a generator by itself if it
is added in a graph.
This example is available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
.. code-block:: shell-session
bonobo run -m bonobo.examples.tutorials.tut01e02
$ bonobo run -m bonobo.examples.tutorials.tut01e02
You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here
below.
@ -188,19 +178,19 @@ cases.
Concepts and definitions
::::::::::::::::::::::::
* Transformation: a callable that takes input (as call parameters) and returns output(s), either as its return value or
* **Transformation**: a callable that takes input (as call parameters) and returns output(s), either as its return value or
by yielding values (a.k.a returning a generator).
* Transformation graph (or Graph): a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
* **Transformation graph (or Graph)**: a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
a directed acyclic graph (or DAG).
* Node: a graph element, most probably a transformation in a graph.
* **Node**: a graph element, most probably a transformation in a graph.
* Execution strategy (or strategy): a way to run a transformation graph. It's responsibility is mainly to parallelize
* **Execution strategy (or strategy)**: a way to run a transformation graph. It's responsibility is mainly to parallelize
(or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for
transformations' inputs and outputs.
* Execution context (or context): a wrapper around a node that holds the state for it. If the node needs state, there
* **Execution context (or context)**: a wrapper around a node that holds the state for it. If the node needs state, there
are tools available in bonobo to feed it to the transformation using additional call parameters, keeping
transformations stateless.

View File

@ -23,16 +23,18 @@ When run, the execution strategy wraps every component in a thread (assuming you
:class:`bonobo.strategies.ThreadPoolExecutorStrategy`).
Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns*
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread.
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. Meanwhile, `A`
will continue to run, if it's not done.
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`), the same thing
happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`.
One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure"
transformations (see :doc:`/guide/purity`).
You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry
too much about blocking nodes, as they won't block other nodes.
too much about nodes running blocking operations, as they will run in parallel. As soon as a line of output is ready,
the next nodes will start consuming it.
That being said, let's manipulate some files.
@ -52,18 +54,33 @@ We'll use a text file that was generated using Bonobo from the "liste-des-cafes-
Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset
<https://opendata.paris.fr/explore/dataset/liste-des-cafes-a-un-euro/information/>`_.
You'll need the `example dataset <https://github.com/python-bonobo/bonobo/blob/master/bonobo/examples/datasets/coffeeshops.txt>`_,
available in **Bonobo**'s repository.
You'll need the `"coffeeshops.txt" example dataset <https://github.com/python-bonobo/bonobo/blob/master/bonobo/examples/datasets/coffeeshops.txt>`_,
available in **Bonobo**'s repository:
.. code-block:: shell-session
$ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'`
.. note::
The "example dataset download" step will be easier in the future.
https://github.com/python-bonobo/bonobo/issues/134
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py
:language: python
You can run this example as a module:
You can also run this example as a module (but you'll still need the dataset...):
.. code-block:: shell-session
$ bonobo run -m bonobo.examples.tutorials.tut02e01_read
.. note::
Don't focus too much on the `get_services()` function for now. It is required, with this exact name, but we'll get
into that in a few minutes.
Writing to files
::::::::::::::::

View File

@ -1,9 +1,195 @@
Configurables and Services
==========================
This document does not exist yet, but will be available soon.
.. note::
Meanwhile, you can read the matching references:
This section lacks completeness, sorry for that (but you can still read it!).
In the last section, we used a few new tools.
Class-based transformations and configurables
:::::::::::::::::::::::::::::::::::::::::::::
Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the
user to provide callables that logically fits in a graph.
You can use plain python objects with a `__call__()` method, and it ill just work.
As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of
them requiring your class to subclass :class:`bonobo.config.Configurable`.
Configurables allows to use the following features:
* You can add **Options** (using the :class:`bonobo.config.Option` descriptor). Options can be positional, or keyword
based, can have a default value and will be consumed from the constructor arguments.
.. code-block:: python
from bonobo.config import Configurable, Option
class PrefixIt(Configurable):
prefix = Option(str, positional=True, default='>>>')
def call(self, row):
return self.prefix + ' ' + row
prefixer = PrefixIt('$')
* You can add **Services** (using the :class:`bonobo.config.Service` descriptor). Services are a subclass of
:class:`bonobo.config.Option`, sharing the same basics, but specialized in the definition of "named services" that
will be resolved at runtime (a.k.a for which we will provide an implementation at runtime). We'll dive more into that
in the next section
.. code-block:: python
from bonobo.config import Configurable, Option, Service
class HttpGet(Configurable):
url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client')
def call(self, http):
resp = http.get(self.url)
for row in resp.json():
yield row
http_get = HttpGet()
* You can add **Methods** (using the :class:`bonobo.config.Method` descriptor). :class:`bonobo.config.Method` is a
subclass of :class:`bonobo.config.Option` that allows to pass callable parameters, either to the class constructor,
or using the class as a decorator.
.. code-block:: python
from bonobo.config import Configurable, Method
class Applier(Configurable):
apply = Method()
def call(self, row):
return self.apply(row)
@Applier
def Prefixer(self, row):
return 'Hello, ' + row
prefixer = Prefixer()
* You can add **ContextProcessors**, which are an advanced feature we won't introduce here. If you're familiar with
pytest, you can think of them as pytest fixtures, execution wise.
Services
::::::::
The motivation behind services is mostly separation of concerns, testability and deployability.
Usually, your transformations will depend on services (like a filesystem, an http client, a database, a rest api, ...).
Those services can very well be hardcoded in the transformations, but there is two main drawbacks:
* You won't be able to change the implementation depending on the current environment (development laptop versus
production servers, bug-hunting session versus execution, etc.)
* You won't be able to test your transformations without testing the associated services.
To overcome those caveats of hardcoding things, we define Services in the configurable, which are basically
string-options of the service names, and we provide an implementation at the last moment possible.
There are two ways of providing implementations:
* Either file-wide, by providing a `get_services()` function that returns a dict of named implementations (we did so
with filesystems in the previous step, :doc:`tut02.rst`)
* Either directory-wide, by providing a `get_services()` function in a specially named `_services.py` file.
The first is simpler if you only have one transformation graph in one file, the second allows to group coherent
transformations together in a directory and share the implementations.
Let's see how to use it, starting from the previous service example:
.. code-block:: python
from bonobo.config import Configurable, Option, Service
class HttpGet(Configurable):
url = Option(default='https://jsonplaceholder.typicode.com/users')
http = Service('http.client')
def call(self, http):
resp = http.get(self.url)
for row in resp.json():
yield row
We defined an "http.client" service, that obviously should have a `get()` method, returning responses that have a
`json()` method.
Let's provide two implementations for that. The first one will be using `requests <http://docs.python-requests.org/>`_,
that coincidally satisfies the described interface:
.. code-block:: python
import bonobo
import requests
def get_services():
return {
'http.client': requests
}
graph = bonobo.Graph(
HttpGet(),
print,
)
If you run this code, you should see some mock data returned by the webservice we called (assuming it's up and you can
reach it).
Now, the second implementation will replace that with a mock, used for testing purposes:
.. code-block:: python
class HttpResponseStub:
def json(self):
return [
{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}},
{'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}},
]
class HttpStub:
def get(self, url):
return HttpResponseStub()
def get_services():
return {
'http.client': HttpStub()
}
graph = bonobo.Graph(
HttpGet(),
print,
)
The `Graph` definition staying the exact same, you can easily substitute the `_services.py` file depending on your
environment (the way you're doing this is out of bonobo scope and heavily depends on your usual way of managing
configuration files on different platforms).
Starting with bonobo 0.5 (not yet released), you will be able to use service injections with function-based
transformations too, using the `bonobo.config.requires` decorator to mark a dependency.
.. code-block:: python
from bonobo.config import requires
@requires('http.client')
def http_get(http):
resp = http.get('https://jsonplaceholder.typicode.com/users')
for row in resp.json():
yield row
Read more
:::::::::
* :doc:`/guide/services`
* :doc:`/reference/api_config`

View File

@ -1,8 +1,199 @@
Working with databases
======================
This document does not exist yet, but will be available soon.
Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never
be) included in the main package. Instead, working with databases is done using third party, well maintained and
specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop.
Meanwhile, you can jump to bonobo-sqlalchemy development repository:
.. note::
SQLAlchemy extension is not yet complete. Things may be not optimal, and some APIs will change. You can still try,
of course.
Consider the following document as a "preview" (yes, it should work, yes it may break in the future).
Also, note that for early development stages, we explicitely support only PostreSQL, although it may work well
with `any other database supported by SQLAlchemy <http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases>`_.
First, read https://www.bonobo-project.org/with/sqlalchemy for instructions on how to install. You **do need** the
bleeding edge version of `bonobo` and `bonobo-sqlalchemy` to make this work.
Additional requirements
:::::::::::::::::::::::
Once you installed `bonobo_sqlalchemy` (read https://www.bonobo-project.org/with/sqlalchemy to use bleeding edge
version), install the following additional packages:
.. code-block:: shell-session
$ pip install -U python-dotenv psycopg2 awesome-slugify
Those packages are not required by the extension, but `python-dotenv` will help us configure the database DSN, and
`psycopg2` is required by SQLAlchemy to connect to PostgreSQL databases. Also, we'll use a slugifier to create unique
identifiers for the database (maybe not what you'd do in the real world, but very much sufficient for example purpose).
Configure a database engine
:::::::::::::::::::::::::::
Open your `_services.py` file and replace the code:
.. code-block:: python
import bonobo
import dotenv
from bonobo_sqlalchemy.util import create_postgresql_engine
dotenv.load_dotenv(dotenv.find_dotenv())
def get_services():
return {
'fs': bonobo.open_fs(),
'db': create_postgresql_engine(name='tutorial')
}
The `create_postgresql_engine` is a tiny function building the DSN from reasonable defaults, that you can override
either by providing kwargs, or with system environment variables. If you want to override something, open the `.env`
file and add values for one or more of `POSTGRES_NAME`, `POSTGRES_USER`, 'POSTGRES_PASS`, `POSTGRES_HOST`,
`POSTGRES_PORT`. Please note that kwargs always have precedence on environment, but that you should prefer using
environment variables for anything that is not immutable from one platform to another.
Let's create a `tutorial/pgdb.py` job:
.. code-block:: python
import bonobo
import bonobo_sqlalchemy
from bonobo.examples.tutorials.tut02e03_writeasmap import graph, split_one_to_map
graph = graph.copy()
graph.add_chain(
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
_input=split_one_to_map
)
Notes here:
* We use the code from :doc:`tut02`, which is bundled with bonobo in the `bonobo.examples.tutorials` package.
* We "fork" the graph, by creating a copy and appending a new "chain", starting at a point that exists in the other
graph.
* We use :class:`bonobo_sqlalchemy.InsertOrUpdate` (which role, in case it is not obvious, is to create database rows if
they do not exist yet, or update the existing row, based on a "discriminant" criteria (by default, "id")).
If we run this transformation (with `bonobo run tutorial/pgdb.py`), we should get an error:
.. code-block:: text
| File ".../lib/python3.6/site-packages/psycopg2/__init__.py", line 130, in connect
| conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
| sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist
|
|
| The above exception was the direct cause of the following exception:
|
| Traceback (most recent call last):
| File ".../bonobo-devkit/bonobo/bonobo/strategies/executor.py", line 45, in _runner
| node_context.start()
| File ".../bonobo-devkit/bonobo/bonobo/execution/base.py", line 75, in start
| self._stack.setup(self)
| File ".../bonobo-devkit/bonobo/bonobo/config/processors.py", line 94, in setup
| _append_to_context = next(_processed)
| File ".../bonobo-devkit/bonobo-sqlalchemy/bonobo_sqlalchemy/writers.py", line 43, in create_connection
| raise UnrecoverableError('Could not create SQLAlchemy connection: {}.'.format(str(exc).replace('\n', ''))) from exc
| bonobo.errors.UnrecoverableError: Could not create SQLAlchemy connection: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist.
The database we requested do not exist. It is not the role of bonobo to do database administration, and thus there is
no tool here to create neither the database, nor the tables we want to use.
There are however tools in `sqlalchemy` to manage tables, so we'll create the database by ourselves, and ask sqlalchemy
to create the table:
.. code-block:: shell-session
$ psql -U postgres -h localhost
psql (9.6.1, server 9.6.3)
Type "help" for help.
postgres=# CREATE ROLE tutorial WITH LOGIN PASSWORD 'tutorial';
CREATE ROLE
postgres=# CREATE DATABASE tutorial WITH OWNER=tutorial TEMPLATE=template0 ENCODING='utf-8';
CREATE DATABASE
Now, let's use a little trick and add this section to `pgdb.py`:
.. code-block:: python
import logging, sys
from bonobo.commands.run import get_default_services
from sqlalchemy import Table, Column, String, Integer, MetaData
def main():
services = get_default_services(__file__)
if len(sys.argv) == 2 and sys.argv[1] == 'reset':
engine = services.get('sqlalchemy.engine')
metadata = MetaData()
coffee_table = Table(
'coffeeshops',
metadata,
Column('id', String(255), primary_key=True),
Column('name', String(255)),
Column('address', String(255)),
)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
metadata.drop_all(engine)
metadata.create_all(engine)
else:
return bonobo.run(graph, services=services)
if __name__ == '__main__':
main()
.. note::
We're using private API of bonobo here, which is unsatisfactory, discouraged and may change. Some way to get the
service dictionnary will be added to the public api in a future release of bonobo.
Now run:
.. code-block:: python
$ python tutorial/pgdb.py reset
Database and table should now exist.
Let's prepare our data for database, and change the `.add_chain(..)` call to do it prior to `InsertOrUpdate(...)`
.. code-block:: python
from slugify import slugify_url
def format_for_db(row):
name, address = list(row.items())[0]
return {
'id': slugify_url(name),
'name': name,
'address': address,
}
# ...
graph = graph.copy()
graph.add_chain(
format_for_db,
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
_input=split_one_to_map
)
You can now run the script (either with `bonobo run tutorial/pgdb.py` or directly with the python interpreter, as we
added a "main" section) and the dataset should be inserted in your database. If you run it again, no new rows are
created.
Note that as we forked the graph from :doc:`tut02`, the transformation also writes the data to `coffeeshops.json`, as
before.
* https://github.com/hartym/bonobo-sqlalchemy

View File

@ -71,3 +71,23 @@ def test_graph_topological_sort():
assert g.topologically_sorted_indexes.index(3) < g.topologically_sorted_indexes.index(4)
assert g[3] == sentinel.b1
assert g[4] == sentinel.b2
def test_copy():
g1 = Graph()
g2 = g1.copy()
assert g1 is not g2
assert len(g1) == 0
assert len(g2) == 0
g1.add_chain([])
assert len(g1) == 1
assert len(g2) == 0
g2.add_chain([], identity)
assert len(g1) == 1
assert len(g2) == 2