@ -5,7 +5,10 @@ graph = bonobo.Graph(
|
||||
print,
|
||||
)
|
||||
|
||||
|
||||
def get_services():
|
||||
return {'fs': bonobo.open_examples_fs('datasets')}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
bonobo.run(
|
||||
graph, services={'fs': bonobo.open_examples_fs('datasets')}
|
||||
)
|
||||
bonobo.run(graph, services=get_services())
|
||||
|
||||
@ -8,10 +8,18 @@ def split_one(line):
|
||||
graph = bonobo.Graph(
|
||||
bonobo.FileReader('coffeeshops.txt'),
|
||||
split_one,
|
||||
bonobo.JsonWriter('coffeeshops.json', ioformat='arg0'),
|
||||
bonobo.JsonWriter(
|
||||
'coffeeshops.json', fs='fs.output', ioformat='arg0'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def get_services():
|
||||
return {
|
||||
'fs': bonobo.open_examples_fs('datasets'),
|
||||
'fs.output': bonobo.open_fs(),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
bonobo.run(
|
||||
graph, services={'fs': bonobo.open_examples_fs('datasets')}
|
||||
)
|
||||
bonobo.run(graph, services=get_services())
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
import bonobo, json
|
||||
import json
|
||||
|
||||
import bonobo
|
||||
|
||||
|
||||
def split_one_to_map(line):
|
||||
@ -18,10 +20,16 @@ class MyJsonWriter(bonobo.JsonWriter):
|
||||
graph = bonobo.Graph(
|
||||
bonobo.FileReader('coffeeshops.txt'),
|
||||
split_one_to_map,
|
||||
MyJsonWriter('coffeeshops.json'),
|
||||
MyJsonWriter('coffeeshops.json', fs='fs.output', ioformat='arg0'),
|
||||
)
|
||||
|
||||
|
||||
def get_services():
|
||||
return {
|
||||
'fs': bonobo.open_examples_fs('datasets'),
|
||||
'fs.output': bonobo.open_fs(),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
bonobo.run(
|
||||
graph, services={'fs': bonobo.open_examples_fs('datasets')}
|
||||
)
|
||||
bonobo.run(graph, services=get_services())
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from copy import copy
|
||||
|
||||
from bonobo.constants import BEGIN
|
||||
|
||||
|
||||
@ -62,6 +64,15 @@ class Graph:
|
||||
|
||||
return self
|
||||
|
||||
def copy(self):
|
||||
g = Graph()
|
||||
|
||||
g.edges = copy(self.edges)
|
||||
g.named = copy(self.named)
|
||||
g.nodes = copy(self.nodes)
|
||||
|
||||
return g
|
||||
|
||||
@property
|
||||
def topologically_sorted_indexes(self):
|
||||
"""Iterate in topological order, based on networkx's topological_sort() function.
|
||||
|
||||
207
docs/_templates/index.html
vendored
207
docs/_templates/index.html
vendored
@ -2,105 +2,116 @@
|
||||
{% set title = _('Bonobo — Data processing for humans') %}
|
||||
{% block body %}
|
||||
|
||||
<div style="border: 2px solid red; font-weight: bold; margin: 1em; padding: 1em">
|
||||
Bonobo is <strong>ALPHA</strong> software. Some APIs will change.
|
||||
</div>
|
||||
<h1 style="text-align: center">
|
||||
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
|
||||
style=" width: 128px; height: 128px;"/>
|
||||
</h1>
|
||||
|
||||
<h1 style="text-align: center">
|
||||
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
|
||||
style=" width: 128px; height: 128px;"/>
|
||||
</h1>
|
||||
|
||||
<p>
|
||||
{% trans %}
|
||||
<strong>Bonobo</strong> is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
|
||||
framework) emphasizing simple and atomic data transformations defined using a directed graph of plain old
|
||||
python objects (functions, iterables, generators, ...).
|
||||
{% endtrans %}
|
||||
</p>
|
||||
|
||||
<h2 style="margin-bottom: 0">{% trans %}Documentation{% endtrans %}</h2>
|
||||
|
||||
<table class="contentstable">
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto("tutorial/index") }}">{% trans %}First steps{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}quick overview of basic features{% endtrans %}</span></p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto("search") }}">{% trans %}
|
||||
Search{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}search the documentation{% endtrans %}</span></p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto("guide/index") }}">{% trans %}
|
||||
Guides{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}for a complete overview{% endtrans %}</span>
|
||||
</p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto("reference/index") }}">{% trans %}References{% endtrans %}</a>
|
||||
<br/>
|
||||
<span class="linkdescr">{% trans %}all functions, classes, terms{% endtrans %}</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" target="_blank" href="https://github.com/python-bonobo/bonobo/tree/master/bonobo/examples">{% trans %}
|
||||
Cookbook{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}examples and recipes{% endtrans %}</span></p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto("contribute/index") }}">{% trans %}
|
||||
Contribute{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}contributor guide{% endtrans %}</span></p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2>Features</h2>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>10 minutes to get started:</b> Know some python? Writing your first data processor is an affair
|
||||
of minutes.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Data sources and targets:</b> HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
|
||||
streaming APIs, python objects...
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Service injection:</b> Abstract the transformation dependencies to easily switch data sources and
|
||||
dependant libraries. You'll be able to specify the concrete implementations or configurations at
|
||||
runtime, for example to switch a database connection string or an API endpoint.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Plugins:</b> Easily add features to all your transformations by using builtin plugins (Jupyter,
|
||||
Console, ...) or write your own.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
Bonobo is young, and the todo-list is huge. Read the <a href="https://www.bonobo-project.org/roadmap">roadmap</a>.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>{% trans %}
|
||||
You can also download PDF/EPUB versions of the Bonobo documentation:
|
||||
<a href="http://readthedocs.org/projects/bonobo/downloads/pdf/stable/">PDF version</a>,
|
||||
<a href="http://readthedocs.org/projects/bonobo/downloads/epub/stable/">EPUB version</a>.
|
||||
<p>
|
||||
{% trans %}
|
||||
<strong>Bonobo</strong> is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
|
||||
framework, or ETL) emphasizing simple and atomic data transformations defined using a directed graph of plain old
|
||||
python objects (functions, iterables, generators, ...).
|
||||
{% endtrans %}
|
||||
</p>
|
||||
</p>
|
||||
|
||||
<div style="border: 2px solid red; font-weight: bold; margin: 1em; padding: 1em">
|
||||
Bonobo is <strong>ALPHA</strong> software. Some APIs will change.
|
||||
</div>
|
||||
|
||||
|
||||
<h2 style="margin-bottom: 0">{% trans %}Documentation{% endtrans %}</h2>
|
||||
|
||||
<table class="contentstable">
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto(" tutorial/index") }}">{% trans %}First steps{%
|
||||
endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}quick overview of basic features{% endtrans %}</span></p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto(" search") }}">{% trans %}
|
||||
Search{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}search the documentation{% endtrans %}</span></p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto(" guide/index") }}">{% trans %}
|
||||
Guides{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}for a complete overview{% endtrans %}</span>
|
||||
</p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto(" reference/index") }}">{% trans %}References{%
|
||||
endtrans %}</a>
|
||||
<br/>
|
||||
<span class="linkdescr">{% trans %}all functions, classes, terms{% endtrans %}</span>
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" target="_blank"
|
||||
href="https://github.com/python-bonobo/bonobo/tree/master/bonobo/examples">{% trans %}
|
||||
Cookbook{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}examples and recipes{% endtrans %}</span></p>
|
||||
</td>
|
||||
<td>
|
||||
<p class="biglink"><a class="biglink" href="{{ pathto(" contribute/index") }}">{% trans %}
|
||||
Contribute{% endtrans %}</a><br/>
|
||||
<span class="linkdescr">{% trans %}contributor guide{% endtrans %}</span></p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2>Features</h2>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>10 minutes to get started:</b> Know some python? Writing your first data processor is an affair
|
||||
of minutes.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Data sources and targets:</b> HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
|
||||
streaming APIs, python objects...
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Service injection:</b> Abstract the transformation dependencies to easily switch data sources and
|
||||
dependant libraries. You'll be able to specify the concrete implementations or configurations at
|
||||
runtime, for example to switch a database connection string or an API endpoint.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
<b>Plugins:</b> Easily add features to all your transformations by using builtin plugins (Jupyter,
|
||||
Console, ...) or write your own.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
<li>
|
||||
{% trans %}
|
||||
Bonobo is young, and the todo-list is huge. Read the <a
|
||||
href="https://www.bonobo-project.org/roadmap">roadmap</a>.
|
||||
{% endtrans %}
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>{% trans %}
|
||||
You can also download PDF/EPUB versions of the Bonobo documentation:
|
||||
<a href="http://readthedocs.org/projects/bonobo/downloads/pdf/stable/">PDF version</a>,
|
||||
<a href="http://readthedocs.org/projects/bonobo/downloads/epub/stable/">EPUB version</a>.
|
||||
{% endtrans %}
|
||||
</p>
|
||||
|
||||
<h2>Table of contents</h2>
|
||||
|
||||
|
||||
<div>
|
||||
{{ toctree(maxdepth=2, collapse=False)}}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
@ -4,6 +4,7 @@ Bonobo with SQLAlchemy
|
||||
.. todo:: The `bonobo-sqlalchemy` package is at a very alpha stage, and things will change. This section is here to
|
||||
give a brief overview but is neither complete nor definitive.
|
||||
|
||||
Read the introduction: https://www.bonobo-project.org/with/sqlalchemy
|
||||
|
||||
Installation
|
||||
::::::::::::
|
||||
|
||||
@ -8,8 +8,8 @@ Bonobo
|
||||
tutorial/index
|
||||
guide/index
|
||||
reference/index
|
||||
contribute/index
|
||||
faq
|
||||
contribute/index
|
||||
genindex
|
||||
modindex
|
||||
|
||||
|
||||
@ -4,36 +4,47 @@ Installation
|
||||
Create an ETL project
|
||||
:::::::::::::::::::::
|
||||
|
||||
If you only want to use Bonobo to code ETLs, your easiest option to get started is to use our
|
||||
`cookiecutter template <https://github.com/python-bonobo/cookiecutter-bonobo>`_.
|
||||
Creating a project and starting to write code should take less than a minute:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ pip install --upgrade bonobo cookiecutter
|
||||
$ bonobo init my-etl-project
|
||||
$ bonobo run my-etl-project
|
||||
|
||||
Once you bootstrapped a project, you can start editing the default example transformation by editing
|
||||
`my-etl-project/main.py`. Now, you can head to :doc:`tutorial/index`.
|
||||
|
||||
Other installation options
|
||||
::::::::::::::::::::::::::
|
||||
|
||||
Install from PyPI
|
||||
:::::::::::::::::
|
||||
-----------------
|
||||
|
||||
You can also install it directly from the `Python Package Index <https://pypi.python.org/pypi/bonobo>`_.
|
||||
You can install it directly from the `Python Package Index <https://pypi.python.org/pypi/bonobo>`_ (like we did above).
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ pip install bonobo
|
||||
|
||||
Install from source
|
||||
:::::::::::::::::::
|
||||
-------------------
|
||||
|
||||
If you want to install an unreleased version, you can use git urls with pip. This is useful when using bonobo as a
|
||||
dependency of your code and you want to try a forked version of bonobo with your software. You can use the git+http
|
||||
string in your `requirements.txt` file. However, the best option for development on bonobo directly is not this one,
|
||||
but editable installs (see below).
|
||||
dependency of your code and you want to try a forked version of bonobo with your software. You can use a `git+http`
|
||||
string in your `requirements.txt` file. However, the best option for development on bonobo is an editable install (see
|
||||
below).
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ pip install git+https://github.com/python-bonobo/bonobo.git@master#egg=bonobo
|
||||
$ pip install git+https://github.com/python-bonobo/bonobo.git@develop#egg=bonobo
|
||||
|
||||
Editable install
|
||||
::::::::::::::::
|
||||
----------------
|
||||
|
||||
If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip feature.
|
||||
Pip will clone your repository in a source directory and create a symlink for it in the site-package directory of your
|
||||
python interpreter.
|
||||
If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip
|
||||
feature. Pip will clone your repository in a source directory and create a symlink for it in the site-package directory
|
||||
of your python interpreter.
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
@ -63,20 +74,17 @@ I usually name the git remote for the main bonobo repository "upstream", and my
|
||||
|
||||
$ git remote rename origin upstream
|
||||
$ git remote add origin git@github.com:hartym/bonobo.git
|
||||
$ git fetch --all
|
||||
|
||||
Of course, replace my github username by the one you used to fork bonobo. You should be good to go!
|
||||
|
||||
Windows support
|
||||
:::::::::::::::
|
||||
|
||||
There are problems on the windows platform, mostly due to the fact bonobo was not developed by experienced windows users.
|
||||
There are minor issues on the windows platform, mostly due to the fact bonobo was not developed by experienced windows
|
||||
users.
|
||||
|
||||
We're trying to look into that but energy available to provide serious support on windows is very limited.
|
||||
|
||||
If you have experience in this domain and you're willing to help, you're more than welcome!
|
||||
|
||||
|
||||
|
||||
.. todo::
|
||||
|
||||
Better install docs, especially on how to use different forks or branches, etc.
|
||||
|
||||
|
||||
@ -9,17 +9,26 @@ python code in charge of handling similar shaped independant lines of data.
|
||||
|
||||
Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas.
|
||||
|
||||
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery.
|
||||
Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery
|
||||
(execution contexts, parallelism, error handling, console output, logging, ...).
|
||||
|
||||
Bonobo uses simple python and should be quick and easy to learn.
|
||||
|
||||
Tutorial
|
||||
::::::::
|
||||
|
||||
Warning: the documentation is still in progress. Although all content here should be accurate, you may feel a lack of
|
||||
completeness, for which we plaid guilty and apologize. If there is something blocking, please come on our
|
||||
`slack channel <https://bonobo-slack.herokuapp.com/>`_ and complain, we'll figure something out. If there is something
|
||||
that did not block you but can be a no-go for others, please consider contributing to the docs.
|
||||
.. note::
|
||||
|
||||
Good documentation is not easy to write. We do our best to make it better and better.
|
||||
|
||||
Although all content here should be accurate, you may feel a lack of completeness, for which we plaid guilty and
|
||||
apologize.
|
||||
|
||||
If you're stuck, please come and ask on our `slack channel <https://bonobo-slack.herokuapp.com/>`_, we'll figure
|
||||
something out.
|
||||
|
||||
If you're not stuck but had trouble understanding something, please consider contributing to the docs (via github
|
||||
pull requests).
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@ -19,7 +19,7 @@ can run.
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
bonobo init tutorial
|
||||
$ bonobo init tutorial
|
||||
|
||||
This will create a `tutorial` directory (`content description here <https://www.bonobo-project.org/with/cookiecutter>`_).
|
||||
|
||||
@ -27,15 +27,15 @@ To run this project, use:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
bonobo run tutorial
|
||||
$ bonobo run tutorial
|
||||
|
||||
|
||||
Write a first transformation
|
||||
::::::::::::::::::::::::::::
|
||||
|
||||
Open `tutorial/__main__.py`, and delete all the code here.
|
||||
Open `tutorial/main.py`, and delete all the code here.
|
||||
|
||||
A transformation can be whatever python can call, having inputs and outputs. Simplest transformations are functions.
|
||||
A transformation can be whatever python can call. Simplest transformations are functions and generators.
|
||||
|
||||
Let's write one:
|
||||
|
||||
@ -48,10 +48,10 @@ Easy.
|
||||
|
||||
.. note::
|
||||
|
||||
This is about the same as :func:`str.upper`, and in the real world, you'd use it directly.
|
||||
This function is very similar to :func:`str.upper`, which you can use directly.
|
||||
|
||||
Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from
|
||||
scratch, and we'll use stdout to simulate data-persistence.
|
||||
scratch, and we'll use stdout to "simulate" data-persistence.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -68,16 +68,16 @@ on things returned, and a normal function will just be seen as a generator that
|
||||
|
||||
.. note::
|
||||
|
||||
Once again, :func:`print` would be used directly in a real-world transformation.
|
||||
Once again, you should use the builtin :func:`print` directly instead of this `load()` function.
|
||||
|
||||
|
||||
Create a transformation graph
|
||||
:::::::::::::::::::::::::::::
|
||||
|
||||
Bonobo main roles are two things:
|
||||
Amongst other features, Bonobo will mostly help you there with the following:
|
||||
|
||||
* Execute the transformations in independant threads
|
||||
* Pass the outputs of one thread to other(s) thread(s).
|
||||
* Pass the outputs of one thread to other(s) thread(s) inputs.
|
||||
|
||||
To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it.
|
||||
|
||||
@ -109,17 +109,17 @@ To do this, it needs to know what data-flow you want to achieve, and you'll use
|
||||
Execute the job
|
||||
:::::::::::::::
|
||||
|
||||
Save `tutorial/__main__.py` and execute your transformation:
|
||||
Save `tutorial/main.py` and execute your transformation again:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
bonobo run tutorial
|
||||
$ bonobo run tutorial
|
||||
|
||||
This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
bonobo run -m bonobo.examples.tutorials.tut01e01
|
||||
$ bonobo run -m bonobo.examples.tutorials.tut01e01
|
||||
|
||||
|
||||
Rewrite it using builtins
|
||||
@ -127,27 +127,17 @@ Rewrite it using builtins
|
||||
|
||||
There is a much simpler way to describe an equivalent graph:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py
|
||||
:language: python
|
||||
|
||||
import bonobo
|
||||
The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input
|
||||
generator.
|
||||
|
||||
graph = bonobo.Graph(
|
||||
['foo', 'bar', 'baz',],
|
||||
str.upper,
|
||||
print,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
bonobo.run(graph)
|
||||
|
||||
We use a shortcut notation for the generator, with a list. Bonobo will wrap an iterable as a generator by itself if it
|
||||
is added in a graph.
|
||||
|
||||
This example is available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
|
||||
This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
bonobo run -m bonobo.examples.tutorials.tut01e02
|
||||
$ bonobo run -m bonobo.examples.tutorials.tut01e02
|
||||
|
||||
You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here
|
||||
below.
|
||||
@ -188,19 +178,19 @@ cases.
|
||||
Concepts and definitions
|
||||
::::::::::::::::::::::::
|
||||
|
||||
* Transformation: a callable that takes input (as call parameters) and returns output(s), either as its return value or
|
||||
* **Transformation**: a callable that takes input (as call parameters) and returns output(s), either as its return value or
|
||||
by yielding values (a.k.a returning a generator).
|
||||
|
||||
* Transformation graph (or Graph): a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
|
||||
* **Transformation graph (or Graph)**: a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
|
||||
a directed acyclic graph (or DAG).
|
||||
|
||||
* Node: a graph element, most probably a transformation in a graph.
|
||||
* **Node**: a graph element, most probably a transformation in a graph.
|
||||
|
||||
* Execution strategy (or strategy): a way to run a transformation graph. It's responsibility is mainly to parallelize
|
||||
* **Execution strategy (or strategy)**: a way to run a transformation graph. It's responsibility is mainly to parallelize
|
||||
(or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for
|
||||
transformations' inputs and outputs.
|
||||
|
||||
* Execution context (or context): a wrapper around a node that holds the state for it. If the node needs state, there
|
||||
* **Execution context (or context)**: a wrapper around a node that holds the state for it. If the node needs state, there
|
||||
are tools available in bonobo to feed it to the transformation using additional call parameters, keeping
|
||||
transformations stateless.
|
||||
|
||||
|
||||
@ -23,16 +23,18 @@ When run, the execution strategy wraps every component in a thread (assuming you
|
||||
:class:`bonobo.strategies.ThreadPoolExecutorStrategy`).
|
||||
|
||||
Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns*
|
||||
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread.
|
||||
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. Meanwhile, `A`
|
||||
will continue to run, if it's not done.
|
||||
|
||||
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing
|
||||
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`), the same thing
|
||||
happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`.
|
||||
|
||||
One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure"
|
||||
transformations (see :doc:`/guide/purity`).
|
||||
|
||||
You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry
|
||||
too much about blocking nodes, as they won't block other nodes.
|
||||
too much about nodes running blocking operations, as they will run in parallel. As soon as a line of output is ready,
|
||||
the next nodes will start consuming it.
|
||||
|
||||
That being said, let's manipulate some files.
|
||||
|
||||
@ -52,18 +54,33 @@ We'll use a text file that was generated using Bonobo from the "liste-des-cafes-
|
||||
Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset
|
||||
<https://opendata.paris.fr/explore/dataset/liste-des-cafes-a-un-euro/information/>`_.
|
||||
|
||||
You'll need the `example dataset <https://github.com/python-bonobo/bonobo/blob/master/bonobo/examples/datasets/coffeeshops.txt>`_,
|
||||
available in **Bonobo**'s repository.
|
||||
You'll need the `"coffeeshops.txt" example dataset <https://github.com/python-bonobo/bonobo/blob/master/bonobo/examples/datasets/coffeeshops.txt>`_,
|
||||
available in **Bonobo**'s repository:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'`
|
||||
|
||||
.. note::
|
||||
|
||||
The "example dataset download" step will be easier in the future.
|
||||
|
||||
https://github.com/python-bonobo/bonobo/issues/134
|
||||
|
||||
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py
|
||||
:language: python
|
||||
|
||||
You can run this example as a module:
|
||||
You can also run this example as a module (but you'll still need the dataset...):
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ bonobo run -m bonobo.examples.tutorials.tut02e01_read
|
||||
|
||||
.. note::
|
||||
|
||||
Don't focus too much on the `get_services()` function for now. It is required, with this exact name, but we'll get
|
||||
into that in a few minutes.
|
||||
|
||||
Writing to files
|
||||
::::::::::::::::
|
||||
|
||||
|
||||
@ -1,9 +1,195 @@
|
||||
Configurables and Services
|
||||
==========================
|
||||
|
||||
This document does not exist yet, but will be available soon.
|
||||
.. note::
|
||||
|
||||
Meanwhile, you can read the matching references:
|
||||
This section lacks completeness, sorry for that (but you can still read it!).
|
||||
|
||||
In the last section, we used a few new tools.
|
||||
|
||||
Class-based transformations and configurables
|
||||
:::::::::::::::::::::::::::::::::::::::::::::
|
||||
|
||||
Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the
|
||||
user to provide callables that logically fits in a graph.
|
||||
|
||||
You can use plain python objects with a `__call__()` method, and it ill just work.
|
||||
|
||||
As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of
|
||||
them requiring your class to subclass :class:`bonobo.config.Configurable`.
|
||||
|
||||
Configurables allows to use the following features:
|
||||
|
||||
* You can add **Options** (using the :class:`bonobo.config.Option` descriptor). Options can be positional, or keyword
|
||||
based, can have a default value and will be consumed from the constructor arguments.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from bonobo.config import Configurable, Option
|
||||
|
||||
class PrefixIt(Configurable):
|
||||
prefix = Option(str, positional=True, default='>>>')
|
||||
|
||||
def call(self, row):
|
||||
return self.prefix + ' ' + row
|
||||
|
||||
prefixer = PrefixIt('$')
|
||||
|
||||
* You can add **Services** (using the :class:`bonobo.config.Service` descriptor). Services are a subclass of
|
||||
:class:`bonobo.config.Option`, sharing the same basics, but specialized in the definition of "named services" that
|
||||
will be resolved at runtime (a.k.a for which we will provide an implementation at runtime). We'll dive more into that
|
||||
in the next section
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from bonobo.config import Configurable, Option, Service
|
||||
|
||||
class HttpGet(Configurable):
|
||||
url = Option(default='https://jsonplaceholder.typicode.com/users')
|
||||
http = Service('http.client')
|
||||
|
||||
def call(self, http):
|
||||
resp = http.get(self.url)
|
||||
|
||||
for row in resp.json():
|
||||
yield row
|
||||
|
||||
http_get = HttpGet()
|
||||
|
||||
|
||||
* You can add **Methods** (using the :class:`bonobo.config.Method` descriptor). :class:`bonobo.config.Method` is a
|
||||
subclass of :class:`bonobo.config.Option` that allows to pass callable parameters, either to the class constructor,
|
||||
or using the class as a decorator.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from bonobo.config import Configurable, Method
|
||||
|
||||
class Applier(Configurable):
|
||||
apply = Method()
|
||||
|
||||
def call(self, row):
|
||||
return self.apply(row)
|
||||
|
||||
@Applier
|
||||
def Prefixer(self, row):
|
||||
return 'Hello, ' + row
|
||||
|
||||
prefixer = Prefixer()
|
||||
|
||||
* You can add **ContextProcessors**, which are an advanced feature we won't introduce here. If you're familiar with
|
||||
pytest, you can think of them as pytest fixtures, execution wise.
|
||||
|
||||
Services
|
||||
::::::::
|
||||
|
||||
The motivation behind services is mostly separation of concerns, testability and deployability.
|
||||
|
||||
Usually, your transformations will depend on services (like a filesystem, an http client, a database, a rest api, ...).
|
||||
Those services can very well be hardcoded in the transformations, but there is two main drawbacks:
|
||||
|
||||
* You won't be able to change the implementation depending on the current environment (development laptop versus
|
||||
production servers, bug-hunting session versus execution, etc.)
|
||||
* You won't be able to test your transformations without testing the associated services.
|
||||
|
||||
To overcome those caveats of hardcoding things, we define Services in the configurable, which are basically
|
||||
string-options of the service names, and we provide an implementation at the last moment possible.
|
||||
|
||||
There are two ways of providing implementations:
|
||||
|
||||
* Either file-wide, by providing a `get_services()` function that returns a dict of named implementations (we did so
|
||||
with filesystems in the previous step, :doc:`tut02.rst`)
|
||||
* Either directory-wide, by providing a `get_services()` function in a specially named `_services.py` file.
|
||||
|
||||
The first is simpler if you only have one transformation graph in one file, the second allows to group coherent
|
||||
transformations together in a directory and share the implementations.
|
||||
|
||||
Let's see how to use it, starting from the previous service example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from bonobo.config import Configurable, Option, Service
|
||||
|
||||
class HttpGet(Configurable):
|
||||
url = Option(default='https://jsonplaceholder.typicode.com/users')
|
||||
http = Service('http.client')
|
||||
|
||||
def call(self, http):
|
||||
resp = http.get(self.url)
|
||||
|
||||
for row in resp.json():
|
||||
yield row
|
||||
|
||||
We defined an "http.client" service, that obviously should have a `get()` method, returning responses that have a
|
||||
`json()` method.
|
||||
|
||||
Let's provide two implementations for that. The first one will be using `requests <http://docs.python-requests.org/>`_,
|
||||
that coincidally satisfies the described interface:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import bonobo
|
||||
import requests
|
||||
|
||||
def get_services():
|
||||
return {
|
||||
'http.client': requests
|
||||
}
|
||||
|
||||
graph = bonobo.Graph(
|
||||
HttpGet(),
|
||||
print,
|
||||
)
|
||||
|
||||
If you run this code, you should see some mock data returned by the webservice we called (assuming it's up and you can
|
||||
reach it).
|
||||
|
||||
Now, the second implementation will replace that with a mock, used for testing purposes:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class HttpResponseStub:
|
||||
def json(self):
|
||||
return [
|
||||
{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}},
|
||||
{'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}},
|
||||
]
|
||||
|
||||
class HttpStub:
|
||||
def get(self, url):
|
||||
return HttpResponseStub()
|
||||
|
||||
def get_services():
|
||||
return {
|
||||
'http.client': HttpStub()
|
||||
}
|
||||
|
||||
graph = bonobo.Graph(
|
||||
HttpGet(),
|
||||
print,
|
||||
)
|
||||
|
||||
The `Graph` definition staying the exact same, you can easily substitute the `_services.py` file depending on your
|
||||
environment (the way you're doing this is out of bonobo scope and heavily depends on your usual way of managing
|
||||
configuration files on different platforms).
|
||||
|
||||
Starting with bonobo 0.5 (not yet released), you will be able to use service injections with function-based
|
||||
transformations too, using the `bonobo.config.requires` decorator to mark a dependency.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from bonobo.config import requires
|
||||
|
||||
@requires('http.client')
|
||||
def http_get(http):
|
||||
resp = http.get('https://jsonplaceholder.typicode.com/users')
|
||||
|
||||
for row in resp.json():
|
||||
yield row
|
||||
|
||||
|
||||
Read more
|
||||
:::::::::
|
||||
|
||||
* :doc:`/guide/services`
|
||||
* :doc:`/reference/api_config`
|
||||
|
||||
@ -1,8 +1,199 @@
|
||||
Working with databases
|
||||
======================
|
||||
|
||||
This document does not exist yet, but will be available soon.
|
||||
Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never
|
||||
be) included in the main package. Instead, working with databases is done using third party, well maintained and
|
||||
specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop.
|
||||
|
||||
Meanwhile, you can jump to bonobo-sqlalchemy development repository:
|
||||
.. note::
|
||||
|
||||
SQLAlchemy extension is not yet complete. Things may be not optimal, and some APIs will change. You can still try,
|
||||
of course.
|
||||
|
||||
Consider the following document as a "preview" (yes, it should work, yes it may break in the future).
|
||||
|
||||
Also, note that for early development stages, we explicitely support only PostreSQL, although it may work well
|
||||
with `any other database supported by SQLAlchemy <http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases>`_.
|
||||
|
||||
First, read https://www.bonobo-project.org/with/sqlalchemy for instructions on how to install. You **do need** the
|
||||
bleeding edge version of `bonobo` and `bonobo-sqlalchemy` to make this work.
|
||||
|
||||
Additional requirements
|
||||
:::::::::::::::::::::::
|
||||
|
||||
Once you installed `bonobo_sqlalchemy` (read https://www.bonobo-project.org/with/sqlalchemy to use bleeding edge
|
||||
version), install the following additional packages:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ pip install -U python-dotenv psycopg2 awesome-slugify
|
||||
|
||||
Those packages are not required by the extension, but `python-dotenv` will help us configure the database DSN, and
|
||||
`psycopg2` is required by SQLAlchemy to connect to PostgreSQL databases. Also, we'll use a slugifier to create unique
|
||||
identifiers for the database (maybe not what you'd do in the real world, but very much sufficient for example purpose).
|
||||
|
||||
Configure a database engine
|
||||
:::::::::::::::::::::::::::
|
||||
|
||||
Open your `_services.py` file and replace the code:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import bonobo
|
||||
import dotenv
|
||||
|
||||
from bonobo_sqlalchemy.util import create_postgresql_engine
|
||||
|
||||
dotenv.load_dotenv(dotenv.find_dotenv())
|
||||
|
||||
def get_services():
|
||||
return {
|
||||
'fs': bonobo.open_fs(),
|
||||
'db': create_postgresql_engine(name='tutorial')
|
||||
}
|
||||
|
||||
The `create_postgresql_engine` is a tiny function building the DSN from reasonable defaults, that you can override
|
||||
either by providing kwargs, or with system environment variables. If you want to override something, open the `.env`
|
||||
file and add values for one or more of `POSTGRES_NAME`, `POSTGRES_USER`, 'POSTGRES_PASS`, `POSTGRES_HOST`,
|
||||
`POSTGRES_PORT`. Please note that kwargs always have precedence on environment, but that you should prefer using
|
||||
environment variables for anything that is not immutable from one platform to another.
|
||||
|
||||
Let's create a `tutorial/pgdb.py` job:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import bonobo
|
||||
import bonobo_sqlalchemy
|
||||
|
||||
from bonobo.examples.tutorials.tut02e03_writeasmap import graph, split_one_to_map
|
||||
|
||||
graph = graph.copy()
|
||||
graph.add_chain(
|
||||
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
|
||||
_input=split_one_to_map
|
||||
)
|
||||
|
||||
Notes here:
|
||||
|
||||
* We use the code from :doc:`tut02`, which is bundled with bonobo in the `bonobo.examples.tutorials` package.
|
||||
* We "fork" the graph, by creating a copy and appending a new "chain", starting at a point that exists in the other
|
||||
graph.
|
||||
* We use :class:`bonobo_sqlalchemy.InsertOrUpdate` (which role, in case it is not obvious, is to create database rows if
|
||||
they do not exist yet, or update the existing row, based on a "discriminant" criteria (by default, "id")).
|
||||
|
||||
If we run this transformation (with `bonobo run tutorial/pgdb.py`), we should get an error:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
| File ".../lib/python3.6/site-packages/psycopg2/__init__.py", line 130, in connect
|
||||
| conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
|
||||
| sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist
|
||||
|
|
||||
|
|
||||
| The above exception was the direct cause of the following exception:
|
||||
|
|
||||
| Traceback (most recent call last):
|
||||
| File ".../bonobo-devkit/bonobo/bonobo/strategies/executor.py", line 45, in _runner
|
||||
| node_context.start()
|
||||
| File ".../bonobo-devkit/bonobo/bonobo/execution/base.py", line 75, in start
|
||||
| self._stack.setup(self)
|
||||
| File ".../bonobo-devkit/bonobo/bonobo/config/processors.py", line 94, in setup
|
||||
| _append_to_context = next(_processed)
|
||||
| File ".../bonobo-devkit/bonobo-sqlalchemy/bonobo_sqlalchemy/writers.py", line 43, in create_connection
|
||||
| raise UnrecoverableError('Could not create SQLAlchemy connection: {}.'.format(str(exc).replace('\n', ''))) from exc
|
||||
| bonobo.errors.UnrecoverableError: Could not create SQLAlchemy connection: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist.
|
||||
|
||||
The database we requested do not exist. It is not the role of bonobo to do database administration, and thus there is
|
||||
no tool here to create neither the database, nor the tables we want to use.
|
||||
|
||||
There are however tools in `sqlalchemy` to manage tables, so we'll create the database by ourselves, and ask sqlalchemy
|
||||
to create the table:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ psql -U postgres -h localhost
|
||||
|
||||
psql (9.6.1, server 9.6.3)
|
||||
Type "help" for help.
|
||||
|
||||
postgres=# CREATE ROLE tutorial WITH LOGIN PASSWORD 'tutorial';
|
||||
CREATE ROLE
|
||||
postgres=# CREATE DATABASE tutorial WITH OWNER=tutorial TEMPLATE=template0 ENCODING='utf-8';
|
||||
CREATE DATABASE
|
||||
|
||||
Now, let's use a little trick and add this section to `pgdb.py`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import logging, sys
|
||||
|
||||
from bonobo.commands.run import get_default_services
|
||||
from sqlalchemy import Table, Column, String, Integer, MetaData
|
||||
|
||||
def main():
|
||||
services = get_default_services(__file__)
|
||||
|
||||
if len(sys.argv) == 2 and sys.argv[1] == 'reset':
|
||||
engine = services.get('sqlalchemy.engine')
|
||||
metadata = MetaData()
|
||||
|
||||
coffee_table = Table(
|
||||
'coffeeshops',
|
||||
metadata,
|
||||
Column('id', String(255), primary_key=True),
|
||||
Column('name', String(255)),
|
||||
Column('address', String(255)),
|
||||
)
|
||||
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
metadata.drop_all(engine)
|
||||
metadata.create_all(engine)
|
||||
else:
|
||||
return bonobo.run(graph, services=services)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
.. note::
|
||||
|
||||
We're using private API of bonobo here, which is unsatisfactory, discouraged and may change. Some way to get the
|
||||
service dictionnary will be added to the public api in a future release of bonobo.
|
||||
|
||||
Now run:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
$ python tutorial/pgdb.py reset
|
||||
|
||||
Database and table should now exist.
|
||||
|
||||
Let's prepare our data for database, and change the `.add_chain(..)` call to do it prior to `InsertOrUpdate(...)`
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from slugify import slugify_url
|
||||
|
||||
def format_for_db(row):
|
||||
name, address = list(row.items())[0]
|
||||
return {
|
||||
'id': slugify_url(name),
|
||||
'name': name,
|
||||
'address': address,
|
||||
}
|
||||
|
||||
# ...
|
||||
|
||||
graph = graph.copy()
|
||||
graph.add_chain(
|
||||
format_for_db,
|
||||
bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
|
||||
_input=split_one_to_map
|
||||
)
|
||||
|
||||
You can now run the script (either with `bonobo run tutorial/pgdb.py` or directly with the python interpreter, as we
|
||||
added a "main" section) and the dataset should be inserted in your database. If you run it again, no new rows are
|
||||
created.
|
||||
|
||||
Note that as we forked the graph from :doc:`tut02`, the transformation also writes the data to `coffeeshops.json`, as
|
||||
before.
|
||||
|
||||
* https://github.com/hartym/bonobo-sqlalchemy
|
||||
|
||||
@ -71,3 +71,23 @@ def test_graph_topological_sort():
|
||||
assert g.topologically_sorted_indexes.index(3) < g.topologically_sorted_indexes.index(4)
|
||||
assert g[3] == sentinel.b1
|
||||
assert g[4] == sentinel.b2
|
||||
|
||||
|
||||
def test_copy():
|
||||
g1 = Graph()
|
||||
g2 = g1.copy()
|
||||
|
||||
assert g1 is not g2
|
||||
|
||||
assert len(g1) == 0
|
||||
assert len(g2) == 0
|
||||
|
||||
g1.add_chain([])
|
||||
|
||||
assert len(g1) == 1
|
||||
assert len(g2) == 0
|
||||
|
||||
g2.add_chain([], identity)
|
||||
|
||||
assert len(g1) == 1
|
||||
assert len(g2) == 2
|
||||
|
||||
Reference in New Issue
Block a user