[doc] Updating guides in documentation
This commit is contained in:
62
docs/_templates/base.html
vendored
Normal file
62
docs/_templates/base.html
vendored
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
{%- extends "alabaster/layout.html" %}
|
||||||
|
|
||||||
|
|
||||||
|
{%- block extrahead %}
|
||||||
|
{{ super() }}
|
||||||
|
<style>
|
||||||
|
div.related {
|
||||||
|
width: 940px;
|
||||||
|
margin: 30px auto 0 auto;
|
||||||
|
}
|
||||||
|
@media screen and (max-width: 875px) {
|
||||||
|
div.related {
|
||||||
|
visibility: hidden;
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{%- block footer %}
|
||||||
|
{{ relbar() }}
|
||||||
|
|
||||||
|
<div class="footer">
|
||||||
|
{% if show_copyright %}©{{ copyright }}.{% endif %}
|
||||||
|
{% if theme_show_powered_by|lower == 'true' %}
|
||||||
|
{% if show_copyright %}|{% endif %}
|
||||||
|
Powered by <a href="http://sphinx-doc.org/">Sphinx {{ sphinx_version }}</a>
|
||||||
|
& <a href="https://github.com/bitprophet/alabaster">Alabaster {{ alabaster_version }}</a>
|
||||||
|
{% endif %}
|
||||||
|
{%- if show_source and has_source and sourcename %}
|
||||||
|
{% if show_copyright or theme_show_powered_by %}|{% endif %}
|
||||||
|
<a href="{{ pathto('_sources/' + sourcename, true)|e }}"
|
||||||
|
rel="nofollow">{{ _('Page source') }}</a>
|
||||||
|
{%- endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if theme_github_banner|lower != 'false' %}
|
||||||
|
<a href="https://github.com/{{ theme_github_user }}/{{ theme_github_repo }}" class="github">
|
||||||
|
<img style="position: absolute; top: 0; right: 0; border: 0;"
|
||||||
|
src="{{ pathto('_static/' ~ theme_github_banner, 1) if theme_github_banner|lower != 'true' else 'https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png' }}"
|
||||||
|
alt="Fork me on GitHub" class="github"/>
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if theme_analytics_id %}
|
||||||
|
<script type="text/javascript">
|
||||||
|
var _gaq = _gaq || [];
|
||||||
|
_gaq.push(['_setAccount', '{{ theme_analytics_id }}']);
|
||||||
|
_gaq.push(['_setDomainName', 'none']);
|
||||||
|
_gaq.push(['_setAllowLinker', true]);
|
||||||
|
_gaq.push(['_trackPageview']);
|
||||||
|
(function () {
|
||||||
|
var ga = document.createElement('script');
|
||||||
|
ga.type = 'text/javascript';
|
||||||
|
ga.async = true;
|
||||||
|
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||||
|
var s = document.getElementsByTagName('script')[0];
|
||||||
|
s.parentNode.insertBefore(ga, s);
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
{% endif %}
|
||||||
|
{%- endblock %}
|
||||||
9
docs/_templates/index.html
vendored
9
docs/_templates/index.html
vendored
@ -1,7 +1,8 @@
|
|||||||
{% extends "layout.html" %}
|
{% extends "base.html" %}
|
||||||
{% set title = _('Bonobo — Data processing for humans') %}
|
|
||||||
{% block body %}
|
|
||||||
|
|
||||||
|
{% set title = _('Bonobo — Data processing for humans') %}
|
||||||
|
|
||||||
|
{% block body %}
|
||||||
<h1 style="text-align: center">
|
<h1 style="text-align: center">
|
||||||
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
|
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" alt="Bonobo"
|
||||||
style=" width: 128px; height: 128px;"/>
|
style=" width: 128px; height: 128px;"/>
|
||||||
@ -9,7 +10,7 @@
|
|||||||
|
|
||||||
<p>
|
<p>
|
||||||
{% trans %}
|
{% trans %}
|
||||||
<b>Bonobo</b> is an Extract Transform Load framework for the Python (3.5+) language.
|
<b>Bonobo</b> is an <b>Extract Transform Load</b> (or ETL) framework for the <b>Python (3.5+)</b> language.
|
||||||
{% endtrans %}
|
{% endtrans %}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
|||||||
7
docs/_templates/layout.html
vendored
Normal file
7
docs/_templates/layout.html
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
{%- extends "base.html" %}
|
||||||
|
|
||||||
|
{%- block content %}
|
||||||
|
{{ relbar() }}
|
||||||
|
{{ super() }}
|
||||||
|
{%- endblock %}
|
||||||
|
|
||||||
6
docs/_templates/sidebarlogo.html
vendored
6
docs/_templates/sidebarlogo.html
vendored
@ -1,10 +1,10 @@
|
|||||||
<a href="{{ pathto(master_doc) }}" style="border: none">
|
<a href="{{ pathto(master_doc) }}" style="border: none">
|
||||||
<h1 style="text-align: center; margin-top: 0;">
|
<h1 style="text-align: center; margin: 0;">
|
||||||
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" style="width: 48px; height: 48px; vertical-align: bottom"/>
|
<img class="logo" src="{{ pathto('_static/bonobo.png', 1) }}" title="Bonobo" style="width: 48px; height: 48px; vertical-align: bottom"/>
|
||||||
Bonobo
|
Bonobo
|
||||||
</h1>
|
</h1>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
<p>
|
<p style="text-align: center">
|
||||||
Data processing for human beings.
|
Data processing for humans.
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
@ -75,9 +75,9 @@ html_theme = 'alabaster'
|
|||||||
html_theme_options = {
|
html_theme_options = {
|
||||||
'github_user': 'python-bonobo',
|
'github_user': 'python-bonobo',
|
||||||
'github_repo': 'bonobo',
|
'github_repo': 'bonobo',
|
||||||
'github_button': True,
|
'github_button': 'true',
|
||||||
'show_powered_by': False,
|
'show_powered_by': 'false',
|
||||||
'show_related': True,
|
'show_related': 'true',
|
||||||
}
|
}
|
||||||
|
|
||||||
html_sidebars = {
|
html_sidebars = {
|
||||||
|
|||||||
3
docs/genindex.rst
Normal file
3
docs/genindex.rst
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
Full Index
|
||||||
|
==========
|
||||||
|
|
||||||
@ -1,11 +1,211 @@
|
|||||||
Graphs
|
Graphs
|
||||||
======
|
======
|
||||||
|
|
||||||
Writing graphs
|
Graphs are the glue that ties transformations together. It's the only data-structure bonobo can execute directly. Graphs
|
||||||
::::::::::::::
|
must be acyclic, and can contain as much nodes as your system can handle. Although this number can be rather high in
|
||||||
|
theory, extreme practical cases usually do not exceed hundreds of nodes (and this is already extreme, really).
|
||||||
|
|
||||||
Debugging graphs
|
|
||||||
|
Definitions
|
||||||
|
:::::::::::
|
||||||
|
|
||||||
|
Graph
|
||||||
|
|
||||||
|
A directed acyclic graph of transformations, that Bonobo can inspect and execute.
|
||||||
|
|
||||||
|
Node
|
||||||
|
|
||||||
|
A transformation within a graph. The transformations are stateless, and have no idea whether or not they are
|
||||||
|
included in a graph, multiple graph, or not at all.
|
||||||
|
|
||||||
|
|
||||||
|
Creating a graph
|
||||||
::::::::::::::::
|
::::::::::::::::
|
||||||
|
|
||||||
|
Graphs should be instances of :class:`bonobo.Graph`. The :func:`bonobo.Graph.add_chain` method can take as many
|
||||||
|
positional parameters as you want.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import bonobo
|
||||||
|
|
||||||
|
graph = bonobo.Graph()
|
||||||
|
graph.add_chain(a, b, c)
|
||||||
|
|
||||||
|
Resulting graph:
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
BEGIN -> "a" -> "b" -> "c";
|
||||||
|
}
|
||||||
|
|
||||||
|
Non-linear graphs
|
||||||
|
:::::::::::::::::
|
||||||
|
|
||||||
|
Divergences / forks
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
To create two or more divergent data streams ("fork"), you should specify `_input` kwarg to `add_chain`.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import bonobo
|
||||||
|
|
||||||
|
graph = bonobo.Graph()
|
||||||
|
graph.add_chain(a, b, c)
|
||||||
|
graph.add_chain(f, g, _input=b)
|
||||||
|
|
||||||
|
|
||||||
|
Resulting graph:
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
BEGIN -> "a" -> "b" -> "c";
|
||||||
|
"b" -> "f" -> "g";
|
||||||
|
}
|
||||||
|
|
||||||
|
.. note:: Both branch will receive the same data, at the same time.
|
||||||
|
|
||||||
|
Convergences / merges
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
To merge two data streams ("merge"), you can use the `_output` kwarg to `add_chain`, or use named nodes (see below).
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import bonobo
|
||||||
|
|
||||||
|
graph = bonobo.Graph()
|
||||||
|
|
||||||
|
# Here we mark _input to None, so normalize won't get the "begin" impulsion.
|
||||||
|
graph.add_chain(normalize, store, _input=None)
|
||||||
|
|
||||||
|
# Add two different chains
|
||||||
|
graph.add_chain(a, b, _output=normalize)
|
||||||
|
graph.add_chain(f, g, _output=normalize)
|
||||||
|
|
||||||
|
|
||||||
|
Resulting graph:
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
BEGIN -> "a" -> "b" -> "normalize";
|
||||||
|
|
||||||
|
BEGIN2 [shape="point"];
|
||||||
|
BEGIN2 -> "f" -> "g" -> "normalize";
|
||||||
|
|
||||||
|
"normalize" -> "store"
|
||||||
|
}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This is not a "join" or "cartesian product". Any data that comes from `b` or `g` will go through `normalize`, one at
|
||||||
|
a time. Think of the graph edges as data flow pipes.
|
||||||
|
|
||||||
|
|
||||||
|
Named nodes
|
||||||
|
:::::::::::
|
||||||
|
|
||||||
|
Using above code to create convergences can lead to hard to read code, because you have to define the "target" stream
|
||||||
|
before the streams that logically goes to the beginning of the transformation graph. To overcome that, one can use
|
||||||
|
"named" nodes:
|
||||||
|
|
||||||
|
graph.add_chain(x, y, z, _name='zed')
|
||||||
|
graph.add_chain(f, g, h, _input='zed')
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import bonobo
|
||||||
|
|
||||||
|
graph = bonobo.Graph()
|
||||||
|
|
||||||
|
# Add two different chains
|
||||||
|
graph.add_chain(a, b, _output="load")
|
||||||
|
graph.add_chain(f, g, _output="load")
|
||||||
|
|
||||||
|
# Here we mark _input to None, so normalize won't get the "begin" impulsion.
|
||||||
|
graph.add_chain(normalize, store, _input=None, _name="load")
|
||||||
|
|
||||||
|
|
||||||
|
Resulting graph:
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
BEGIN -> "a" -> "b" -> "normalize (load)";
|
||||||
|
|
||||||
|
BEGIN2 [shape="point"];
|
||||||
|
BEGIN2 -> "f" -> "g" -> "normalize (load)";
|
||||||
|
|
||||||
|
"normalize (load)" -> "store"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Inspecting graphs
|
||||||
|
:::::::::::::::::
|
||||||
|
|
||||||
|
Bonobo is bundled with an "inspector", that can use graphviz to let you visualize your graphs.
|
||||||
|
|
||||||
|
Read `How to inspect and visualize your graph <https://www.bonobo-project.org/how-to/inspect-an-etl-jobs-graph>`_.
|
||||||
|
|
||||||
|
|
||||||
Executing graphs
|
Executing graphs
|
||||||
::::::::::::::::
|
::::::::::::::::
|
||||||
|
|
||||||
|
There are two options to execute a graph (which have a similar result, but are targeting different use cases).
|
||||||
|
|
||||||
|
* You can use the bonobo command line interface, which is the highest level interface.
|
||||||
|
* You can use the python API, which is lower level but allows to use bonobo from within your own code (for example, a
|
||||||
|
django management command).
|
||||||
|
|
||||||
|
Executing a graph with the command line interface
|
||||||
|
-------------------------------------------------
|
||||||
|
|
||||||
|
If there is no good reason not to, you should use `bonobo run ...` to run transformation graphs found in your python
|
||||||
|
source code files.
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ bonobo run file.py
|
||||||
|
|
||||||
|
You can also run a python module:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ bonobo run -m my.own.etlmod
|
||||||
|
|
||||||
|
In each case, bonobo's CLI will look for an instance of :class:`bonobo.Graph` in your file/module, create the plumbery
|
||||||
|
needed to execute it, and run it.
|
||||||
|
|
||||||
|
If you're in an interactive terminal context, it will use :class:`bonobo.ext.console.ConsoleOutputPlugin` for display.
|
||||||
|
|
||||||
|
If you're in a jupyter notebook context, it will (try to) use :class:`bonobo.ext.jupyter.JupyterOutputPlugin`.
|
||||||
|
|
||||||
|
Executing a graph using the internal API
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
To integrate bonobo executions in any other python code, you should use :func:`bonobo.run`. It behaves very similar to
|
||||||
|
the CLI, and reading the source you should be able to figure out its usage quite easily.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,13 +1,14 @@
|
|||||||
Guides
|
Guides
|
||||||
======
|
======
|
||||||
|
|
||||||
Here are a few guides and best practices to work with bonobo.
|
This section will guide you through your journey with Bonobo ETL.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
graphs
|
introduction
|
||||||
transformations
|
transformations
|
||||||
|
graphs
|
||||||
services
|
services
|
||||||
environment
|
environment
|
||||||
purity
|
purity
|
||||||
|
|||||||
106
docs/guide/introduction.rst
Normal file
106
docs/guide/introduction.rst
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
The first thing you need to understand before you use Bonobo, or not, is what it does and what it does not, so you can
|
||||||
|
understand if it could be a good fit for your use cases.
|
||||||
|
|
||||||
|
How it works?
|
||||||
|
:::::::::::::
|
||||||
|
|
||||||
|
**Bonobo** is an **Extract Transform Load** framework aimed at coders, hackers, or any other person who's at ease with
|
||||||
|
terminals and source code files.
|
||||||
|
|
||||||
|
It is a **data streaming** solution, that treat datasets as ordered collections of independant rows, allowing to process
|
||||||
|
them "first in, first out" using a set of transformations organized together in a directed graph.
|
||||||
|
|
||||||
|
Let's take a few examples:
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
END [shape="none" label="..."];
|
||||||
|
BEGIN -> "A" -> "B" -> "C" -> "END";
|
||||||
|
}
|
||||||
|
|
||||||
|
One of the simplest, by the book, cases, is an extractor sending to a transformation, itself sending to a loader.
|
||||||
|
|
||||||
|
Bonobo will send an "impulsion" to all transformations linked to the little black dot on the left, here `A`.
|
||||||
|
`A`'s main topic will be to extract data from somewhere (a file, an endpoint, a database...) and generate some output.
|
||||||
|
As soon as the first row of `A`'s output is available, Bonobo will start asking `B` to process it. As soon as the first
|
||||||
|
row of `B`'s output is available, Bonobo will start asking `C` to process it.
|
||||||
|
|
||||||
|
While `B` and `C` are processing, `A` continues to generate data.
|
||||||
|
|
||||||
|
This approach can be efficient, depending on your requirements, because you may rely on a lot of services that may be
|
||||||
|
long to answer or unreliable, and you don't have to handle optimizations, parallelism or retry logic by yourself.
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
END [shape="none" label="..."];
|
||||||
|
END2 [shape="none" label="..."];
|
||||||
|
BEGIN -> "A" -> "B" -> "END";
|
||||||
|
"A" -> "C" -> "END2";
|
||||||
|
}
|
||||||
|
|
||||||
|
In this case, any output row of `A`, will be **sent to both** `B` and `C` simultaneously. Again, `A` will continue its
|
||||||
|
processing while `B` and `C` are working.
|
||||||
|
|
||||||
|
|
||||||
|
.. graphviz::
|
||||||
|
|
||||||
|
digraph {
|
||||||
|
rankdir = LR;
|
||||||
|
stylesheet = "../_static/graphs.css";
|
||||||
|
|
||||||
|
BEGIN [shape="point"];
|
||||||
|
BEGIN2 [shape="point"];
|
||||||
|
END [shape="none" label="..."];
|
||||||
|
BEGIN -> "A" -> "C" -> "END";
|
||||||
|
BEGIN2 -> "B" -> "C";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
What is it not?
|
||||||
|
:::::::::::::::
|
||||||
|
|
||||||
|
**Bonobo** is not:
|
||||||
|
|
||||||
|
* A data science, or statistical analysis tool, which need to treat the dataset as a whole and not as a collection of
|
||||||
|
independant rows. If this is your need, you probably want to look at `pandas <https://pandas.pydata.org/>`_.
|
||||||
|
|
||||||
|
* A workflow or scheduling solution for independant data-engineering tasks. If you're looking to manage your sets of
|
||||||
|
data processing tasks as a whole, you probably want to look at `airflow <https://airflow.incubator.apache.org/>`_.
|
||||||
|
Although there is no Bonobo extension yet that handles that, it does make sense to integrate Bonobo jobs in an airflow
|
||||||
|
(or other similar tool) workflow.
|
||||||
|
|
||||||
|
* A big data solution, `as defined by wikipedia <https://en.wikipedia.org/wiki/Big_data>`_. We're aiming at "small
|
||||||
|
scale" data processing, which can be still quite huge for humans, but not for computers. If you don't know whether or
|
||||||
|
not this is sufficient for your needs, it probably means you're not in the "big data" land.
|
||||||
|
|
||||||
|
|
||||||
|
Where to jump next?
|
||||||
|
:::::::::::::::::::
|
||||||
|
|
||||||
|
If you did not run through it yet, we highly suggest that you go through the :doc:`tutorial </tutorial/index>` first.
|
||||||
|
|
||||||
|
Then, you can jump to the following guides, in no particuliar order:
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
transformations
|
||||||
|
graphs
|
||||||
|
services
|
||||||
|
environment
|
||||||
|
purity
|
||||||
|
|
||||||
|
|
||||||
@ -1,14 +1,10 @@
|
|||||||
Services and dependencies
|
Services and dependencies
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
:Last-Modified: 20 may 2017
|
You'll want to use external systems within your transformations, including databases, HTTP APIs, other web services,
|
||||||
|
filesystems, etc.
|
||||||
|
|
||||||
You'll probably want to use external systems within your transformations. Those systems may include databases, apis
|
Hardcoding those services is a good first step, but as your codebase grows, will show limits rather quickly.
|
||||||
(using http, for example), filesystems, etc.
|
|
||||||
|
|
||||||
You can start by hardcoding those services. That does the job, at first.
|
|
||||||
|
|
||||||
If you're going a little further than that, you'll feel limited, for a few reasons:
|
|
||||||
|
|
||||||
* Hardcoded and tightly linked dependencies make your transformations hard to test, and hard to reuse.
|
* Hardcoded and tightly linked dependencies make your transformations hard to test, and hard to reuse.
|
||||||
* Processing data on your laptop is great, but being able to do it on different target systems (or stages), in different
|
* Processing data on your laptop is great, but being able to do it on different target systems (or stages), in different
|
||||||
@ -16,70 +12,77 @@ If you're going a little further than that, you'll feel limited, for a few reaso
|
|||||||
pre-production environment, or production system. Maybe you have similar systems for different clients and want to select
|
pre-production environment, or production system. Maybe you have similar systems for different clients and want to select
|
||||||
the system at runtime. Etc.
|
the system at runtime. Etc.
|
||||||
|
|
||||||
Service injection
|
Definition of service dependencies
|
||||||
:::::::::::::::::
|
::::::::::::::::::::::::::::::::::
|
||||||
|
|
||||||
To solve this problem, we introduce a light dependency injection system. It allows to define named dependencies in
|
To solve this problem, we introduce a light dependency injection system. It allows to define **named dependencies** in
|
||||||
your transformations, and provide an implementation at runtime.
|
your transformations, and provide an implementation at runtime.
|
||||||
|
|
||||||
Class-based transformations
|
For function-based transformations, you can use the :func:`bonobo.config.use` decorator to mark the dependencies. You'll
|
||||||
---------------------------
|
still be able to call it manually, providing the implementation yourself, but in a bonobo execution context, it will
|
||||||
|
be resolve and injected automatically, as long as you provided an implementation to the executor (more on that below).
|
||||||
|
|
||||||
To define a service dependency in a class-based transformation, use :class:`bonobo.config.Service`, a special
|
.. code-block:: python
|
||||||
descriptor (and subclass of :class:`bonobo.config.Option`) that will hold the service names and act as a marker
|
|
||||||
for runtime resolution of service instances.
|
|
||||||
|
|
||||||
Let's define such a transformation:
|
from bonobo.config import use
|
||||||
|
|
||||||
|
@use('orders_database')
|
||||||
|
def select_all(database):
|
||||||
|
yield from database.query('SELECT * FROM foo;')
|
||||||
|
|
||||||
|
For class based transformations, you can use :class:`bonobo.config.Service`, a special descriptor (and subclass of
|
||||||
|
:class:`bonobo.config.Option`) that will hold the service names and act as a marker for runtime resolution of service
|
||||||
|
instances.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from bonobo.config import Configurable, Service
|
from bonobo.config import Configurable, Service
|
||||||
|
|
||||||
class JoinDatabaseCategories(Configurable):
|
class JoinDatabaseCategories(Configurable):
|
||||||
database = Service('primary_sql_database')
|
database = Service('orders_database')
|
||||||
|
|
||||||
def __call__(self, database, row):
|
def call(self, database, row):
|
||||||
return {
|
return {
|
||||||
**row,
|
**row,
|
||||||
'category': database.get_category_name_for_sku(row['sku'])
|
'category': database.get_category_name_for_sku(row['sku'])
|
||||||
}
|
}
|
||||||
|
|
||||||
This piece of code tells bonobo that your transformation expect a service called "primary_sql_database", that will be
|
Both pieces of code tells bonobo that your transformation expect a service called "orders_database", that will be
|
||||||
injected to your calls under the parameter name "database".
|
injected to your calls under the parameter name "database".
|
||||||
|
|
||||||
Function-based transformations
|
Providing implementations at run-time
|
||||||
------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
No implementation yet, but expect something similar to CBT API, maybe using a `@Service(...)` decorator. See
|
Bonobo will expect you to provide a dictionary of all service implementations required by your graph.
|
||||||
`issue #70 <https://github.com/python-bonobo/bonobo/issues/70>`_.
|
|
||||||
|
|
||||||
Provide implementation at run time
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
Let's see how to execute it:
|
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
import bonobo
|
import bonobo
|
||||||
|
|
||||||
graph = bonobo.graph(
|
graph = bonobo.graph(...)
|
||||||
*before,
|
|
||||||
JoinDatabaseCategories(),
|
def get_services():
|
||||||
*after,
|
return {
|
||||||
)
|
'orders_database': my_database_service,
|
||||||
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
bonobo.run(
|
bonobo.run(graph, services=get_services())
|
||||||
graph,
|
|
||||||
services={
|
|
||||||
'primary_sql_database': my_database_service,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
A dictionary, or dictionary-like, "services" named argument can be passed to the :func:`bonobo.run` helper. The
|
|
||||||
"dictionary-like" part is the real keyword here. Bonobo is not a DIC library, and won't become one. So the implementation
|
.. note::
|
||||||
provided is pretty basic, and feature-less. But you can use much more evolved libraries instead of the provided
|
|
||||||
stub, and as long as it works the same (a.k.a implements a dictionary-like interface), the system will use it.
|
A dictionary, or dictionary-like, "services" named argument can be passed to the :func:`bonobo.run` API method.
|
||||||
|
The "dictionary-like" part is the real keyword here. Bonobo is not a DIC library, and won't become one. So the
|
||||||
|
implementation provided is pretty basic, and feature-less. But you can use much more evolved libraries instead of
|
||||||
|
the provided stub, and as long as it works the same (a.k.a implements a dictionary-like interface), the system will
|
||||||
|
use it.
|
||||||
|
|
||||||
|
Command line interface will look at services in two different places:
|
||||||
|
|
||||||
|
* A `get_services()` function present at the same level of your graph definition.
|
||||||
|
* A `get_services()` function in a `_services.py` file in the same directory as your graph's file, allowing to reuse the
|
||||||
|
same service implementations for more than one graph.
|
||||||
|
|
||||||
Solving concurrency problems
|
Solving concurrency problems
|
||||||
----------------------------
|
----------------------------
|
||||||
@ -87,7 +90,7 @@ Solving concurrency problems
|
|||||||
If a service cannot be used by more than one thread at a time, either because it's just not threadsafe, or because
|
If a service cannot be used by more than one thread at a time, either because it's just not threadsafe, or because
|
||||||
it requires to carefully order the calls made (apis that includes nonces, or work on results returned by previous
|
it requires to carefully order the calls made (apis that includes nonces, or work on results returned by previous
|
||||||
calls are usually good candidates), you can use the :class:`bonobo.config.Exclusive` context processor to lock the
|
calls are usually good candidates), you can use the :class:`bonobo.config.Exclusive` context processor to lock the
|
||||||
use of a dependency for a time period.
|
use of a dependency for the time of the context manager (`with` statement)
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -101,18 +104,10 @@ use of a dependency for a time period.
|
|||||||
api.last_call()
|
api.last_call()
|
||||||
|
|
||||||
|
|
||||||
Service configuration (to be decided and implemented)
|
|
||||||
:::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
||||||
|
|
||||||
* There should be a way to configure default service implementation for a python file, a directory, a project ...
|
|
||||||
* There should be a way to override services when running a transformation.
|
|
||||||
* There should be a way to use environment for service configuration.
|
|
||||||
|
|
||||||
Future and proposals
|
Future and proposals
|
||||||
::::::::::::::::::::
|
::::::::::::::::::::
|
||||||
|
|
||||||
This is the first proposed implementation and it will evolve, but looks a lot like how we used bonobo ancestor in
|
This first implementation and it will evolve. Base concepts will stay, though.
|
||||||
production.
|
|
||||||
|
|
||||||
May or may not happen, depending on discussions.
|
May or may not happen, depending on discussions.
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,90 @@
|
|||||||
Transformations
|
Transformations
|
||||||
===============
|
===============
|
||||||
|
|
||||||
Here is some guidelines on how to write transformations, to avoid the convention-jungle that could happen without
|
Transformations are the smallest building blocks in Bonobo ETL.
|
||||||
a few rules.
|
|
||||||
|
They are written using standard python callables (or iterables, if you're writing transformations that have no input,
|
||||||
|
a.k.a extractors).
|
||||||
|
|
||||||
|
Definitions
|
||||||
|
:::::::::::
|
||||||
|
|
||||||
|
Transformation
|
||||||
|
|
||||||
|
The base building block of Bonobo, anything you would insert in a graph as a node. Mostly, a callable or an iterable.
|
||||||
|
|
||||||
|
Extractor
|
||||||
|
|
||||||
|
Special case transformation that use no input. It will be only called once, and its purpose is to generate data,
|
||||||
|
either by itself or by requesting it from an external service.
|
||||||
|
|
||||||
|
Loader
|
||||||
|
|
||||||
|
Special case transformation that feed an external service with data. For convenience, it can also yield the data but
|
||||||
|
a "pure" loader would have no output (although yielding things should have no bad side effect).
|
||||||
|
|
||||||
|
Callable
|
||||||
|
|
||||||
|
Anything one can call, in python. Can be a function, a python builtin, or anything that implements `__call__`
|
||||||
|
|
||||||
|
Iterable
|
||||||
|
|
||||||
|
Something we can iterate on, in python, so basically anything you'd be able to use in a `for` loop.
|
||||||
|
|
||||||
|
|
||||||
|
Function based transformations
|
||||||
|
::::::::::::::::::::::::::::::
|
||||||
|
|
||||||
|
The most basic transformations are function-based. Which means that you define a function, and it will be used directly
|
||||||
|
in a graph.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
def get_representation(row):
|
||||||
|
return repr(row)
|
||||||
|
|
||||||
|
graph = bonobo.Graph(
|
||||||
|
[...],
|
||||||
|
get_representation,
|
||||||
|
[...],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
It does not allow any configuration, but if it's an option, prefer it as it's simpler to write.
|
||||||
|
|
||||||
|
|
||||||
|
Class based transformations
|
||||||
|
:::::::::::::::::::::::::::
|
||||||
|
|
||||||
|
For less basic use cases, you'll want to use classes to define some of your transformations. It's also a better choice
|
||||||
|
to build reusable blocks, as you'll be able to create parametrizable transformations that the end user will be able to
|
||||||
|
configure at the last minute.
|
||||||
|
|
||||||
|
|
||||||
|
Configurable
|
||||||
|
------------
|
||||||
|
|
||||||
|
.. autoclass:: bonobo.config.Configurable
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
.. autoclass:: bonobo.config.Option
|
||||||
|
|
||||||
|
Services
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. autoclass:: bonobo.config.Service
|
||||||
|
|
||||||
|
Methods
|
||||||
|
-------
|
||||||
|
|
||||||
|
.. autoclass:: bonobo.config.Method
|
||||||
|
|
||||||
|
ContextProcessors
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
.. autoclass:: bonobo.config.ContextProcessor
|
||||||
|
|
||||||
|
|
||||||
Naming conventions
|
Naming conventions
|
||||||
@ -44,50 +126,35 @@ can be used as a graph node, then use camelcase names:
|
|||||||
upper = Apply(str.upper)
|
upper = Apply(str.upper)
|
||||||
|
|
||||||
|
|
||||||
Function based transformations
|
Testing
|
||||||
::::::::::::::::::::::::::::::
|
:::::::
|
||||||
|
|
||||||
|
As Bonobo use plain old python objects as transformations, it's very easy to unit test your transformations using your
|
||||||
|
favourite testing framework. We're using pytest internally for Bonobo, but it's up to you to use the one you prefer.
|
||||||
|
|
||||||
|
If you want to test a transformation with the surrounding context provided (for example, service instances injected, and
|
||||||
|
context processors applied), you can use :class:`bonobo.execution.NodeExecutionContext` as a context processor and have
|
||||||
|
bonobo send the data to your transformation.
|
||||||
|
|
||||||
The most basic transformations are function-based. Which means that you define a function, and it will be used directly
|
|
||||||
in a graph.
|
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
def get_representation(row):
|
from bonobo.constants import BEGIN, END
|
||||||
return repr(row)
|
from bonobo.execution import NodeExecutionContext
|
||||||
|
|
||||||
graph = bonobo.Graph(
|
with NodeExecutionContext(
|
||||||
[...],
|
JsonWriter(filename), services={'fs': ...}
|
||||||
get_representation,
|
) as context:
|
||||||
)
|
|
||||||
|
|
||||||
|
# Write a list of rows, including BEGIN/END control messages.
|
||||||
|
context.write(
|
||||||
|
BEGIN,
|
||||||
|
Bag({'foo': 'bar'}),
|
||||||
|
Bag({'foo': 'baz'}),
|
||||||
|
END
|
||||||
|
)
|
||||||
|
|
||||||
It does not allow any configuration, but if it's an option, prefer it as it's simpler to write.
|
# Out of the bonobo main loop, we need to call `step` explicitely.
|
||||||
|
context.step()
|
||||||
|
context.step()
|
||||||
Class based transformations
|
|
||||||
:::::::::::::::::::::::::::
|
|
||||||
|
|
||||||
A lot of logic is a bit more complex, and you'll want to use classes to define some of your transformations.
|
|
||||||
|
|
||||||
The :class:`bonobo.config.Configurable` class gives you a few toys to write configurable transformations.
|
|
||||||
|
|
||||||
Options
|
|
||||||
-------
|
|
||||||
|
|
||||||
.. autoclass:: bonobo.config.Option
|
|
||||||
|
|
||||||
Services
|
|
||||||
--------
|
|
||||||
|
|
||||||
.. autoclass:: bonobo.config.Service
|
|
||||||
|
|
||||||
Methods
|
|
||||||
-------
|
|
||||||
|
|
||||||
.. autoclass:: bonobo.config.Method
|
|
||||||
|
|
||||||
ContextProcessors
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
.. autoclass:: bonobo.config.ContextProcessor
|
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,10 @@ Bonobo
|
|||||||
reference/index
|
reference/index
|
||||||
faq
|
faq
|
||||||
contribute/index
|
contribute/index
|
||||||
|
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:hidden:
|
||||||
|
|
||||||
genindex
|
genindex
|
||||||
modindex
|
modindex
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user