From cf5b078620226a0088f282f3218638efa19e03cc Mon Sep 17 00:00:00 2001
From: Romain Dorgueil
Date: Wed, 5 Jul 2017 19:28:42 +0200
Subject: [PATCH 1/3] [doc] Documentation, my dear. Half of the work, looks you
are a little behind on quotas ...
---
bonobo/examples/tutorials/tut02e01_read.py | 9 +-
bonobo/examples/tutorials/tut02e02_write.py | 16 +-
.../examples/tutorials/tut02e03_writeasmap.py | 18 +-
docs/_templates/index.html | 207 +++++++++---------
docs/guide/ext/sqlalchemy.rst | 1 +
docs/index.rst | 2 +-
docs/install.rst | 48 ++--
docs/tutorial/index.rst | 19 +-
docs/tutorial/tut01.rst | 56 ++---
docs/tutorial/tut02.rst | 29 ++-
docs/tutorial/tut03.rst | 190 +++++++++++++++-
11 files changed, 418 insertions(+), 177 deletions(-)
diff --git a/bonobo/examples/tutorials/tut02e01_read.py b/bonobo/examples/tutorials/tut02e01_read.py
index 0eb6786..362051a 100644
--- a/bonobo/examples/tutorials/tut02e01_read.py
+++ b/bonobo/examples/tutorials/tut02e01_read.py
@@ -5,7 +5,10 @@ graph = bonobo.Graph(
print,
)
+
+def get_services():
+ return {'fs': bonobo.open_examples_fs('datasets')}
+
+
if __name__ == '__main__':
- bonobo.run(
- graph, services={'fs': bonobo.open_examples_fs('datasets')}
- )
+ bonobo.run(graph, services=get_services())
diff --git a/bonobo/examples/tutorials/tut02e02_write.py b/bonobo/examples/tutorials/tut02e02_write.py
index 1d41ac2..e5a8445 100644
--- a/bonobo/examples/tutorials/tut02e02_write.py
+++ b/bonobo/examples/tutorials/tut02e02_write.py
@@ -8,10 +8,18 @@ def split_one(line):
graph = bonobo.Graph(
bonobo.FileReader('coffeeshops.txt'),
split_one,
- bonobo.JsonWriter('coffeeshops.json'),
+ bonobo.JsonWriter(
+ 'coffeeshops.json', fs='fs.output', ioformat='arg0'
+ ),
)
+
+def get_services():
+ return {
+ 'fs': bonobo.open_examples_fs('datasets'),
+ 'fs.output': bonobo.open_fs(),
+ }
+
+
if __name__ == '__main__':
- bonobo.run(
- graph, services={'fs': bonobo.open_examples_fs('datasets')}
- )
+ bonobo.run(graph, services=get_services())
diff --git a/bonobo/examples/tutorials/tut02e03_writeasmap.py b/bonobo/examples/tutorials/tut02e03_writeasmap.py
index 3fe4c08..e234f22 100644
--- a/bonobo/examples/tutorials/tut02e03_writeasmap.py
+++ b/bonobo/examples/tutorials/tut02e03_writeasmap.py
@@ -1,4 +1,6 @@
-import bonobo, json
+import json
+
+import bonobo
def split_one_to_map(line):
@@ -18,10 +20,16 @@ class MyJsonWriter(bonobo.JsonWriter):
graph = bonobo.Graph(
bonobo.FileReader('coffeeshops.txt'),
split_one_to_map,
- MyJsonWriter('coffeeshops.json'),
+ MyJsonWriter('coffeeshops.json', fs='fs.output', ioformat='arg0'),
)
+
+def get_services():
+ return {
+ 'fs': bonobo.open_examples_fs('datasets'),
+ 'fs.output': bonobo.open_fs(),
+ }
+
+
if __name__ == '__main__':
- bonobo.run(
- graph, services={'fs': bonobo.open_examples_fs('datasets')}
- )
+ bonobo.run(graph, services=get_services())
diff --git a/docs/_templates/index.html b/docs/_templates/index.html
index b778554..8f9185a 100644
--- a/docs/_templates/index.html
+++ b/docs/_templates/index.html
@@ -2,105 +2,116 @@
{% set title = _('Bonobo — Data processing for humans') %}
{% block body %}
-
- Bonobo is ALPHA software. Some APIs will change.
-
+
+
+
-
-
-
-
-
- {% trans %}
- Bonobo is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
- framework) emphasizing simple and atomic data transformations defined using a directed graph of plain old
- python objects (functions, iterables, generators, ...).
- {% endtrans %}
-
-
- {% trans %}Documentation{% endtrans %}
-
-
-
- |
- {% trans %}First steps{% endtrans %}
- {% trans %}quick overview of basic features{% endtrans %}
- |
-
- {% trans %}
- Search{% endtrans %}
- {% trans %}search the documentation{% endtrans %}
- |
-
-
- |
- {% trans %}
- Guides{% endtrans %}
- {% trans %}for a complete overview{% endtrans %}
-
- |
-
- {% trans %}References{% endtrans %}
-
- {% trans %}all functions, classes, terms{% endtrans %}
-
- |
-
-
- |
- {% trans %}
- Cookbook{% endtrans %}
- {% trans %}examples and recipes{% endtrans %}
- |
-
- {% trans %}
- Contribute{% endtrans %}
- {% trans %}contributor guide{% endtrans %}
- |
-
-
-
- Features
-
-
- -
- {% trans %}
- 10 minutes to get started: Know some python? Writing your first data processor is an affair
- of minutes.
- {% endtrans %}
-
- -
- {% trans %}
- Data sources and targets: HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
- streaming APIs, python objects...
- {% endtrans %}
-
- -
- {% trans %}
- Service injection: Abstract the transformation dependencies to easily switch data sources and
- dependant libraries. You'll be able to specify the concrete implementations or configurations at
- runtime, for example to switch a database connection string or an API endpoint.
- {% endtrans %}
-
- -
- {% trans %}
- Plugins: Easily add features to all your transformations by using builtin plugins (Jupyter,
- Console, ...) or write your own.
- {% endtrans %}
-
- -
- {% trans %}
- Bonobo is young, and the todo-list is huge. Read the roadmap.
- {% endtrans %}
-
-
-
- {% trans %}
- You can also download PDF/EPUB versions of the Bonobo documentation:
- PDF version,
- EPUB version.
+
+ {% trans %}
+ Bonobo is a line-by-line data-processing toolkit for python 3.5+ (extract-transform-load
+ framework, or ETL) emphasizing simple and atomic data transformations defined using a directed graph of plain old
+ python objects (functions, iterables, generators, ...).
{% endtrans %}
-
+
+
+ Bonobo is ALPHA software. Some APIs will change.
+
+
+
+{% trans %}Documentation{% endtrans %}
+
+
+
+ |
+ {% trans %}First steps{%
+ endtrans %}
+ {% trans %}quick overview of basic features{% endtrans %}
+ |
+
+ {% trans %}
+ Search{% endtrans %}
+ {% trans %}search the documentation{% endtrans %}
+ |
+
+
+ |
+ {% trans %}
+ Guides{% endtrans %}
+ {% trans %}for a complete overview{% endtrans %}
+
+ |
+
+ {% trans %}References{%
+ endtrans %}
+
+ {% trans %}all functions, classes, terms{% endtrans %}
+
+ |
+
+
+ |
+ {% trans %}
+ Cookbook{% endtrans %}
+ {% trans %}examples and recipes{% endtrans %}
+ |
+
+ {% trans %}
+ Contribute{% endtrans %}
+ {% trans %}contributor guide{% endtrans %}
+ |
+
+
+
+Features
+
+
+ -
+ {% trans %}
+ 10 minutes to get started: Know some python? Writing your first data processor is an affair
+ of minutes.
+ {% endtrans %}
+
+ -
+ {% trans %}
+ Data sources and targets: HTML, JSON, XML, SQL databases, NoSQL databases, HTTP/REST APIs,
+ streaming APIs, python objects...
+ {% endtrans %}
+
+ -
+ {% trans %}
+ Service injection: Abstract the transformation dependencies to easily switch data sources and
+ dependant libraries. You'll be able to specify the concrete implementations or configurations at
+ runtime, for example to switch a database connection string or an API endpoint.
+ {% endtrans %}
+
+ -
+ {% trans %}
+ Plugins: Easily add features to all your transformations by using builtin plugins (Jupyter,
+ Console, ...) or write your own.
+ {% endtrans %}
+
+ -
+ {% trans %}
+ Bonobo is young, and the todo-list is huge. Read the roadmap.
+ {% endtrans %}
+
+
+
+{% trans %}
+ You can also download PDF/EPUB versions of the Bonobo documentation:
+ PDF version,
+ EPUB version.
+ {% endtrans %}
+
+
+Table of contents
+
+
+
+ {{ toctree(maxdepth=2, collapse=False)}}
+
{% endblock %}
diff --git a/docs/guide/ext/sqlalchemy.rst b/docs/guide/ext/sqlalchemy.rst
index 0f9c549..d7da4e8 100644
--- a/docs/guide/ext/sqlalchemy.rst
+++ b/docs/guide/ext/sqlalchemy.rst
@@ -4,6 +4,7 @@ Bonobo with SQLAlchemy
.. todo:: The `bonobo-sqlalchemy` package is at a very alpha stage, and things will change. This section is here to
give a brief overview but is neither complete nor definitive.
+Read the introduction: https://www.bonobo-project.org/with/sqlalchemy
Installation
::::::::::::
diff --git a/docs/index.rst b/docs/index.rst
index c2eeb8d..8fbcd6e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,8 +8,8 @@ Bonobo
tutorial/index
guide/index
reference/index
- contribute/index
faq
+ contribute/index
genindex
modindex
diff --git a/docs/install.rst b/docs/install.rst
index 41487e4..ac951e5 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -4,36 +4,47 @@ Installation
Create an ETL project
:::::::::::::::::::::
-If you only want to use Bonobo to code ETLs, your easiest option to get started is to use our
-`cookiecutter template `_.
+Creating a project and starting to write code should take less than a minute:
+
+.. code-block:: shell-session
+
+ $ pip install --upgrade bonobo cookiecutter
+ $ bonobo init my-etl-project
+ $ bonobo run my-etl-project
+
+Once you bootstrapped a project, you can start editing the default example transformation by editing
+`my-etl-project/main.py`.
+
+Other installation options
+::::::::::::::::::::::::::
Install from PyPI
-:::::::::::::::::
+-----------------
-You can also install it directly from the `Python Package Index `_.
+You can install it directly from the `Python Package Index `_ (like we did above).
.. code-block:: shell-session
$ pip install bonobo
Install from source
-:::::::::::::::::::
+-------------------
If you want to install an unreleased version, you can use git urls with pip. This is useful when using bonobo as a
-dependency of your code and you want to try a forked version of bonobo with your software. You can use the git+http
-string in your `requirements.txt` file. However, the best option for development on bonobo directly is not this one,
-but editable installs (see below).
+dependency of your code and you want to try a forked version of bonobo with your software. You can use a `git+http`
+string in your `requirements.txt` file. However, the best option for development on bonobo is an editable install (see
+below).
.. code-block:: shell-session
- $ pip install git+https://github.com/python-bonobo/bonobo.git@master#egg=bonobo
+ $ pip install git+https://github.com/python-bonobo/bonobo.git@develop#egg=bonobo
Editable install
-::::::::::::::::
+----------------
-If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip feature.
-Pip will clone your repository in a source directory and create a symlink for it in the site-package directory of your
-python interpreter.
+If you plan on making patches to Bonobo, you should install it as an "editable" package, which is a really great pip
+feature. Pip will clone your repository in a source directory and create a symlink for it in the site-package directory
+of your python interpreter.
.. code-block:: shell-session
@@ -63,20 +74,17 @@ I usually name the git remote for the main bonobo repository "upstream", and my
$ git remote rename origin upstream
$ git remote add origin git@github.com:hartym/bonobo.git
+ $ git fetch --all
Of course, replace my github username by the one you used to fork bonobo. You should be good to go!
Windows support
:::::::::::::::
-There are problems on the windows platform, mostly due to the fact bonobo was not developed by experienced windows users.
+There are minor issues on the windows platform, mostly due to the fact bonobo was not developed by experienced windows
+users.
We're trying to look into that but energy available to provide serious support on windows is very limited.
+
If you have experience in this domain and you're willing to help, you're more than welcome!
-
-
-.. todo::
-
- Better install docs, especially on how to use different forks or branches, etc.
-
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index 342449b..d449df5 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -9,17 +9,26 @@ python code in charge of handling similar shaped independant lines of data.
Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas.
-Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery.
+Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery
+(execution contexts, parallelism, error handling, console output, logging, ...).
Bonobo uses simple python and should be quick and easy to learn.
Tutorial
::::::::
-Warning: the documentation is still in progress. Although all content here should be accurate, you may feel a lack of
-completeness, for which we plaid guilty and apologize. If there is something blocking, please come on our
-`slack channel `_ and complain, we'll figure something out. If there is something
-that did not block you but can be a no-go for others, please consider contributing to the docs.
+.. note::
+
+ Good documentation is not easy to write. We do our best to make it better and better.
+
+ Although all content here should be accurate, you may feel a lack of completeness, for which we plaid guilty and
+ apologize.
+
+ If you're stuck, please come and ask on our `slack channel `_, we'll figure
+ something out.
+
+ If you're not stuck but had trouble understanding something, please consider contributing to the docs (via github
+ pull requests).
.. toctree::
:maxdepth: 2
diff --git a/docs/tutorial/tut01.rst b/docs/tutorial/tut01.rst
index 0357926..d6aa604 100644
--- a/docs/tutorial/tut01.rst
+++ b/docs/tutorial/tut01.rst
@@ -19,7 +19,7 @@ can run.
.. code-block:: shell-session
- bonobo init tutorial
+ $ bonobo init tutorial
This will create a `tutorial` directory (`content description here `_).
@@ -27,15 +27,15 @@ To run this project, use:
.. code-block:: shell-session
- bonobo run tutorial
+ $ bonobo run tutorial
Write a first transformation
::::::::::::::::::::::::::::
-Open `tutorial/__main__.py`, and delete all the code here.
+Open `tutorial/main.py`, and delete all the code here.
-A transformation can be whatever python can call, having inputs and outputs. Simplest transformations are functions.
+A transformation can be whatever python can call. Simplest transformations are functions and generators.
Let's write one:
@@ -48,10 +48,10 @@ Easy.
.. note::
- This is about the same as :func:`str.upper`, and in the real world, you'd use it directly.
+ This function is very similar to :func:`str.upper`, which you can use directly.
Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from
-scratch, and we'll use stdout to simulate data-persistence.
+scratch, and we'll use stdout to "simulate" data-persistence.
.. code-block:: python
@@ -68,16 +68,16 @@ on things returned, and a normal function will just be seen as a generator that
.. note::
- Once again, :func:`print` would be used directly in a real-world transformation.
+ Once again, you should use the builtin :func:`print` directly instead of this `load()` function.
Create a transformation graph
:::::::::::::::::::::::::::::
-Bonobo main roles are two things:
+Amongst other features, Bonobo will mostly help you there with the following:
* Execute the transformations in independant threads
-* Pass the outputs of one thread to other(s) thread(s).
+* Pass the outputs of one thread to other(s) thread(s) inputs.
To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it.
@@ -109,17 +109,17 @@ To do this, it needs to know what data-flow you want to achieve, and you'll use
Execute the job
:::::::::::::::
-Save `tutorial/__main__.py` and execute your transformation:
+Save `tutorial/main.py` and execute your transformation again:
.. code-block:: shell-session
- bonobo run tutorial
+ $ bonobo run tutorial
This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module:
.. code-block:: shell-session
- bonobo run -m bonobo.examples.tutorials.tut01e01
+ $ bonobo run -m bonobo.examples.tutorials.tut01e01
Rewrite it using builtins
@@ -127,27 +127,17 @@ Rewrite it using builtins
There is a much simpler way to describe an equivalent graph:
-.. code-block:: python
+.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py
+ :language: python
- import bonobo
+The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input
+generator.
- graph = bonobo.Graph(
- ['foo', 'bar', 'baz',],
- str.upper,
- print,
- )
-
- if __name__ == '__main__':
- bonobo.run(graph)
-
-We use a shortcut notation for the generator, with a list. Bonobo will wrap an iterable as a generator by itself if it
-is added in a graph.
-
-This example is available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
+This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module:
.. code-block:: shell-session
- bonobo run -m bonobo.examples.tutorials.tut01e02
+ $ bonobo run -m bonobo.examples.tutorials.tut01e02
You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here
below.
@@ -188,19 +178,19 @@ cases.
Concepts and definitions
::::::::::::::::::::::::
-* Transformation: a callable that takes input (as call parameters) and returns output(s), either as its return value or
+* **Transformation**: a callable that takes input (as call parameters) and returns output(s), either as its return value or
by yielding values (a.k.a returning a generator).
-* Transformation graph (or Graph): a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
+* **Transformation graph (or Graph)**: a set of transformations tied together in a :class:`bonobo.Graph` instance, which is
a directed acyclic graph (or DAG).
-* Node: a graph element, most probably a transformation in a graph.
+* **Node**: a graph element, most probably a transformation in a graph.
-* Execution strategy (or strategy): a way to run a transformation graph. It's responsibility is mainly to parallelize
+* **Execution strategy (or strategy)**: a way to run a transformation graph. It's responsibility is mainly to parallelize
(or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for
transformations' inputs and outputs.
-* Execution context (or context): a wrapper around a node that holds the state for it. If the node needs state, there
+* **Execution context (or context)**: a wrapper around a node that holds the state for it. If the node needs state, there
are tools available in bonobo to feed it to the transformation using additional call parameters, keeping
transformations stateless.
diff --git a/docs/tutorial/tut02.rst b/docs/tutorial/tut02.rst
index ff562d1..b1545f9 100644
--- a/docs/tutorial/tut02.rst
+++ b/docs/tutorial/tut02.rst
@@ -23,16 +23,18 @@ When run, the execution strategy wraps every component in a thread (assuming you
:class:`bonobo.strategies.ThreadPoolExecutorStrategy`).
Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns*
-something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread.
+something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. Meanwhile, `A`
+will continue to run, if it's not done.
-When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing
+When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`), the same thing
happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`.
One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure"
transformations (see :doc:`/guide/purity`).
You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry
-too much about blocking nodes, as they won't block other nodes.
+too much about nodes running blocking operations, as they will run in parallel. As soon as a line of output is ready,
+the next nodes will start consuming it.
That being said, let's manipulate some files.
@@ -52,18 +54,33 @@ We'll use a text file that was generated using Bonobo from the "liste-des-cafes-
Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset
`_.
-You'll need the `example dataset `_,
-available in **Bonobo**'s repository.
+You'll need the `"coffeeshops.txt" example dataset `_,
+available in **Bonobo**'s repository:
+
+.. code-block:: shell-session
+
+ $ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'`
+
+.. note::
+
+ The "example dataset download" step will be easier in the future.
+
+ https://github.com/python-bonobo/bonobo/issues/134
.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py
:language: python
-You can run this example as a module:
+You can also run this example as a module (but you'll still need the dataset...):
.. code-block:: shell-session
$ bonobo run -m bonobo.examples.tutorials.tut02e01_read
+.. note::
+
+ Don't focus too much on the `get_services()` function for now. It is required, with this exact name, but we'll get
+ into that in a few minutes.
+
Writing to files
::::::::::::::::
diff --git a/docs/tutorial/tut03.rst b/docs/tutorial/tut03.rst
index 2721430..325bc9d 100644
--- a/docs/tutorial/tut03.rst
+++ b/docs/tutorial/tut03.rst
@@ -1,9 +1,195 @@
Configurables and Services
==========================
-This document does not exist yet, but will be available soon.
+.. note::
-Meanwhile, you can read the matching references:
+ This section lacks completeness, sorry for that (but you can still read it!).
+
+In the last section, we used a few new tools.
+
+Class-based transformations and configurables
+:::::::::::::::::::::::::::::::::::::::::::::
+
+Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the
+user to provide callables that logically fits in a graph.
+
+You can use plain python objects with a `__call__()` method, and it ill just work.
+
+As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of
+them requiring your class to subclass :class:`bonobo.config.Configurable`.
+
+Configurables allows to use the following features:
+
+* You can add **Options** (using the :class:`bonobo.config.Option` descriptor). Options can be positional, or keyword
+ based, can have a default value and will be consumed from the constructor arguments.
+
+ .. code-block:: python
+
+ from bonobo.config import Configurable, Option
+
+ class PrefixIt(Configurable):
+ prefix = Option(str, positional=True, default='>>>')
+
+ def call(self, row):
+ return self.prefix + ' ' + row
+
+ prefixer = PrefixIt('$')
+
+* You can add **Services** (using the :class:`bonobo.config.Service` descriptor). Services are a subclass of
+ :class:`bonobo.config.Option`, sharing the same basics, but specialized in the definition of "named services" that
+ will be resolved at runtime (a.k.a for which we will provide an implementation at runtime). We'll dive more into that
+ in the next section
+
+ .. code-block:: python
+
+ from bonobo.config import Configurable, Option, Service
+
+ class HttpGet(Configurable):
+ url = Option(default='https://jsonplaceholder.typicode.com/users')
+ http = Service('http.client')
+
+ def call(self, http):
+ resp = http.get(self.url)
+
+ for row in resp.json():
+ yield row
+
+ http_get = HttpGet()
+
+
+* You can add **Methods** (using the :class:`bonobo.config.Method` descriptor). :class:`bonobo.config.Method` is a
+ subclass of :class:`bonobo.config.Option` that allows to pass callable parameters, either to the class constructor,
+ or using the class as a decorator.
+
+ .. code-block:: python
+
+ from bonobo.config import Configurable, Method
+
+ class Applier(Configurable):
+ apply = Method()
+
+ def call(self, row):
+ return self.apply(row)
+
+ @Applier
+ def Prefixer(self, row):
+ return 'Hello, ' + row
+
+ prefixer = Prefixer()
+
+* You can add **ContextProcessors**, which are an advanced feature we won't introduce here. If you're familiar with
+ pytest, you can think of them as pytest fixtures, execution wise.
+
+Services
+::::::::
+
+The motivation behind services is mostly separation of concerns, testability and deployability.
+
+Usually, your transformations will depend on services (like a filesystem, an http client, a database, a rest api, ...).
+Those services can very well be hardcoded in the transformations, but there is two main drawbacks:
+
+* You won't be able to change the implementation depending on the current environment (development laptop versus
+ production servers, bug-hunting session versus execution, etc.)
+* You won't be able to test your transformations without testing the associated services.
+
+To overcome those caveats of hardcoding things, we define Services in the configurable, which are basically
+string-options of the service names, and we provide an implementation at the last moment possible.
+
+There are two ways of providing implementations:
+
+* Either file-wide, by providing a `get_services()` function that returns a dict of named implementations (we did so
+ with filesystems in the previous step, :doc:`tut02.rst`)
+* Either directory-wide, by providing a `get_services()` function in a specially named `_services.py` file.
+
+The first is simpler if you only have one transformation graph in one file, the second allows to group coherent
+transformations together in a directory and share the implementations.
+
+Let's see how to use it, starting from the previous service example:
+
+.. code-block:: python
+
+ from bonobo.config import Configurable, Option, Service
+
+ class HttpGet(Configurable):
+ url = Option(default='https://jsonplaceholder.typicode.com/users')
+ http = Service('http.client')
+
+ def call(self, http):
+ resp = http.get(self.url)
+
+ for row in resp.json():
+ yield row
+
+We defined an "http.client" service, that obviously should have a `get()` method, returning responses that have a
+`json()` method.
+
+Let's provide two implementations for that. The first one will be using `requests `_,
+that coincidally satisfies the described interface:
+
+.. code-block:: python
+
+ import bonobo
+ import requests
+
+ def get_services():
+ return {
+ 'http.client': requests
+ }
+
+ graph = bonobo.Graph(
+ HttpGet(),
+ print,
+ )
+
+If you run this code, you should see some mock data returned by the webservice we called (assuming it's up and you can
+reach it).
+
+Now, the second implementation will replace that with a mock, used for testing purposes:
+
+.. code-block:: python
+
+ class HttpResponseStub:
+ def json(self):
+ return [
+ {'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}},
+ {'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}},
+ ]
+
+ class HttpStub:
+ def get(self, url):
+ return HttpResponseStub()
+
+ def get_services():
+ return {
+ 'http.client': HttpStub()
+ }
+
+ graph = bonobo.Graph(
+ HttpGet(),
+ print,
+ )
+
+The `Graph` definition staying the exact same, you can easily substitute the `_services.py` file depending on your
+environment (the way you're doing this is out of bonobo scope and heavily depends on your usual way of managing
+configuration files on different platforms).
+
+Starting with bonobo 0.5 (not yet released), you will be able to use service injections with function-based
+transformations too, using the `bonobo.config.requires` decorator to mark a dependency.
+
+.. code-block:: python
+
+ from bonobo.config import requires
+
+ @requires('http.client')
+ def http_get(http):
+ resp = http.get('https://jsonplaceholder.typicode.com/users')
+
+ for row in resp.json():
+ yield row
+
+
+Read more
+:::::::::
* :doc:`/guide/services`
* :doc:`/reference/api_config`
From 5d59a72310e37ad5d691fa9896567a107c247784 Mon Sep 17 00:00:00 2001
From: Romain Dorgueil
Date: Thu, 6 Jul 2017 11:29:55 +0200
Subject: [PATCH 2/3] [core] Adds a .copy() method to graph structure.
---
bonobo/structs/graphs.py | 11 +++++++++++
tests/structs/test_graphs.py | 20 ++++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/bonobo/structs/graphs.py b/bonobo/structs/graphs.py
index ccafb6b..fe7c1df 100644
--- a/bonobo/structs/graphs.py
+++ b/bonobo/structs/graphs.py
@@ -1,3 +1,5 @@
+from copy import copy
+
from bonobo.constants import BEGIN
@@ -62,6 +64,15 @@ class Graph:
return self
+ def copy(self):
+ g = Graph()
+
+ g.edges = copy(self.edges)
+ g.named = copy(self.named)
+ g.nodes = copy(self.nodes)
+
+ return g
+
@property
def topologically_sorted_indexes(self):
"""Iterate in topological order, based on networkx's topological_sort() function.
diff --git a/tests/structs/test_graphs.py b/tests/structs/test_graphs.py
index af1a6df..7f3a58d 100644
--- a/tests/structs/test_graphs.py
+++ b/tests/structs/test_graphs.py
@@ -71,3 +71,23 @@ def test_graph_topological_sort():
assert g.topologically_sorted_indexes.index(3) < g.topologically_sorted_indexes.index(4)
assert g[3] == sentinel.b1
assert g[4] == sentinel.b2
+
+
+def test_copy():
+ g1 = Graph()
+ g2 = g1.copy()
+
+ assert g1 is not g2
+
+ assert len(g1) == 0
+ assert len(g2) == 0
+
+ g1.add_chain([])
+
+ assert len(g1) == 1
+ assert len(g2) == 0
+
+ g2.add_chain([], identity)
+
+ assert len(g1) == 1
+ assert len(g2) == 2
From 0f23f1a940e280c30e6d988d9f107795840bff60 Mon Sep 17 00:00:00 2001
From: Romain Dorgueil
Date: Thu, 6 Jul 2017 12:40:55 +0200
Subject: [PATCH 3/3] [docs] First draft of sqlalchemy tutorial.
---
docs/install.rst | 2 +-
docs/tutorial/tut04.rst | 197 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 195 insertions(+), 4 deletions(-)
diff --git a/docs/install.rst b/docs/install.rst
index ac951e5..87df3d3 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -13,7 +13,7 @@ Creating a project and starting to write code should take less than a minute:
$ bonobo run my-etl-project
Once you bootstrapped a project, you can start editing the default example transformation by editing
-`my-etl-project/main.py`.
+`my-etl-project/main.py`. Now, you can head to :doc:`tutorial/index`.
Other installation options
::::::::::::::::::::::::::
diff --git a/docs/tutorial/tut04.rst b/docs/tutorial/tut04.rst
index 14888d5..69e1846 100644
--- a/docs/tutorial/tut04.rst
+++ b/docs/tutorial/tut04.rst
@@ -1,8 +1,199 @@
Working with databases
======================
-This document does not exist yet, but will be available soon.
+Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never
+be) included in the main package. Instead, working with databases is done using third party, well maintained and
+specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop.
-Meanwhile, you can jump to bonobo-sqlalchemy development repository:
+.. note::
+
+ SQLAlchemy extension is not yet complete. Things may be not optimal, and some APIs will change. You can still try,
+ of course.
+
+ Consider the following document as a "preview" (yes, it should work, yes it may break in the future).
+
+ Also, note that for early development stages, we explicitely support only PostreSQL, although it may work well
+ with `any other database supported by SQLAlchemy `_.
+
+First, read https://www.bonobo-project.org/with/sqlalchemy for instructions on how to install. You **do need** the
+bleeding edge version of `bonobo` and `bonobo-sqlalchemy` to make this work.
+
+Additional requirements
+:::::::::::::::::::::::
+
+Once you installed `bonobo_sqlalchemy` (read https://www.bonobo-project.org/with/sqlalchemy to use bleeding edge
+version), install the following additional packages:
+
+.. code-block:: shell-session
+
+ $ pip install -U python-dotenv psycopg2 awesome-slugify
+
+Those packages are not required by the extension, but `python-dotenv` will help us configure the database DSN, and
+`psycopg2` is required by SQLAlchemy to connect to PostgreSQL databases. Also, we'll use a slugifier to create unique
+identifiers for the database (maybe not what you'd do in the real world, but very much sufficient for example purpose).
+
+Configure a database engine
+:::::::::::::::::::::::::::
+
+Open your `_services.py` file and replace the code:
+
+.. code-block:: python
+
+ import bonobo
+ import dotenv
+
+ from bonobo_sqlalchemy.util import create_postgresql_engine
+
+ dotenv.load_dotenv(dotenv.find_dotenv())
+
+ def get_services():
+ return {
+ 'fs': bonobo.open_fs(),
+ 'db': create_postgresql_engine(name='tutorial')
+ }
+
+The `create_postgresql_engine` is a tiny function building the DSN from reasonable defaults, that you can override
+either by providing kwargs, or with system environment variables. If you want to override something, open the `.env`
+file and add values for one or more of `POSTGRES_NAME`, `POSTGRES_USER`, 'POSTGRES_PASS`, `POSTGRES_HOST`,
+`POSTGRES_PORT`. Please note that kwargs always have precedence on environment, but that you should prefer using
+environment variables for anything that is not immutable from one platform to another.
+
+Let's create a `tutorial/pgdb.py` job:
+
+.. code-block:: python
+
+ import bonobo
+ import bonobo_sqlalchemy
+
+ from bonobo.examples.tutorials.tut02e03_writeasmap import graph, split_one_to_map
+
+ graph = graph.copy()
+ graph.add_chain(
+ bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
+ _input=split_one_to_map
+ )
+
+Notes here:
+
+* We use the code from :doc:`tut02`, which is bundled with bonobo in the `bonobo.examples.tutorials` package.
+* We "fork" the graph, by creating a copy and appending a new "chain", starting at a point that exists in the other
+ graph.
+* We use :class:`bonobo_sqlalchemy.InsertOrUpdate` (which role, in case it is not obvious, is to create database rows if
+ they do not exist yet, or update the existing row, based on a "discriminant" criteria (by default, "id")).
+
+If we run this transformation (with `bonobo run tutorial/pgdb.py`), we should get an error:
+
+.. code-block:: text
+
+ | File ".../lib/python3.6/site-packages/psycopg2/__init__.py", line 130, in connect
+ | conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
+ | sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist
+ |
+ |
+ | The above exception was the direct cause of the following exception:
+ |
+ | Traceback (most recent call last):
+ | File ".../bonobo-devkit/bonobo/bonobo/strategies/executor.py", line 45, in _runner
+ | node_context.start()
+ | File ".../bonobo-devkit/bonobo/bonobo/execution/base.py", line 75, in start
+ | self._stack.setup(self)
+ | File ".../bonobo-devkit/bonobo/bonobo/config/processors.py", line 94, in setup
+ | _append_to_context = next(_processed)
+ | File ".../bonobo-devkit/bonobo-sqlalchemy/bonobo_sqlalchemy/writers.py", line 43, in create_connection
+ | raise UnrecoverableError('Could not create SQLAlchemy connection: {}.'.format(str(exc).replace('\n', ''))) from exc
+ | bonobo.errors.UnrecoverableError: Could not create SQLAlchemy connection: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist.
+
+The database we requested do not exist. It is not the role of bonobo to do database administration, and thus there is
+no tool here to create neither the database, nor the tables we want to use.
+
+There are however tools in `sqlalchemy` to manage tables, so we'll create the database by ourselves, and ask sqlalchemy
+to create the table:
+
+.. code-block:: shell-session
+
+ $ psql -U postgres -h localhost
+
+ psql (9.6.1, server 9.6.3)
+ Type "help" for help.
+
+ postgres=# CREATE ROLE tutorial WITH LOGIN PASSWORD 'tutorial';
+ CREATE ROLE
+ postgres=# CREATE DATABASE tutorial WITH OWNER=tutorial TEMPLATE=template0 ENCODING='utf-8';
+ CREATE DATABASE
+
+Now, let's use a little trick and add this section to `pgdb.py`:
+
+.. code-block:: python
+
+ import logging, sys
+
+ from bonobo.commands.run import get_default_services
+ from sqlalchemy import Table, Column, String, Integer, MetaData
+
+ def main():
+ services = get_default_services(__file__)
+
+ if len(sys.argv) == 2 and sys.argv[1] == 'reset':
+ engine = services.get('sqlalchemy.engine')
+ metadata = MetaData()
+
+ coffee_table = Table(
+ 'coffeeshops',
+ metadata,
+ Column('id', String(255), primary_key=True),
+ Column('name', String(255)),
+ Column('address', String(255)),
+ )
+
+ logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
+ metadata.drop_all(engine)
+ metadata.create_all(engine)
+ else:
+ return bonobo.run(graph, services=services)
+
+ if __name__ == '__main__':
+ main()
+
+.. note::
+
+ We're using private API of bonobo here, which is unsatisfactory, discouraged and may change. Some way to get the
+ service dictionnary will be added to the public api in a future release of bonobo.
+
+Now run:
+
+.. code-block:: python
+
+ $ python tutorial/pgdb.py reset
+
+Database and table should now exist.
+
+Let's prepare our data for database, and change the `.add_chain(..)` call to do it prior to `InsertOrUpdate(...)`
+
+.. code-block:: python
+
+ from slugify import slugify_url
+
+ def format_for_db(row):
+ name, address = list(row.items())[0]
+ return {
+ 'id': slugify_url(name),
+ 'name': name,
+ 'address': address,
+ }
+
+ # ...
+
+ graph = graph.copy()
+ graph.add_chain(
+ format_for_db,
+ bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'),
+ _input=split_one_to_map
+ )
+
+You can now run the script (either with `bonobo run tutorial/pgdb.py` or directly with the python interpreter, as we
+added a "main" section) and the dataset should be inserted in your database. If you run it again, no new rows are
+created.
+
+Note that as we forked the graph from :doc:`tut02`, the transformation also writes the data to `coffeeshops.json`, as
+before.
-* https://github.com/hartym/bonobo-sqlalchemy