diff --git a/Makefile b/Makefile index 7dcf058..a6fb6f8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # This file has been auto-generated. # All changes will be lost, see Projectfile. # -# Updated at 2017-05-28 16:50:32.109035 +# Updated at 2017-05-28 18:02:16.552433 PACKAGE ?= bonobo PYTHON ?= $(shell which python) diff --git a/Projectfile b/Projectfile index bc5443e..46f262b 100644 --- a/Projectfile +++ b/Projectfile @@ -45,6 +45,7 @@ python.add_requirements( 'stevedore >=1.21,<2.0', dev=[ 'pytest-timeout >=1,<2', + 'cookiecutter >=1.5,<1.6', ], docker=[ 'bonobo-docker', diff --git a/bonobo/examples/tutorials/tut01e01.py b/bonobo/examples/tutorials/tut01e01.py new file mode 100644 index 0000000..c524039 --- /dev/null +++ b/bonobo/examples/tutorials/tut01e01.py @@ -0,0 +1,23 @@ +import bonobo + + +def extract(): + yield 'foo' + yield 'bar' + yield 'baz' + + +def transform(x): + return x.upper() + + +def load(x): + print(x) + + +graph = bonobo.Graph(extract, transform, load) + +graph.__doc__ = 'hello' + +if __name__ == '__main__': + bonobo.run(graph) diff --git a/bonobo/examples/tutorials/tut01e02.py b/bonobo/examples/tutorials/tut01e02.py new file mode 100644 index 0000000..3784235 --- /dev/null +++ b/bonobo/examples/tutorials/tut01e02.py @@ -0,0 +1,10 @@ +import bonobo + +graph = bonobo.Graph( + ['foo', 'bar', 'baz', ], + str.upper, + print, +) + +if __name__ == '__main__': + bonobo.run(graph) diff --git a/bonobo/examples/tutorials/tut02_01_read.py b/bonobo/examples/tutorials/tut02e01_read.py similarity index 100% rename from bonobo/examples/tutorials/tut02_01_read.py rename to bonobo/examples/tutorials/tut02e01_read.py diff --git a/bonobo/examples/tutorials/tut02_02_write.py b/bonobo/examples/tutorials/tut02e02_write.py similarity index 100% rename from bonobo/examples/tutorials/tut02_02_write.py rename to bonobo/examples/tutorials/tut02e02_write.py diff --git a/bonobo/examples/tutorials/tut02_03_writeasmap.py b/bonobo/examples/tutorials/tut02e03_writeasmap.py similarity index 100% rename from bonobo/examples/tutorials/tut02_03_writeasmap.py rename to bonobo/examples/tutorials/tut02e03_writeasmap.py diff --git a/docs/reference/examples.rst b/docs/reference/examples.rst index e685f79..b36c414 100644 --- a/docs/reference/examples.rst +++ b/docs/reference/examples.rst @@ -1,10 +1,16 @@ Examples ======== -There are a few examples bundled with **bonobo**. You'll find them under the :mod:`bonobo.examples` package, and -you can try them in a clone of bonobo by typing:: +There are a few examples bundled with **bonobo**. - $ bonobo run bonobo/examples/.../file.py +You'll find them under the :mod:`bonobo.examples` package, and you can run them directly as modules: + + $ bonobo run -m bonobo.examples...module + +.. toctree:: + :maxdepth: 4 + + examples/tutorials Datasets diff --git a/docs/reference/examples/tutorials.rst b/docs/reference/examples/tutorials.rst new file mode 100644 index 0000000..2a5ca4f --- /dev/null +++ b/docs/reference/examples/tutorials.rst @@ -0,0 +1,50 @@ +Examples from the tutorial +========================== + +Examples from :doc:`/tutorial/tut01` +:::::::::::::::::::::::::::::::::::: + +Example 1 +--------- + +.. automodule:: bonobo.examples.tutorials.tut01e01 + :members: + :undoc-members: + :show-inheritance: + +Example 2 +--------- + +.. automodule:: bonobo.examples.tutorials.tut01e02 + :members: + :undoc-members: + :show-inheritance: + +Examples from :doc:`/tutorial/tut02` +:::::::::::::::::::::::::::::::::::: + +Example 1: Read +--------------- + +.. automodule:: bonobo.examples.tutorials.tut02e01_read + :members: + :undoc-members: + :show-inheritance: + +Example 2: Write +---------------- + +.. automodule:: bonobo.examples.tutorials.tut02e02_write + :members: + :undoc-members: + :show-inheritance: + +Example 3: Write as map +----------------------- + +.. automodule:: bonobo.examples.tutorials.tut02e02_writeasmap + :members: + :undoc-members: + :show-inheritance: + + diff --git a/docs/tutorial/tut01.rst b/docs/tutorial/tut01.rst index ddff6e8..0357926 100644 --- a/docs/tutorial/tut01.rst +++ b/docs/tutorial/tut01.rst @@ -1,58 +1,91 @@ -Basic concepts -============== +Let's get started! +================== -To begin with Bonobo, you need to install it in a working python 3.5+ environment: +To begin with Bonobo, you need to install it in a working python 3.5+ environment, and you'll also need cookiecutter +to bootstrap your project. .. code-block:: shell-session - $ pip install bonobo + $ pip install bonobo cookiecutter See :doc:`/install` for more options. -Let's write a first data transformation -::::::::::::::::::::::::::::::::::::::: -We'll start with the simplest transformation possible. +Create an empty project +::::::::::::::::::::::: -In **Bonobo**, a transformation is a plain old python callable, not more, not less. Let's write one that takes a string -and uppercases it. +Your ETL code will live in ETL projects, which are basically a bunch of files, including python code, that bonobo +can run. + +.. code-block:: shell-session + + bonobo init tutorial + +This will create a `tutorial` directory (`content description here `_). + +To run this project, use: + +.. code-block:: shell-session + + bonobo run tutorial + + +Write a first transformation +:::::::::::::::::::::::::::: + +Open `tutorial/__main__.py`, and delete all the code here. + +A transformation can be whatever python can call, having inputs and outputs. Simplest transformations are functions. + +Let's write one: .. code-block:: python - def uppercase(x: str): + def transform(x): return x.upper() -Pretty straightforward. +Easy. -You could even use :func:`str.upper` directly instead of writing a wrapper, as a type's method (unbound) will take an -instance of this type as its first parameter (what you'd call `self` in your method). +.. note:: -The type annotations written here are not used, but can make your code much more readable, and may very well be used as -validators in the future. + This is about the same as :func:`str.upper`, and in the real world, you'd use it directly. -Let's write two more transformations: a generator to produce the data to be transformed, and something that outputs it, -because, yeah, feedback is cool. +Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from +scratch, and we'll use stdout to simulate data-persistence. .. code-block:: python - def generate_data(): + def extract(): yield 'foo' yield 'bar' yield 'baz' - def output(x: str): + def load(x): print(x) -Once again, you could have skipped the pain of writing this and simply use an iterable to generate the data and the -builtin :func:`print` for the output, but we'll stick to writing our own transformations for now. +Bonobo makes no difference between generators (yielding functions) and regular functions. It will, in all cases, iterate +on things returned, and a normal function will just be seen as a generator that yields only once. -Let's chain the three transformations together and run the transformation graph: +.. note:: + + Once again, :func:`print` would be used directly in a real-world transformation. + + +Create a transformation graph +::::::::::::::::::::::::::::: + +Bonobo main roles are two things: + +* Execute the transformations in independant threads +* Pass the outputs of one thread to other(s) thread(s). + +To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it. .. code-block:: python import bonobo - graph = bonobo.Graph(generate_data, uppercase, output) + graph = bonobo.Graph(extract, transform, load) if __name__ == '__main__': bonobo.run(graph) @@ -64,14 +97,60 @@ Let's chain the three transformations together and run the transformation graph: stylesheet = "../_static/graphs.css"; BEGIN [shape="point"]; - BEGIN -> "generate_data" -> "uppercase" -> "output"; + BEGIN -> "extract" -> "transform" -> "load"; } -We use the :func:`bonobo.run` helper that hides the underlying object composition necessary to actually run the -transformations in parallel, because it's simpler. +.. note:: -Depending on what you're doing, you may use the shorthand helper method, or the verbose one. Always favor the shorter, -if you don't need to tune the graph or the execution strategy (see below). + The `if __name__ == '__main__':` section is not required, unless you want to run it directly using the python + interpreter. + + +Execute the job +::::::::::::::: + +Save `tutorial/__main__.py` and execute your transformation: + +.. code-block:: shell-session + + bonobo run tutorial + +This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module: + +.. code-block:: shell-session + + bonobo run -m bonobo.examples.tutorials.tut01e01 + + +Rewrite it using builtins +::::::::::::::::::::::::: + +There is a much simpler way to describe an equivalent graph: + +.. code-block:: python + + import bonobo + + graph = bonobo.Graph( + ['foo', 'bar', 'baz',], + str.upper, + print, + ) + + if __name__ == '__main__': + bonobo.run(graph) + +We use a shortcut notation for the generator, with a list. Bonobo will wrap an iterable as a generator by itself if it +is added in a graph. + +This example is available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module: + +.. code-block:: shell-session + + bonobo run -m bonobo.examples.tutorials.tut01e02 + +You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here +below. Takeaways ::::::::: @@ -79,7 +158,7 @@ Takeaways ① The :class:`bonobo.Graph` class is used to represent a data-processing pipeline. It can represent simple list-like linear graphs, like here, but it can also represent much more complex graphs, with -branches and cycles. +forks and joins. This is what the graph we defined looks like: @@ -97,10 +176,10 @@ either `return` or `yield` data to send it to the next step. Regular functions ( each call is guaranteed to return exactly one result, while generators (using `yield`) should be prefered if the number of output lines for a given input varies. -③ The `Graph` instance, or `transformation graph` is then executed using an `ExecutionStrategy`. You did not use it -directly in this tutorial, but :func:`bonobo.run` created an instance of :class:`bonobo.ThreadPoolExecutorStrategy` -under the hood (which is the default strategy). Actual behavior of an execution will depend on the strategy chosen, but -the default should be fine in most of the basic cases. +③ The `Graph` instance, or `transformation graph` is executed using an `ExecutionStrategy`. You won't use it directly, +but :func:`bonobo.run` created an instance of :class:`bonobo.ThreadPoolExecutorStrategy` under the hood (the default +strategy). Actual behavior of an execution will depend on the strategy chosen, but the default should be fine for most +cases. ④ Before actually executing the `transformations`, the `ExecutorStrategy` instance will wrap each component in an `execution context`, whose responsibility is to hold the state of the transformation. It enables to keep the @@ -111,21 +190,22 @@ Concepts and definitions * Transformation: a callable that takes input (as call parameters) and returns output(s), either as its return value or by yielding values (a.k.a returning a generator). -* Transformation graph (or Graph): a set of transformations tied together in a :class:`bonobo.Graph` instance, which is a simple - directed acyclic graph (also refered as a DAG, sometimes). -* Node: a transformation within the context of a transformation graph. The node defines what to do with a - transformation's output, and especially what other nodes to feed with the output. + +* Transformation graph (or Graph): a set of transformations tied together in a :class:`bonobo.Graph` instance, which is + a directed acyclic graph (or DAG). + +* Node: a graph element, most probably a transformation in a graph. + * Execution strategy (or strategy): a way to run a transformation graph. It's responsibility is mainly to parallelize (or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for transformations' inputs and outputs. + * Execution context (or context): a wrapper around a node that holds the state for it. If the node needs state, there - are tools available in bonobo to feed it to the transformation using additional call parameters, and so every - transformation will be atomic. + are tools available in bonobo to feed it to the transformation using additional call parameters, keeping + transformations stateless. Next :::: -You now know all the basic concepts necessary to build (batch-like) data processors. - -Time to jump to the second part: :doc:`tut02` +Time to jump to the second part: :doc:`tut02`. diff --git a/docs/tutorial/tut02.rst b/docs/tutorial/tut02.rst index 0c96a40..685e455 100644 --- a/docs/tutorial/tut02.rst +++ b/docs/tutorial/tut02.rst @@ -1,11 +1,14 @@ Working with files ================== -Bonobo would be a bit useless if the aim was just to uppercase small lists of strings. +Bonobo would be pointless if the aim was just to uppercase small lists of strings. In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks. -Let's take the following graph as an example: +Some background... +:::::::::::::::::: + +Let's take the following graph: .. graphviz:: @@ -16,8 +19,8 @@ Let's take the following graph as an example: "B" -> "D"; } -The execution strategy does a bit of under the scene work, wrapping every component in a thread (assuming you're using -the :class:`bonobo.strategies.ThreadPoolExecutorStrategy`). +When run, the execution strategy wraps every component in a thread (assuming you're using the default +:class:`bonobo.strategies.ThreadPoolExecutorStrategy`). Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns* something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. @@ -25,9 +28,11 @@ something, it will be pushed on `B` input :class:`queue.Queue`, and will be cons When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`. -The great thing is that you generally don't have to think about it. Just be aware that your components will be run in -parallel (with the default strategy), and don't worry too much about blocking components, as they won't block their -siblings when run in bonobo. +One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure" +transformations (see :doc:`/guide/purity`). + +You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry +too much about blocking nodes, as they won't block other nodes. That being said, let's manipulate some files. @@ -38,9 +43,10 @@ There are a few component builders available in **Bonobo** that let you read fro All readers work the same way. They need a filesystem to work with, and open a "path" they will read from. -* :class:`bonobo.io.FileReader` -* :class:`bonobo.io.JsonReader` -* :class:`bonobo.io.CsvReader` +* :class:`bonobo.CsvReader` +* :class:`bonobo.FileReader` +* :class:`bonobo.JsonReader` +* :class:`bonobo.PickleReader` We'll use a text file that was generated using Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset @@ -49,35 +55,14 @@ Mairie de Paris under the Open Database License (ODbL). You can `explore the ori You'll need the `example dataset `_, available in **Bonobo**'s repository. -.. literalinclude:: ../../bonobo/examples/tutorials/tut02_01_read.py +.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py :language: python -You can run this script directly using the python interpreter: +You can run this example as a module: .. code-block:: shell-session - $ python bonobo/examples/tutorials/tut02_01_read.py - -Another option is to use the bonobo cli, which allows more flexibility: - -.. code-block:: shell-session - - $ bonobo run bonobo/examples/tutorials/tut02_01_read.py - -Using bonobo command line has a few advantages. - -It will look for one and only one :class:`bonobo.Graph` instance in the file given as argument, configure an execution -strategy, eventually plugins, and execute it. It has the benefit of allowing to tune the "artifacts" surrounding the -transformation graph on command line (verbosity, plugins ...), and it will also ease the transition to run -transformation graphs in containers, as the syntax will be the same. Of course, it is not required, and the -containerization capabilities are provided by an optional and separate python package. - -It also change a bit the way you can configure service dependencies. The CLI won't run the `if __name__ == '__main__'` -block, and thus it won't get the configured services passed to :func:`bonobo.run`. Instead, one option to configure -services is to define a `get_services()` function in a -`_services.py `_ file. - -There will be more options using the CLI or environment to override things soon. + $ bonobo run -m bonobo.examples.tutorials.tut02e01_read Writing to files :::::::::::::::: @@ -86,22 +71,34 @@ Let's split this file's each lines on the first comma and store a json file mapp Here are, like the readers, the classes available to write files -* :class:`bonobo.io.FileWriter` -* :class:`bonobo.io.JsonWriter` -* :class:`bonobo.io.CsvWriter` +* :class:`bonobo.CsvWriter` +* :class:`bonobo.FileWriter` +* :class:`bonobo.JsonWriter` +* :class:`bonobo.PickleWriter` Let's write a first implementation: -.. literalinclude:: ../../bonobo/examples/tutorials/tut02_02_write.py +.. literalinclude:: ../../bonobo/examples/tutorials/tut02e02_write.py :language: python -You can run it and read the output file, you'll see it misses the "map" part of the question. Let's extend -:class:`bonobo.io.JsonWriter` to finish the job: +(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e02_write` or :code:`bonobo run myfile.py`) -.. literalinclude:: ../../bonobo/examples/tutorials/tut02_03_writeasmap.py +If you read the output file, you'll see it misses the "map" part of the problem. + +Let's extend :class:`bonobo.io.JsonWriter` to finish the job: + +.. literalinclude:: ../../bonobo/examples/tutorials/tut02e03_writeasmap.py :language: python -You can now run it again, it should produce a nice map. We favored a bit hackish solution here instead of constructing a -map in python then passing the whole to :func:`json.dumps` because we want to work with streams, if you have to -construct the whole data structure in python, you'll loose a lot of bonobo's benefits. +(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e03_writeasmap` or :code:`bonobo run myfile.py`) +It should produce a nice map. + +We favored a bit hackish solution here instead of constructing a map in python then passing the whole to +:func:`json.dumps` because we want to work with streams, if you have to construct the whole data structure in python, +you'll loose a lot of bonobo's benefits. + +Next +:::: + +Time to write some more advanced transformations, with service dependencies: :doc:`tut03`. diff --git a/docs/tutorial/tut03.rst b/docs/tutorial/tut03.rst new file mode 100644 index 0000000..dc1e4c2 --- /dev/null +++ b/docs/tutorial/tut03.rst @@ -0,0 +1,9 @@ +Configurables and Services +========================== + +TODO + +Next +:::: + +:doc:`tut04`. diff --git a/docs/tutorial/tut04.rst b/docs/tutorial/tut04.rst new file mode 100644 index 0000000..18bea48 --- /dev/null +++ b/docs/tutorial/tut04.rst @@ -0,0 +1,4 @@ +Working with databases +====================== + +TODO diff --git a/requirements-dev.txt b/requirements-dev.txt index 18f7807..c346203 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,24 +1,32 @@ -e .[dev] alabaster==0.7.10 +arrow==0.10.0 babel==2.4.0 +binaryornot==0.4.3 certifi==2017.4.17 chardet==3.0.3 +click==6.7 +cookiecutter==1.5.1 coverage==4.4.1 docutils==0.13.1 +future==0.16.0 idna==2.5 imagesize==0.7.1 +jinja2-time==0.2.0 jinja2==2.9.6 markupsafe==1.0 +poyo==0.4.1 py==1.4.33 pygments==2.2.0 pytest-cov==2.5.1 pytest-timeout==1.2.0 pytest==3.1.0 +python-dateutil==2.6.0 pytz==2017.2 requests==2.16.5 six==1.10.0 snowballstemmer==1.2.1 -sphinx==1.6.1 +sphinx==1.6.2 sphinxcontrib-websupport==1.0.1 -typing==3.6.1 urllib3==1.21.1 +whichcraft==0.4.1 diff --git a/setup.py b/setup.py index 3aa6575..618ec41 100644 --- a/setup.py +++ b/setup.py @@ -58,8 +58,8 @@ setup( ], extras_require={ 'dev': [ - 'coverage (>= 4.4, < 5.0)', 'pytest (>= 3.1, < 4.0)', 'pytest-cov (>= 2.5, < 3.0)', - 'pytest-timeout (>= 1, < 2)', 'sphinx (>= 1.6, < 2.0)' + 'cookiecutter (>= 1.5, < 1.6)', 'coverage (>= 4.4, < 5.0)', 'pytest (>= 3.1, < 4.0)', + 'pytest-cov (>= 2.5, < 3.0)', 'pytest-timeout (>= 1, < 2)', 'sphinx (>= 1.6, < 2.0)' ], 'docker': ['bonobo-docker'], 'jupyter': ['ipywidgets (>= 6.0.0.beta5)', 'jupyter (>= 1.0, < 1.1)']