adds tutorials and documentation for file readers and writers.
This commit is contained in:
@ -10,6 +10,7 @@ __all__ += ['Bag', 'Graph']
|
|||||||
|
|
||||||
# Filesystem. This is a shortcut from the excellent filesystem2 library, that we make available there for convenience.
|
# Filesystem. This is a shortcut from the excellent filesystem2 library, that we make available there for convenience.
|
||||||
from fs import open_fs as _open_fs
|
from fs import open_fs as _open_fs
|
||||||
|
|
||||||
open_fs = lambda url, *args, **kwargs: _open_fs(str(url), *args, **kwargs)
|
open_fs = lambda url, *args, **kwargs: _open_fs(str(url), *args, **kwargs)
|
||||||
__all__ += ['open_fs']
|
__all__ += ['open_fs']
|
||||||
|
|
||||||
@ -38,7 +39,11 @@ def get_examples_path(*pathsegments):
|
|||||||
return str(pathlib.Path(os.path.dirname(__file__), 'examples', *pathsegments))
|
return str(pathlib.Path(os.path.dirname(__file__), 'examples', *pathsegments))
|
||||||
|
|
||||||
|
|
||||||
|
def open_examples_fs(*pathsegments):
|
||||||
|
return open_fs(get_examples_path(*pathsegments))
|
||||||
|
|
||||||
__all__.append(get_examples_path.__name__)
|
__all__.append(get_examples_path.__name__)
|
||||||
|
__all__.append(open_examples_fs.__name__)
|
||||||
|
|
||||||
|
|
||||||
def _is_interactive_console():
|
def _is_interactive_console():
|
||||||
|
|||||||
5
bonobo/examples/tutorials/_services.py
Normal file
5
bonobo/examples/tutorials/_services.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from bonobo import open_examples_fs
|
||||||
|
|
||||||
|
|
||||||
|
def get_services():
|
||||||
|
return {'fs': open_examples_fs('datasets')}
|
||||||
@ -1,15 +1,11 @@
|
|||||||
import bonobo
|
import bonobo
|
||||||
from bonobo.commands.run import get_default_services
|
|
||||||
|
|
||||||
graph = bonobo.Graph(
|
graph = bonobo.Graph(
|
||||||
bonobo.FileReader(path='datasets/coffeeshops.txt'),
|
bonobo.FileReader(path='coffeeshops.txt'),
|
||||||
print,
|
print,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_services():
|
|
||||||
return {'fs': bonobo.open_fs(bonobo.get_examples_path())}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
bonobo.run(graph, services=get_default_services(__file__, get_services()))
|
bonobo.run(graph, services={
|
||||||
|
'fs': bonobo.open_examples_fs('datasets')
|
||||||
|
})
|
||||||
|
|||||||
17
bonobo/examples/tutorials/tut02_02_write.py
Normal file
17
bonobo/examples/tutorials/tut02_02_write.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import bonobo
|
||||||
|
|
||||||
|
|
||||||
|
def split_one(line):
|
||||||
|
return line.split(', ', 1)
|
||||||
|
|
||||||
|
|
||||||
|
graph = bonobo.Graph(
|
||||||
|
bonobo.FileReader(path='coffeeshops.txt'),
|
||||||
|
split_one,
|
||||||
|
bonobo.JsonWriter(path='coffeeshops.json'),
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
bonobo.run(graph, services={
|
||||||
|
'fs': bonobo.open_examples_fs('datasets')
|
||||||
|
})
|
||||||
27
bonobo/examples/tutorials/tut02_03_writeasmap.py
Normal file
27
bonobo/examples/tutorials/tut02_03_writeasmap.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import bonobo, json
|
||||||
|
|
||||||
|
|
||||||
|
def split_one_to_map(line):
|
||||||
|
k, v = line.split(', ', 1)
|
||||||
|
return {k: v}
|
||||||
|
|
||||||
|
|
||||||
|
class MyJsonWriter(bonobo.JsonWriter):
|
||||||
|
prefix, suffix = '{', '}'
|
||||||
|
|
||||||
|
def write(self, fs, file, lineno, row):
|
||||||
|
return bonobo.FileWriter.write(
|
||||||
|
self, fs, file, lineno, json.dumps(row)[1:-1]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
graph = bonobo.Graph(
|
||||||
|
bonobo.FileReader(path='coffeeshops.txt'),
|
||||||
|
split_one_to_map,
|
||||||
|
MyJsonWriter(path='coffeeshops.json'),
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
bonobo.run(graph, services={
|
||||||
|
'fs': bonobo.open_examples_fs('datasets')
|
||||||
|
})
|
||||||
@ -10,6 +10,7 @@ __all__ = [
|
|||||||
|
|
||||||
class JsonHandler:
|
class JsonHandler:
|
||||||
eol = ',\n'
|
eol = ',\n'
|
||||||
|
prefix, suffix = '[', ']'
|
||||||
|
|
||||||
|
|
||||||
class JsonReader(JsonHandler, FileReader):
|
class JsonReader(JsonHandler, FileReader):
|
||||||
@ -24,9 +25,9 @@ class JsonReader(JsonHandler, FileReader):
|
|||||||
class JsonWriter(JsonHandler, FileWriter):
|
class JsonWriter(JsonHandler, FileWriter):
|
||||||
@ContextProcessor
|
@ContextProcessor
|
||||||
def envelope(self, context, fs, file, lineno):
|
def envelope(self, context, fs, file, lineno):
|
||||||
file.write('[\n')
|
file.write(self.prefix)
|
||||||
yield
|
yield
|
||||||
file.write('\n]')
|
file.write(self.suffix)
|
||||||
|
|
||||||
def write(self, fs, file, lineno, row):
|
def write(self, fs, file, lineno, row):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
Bonobo with Jupyter
|
Bonobo with Jupyter
|
||||||
==================
|
===================
|
||||||
|
|
||||||
There is a builtin plugin that integrates (kind of minimalistically, for now) bonobo within jupyter notebooks, so
|
There is a builtin plugin that integrates (kind of minimalistically, for now) bonobo within jupyter notebooks, so
|
||||||
you can read the execution status of a graph within a nice (ok not so nice) html/javascript widget.
|
you can read the execution status of a graph within a nice (ok not so nice) html/javascript widget.
|
||||||
@ -9,7 +9,11 @@ See https://github.com/jupyter-widgets/widget-cookiecutter for the base template
|
|||||||
Installation
|
Installation
|
||||||
::::::::::::
|
::::::::::::
|
||||||
|
|
||||||
To install the widget::
|
Install `bonobo` with the **jupyter** extra::
|
||||||
|
|
||||||
|
pip install bonobo[jupyter]
|
||||||
|
|
||||||
|
Install the jupyter extension::
|
||||||
|
|
||||||
jupyter nbextension enable --py --sys-prefix bonobo.ext.jupyter
|
jupyter nbextension enable --py --sys-prefix bonobo.ext.jupyter
|
||||||
|
|
||||||
|
|||||||
@ -13,17 +13,18 @@ concepts you'll see everywhere in the software.
|
|||||||
If you're not familiar with python, you should first read :doc:`./python`.
|
If you're not familiar with python, you should first read :doc:`./python`.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
tut01
|
tut01
|
||||||
tut02
|
tut02
|
||||||
|
|
||||||
Where to go next?
|
What's next?
|
||||||
:::::::::::::::::
|
::::::::::::
|
||||||
|
|
||||||
When you're done with the tutorial, you may be interested in the following next steps:
|
Read a few examples
|
||||||
|
-------------------
|
||||||
|
|
||||||
Read the :doc:`../reference/examples`
|
* :doc:`../reference/examples`
|
||||||
|
|
||||||
Read about best development practices
|
Read about best development practices
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|||||||
@ -1,6 +1,11 @@
|
|||||||
Just enough Python for Bonobo
|
Just enough Python for Bonobo
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
|
.. todo::
|
||||||
|
|
||||||
|
This is a work in progress and it is not yet available. Please come back later or even better, help us write this
|
||||||
|
guide!
|
||||||
|
|
||||||
This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It should
|
This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It should
|
||||||
definately not be considered as a general python introduction, neither a deep dive into details.
|
definately not be considered as a general python introduction, neither a deep dive into details.
|
||||||
|
|
||||||
|
|||||||
@ -127,6 +127,5 @@ Next
|
|||||||
|
|
||||||
You now know all the basic concepts necessary to build (batch-like) data processors.
|
You now know all the basic concepts necessary to build (batch-like) data processors.
|
||||||
|
|
||||||
If you're confident with this part, let's get to a more real world example, using files and nice console output:
|
Time to jump to the second part: :doc:`tut02`
|
||||||
:doc:`basics2`
|
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
Working with files
|
Working with files
|
||||||
==================
|
==================
|
||||||
|
|
||||||
Bonobo would not be of any use if the aim was to uppercase small lists of strings. In fact, Bonobo should not be used
|
Bonobo would be a bit useless if the aim was just to uppercase small lists of strings.
|
||||||
if you don't expect any gain from parallelization/distribution of tasks.
|
|
||||||
|
In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks.
|
||||||
|
|
||||||
Let's take the following graph as an example:
|
Let's take the following graph as an example:
|
||||||
|
|
||||||
@ -12,52 +13,95 @@ Let's take the following graph as an example:
|
|||||||
rankdir = LR;
|
rankdir = LR;
|
||||||
BEGIN [shape="point"];
|
BEGIN [shape="point"];
|
||||||
BEGIN -> "A" -> "B" -> "C";
|
BEGIN -> "A" -> "B" -> "C";
|
||||||
|
"B" -> "D";
|
||||||
}
|
}
|
||||||
|
|
||||||
The execution strategy does a bit of under the scene work, wrapping every component in a thread (assuming you're using
|
The execution strategy does a bit of under the scene work, wrapping every component in a thread (assuming you're using
|
||||||
the :class:`bonobo.ThreadPoolExecutorStrategy`), which allows to start running `B` as soon as `A` yielded the first line
|
the :class:`bonobo.strategies.ThreadPoolExecutorStrategy`).
|
||||||
of data, and `C` as soon as `B` yielded the first line of data, even if `A` or `B` still have data to yield.
|
|
||||||
|
Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns*
|
||||||
|
something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread.
|
||||||
|
|
||||||
|
When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing
|
||||||
|
happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`.
|
||||||
|
|
||||||
The great thing is that you generally don't have to think about it. Just be aware that your components will be run in
|
The great thing is that you generally don't have to think about it. Just be aware that your components will be run in
|
||||||
parallel (with the default strategy), and don't worry too much about blocking components, as they won't block their
|
parallel (with the default strategy), and don't worry too much about blocking components, as they won't block their
|
||||||
siblings when run in bonobo.
|
siblings when run in bonobo.
|
||||||
|
|
||||||
That being said, let's try to write a more real-world like transformation.
|
That being said, let's manipulate some files.
|
||||||
|
|
||||||
Reading a file
|
Reading a file
|
||||||
::::::::::::::
|
::::::::::::::
|
||||||
|
|
||||||
There are a few component builders available in **Bonobo** that let you read files. You should at least know about the
|
There are a few component builders available in **Bonobo** that let you read from (or write to) files.
|
||||||
following:
|
|
||||||
|
|
||||||
* :class:`bonobo.io.FileReader`
|
All readers work the same way. They need a filesystem to work with, and open a "path" they will read from.
|
||||||
* :class:`bonobo.io.JsonReader`
|
|
||||||
* :class:`bonobo.io.CsvReader`
|
|
||||||
|
|
||||||
Reading a file is as simple as using one of those, and for the example, we'll use a text file that was generated using
|
* :class:`bonobo.FileReader`
|
||||||
Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by Mairie de Paris under the Open Database
|
* :class:`bonobo.JsonReader`
|
||||||
License (ODbL). You can `explore the original dataset <https://opendata.paris.fr/explore/dataset/liste-des-cafes-a-un-euro/information/>`_.
|
* :class:`bonobo.CsvReader`
|
||||||
You'll need the example dataset, available in **Bonobo**'s repository.
|
|
||||||
|
|
||||||
.. literalinclude:: ../../examples/tut02_01_read.py
|
We'll use a text file that was generated using Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by
|
||||||
|
Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset
|
||||||
|
<https://opendata.paris.fr/explore/dataset/liste-des-cafes-a-un-euro/information/>`_.
|
||||||
|
|
||||||
|
You'll need the `example dataset <https://github.com/python-bonobo/bonobo/blob/0.2/bonobo/examples/datasets/coffeeshops.txt>`_,
|
||||||
|
available in **Bonobo**'s repository.
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bonobo/examples/tutorials/tut02_01_read.py
|
||||||
:language: python
|
:language: python
|
||||||
|
|
||||||
Until then, we ran the file directly using our python interpreter, but there is other options, one of them being
|
You can run this script directly using the python interpreter:
|
||||||
`bonobo run`. This command allows to run a graph defined by a python file, and is replacing the :func:`bonobo.run`
|
|
||||||
helper. It's the exact reason why we call :func:`bonobo.run` in the `if __name__ == '__main__'` block, to only
|
|
||||||
instanciate it if it is run directly.
|
|
||||||
|
|
||||||
Using bonobo command line has a few advantages. It will look for one and only one :class:`bonobo.Graph` instance defined
|
|
||||||
in the file given as argument, configure an execution strategy, eventually plugins, and execute it. It has the benefit
|
|
||||||
of allowing to tune the "artifacts" surrounding the transformation graph on command line (verbosity, plugins ...), and
|
|
||||||
it will also ease the transition to run transformation graphs in containers, as the syntax will be the same. Of course,
|
|
||||||
it is not required, and the containerization capabilities are provided by an optional and separate python package.
|
|
||||||
|
|
||||||
.. code-block:: shell-session
|
.. code-block:: shell-session
|
||||||
|
|
||||||
$ bonobo run examples/tut02_01_read.py
|
$ python bonobo/examples/tutorials/tut02_01_read.py
|
||||||
|
|
||||||
|
Another option is to use the bonobo cli, which allows more flexibility:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ bonobo run bonobo/examples/tutorials/tut02_01_read.py
|
||||||
|
|
||||||
|
Using bonobo command line has a few advantages.
|
||||||
|
|
||||||
|
It will look for one and only one :class:`bonobo.Graph` instance in the file given as argument, configure an execution
|
||||||
|
strategy, eventually plugins, and execute it. It has the benefit of allowing to tune the "artifacts" surrounding the
|
||||||
|
transformation graph on command line (verbosity, plugins ...), and it will also ease the transition to run
|
||||||
|
transformation graphs in containers, as the syntax will be the same. Of course, it is not required, and the
|
||||||
|
containerization capabilities are provided by an optional and separate python package.
|
||||||
|
|
||||||
|
It also change a bit the way you can configure service dependencies. The CLI won't run the `if __name__ == '__main__'`
|
||||||
|
block, and thus it won't get the configured services passed to :func:`bonobo.run`. Instead, one option to configure
|
||||||
|
services is to define a `get_services()` function in a
|
||||||
|
`_services.py <https://github.com/python-bonobo/bonobo/blob/0.2/bonobo/examples/tutorials/_services.py>`_ file.
|
||||||
|
|
||||||
|
There will be more options using the CLI or environment to override things soon.
|
||||||
|
|
||||||
|
Writing to files
|
||||||
|
::::::::::::::::
|
||||||
|
|
||||||
|
Let's split this file's each lines on the first comma and store a json file mapping coffee names to their addresses.
|
||||||
|
|
||||||
|
Here are, like the readers, the classes available to write files
|
||||||
|
|
||||||
|
* :class:`bonobo.FileWriter`
|
||||||
|
* :class:`bonobo.JsonWriter`
|
||||||
|
* :class:`bonobo.CsvWriter`
|
||||||
|
|
||||||
|
Let's write a first implementation:
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bonobo/examples/tutorials/tut02_02_write.py
|
||||||
|
:language: python
|
||||||
|
|
||||||
|
You can run it and read the output file, you'll see it misses the "map" part of the question. Let's extend
|
||||||
|
:class:`bonobo.JsonWriter` to finish the job:
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bonobo/examples/tutorials/tut02_03_writeasmap.py
|
||||||
|
:language: python
|
||||||
|
|
||||||
|
You can now run it again, it should produce a nice map. We favored a bit hackish solution here instead of constructing a
|
||||||
|
map in python then passing the whole to :func:`json.dumps` because we want to work with streams, if you have to
|
||||||
|
construct the whole data structure in python, you'll loose a lot of bonobo's benefits.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user