From 4ee3fd3be939cf90ccb50680eb6e766391c37c42 Mon Sep 17 00:00:00 2001 From: Romain Dorgueil Date: Sun, 30 Apr 2017 11:14:34 +0200 Subject: [PATCH] adds tutorials and documentation for file readers and writers. --- bonobo/_api.py | 5 + bonobo/examples/tutorials/_services.py | 5 + bonobo/examples/tutorials/tut02_01_read.py | 12 +-- bonobo/examples/tutorials/tut02_02_write.py | 17 ++++ .../examples/tutorials/tut02_03_writeasmap.py | 27 ++++++ bonobo/io/json.py | 5 +- docs/guide/ext/jupyter.rst | 8 +- docs/tutorial/index.rst | 15 +-- docs/tutorial/python.rst | 5 + docs/tutorial/tut01.rst | 3 +- docs/tutorial/tut02.rst | 96 ++++++++++++++----- 11 files changed, 151 insertions(+), 47 deletions(-) create mode 100644 bonobo/examples/tutorials/_services.py create mode 100644 bonobo/examples/tutorials/tut02_02_write.py create mode 100644 bonobo/examples/tutorials/tut02_03_writeasmap.py diff --git a/bonobo/_api.py b/bonobo/_api.py index c57a255..70d033f 100644 --- a/bonobo/_api.py +++ b/bonobo/_api.py @@ -10,6 +10,7 @@ __all__ += ['Bag', 'Graph'] # Filesystem. This is a shortcut from the excellent filesystem2 library, that we make available there for convenience. from fs import open_fs as _open_fs + open_fs = lambda url, *args, **kwargs: _open_fs(str(url), *args, **kwargs) __all__ += ['open_fs'] @@ -38,7 +39,11 @@ def get_examples_path(*pathsegments): return str(pathlib.Path(os.path.dirname(__file__), 'examples', *pathsegments)) +def open_examples_fs(*pathsegments): + return open_fs(get_examples_path(*pathsegments)) + __all__.append(get_examples_path.__name__) +__all__.append(open_examples_fs.__name__) def _is_interactive_console(): diff --git a/bonobo/examples/tutorials/_services.py b/bonobo/examples/tutorials/_services.py new file mode 100644 index 0000000..25d783d --- /dev/null +++ b/bonobo/examples/tutorials/_services.py @@ -0,0 +1,5 @@ +from bonobo import open_examples_fs + + +def get_services(): + return {'fs': open_examples_fs('datasets')} diff --git a/bonobo/examples/tutorials/tut02_01_read.py b/bonobo/examples/tutorials/tut02_01_read.py index 64816ce..e934c79 100644 --- a/bonobo/examples/tutorials/tut02_01_read.py +++ b/bonobo/examples/tutorials/tut02_01_read.py @@ -1,15 +1,11 @@ import bonobo -from bonobo.commands.run import get_default_services graph = bonobo.Graph( - bonobo.FileReader(path='datasets/coffeeshops.txt'), + bonobo.FileReader(path='coffeeshops.txt'), print, ) - -def get_services(): - return {'fs': bonobo.open_fs(bonobo.get_examples_path())} - - if __name__ == '__main__': - bonobo.run(graph, services=get_default_services(__file__, get_services())) + bonobo.run(graph, services={ + 'fs': bonobo.open_examples_fs('datasets') + }) diff --git a/bonobo/examples/tutorials/tut02_02_write.py b/bonobo/examples/tutorials/tut02_02_write.py new file mode 100644 index 0000000..d63dd47 --- /dev/null +++ b/bonobo/examples/tutorials/tut02_02_write.py @@ -0,0 +1,17 @@ +import bonobo + + +def split_one(line): + return line.split(', ', 1) + + +graph = bonobo.Graph( + bonobo.FileReader(path='coffeeshops.txt'), + split_one, + bonobo.JsonWriter(path='coffeeshops.json'), +) + +if __name__ == '__main__': + bonobo.run(graph, services={ + 'fs': bonobo.open_examples_fs('datasets') + }) diff --git a/bonobo/examples/tutorials/tut02_03_writeasmap.py b/bonobo/examples/tutorials/tut02_03_writeasmap.py new file mode 100644 index 0000000..e131acd --- /dev/null +++ b/bonobo/examples/tutorials/tut02_03_writeasmap.py @@ -0,0 +1,27 @@ +import bonobo, json + + +def split_one_to_map(line): + k, v = line.split(', ', 1) + return {k: v} + + +class MyJsonWriter(bonobo.JsonWriter): + prefix, suffix = '{', '}' + + def write(self, fs, file, lineno, row): + return bonobo.FileWriter.write( + self, fs, file, lineno, json.dumps(row)[1:-1] + ) + + +graph = bonobo.Graph( + bonobo.FileReader(path='coffeeshops.txt'), + split_one_to_map, + MyJsonWriter(path='coffeeshops.json'), +) + +if __name__ == '__main__': + bonobo.run(graph, services={ + 'fs': bonobo.open_examples_fs('datasets') + }) diff --git a/bonobo/io/json.py b/bonobo/io/json.py index 1b9ab46..34f4f0f 100644 --- a/bonobo/io/json.py +++ b/bonobo/io/json.py @@ -10,6 +10,7 @@ __all__ = [ class JsonHandler: eol = ',\n' + prefix, suffix = '[', ']' class JsonReader(JsonHandler, FileReader): @@ -24,9 +25,9 @@ class JsonReader(JsonHandler, FileReader): class JsonWriter(JsonHandler, FileWriter): @ContextProcessor def envelope(self, context, fs, file, lineno): - file.write('[\n') + file.write(self.prefix) yield - file.write('\n]') + file.write(self.suffix) def write(self, fs, file, lineno, row): """ diff --git a/docs/guide/ext/jupyter.rst b/docs/guide/ext/jupyter.rst index 2e912c2..b58b70f 100644 --- a/docs/guide/ext/jupyter.rst +++ b/docs/guide/ext/jupyter.rst @@ -1,5 +1,5 @@ Bonobo with Jupyter -================== +=================== There is a builtin plugin that integrates (kind of minimalistically, for now) bonobo within jupyter notebooks, so you can read the execution status of a graph within a nice (ok not so nice) html/javascript widget. @@ -9,7 +9,11 @@ See https://github.com/jupyter-widgets/widget-cookiecutter for the base template Installation :::::::::::: -To install the widget:: +Install `bonobo` with the **jupyter** extra:: + + pip install bonobo[jupyter] + +Install the jupyter extension:: jupyter nbextension enable --py --sys-prefix bonobo.ext.jupyter diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst index 8fccab4..324fed7 100644 --- a/docs/tutorial/index.rst +++ b/docs/tutorial/index.rst @@ -13,17 +13,18 @@ concepts you'll see everywhere in the software. If you're not familiar with python, you should first read :doc:`./python`. .. toctree:: - :maxdepth: 2 + :maxdepth: 2 - tut01 - tut02 + tut01 + tut02 -Where to go next? -::::::::::::::::: +What's next? +:::::::::::: -When you're done with the tutorial, you may be interested in the following next steps: +Read a few examples +------------------- -Read the :doc:`../reference/examples` +* :doc:`../reference/examples` Read about best development practices ------------------------------------- diff --git a/docs/tutorial/python.rst b/docs/tutorial/python.rst index 13c26a7..dae49b8 100644 --- a/docs/tutorial/python.rst +++ b/docs/tutorial/python.rst @@ -1,6 +1,11 @@ Just enough Python for Bonobo ============================= +.. todo:: + + This is a work in progress and it is not yet available. Please come back later or even better, help us write this + guide! + This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It should definately not be considered as a general python introduction, neither a deep dive into details. diff --git a/docs/tutorial/tut01.rst b/docs/tutorial/tut01.rst index d61c544..ddff6e8 100644 --- a/docs/tutorial/tut01.rst +++ b/docs/tutorial/tut01.rst @@ -127,6 +127,5 @@ Next You now know all the basic concepts necessary to build (batch-like) data processors. -If you're confident with this part, let's get to a more real world example, using files and nice console output: -:doc:`basics2` +Time to jump to the second part: :doc:`tut02` diff --git a/docs/tutorial/tut02.rst b/docs/tutorial/tut02.rst index f053c89..cae0d5f 100644 --- a/docs/tutorial/tut02.rst +++ b/docs/tutorial/tut02.rst @@ -1,8 +1,9 @@ Working with files ================== -Bonobo would not be of any use if the aim was to uppercase small lists of strings. In fact, Bonobo should not be used -if you don't expect any gain from parallelization/distribution of tasks. +Bonobo would be a bit useless if the aim was just to uppercase small lists of strings. + +In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks. Let's take the following graph as an example: @@ -12,52 +13,95 @@ Let's take the following graph as an example: rankdir = LR; BEGIN [shape="point"]; BEGIN -> "A" -> "B" -> "C"; + "B" -> "D"; } The execution strategy does a bit of under the scene work, wrapping every component in a thread (assuming you're using -the :class:`bonobo.ThreadPoolExecutorStrategy`), which allows to start running `B` as soon as `A` yielded the first line -of data, and `C` as soon as `B` yielded the first line of data, even if `A` or `B` still have data to yield. +the :class:`bonobo.strategies.ThreadPoolExecutorStrategy`). + +Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns* +something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. + +When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`) , the same thing +happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`. The great thing is that you generally don't have to think about it. Just be aware that your components will be run in parallel (with the default strategy), and don't worry too much about blocking components, as they won't block their siblings when run in bonobo. -That being said, let's try to write a more real-world like transformation. +That being said, let's manipulate some files. Reading a file :::::::::::::: -There are a few component builders available in **Bonobo** that let you read files. You should at least know about the -following: +There are a few component builders available in **Bonobo** that let you read from (or write to) files. -* :class:`bonobo.io.FileReader` -* :class:`bonobo.io.JsonReader` -* :class:`bonobo.io.CsvReader` +All readers work the same way. They need a filesystem to work with, and open a "path" they will read from. -Reading a file is as simple as using one of those, and for the example, we'll use a text file that was generated using -Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by Mairie de Paris under the Open Database -License (ODbL). You can `explore the original dataset `_. -You'll need the example dataset, available in **Bonobo**'s repository. +* :class:`bonobo.FileReader` +* :class:`bonobo.JsonReader` +* :class:`bonobo.CsvReader` -.. literalinclude:: ../../examples/tut02_01_read.py +We'll use a text file that was generated using Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by +Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset +`_. + +You'll need the `example dataset `_, +available in **Bonobo**'s repository. + +.. literalinclude:: ../../bonobo/examples/tutorials/tut02_01_read.py :language: python -Until then, we ran the file directly using our python interpreter, but there is other options, one of them being -`bonobo run`. This command allows to run a graph defined by a python file, and is replacing the :func:`bonobo.run` -helper. It's the exact reason why we call :func:`bonobo.run` in the `if __name__ == '__main__'` block, to only -instanciate it if it is run directly. - -Using bonobo command line has a few advantages. It will look for one and only one :class:`bonobo.Graph` instance defined -in the file given as argument, configure an execution strategy, eventually plugins, and execute it. It has the benefit -of allowing to tune the "artifacts" surrounding the transformation graph on command line (verbosity, plugins ...), and -it will also ease the transition to run transformation graphs in containers, as the syntax will be the same. Of course, -it is not required, and the containerization capabilities are provided by an optional and separate python package. +You can run this script directly using the python interpreter: .. code-block:: shell-session - $ bonobo run examples/tut02_01_read.py + $ python bonobo/examples/tutorials/tut02_01_read.py +Another option is to use the bonobo cli, which allows more flexibility: +.. code-block:: shell-session + $ bonobo run bonobo/examples/tutorials/tut02_01_read.py +Using bonobo command line has a few advantages. + +It will look for one and only one :class:`bonobo.Graph` instance in the file given as argument, configure an execution +strategy, eventually plugins, and execute it. It has the benefit of allowing to tune the "artifacts" surrounding the +transformation graph on command line (verbosity, plugins ...), and it will also ease the transition to run +transformation graphs in containers, as the syntax will be the same. Of course, it is not required, and the +containerization capabilities are provided by an optional and separate python package. + +It also change a bit the way you can configure service dependencies. The CLI won't run the `if __name__ == '__main__'` +block, and thus it won't get the configured services passed to :func:`bonobo.run`. Instead, one option to configure +services is to define a `get_services()` function in a +`_services.py `_ file. + +There will be more options using the CLI or environment to override things soon. + +Writing to files +:::::::::::::::: + +Let's split this file's each lines on the first comma and store a json file mapping coffee names to their addresses. + +Here are, like the readers, the classes available to write files + +* :class:`bonobo.FileWriter` +* :class:`bonobo.JsonWriter` +* :class:`bonobo.CsvWriter` + +Let's write a first implementation: + +.. literalinclude:: ../../bonobo/examples/tutorials/tut02_02_write.py + :language: python + +You can run it and read the output file, you'll see it misses the "map" part of the question. Let's extend +:class:`bonobo.JsonWriter` to finish the job: + +.. literalinclude:: ../../bonobo/examples/tutorials/tut02_03_writeasmap.py + :language: python + +You can now run it again, it should produce a nice map. We favored a bit hackish solution here instead of constructing a +map in python then passing the whole to :func:`json.dumps` because we want to work with streams, if you have to +construct the whole data structure in python, you'll loose a lot of bonobo's benefits.