diff --git a/.coveragerc b/.coveragerc index ce96e75..1d76a1f 100644 --- a/.coveragerc +++ b/.coveragerc @@ -23,4 +23,4 @@ exclude_lines = ignore_errors = True [html] -directory = doc/_build/html/coverage +directory = docs/_build/html/coverage diff --git a/bonobo/__init__.py b/bonobo/__init__.py index 0d331c6..b66e2d8 100644 --- a/bonobo/__init__.py +++ b/bonobo/__init__.py @@ -36,16 +36,20 @@ with open(os.path.realpath(os.path.join(os.path.dirname(__file__), '../version.t __all__ = [ 'Bag', + 'FileWriter', 'Graph', - 'NaiveStrategy', + 'JsonFileWriter', 'NOT_MODIFIED', + 'NaiveStrategy', 'ProcessPoolExecutorStrategy', 'ThreadPoolExecutorStrategy', + 'console_run', 'head', 'inject', + 'jupyter_run', 'log', 'noop', + 'run', 'service', 'tee', - 'to_json', ] diff --git a/bonobo/compat/__init__.py b/bonobo/compat/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bonobo/compat/pandas.py b/bonobo/compat/pandas.py new file mode 100644 index 0000000..aab0dbd --- /dev/null +++ b/bonobo/compat/pandas.py @@ -0,0 +1,9 @@ +from bonobo import FileWriter, JsonFileWriter + +to_file = FileWriter +to_json = JsonFileWriter + +__all__ = [ + 'to_json', + 'to_file', +] diff --git a/bonobo/core/bags.py b/bonobo/core/bags.py index 2ec728d..c4f7c7b 100644 --- a/bonobo/core/bags.py +++ b/bonobo/core/bags.py @@ -33,8 +33,8 @@ class Bag: def flags(self): return self._flags - def apply(self, func, *args, **kwargs): - return func(*args, *self.args, **kwargs, **self.kwargs) + def apply(self, func_or_iter, *args, **kwargs): + return func_or_iter(*args, *self.args, **kwargs, **self.kwargs) def extend(self, *args, **kwargs): return type(self)(*args, _parent=self, **kwargs) diff --git a/bonobo/core/contexts.py b/bonobo/core/contexts.py index 7b0ec26..0e9d127 100644 --- a/bonobo/core/contexts.py +++ b/bonobo/core/contexts.py @@ -128,7 +128,7 @@ class ComponentExecutionContext(WithStatistics, AbstractLoopContext): @property def name(self): - return self.component.__name__ + return getattr(self.component, '__name__', getattr(type(self.component), '__name__', repr(self.component))) def __init__(self, component, parent): self.parent = parent diff --git a/bonobo/core/graphs.py b/bonobo/core/graphs.py index e9ffeb5..ba31930 100644 --- a/bonobo/core/graphs.py +++ b/bonobo/core/graphs.py @@ -6,9 +6,10 @@ class Graph: Represents a coherent directed acyclic graph (DAG) of components. """ - def __init__(self): + def __init__(self, *chain): self.components = [] self.graph = {BEGIN: set()} + self.add_chain(*chain) def outputs_of(self, idx, create=False): if create and not idx in self.graph: diff --git a/bonobo/ext/console/__init__.py b/bonobo/ext/console/__init__.py index 7d5c3f5..2fffb8f 100644 --- a/bonobo/ext/console/__init__.py +++ b/bonobo/ext/console/__init__.py @@ -1,7 +1,3 @@ -from .helpers import console_run from .plugin import ConsoleOutputPlugin -__all__ = [ - 'ConsoleOutputPlugin', - 'console_run', -] +__all__ = ['ConsoleOutputPlugin', ] diff --git a/bonobo/ext/console/helpers.py b/bonobo/ext/console/helpers.py deleted file mode 100644 index f55d016..0000000 --- a/bonobo/ext/console/helpers.py +++ /dev/null @@ -1,9 +0,0 @@ -from bonobo import Graph, ThreadPoolExecutorStrategy -from .plugin import ConsoleOutputPlugin - - -def console_run(*chain, output=True, plugins=None): - graph = Graph() - executor = ThreadPoolExecutorStrategy() - graph.add_chain(*chain) - return executor.execute(graph, plugins=(plugins or []) + [ConsoleOutputPlugin()] if output else []) diff --git a/bonobo/ext/jupyter/__init__.py b/bonobo/ext/jupyter/__init__.py index 2e04e8a..7dd1300 100644 --- a/bonobo/ext/jupyter/__init__.py +++ b/bonobo/ext/jupyter/__init__.py @@ -1,4 +1,4 @@ -from .helpers import jupyter_run +from bonobo.util.helpers import jupyter_run from .plugin import JupyterOutputPlugin @@ -6,7 +6,4 @@ def _jupyter_nbextension_paths(): return [{'section': 'notebook', 'src': 'static', 'dest': 'bonobo-jupyter', 'require': 'bonobo-jupyter/extension'}] -__all__ = [ - 'JupyterOutputPlugin', - 'jupyter_run', -] +__all__ = ['JupyterOutputPlugin', ] diff --git a/bonobo/ext/jupyter/helpers.py b/bonobo/ext/jupyter/helpers.py index c62e3dd..8b13789 100644 --- a/bonobo/ext/jupyter/helpers.py +++ b/bonobo/ext/jupyter/helpers.py @@ -1,9 +1 @@ -from bonobo import Graph, ThreadPoolExecutorStrategy -from .plugin import JupyterOutputPlugin - -def jupyter_run(*chain, plugins=None): - graph = Graph() - executor = ThreadPoolExecutorStrategy() - graph.add_chain(*chain) - return executor.execute(graph, plugins=(plugins or []) + [JupyterOutputPlugin()]) diff --git a/bonobo/ext/ods.py b/bonobo/ext/opendatasoft.py similarity index 60% rename from bonobo/ext/ods.py rename to bonobo/ext/opendatasoft.py index 1a8cf4f..9fb8d61 100644 --- a/bonobo/ext/ods.py +++ b/bonobo/ext/opendatasoft.py @@ -3,11 +3,18 @@ from urllib.parse import urlencode import requests # todo: make this a service so we can substitute it ? -def extract_ods(url, dataset, rows=100, **kwargs): +def from_opendatasoft_api(dataset=None, + endpoint='{scheme}://{netloc}{path}', + scheme='https', + netloc='data.opendatasoft.com', + path='/api/records/1.0/search/', + rows=100, + **kwargs): + path = path if path.startswith('/') else '/' + path params = ( ('dataset', dataset), ('rows', rows), ) + tuple(sorted(kwargs.items())) - base_url = url + '?' + urlencode(params) + base_url = endpoint.format(scheme=scheme, netloc=netloc, path=path) + '?' + urlencode(params) def _extract_ods(): nonlocal base_url, rows diff --git a/bonobo/io/__init__.py b/bonobo/io/__init__.py index 3338e9d..ca10d08 100644 --- a/bonobo/io/__init__.py +++ b/bonobo/io/__init__.py @@ -1,5 +1,9 @@ """ Readers and writers for common file formats. """ -from .json import * +from .file import FileWriter +from .json import JsonFileWriter -__all__ = ['to_json', ] +__all__ = [ + 'FileWriter', + 'JsonFileWriter', +] diff --git a/bonobo/io/file.py b/bonobo/io/file.py new file mode 100644 index 0000000..b30e515 --- /dev/null +++ b/bonobo/io/file.py @@ -0,0 +1,35 @@ +from bonobo.util.lifecycle import with_context + +__all__ = ['FileWriter', ] + + +@with_context +class FileWriter: + # XXX TODO implement @with_context like this ? Pros and cons ? + class Meta: + contextual = True + + def __init__(self, path_or_buf, eol='\n'): + self.path_or_buf = path_or_buf + self.eol = eol + + def initialize(self, ctx): + """ todo add lock file ? optional maybe ? """ + assert not hasattr(ctx, 'fp'), 'One at a time, baby.' + ctx.fp = open(self.path_or_buf, 'w+') + ctx.first = True + + def write(self, fp, line, prefix=''): + fp.write(prefix + line) + + def __call__(self, ctx, row): + if ctx.first: + prefix, ctx.first = '', False + else: + prefix = self.eol + + self.write(ctx.fp, row, prefix=prefix) + + def finalize(self, ctx): + ctx.fp.close() + del ctx.fp, ctx.first diff --git a/bonobo/io/json.py b/bonobo/io/json.py index 2df1538..98712a7 100644 --- a/bonobo/io/json.py +++ b/bonobo/io/json.py @@ -1,40 +1,23 @@ import json +from .file import FileWriter from bonobo.util.lifecycle import with_context -__all__ = [ - 'from_json', - 'to_json', -] +__all__ = ['JsonFileWriter', ] @with_context -class JsonWriter: +class JsonFileWriter(FileWriter): def __init__(self, path_or_buf): - self.path_or_buf = path_or_buf + super().__init__(path_or_buf, eol=',\n') def initialize(self, ctx): - assert not hasattr(ctx, 'fp'), 'One at a time, baby.' - ctx.fp = open(self.path_or_buf, 'w+') + super().initialize(ctx) ctx.fp.write('[\n') - ctx.first = True - def __call__(self, ctx, row): - if ctx.first: - prefix = '' - ctx.first = False - else: - prefix = ',\n' - ctx.fp.write(prefix + json.dumps(row)) + def write(self, fp, line, prefix=''): + fp.write(prefix + json.dumps(line)) def finalize(self, ctx): ctx.fp.write('\n]') - ctx.fp.close() - del ctx.fp, ctx.first - - -def from_json(path_or_buf): - pass - - -to_json = JsonWriter + super().finalize(ctx) diff --git a/bonobo/util/__init__.py b/bonobo/util/__init__.py index 5de330f..4a15b40 100644 --- a/bonobo/util/__init__.py +++ b/bonobo/util/__init__.py @@ -4,12 +4,16 @@ import functools import pprint from .tokens import NOT_MODIFIED +from .helpers import run, console_run, jupyter_run __all__ = [ 'NOT_MODIFIED', + 'console_run', 'head', + 'jupyter_run', 'log', 'noop', + 'run', 'tee', ] diff --git a/bonobo/util/helpers.py b/bonobo/util/helpers.py new file mode 100644 index 0000000..5e3538f --- /dev/null +++ b/bonobo/util/helpers.py @@ -0,0 +1,20 @@ +def run(*chain, plugins=None): + from bonobo import Graph, ThreadPoolExecutorStrategy + + graph = Graph() + graph.add_chain(*chain) + + executor = ThreadPoolExecutorStrategy() + return executor.execute(graph, plugins=plugins or []) + + +def console_run(*chain, output=True, plugins=None): + from bonobo.ext.console import ConsoleOutputPlugin + + return run(*chain, plugins=(plugins or []) + [ConsoleOutputPlugin()] if output else []) + + +def jupyter_run(*chain, plugins=None): + from bonobo.ext.jupyter import JupyterOutputPlugin + + return run(*chain, plugins=(plugins or []) + [JupyterOutputPlugin()]) diff --git a/docs/_templates/index.html b/docs/_templates/index.html index 4894573..9b9ae48 100644 --- a/docs/_templates/index.html +++ b/docs/_templates/index.html @@ -1,22 +1,20 @@ {% extends "layout.html" %} -{% set title = _('Overview') %} +{% set title = _('Bonobo — Data processing for humans') %} {% block body %} -
+
- {% trans %} - Bonobo is a line-by-line data-processing toolkit for python 3.5+ emphasizing simplicity and atomicity of - data transformations using a simple directed graph of python callables. + Bonobo is a line-by-line data-processing toolkit for python 3.5+ emphasizing simple and + atomic data transformations defined using a directed graph of plain old python callables. {% endtrans %}
@@ -71,9 +69,8 @@|
- {% trans %}First steps with
- Bonobo{% endtrans %} {% trans %}First steps{% endtrans %} |
{%- if hasdoc('search') %}
diff --git a/docs/conf.py b/docs/conf.py
index 604417a..16307fd 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -12,8 +12,14 @@ import bonobo
# -- General configuration ------------------------------------------------
extensions = [
- 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage',
- 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode'
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.ifconfig',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.graphviz',
]
# Add any paths that contain templates here, relative to this directory.
@@ -95,6 +101,8 @@ html_additional_pages = {'index': 'index.html'}
html_static_path = ['_static']
html_show_sphinx = False
+graphviz_output_format = 'svg'
+
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
diff --git a/docs/install.rst b/docs/install.rst
new file mode 100644
index 0000000..5b8488a
--- /dev/null
+++ b/docs/install.rst
@@ -0,0 +1,34 @@
+Installation
+============
+
+
+.. todo::
+
+ better install docs, especially on how to use different fork, etc.
+
+Install with pip
+::::::::::::::::
+
+.. code-block:: shell-session
+
+ $ pip install bonobo
+
+Install from source
+:::::::::::::::::::
+
+.. code-block:: shell-session
+
+ $ pip install git+https://github.com/python-bonobo/bonobo.git@master#egg=bonobo
+
+Editable install
+::::::::::::::::
+
+If you plan on making patches to Bonobo, you should install it as an "editable" package.
+
+
+.. code-block:: shell-session
+
+ $ pip install --editable git+https://github.com/python-bonobo/bonobo.git@master#egg=bonobo
+
+Note: `-e` is the shorthand version of `--editable`.
+
diff --git a/docs/tutorial/basics.rst b/docs/tutorial/basics.rst
new file mode 100644
index 0000000..1197d3a
--- /dev/null
+++ b/docs/tutorial/basics.rst
@@ -0,0 +1,146 @@
+First steps - Basic concepts
+============================
+
+To begin with Bonobo, you should first install it:
+
+.. code-block:: shell-session
+
+ $ pip install bonobo
+
+See :doc:`install` if you're looking for more options.
+
+Let's write a first data transformation
+:::::::::::::::::::::::::::::::::::::::
+
+We'll write a simple component that just uppercase everything. In **Bonobo**, a component is a plain old python
+callable, not more, not less.
+
+.. code-block:: python
+
+ def uppercase(x: str):
+ return x.upper()
+
+Ok, this is kind of simple, and you can even use `str.upper` directly instead of writing a wrapper. The type annotations
+are not used, but can make your code much more readable (and may be used as validators in the future).
+
+To run this, we need two more things: a generator that feeds data, and something that outputs it.
+
+.. code-block:: python
+
+ def generate_data():
+ yield 'foo'
+ yield 'bar'
+ yield 'baz'
+
+ def output(x: str):
+ print(x)
+
+That should do the job. Now, let's chain the three callables together and run them.
+
+.. code-block:: python
+
+ from bonobo import run
+
+ run(generate_data, uppercase, output)
+
+This is the simplest data transormation possible, and we run it using the `run` helper that hides the underlying object
+composition necessary to actually run the callables in parralel. The more flexible, but a bit more verbose to do the
+same thing would be:
+
+.. code-block:: python
+
+ from bonobo import Graph, ThreadPoolExecutorStrategy
+
+ graph = Graph()
+ graph.add_chain(generate_data, uppercase, output)
+
+ executor = ThreadPoolExecutorStrategy()
+ executor.execute(graph)
+
+Depending on what you're doing, you may use the shorthand helper method, or the verbose one. Always favor the shorter,
+if you don't need to tune the graph or the execution strategy.
+
+Definitions
+:::::::::::
+
+* Graph
+* Component
+* Executor
+
+.. todo:: Definitions, and substitute vague terms in the page by the exact term defined here
+
+Summary
+:::::::
+
+Let's rewrite this using builtin functions and methods, then explain the few concepts available here:
+
+.. code-block:: python
+
+ from bonobo import Graph, ThreadPoolExecutorStrategy
+
+ # Represent our data processor as a simple directed graph of callables.
+ graph = Graph(
+ (x for x in 'foo', 'bar', 'baz'),
+ str.upper,
+ print,
+ )
+
+ # Use a thread pool.
+ executor = ThreadPoolExecutorStrategy()
+
+ # Run the thing.
+ executor.execute(graph)
+
+Or the shorthand version, that you should prefer if you don't need fine tuning:
+
+.. code-block:: python
+
+ from bonobo import run
+
+ run(
+ iter(['foo', 'bar', 'baz']),
+ str.upper,
+ print,
+ )
+
+Both methods are strictly equivalent (see :func:`bonobo.run`). When in doubt, favour the shorter.
+
+Takeaways
+:::::::::
+
+① The :class:`bonobo.Graph` class is used to represent a data-processing pipeline.
+
+It can represent simple list-like linear graphs, like here, but it can also represent much more complex graphs, with
+branches and cycles.
+
+This is what the graph we defined looks like:
+
+.. graphviz::
+
+ digraph {
+ rankdir = LR;
+ "iter(['foo', 'bar', 'baz'])" -> "str.upper" -> "print";
+ }
+
+
+② Transformations are simple python callables. Whatever can be called can be used as a transformation. Callables can
+either `return` or `yield` data to send it to the next step. Regular functions (using `return`) should be prefered if
+each call is guaranteed to return exactly one result, while generators (using `yield`) should be prefered if the
+number of output lines for a given input varies.
+
+③ The graph is then executed using an `ExecutionStrategy`. For now, let's focus only on
+:class:`bonobo.ThreadPoolExecutorStrategy`, which use an underlying `concurrent.futures.ThreadPoolExecutor` to
+schedule calls in a pool of threads, but basically this strategy is what determines the actual behaviour of execution.
+
+④ Before actually executing the callables, the `ExecutorStrategy` instance will wrap each component in a `context`,
+whose responsibility is to hold the state, to keep the components stateless. We'll expand on this later.
+
+
+Next
+::::
+
+You now know all the basic concepts necessary to build (batch-like) data processors.
+
+If you're confident with this part, let's get to a more real world example, using files and nice console output.
+
+.. todo:: link to next page
diff --git a/docs/tutorial/basics2.rst b/docs/tutorial/basics2.rst
new file mode 100644
index 0000000..8542ec2
--- /dev/null
+++ b/docs/tutorial/basics2.rst
@@ -0,0 +1,46 @@
+First steps - Working with files
+================================
+
+Bonobo would not be of any use if the aim was to uppercase small lists of strings. In fact, Bonobo should not be used
+if you don't expect any gain from parralelization of tasks.
+
+Let's take the following graph as an example:
+
+.. graphviz::
+
+ digraph {
+ rankdir = LR;
+ "A" -> "B" -> "C";
+ }
+
+The execution strategy does a bit of under the scene work, wrapping every component in a thread (assuming you're using
+the :class:`bonobo.ThreadPoolExecutorStrategy`), which allows to start running `B` as soon as `A` yielded the first line
+of data, and `C` as soon as `B` yielded the first line of data, even if `A` or `B` still have data to yield.
+
+The great thing is that you generally don't have to think about it. Just be aware that your components will be run in
+parralel, and don't worry too much about blocking components, as they won't block their siblings.
+
+That being said, let's try to write a more real-world like transformation.
+
+Reading a file
+::::::::::::::
+
+There are a few component builders available in **Bonobo** that let you read files. You should at least know about the following:
+
+* :class:`bonobo.FileReader` (aliased as :func:`bonobo.from_file`)
+* :class:`bonobo.JsonFileReader` (aliased as :func:`bonobo.from_json`)
+* :class:`bonobo.CsvFileReader` (aliased as :func:`bonobo.from_csv`)
+
+Reading a file is as simple as using one of those, and for the example, we'll use a text file that was generated using
+Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by Mairie de Paris under the Open Database
+License (ODbL). You can `explore the original dataset |