From c9987089234dd498f7212b3968e401a923387719 Mon Sep 17 00:00:00 2001 From: Romain Dorgueil Date: Sat, 1 Jun 2019 12:31:38 +0200 Subject: [PATCH 1/2] New syntax: adding test cases and syntactic sugar tools in graph api for merges and forks (related to #323 and #324, allows #328). --- bonobo/structs/graphs.py | 40 +++++++- tests/structs/test_graphs_new_syntax.py | 129 ++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 5 deletions(-) create mode 100644 tests/structs/test_graphs_new_syntax.py diff --git a/bonobo/structs/graphs.py b/bonobo/structs/graphs.py index 8d256de..aaf3fd6 100644 --- a/bonobo/structs/graphs.py +++ b/bonobo/structs/graphs.py @@ -37,11 +37,20 @@ class GraphCursor: ) if len(nodes): - chain = self.graph.add_chain(*nodes, _input=self.last) + chain = self.graph.add_chain(*nodes, _input=self.last, use_existing_nodes=True) return GraphCursor(chain.graph, first=self.first, last=chain.output) return self + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return None + + def __eq__(self, other): + return self.graph == other.graph and self.first == other.first and self.last == other.last + class PartialGraph: def __init__(self, *nodes): @@ -73,6 +82,15 @@ class Graph: def __getitem__(self, key): return self.nodes[key] + def __enter__(self): + return self.get_cursor().__enter__() + + def __exit__(self, exc_type, exc_val, exc_tb): + return None + + def __rshift__(self, other): + return self.get_cursor().__rshift__(other) + def get_cursor(self, ref=BEGIN): return GraphCursor(self, last=self.index_of(ref)) @@ -96,6 +114,9 @@ class Graph: raise ValueError("Cannot find node matching {!r}.".format(mixed)) + def indexes_of(self, *things): + return set(map(self.index_of, things)) + def outputs_of(self, idx_or_node, create=False): """Get a set of the outputs for a given node, node index or name. """ @@ -105,13 +126,13 @@ class Graph: self.edges[idx_or_node] = set() return self.edges[idx_or_node] - def add_node(self, c, *, _name=None): + def add_node(self, new_node, *, _name=None): """Add a node without connections in this graph and returns its index. If _name is specified, name this node (string reference for further usage). """ idx = len(self.nodes) self.edges[idx] = set() - self.nodes.append(c) + self.nodes.append(new_node) if _name: if _name in self.named: @@ -120,7 +141,14 @@ class Graph: return idx - def add_chain(self, *nodes, _input=BEGIN, _output=None, _name=None): + def get_or_add_node(self, new_node, *, _name=None): + if new_node in self.nodes: + if _name is not None: + raise RuntimeError("Cannot name a node that is already present in the graph.") + return self.index_of(new_node) + return self.add_node(new_node, _name=_name) + + def add_chain(self, *nodes, _input=BEGIN, _output=None, _name=None, use_existing_nodes=False): """Add `nodes` as a chain in this graph. **Input rules** @@ -153,6 +181,8 @@ class Graph: _first = None _last = None + get_node = self.get_or_add_node if use_existing_nodes else self.add_node + # Sanity checks. if not len(nodes): if _input is None or _output is None: @@ -164,7 +194,7 @@ class Graph: raise RuntimeError("Using add_chain(...) without nodes does not allow to use the _name parameter.") for i, node in enumerate(nodes): - _last = self.add_node(node, _name=_name if not i else None) + _last = get_node(node, _name=_name if not i else None) if _first is None: _first = _last diff --git a/tests/structs/test_graphs_new_syntax.py b/tests/structs/test_graphs_new_syntax.py new file mode 100644 index 0000000..570fa47 --- /dev/null +++ b/tests/structs/test_graphs_new_syntax.py @@ -0,0 +1,129 @@ +from operator import attrgetter +from unittest.mock import sentinel + +import pytest + +from bonobo.constants import BEGIN +from bonobo.structs.graphs import Graph, GraphCursor +from bonobo.util import tuplize + + +@tuplize +def get_pseudo_nodes(*names): + for name in names: + yield getattr(sentinel, name) + + +def test_get_cursor(): + g = Graph() + cursor = g.get_cursor() + + assert cursor.graph is g + assert cursor.first is BEGIN + assert cursor.last is BEGIN + + +def test_get_cursor_in_a_vacuum(): + g = Graph() + cursor = g.get_cursor(None) + + assert cursor.graph is g + assert cursor.first is None + assert cursor.last is None + + +def test_cursor_usage_to_add_a_chain(): + a, b, c = get_pseudo_nodes(*"abc") + + g = Graph() + + g.get_cursor() >> a >> b >> c + + assert len(g) == 3 + assert g.outputs_of(BEGIN) == {g.index_of(a)} + assert g.outputs_of(a) == {g.index_of(b)} + assert g.outputs_of(b) == {g.index_of(c)} + assert g.outputs_of(c) == set() + + +def test_cursor_usage_to_add_a_chain_in_a_context_manager(): + a, b, c = get_pseudo_nodes(*"abc") + + g = Graph() + with g as cur: + cur >> a >> b >> c + + assert len(g) == 3 + assert g.outputs_of(BEGIN) == {g.index_of(a)} + assert g.outputs_of(a) == {g.index_of(b)} + assert g.outputs_of(b) == {g.index_of(c)} + assert g.outputs_of(c) == set() + + +def test_implicit_cursor_usage(): + a, b, c = get_pseudo_nodes(*"abc") + + g = Graph() + g >> a >> b >> c + + assert len(g) == 3 + assert g.outputs_of(BEGIN) == {g.index_of(a)} + assert g.outputs_of(a) == {g.index_of(b)} + assert g.outputs_of(b) == {g.index_of(c)} + assert g.outputs_of(c) == set() + + +def test_cursor_to_fork_a_graph(): + a, b, c, d, e = get_pseudo_nodes(*"abcde") + + g = Graph() + g >> a >> b >> c + g.get_cursor(b) >> d >> e + + assert len(g) == 5 + assert g.outputs_of(BEGIN) == {g.index_of(a)} + assert g.outputs_of(a) == {g.index_of(b)} + assert g.outputs_of(b) == {g.index_of(c), g.index_of(d)} + assert g.outputs_of(c) == set() + assert g.outputs_of(d) == {g.index_of(e)} + assert g.outputs_of(e) == set() + + +def test_cursor_to_fork_at_the_end(): + a, b, c, d, e = get_pseudo_nodes(*"abcde") + + g = Graph() + c0 = g >> a >> b + c1 = c0 >> c + c2 = c0 >> d >> e + + assert len(g) == 5 + assert g.outputs_of(BEGIN) == {g.index_of(a)} + assert g.outputs_of(a) == {g.index_of(b)} + assert g.outputs_of(b) == {g.index_of(c), g.index_of(d)} + assert g.outputs_of(c) == set() + assert g.outputs_of(d) == {g.index_of(e)} + assert g.outputs_of(e) == set() + + assert c0.first == g.index_of(BEGIN) + assert c0.last == g.index_of(b) + assert c1.first == g.index_of(BEGIN) + assert c1.last == g.index_of(c) + assert c2.first == g.index_of(BEGIN) + assert c2.last == g.index_of(e) + + +def test_cursor_merge(): + a, b, c = get_pseudo_nodes(*"abc") + g = Graph() + + c1 = g >> a >> c + c2 = g >> b >> c + + assert len(g) == 3 + assert g.outputs_of(BEGIN) == g.indexes_of(a, b) + assert g.outputs_of(a) == g.indexes_of(c) + assert g.outputs_of(b) == g.indexes_of(c) + assert g.outputs_of(c) == set() + + assert c1 == c2 From e84440df8c61b9f03dc5fa13080decf8af1c5356 Mon Sep 17 00:00:00 2001 From: Romain Dorgueil Date: Sat, 1 Jun 2019 14:08:25 +0200 Subject: [PATCH 2/2] Major update to documentation, removing deprecated docs and adding the new syntax to graph building options. --- bin/update_apidoc.py | 94 +++++++- bonobo/_api.py | 7 +- bonobo/nodes/__init__.py | 7 + docs/guide/graphs.rst | 209 ++++++++++++++--- docs/reference/api/bonobo.rst | 114 +++++++++ docs/reference/api/bonobo/config.rst | 3 + docs/reference/api/bonobo/constants.rst | 3 + docs/reference/api/bonobo/execution.rst | 8 +- .../api/bonobo/execution/contexts.rst | 3 + .../reference/api/bonobo/execution/events.rst | 3 + .../api/bonobo/execution/strategies.rst | 3 + docs/reference/api/bonobo/nodes.rst | 13 ++ docs/reference/api/bonobo/structs/graphs.rst | 13 ++ docs/reference/api/bonobo/util.rst | 3 + docs/reference/examples.rst | 1 - docs/reference/index.rst | 3 +- docs/tutorial/0.5/_outdated_note.rst | 9 - docs/tutorial/0.5/index.rst | 65 ------ docs/tutorial/0.5/python.rst | 13 -- docs/tutorial/0.5/tut01.rst | 202 ---------------- docs/tutorial/0.5/tut02.rst | 123 ---------- docs/tutorial/0.5/tut03.rst | 202 ---------------- docs/tutorial/0.5/tut04.rst | 216 ------------------ 23 files changed, 434 insertions(+), 883 deletions(-) create mode 100644 docs/reference/api/bonobo/nodes.rst create mode 100644 docs/reference/api/bonobo/structs/graphs.rst delete mode 100644 docs/tutorial/0.5/_outdated_note.rst delete mode 100644 docs/tutorial/0.5/index.rst delete mode 100644 docs/tutorial/0.5/python.rst delete mode 100644 docs/tutorial/0.5/tut01.rst delete mode 100644 docs/tutorial/0.5/tut02.rst delete mode 100644 docs/tutorial/0.5/tut03.rst delete mode 100644 docs/tutorial/0.5/tut04.rst diff --git a/bin/update_apidoc.py b/bin/update_apidoc.py index 9a31a62..efb0563 100644 --- a/bin/update_apidoc.py +++ b/bin/update_apidoc.py @@ -8,8 +8,8 @@ apidoc_root = "docs/reference/api" class Module: - def __init__(self, name, title=None, *, automodule_options=None): - + def __init__(self, name, title=None, *, automodule_options=None, append=None): + self.append = append self.name = name self.title = title or " ".join(map(str.title, self.name.split(".")[1:])) self.automodule_options = automodule_options or list() @@ -18,20 +18,101 @@ class Module: return "<{} ({})>".format(self.title, self.name) def asdict(self): - return {"name": self.name, "title": self.title, "automodule_options": self.automodule_options} + return { + "append": self.append, + "automodule": True, + "automodule_options": self.automodule_options, + "name": self.name, + "title": self.title, + } def get_path(self): return os.path.join(__path__, apidoc_root, *self.name.split(".")) + ".rst" +import inspect + +bonobo = __import__("bonobo") +assert bonobo.__version__ + +prefixes = { + "bonobo.nodes": None, + "bonobo._api": "bonobo", + "bonobo.structs.graphs": None, + "bonobo.execution.strategies": "bonobo", + "bonobo.registry": "bonobo", + "bonobo.util.environ": "bonobo", +} +api_objects = {} + +display_order = [("bonobo.structs.graphs", "Graphs"), ("bonobo.nodes", "Nodes"), ("bonobo", "Other top-level APIs")] + +for name in sorted(dir(bonobo)): + # ignore attributes starting by underscores + if name.startswith("_"): + continue + attr = getattr(bonobo, name) + if inspect.ismodule(attr): + continue + + assert name in bonobo.__all__ + + o = getattr(bonobo, name) + modname = inspect.getmodule(o).__name__ + family = None + family_override = None + + for prefix, target in prefixes.items(): + if modname == prefix or modname.startswith(prefix + "."): + family = target or prefix + display_name = ".".join([family, name]) + break + + if family is None: + raise Exception("Could not find family for {}".format(name)) + + api_objects.setdefault(family, []) + api_objects[family].append((name, o)) + +api_content = [] +current_family = None +for family, title in display_order: + if family != current_family: + if current_family is not None: + api_content.append("") + api_content.append("") + api_content.append(title) + api_content.append(":" * len(title)) + api_content.append("") + current_family = family + + for api_object in sorted(api_objects[family]): + object_type = "func" if inspect.isfunction(api_object[1]) else "class" + api_content.append("* :{}:`{}.{}` ".format(object_type, family, api_object[0])) + + if family == "bonobo": + for api_object in sorted(api_objects[family]): + object_type = "function" if inspect.isfunction(api_object[1]) else "class" + api_content.append("") + api_content.append("") + api_content.append(api_object[0]) + api_content.append("-" * len(api_object[0])) + api_content.append("") + api_content.append(".. auto{}:: {}.{}".format(object_type, family, api_object[0])) + + +print("\n".join(api_content)) + modules = [ - Module("bonobo", title="Bonobo"), + Module("bonobo", title="Bonobo", automodule_options=["no-members"], append="\n".join(api_content)), Module("bonobo.config"), Module("bonobo.constants", automodule_options=["no-members"]), Module("bonobo.execution"), Module("bonobo.execution.contexts"), Module("bonobo.execution.events"), Module("bonobo.execution.strategies"), + Module("bonobo.nodes"), + Module("bonobo.structs.graphs", title="Graphs"), Module("bonobo.util"), ] @@ -50,8 +131,13 @@ env = Environment( :Module: :mod:`{{ name }}` +{% if automodule %} .. automodule:: {{ name }} {% for opt in automodule_options %} :{{ opt }}:{{ "\n" }}{% endfor %} +{% endif %} +{% if append %} +{{ append }} +{% endif %} """[ 1:-1 ] diff --git a/bonobo/_api.py b/bonobo/_api.py index 8432eee..5d35eeb 100644 --- a/bonobo/_api.py +++ b/bonobo/_api.py @@ -1,6 +1,9 @@ """ -Contains all the tools you need to get started with the framework, including (but not limited to) generic -transformations, readers, writers, and tools for writing and executing graphs and jobs. +The root :mod:`bonobo` package contains all the tools you need to get started with the framework, including (but not +limited to) generic transformations, readers, writers, and tools for writing and executing graphs and jobs. + +Mostly, it exposes objects found in sub-packages, and although you can access them directly here, you should refer to +the matching documentation pages when using them. All objects in this module are considered very safe to use, and backward compatibility when moving up from one version to another is maximal. diff --git a/bonobo/nodes/__init__.py b/bonobo/nodes/__init__.py index 843a3bb..89370d5 100644 --- a/bonobo/nodes/__init__.py +++ b/bonobo/nodes/__init__.py @@ -1,3 +1,10 @@ +""" +The :mod:`bonobo.nodes` module contains all builtin transformations that you can use out of the box in your ETL jobs. + +Please note that all objects from this package are also available directly through the root :mod:`bonobo` package. + +""" + from bonobo.nodes.basics import * from bonobo.nodes.basics import __all__ as _all_basics from bonobo.nodes.filter import Filter diff --git a/docs/guide/graphs.rst b/docs/guide/graphs.rst index bad66f1..67f8ce9 100644 --- a/docs/guide/graphs.rst +++ b/docs/guide/graphs.rst @@ -1,51 +1,40 @@ Graphs ====== -Graphs are the glue that ties transformations together. They are the only data-structure bonobo can execute directly. Graphs -must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number of nodes can be rather high, practical use cases usually do not exceed more than a few hundred nodes and only then in extreme cases. +Graphs are the glue that ties transformations together. They are the only data-structure bonobo can execute directly. +Graphs must be acyclic, and can contain as many nodes as your system can handle. However, although in theory the number +of nodes can be rather high, practical cases usually do not exceed a few hundred nodes and even that is a rather high +number you may not encounter so often. -Within a graph, each node are isolated and can only communicate using their -input and output queues. For each input row, a given node will be called with -the row passed as arguments. Each *return* or *yield* value will be put on the -node's output queue, and the nodes connected in the graph will then be able to -process it. +Within a graph, each node are isolated and can only communicate using their input and output queues. For each input row, +a given node will be called with the row passed as arguments. Each *return* or *yield* value will be put on the node's +output queue, and the nodes connected in the graph will then be able to process it. |bonobo| is a line-by-line data stream processing solution. Handling the data-flow this way brings the following properties: -- **First in, first out**: unless stated otherwise, each node will receeive the - rows from FIFO queues, and so, the order of rows will be preserved. That is - true for each single node, but please note that if you define "graph bubbles" - (where a graph diverge in different branches then converge again), the - convergence node will receive rows FIFO from each input queue, meaning that - the order existing at the divergence point wont stay true at the convergence - point. +- **First in, first out**: unless stated otherwise, each node will receeive the rows from FIFO queues, and so, the order + of rows will be preserved. That is true for each single node, but please note that if you define "graph bubbles" + (where a graph diverge in different branches then converge again), the convergence node will receive rows FIFO from + each input queue, meaning that the order existing at the divergence point wont stay true at the convergence point. -- **Parallelism**: each node run in parallel (by default, using independent - threads). This is useful as you don't have to worry about blocking calls. - If a thread waits for, let's say, a database, or a network service, the other - nodes will continue handling data, as long as they have input rows available. +- **Parallelism**: each node run in parallel (by default, using independent threads). This is useful as you don't have + to worry about blocking calls. If a thread waits for, let's say, a database, or a network service, the other nodes + will continue handling data, as long as they have input rows available. -- **Independence**: the rows are independent from each other, making this way - of working with data flows good for line-by-line data processing, but - also not ideal for "grouped" computations (where an output depends on more - than one line of input data). You can overcome this with rolling windows if - the input required are adjacent rows, but if you need to work on the whole - dataset at once, you should consider other software. +- **Independence**: the rows are independent from each other, making this way of working with data flows good for + line-by-line data processing, but also not ideal for "grouped" computations (where an output depends on more than one + line of input data). You can overcome this with rolling windows if the input required are adjacent rows, but if you + need to work on the whole dataset at once, you should consider other software. -Graphs are defined using :class:`bonobo.Graph` instances, as seen in the -previous tutorial step. - -.. warning:: - - This document is currently reviewed to check for correctness after the 0.6 release. +Graphs are defined using :class:`bonobo.Graph` instances, as seen in the previous tutorial step. -What can be a node? -::::::::::::::::::: +What can be used as a node? +::::::::::::::::::::::::::: -**TL;DR**: … anything, as long as it’s callable(). +**TL;DR**: … anything, as long as it’s callable() or iterable. Functions --------- @@ -55,7 +44,100 @@ Functions def get_item(id): return id, items.get(id) +When building your graph, you can simply add your function: +.. code-block:: python + + graph.add_chain(..., get_item, ...) + +Or using the new syntax: + +.. code-block:: python + + graph >> ... >> get_item >> ... + +.. note:: + + Please note that we pass the function object, and not the result of the function being called. A common mistake is + to call the function while building the graph, which won't work and may be tedious to debug. + + As a convention, we use snake_cased objects when the object can be directly passed to a graph, like this function. + + Some functions are factories for closures, and thus behave differently (as you need to call them to get an actual + object usable as a transformation. When it is the case, we use CamelCase as a convention, as it behaves the same way + as a class. + + +Classes +------- + +.. code-block:: python + + class Foo: + ... + + def __call__(self, id): + return id, self.get(id) + +When building your graph, you can add an instance of your object (or even multiple instances, eventually configured +differently): + +.. code-block:: python + + graph.add_chain(..., Foo(), ...) + +Or using the new syntax: + +.. code-block:: python + + graph >> ... >> Foo() >> ... + + +Iterables (generators, lists, ...) +---------------------------------- + +As a convenience tool, we can use iterables directly within a graph. It can either be used as producer nodes (nodes that +are normally only called once and produce data) or, in case of generators, as transformations. + + +.. code-block:: python + + def product(x): + for i in range(10) + yield x, i, x * i + +Then, add it to a graph: + +.. code-block:: python + + graph.add_chain(range(10), product, ...) + +Or using the new syntax: + +.. code-block:: python + + graph >> range(10) >> product >> ... + + +Builtins +-------- + +Again, as long as it is callable, you can use it as a node. It means that python builtins works (think about `print` or +`str.upper`...) + +.. code-block:: python + + graph.add_chain(range(ord("a"), ord("z")+1), chr, str.upper, print) + +Or using the new syntax: + +.. code-block:: python + + graph >> range(ord("a"), ord("z")+1) >> chr >> str.upper >> print + + +What happens during the graph execution? +:::::::::::::::::::::::::::::::::::::::: Each node of a graph will be executed in isolation from the other nodes, and the data is passed from one node to the next using FIFO queues, managed by the framework. It's transparent to the end-user, though, and you'll only use @@ -90,9 +172,9 @@ It allows to have ETL jobs that ignore faulty data and try their best to process Some errors are fatal, though. -If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an :class:`bonobo.errors.UnrecoverableTypeError`, and exit the -current graph execution as fast as it can (finishing the other node executions that are in progress first, but not -starting new ones if there are remaining input rows). +If you pass a 2 elements tuple to a node that takes 3 args, |bonobo| will raise an +:class:`bonobo.errors.UnrecoverableTypeError`, and exit the current graph execution as fast as it can (finishing the +other node executions that are in progress first, but not starting new ones if there are remaining input rows). Definitions @@ -108,12 +190,20 @@ Node included in a graph, multiple graph, or not at all. -Creating a graph -:::::::::::::::: +Building graphs +::::::::::::::: + +Graphs in |bonobo| are instances of :class:`bonobo.Graph` Graphs should be instances of :class:`bonobo.Graph`. The :func:`bonobo.Graph.add_chain` method can take as many positional parameters as you want. +.. note:: + + As of |bonobo| 0.7, a new syntax is available that we believe is more powerfull and more readable than the legacy + `add_chain` method. The former API is here to stay and it's perfectly safe to use it, but if it is an option, you + should consider the new syntax. During the transition period, we'll document both. + .. code-block:: python import bonobo @@ -121,6 +211,16 @@ positional parameters as you want. graph = bonobo.Graph() graph.add_chain(a, b, c) +Or using the new syntax: + +.. code-block:: python + + import bonobo + + graph = bonobo.Graph() + graph >> a >> b >> c + + Resulting graph: .. graphviz:: @@ -149,6 +249,16 @@ To create two or more divergent data streams ("forks"), you should specify the ` graph.add_chain(a, b, c) graph.add_chain(f, g, _input=b) +Or using the new syntax: + +.. code-block:: python + + import bonobo + + graph = bonobo.Graph() + graph >> a >> b >> c + graph.get_cursor(b) >> f >> g + Resulting graph: @@ -184,6 +294,21 @@ To merge two data streams, you can use the `_output` kwarg to `add_chain`, or us graph.add_chain(a, b, _output=normalize) graph.add_chain(f, g, _output=normalize) +Or using the new syntax: + +.. code-block:: python + + import bonobo + + graph = bonobo.Graph() + + # Here we set _input to None, so normalize won't start on its own but only after it receives input from the other chains. + graph.get_cursor(None) >> normalize >> store + + # Add two different chains + graph >> a >> b >> normalize + graph >> f >> g >> normalize + Resulting graph: @@ -230,6 +355,9 @@ Please note that naming a chain is exactly the same thing as naming the first no graph.add_chain(a, b, _output="load") graph.add_chain(f, g, _output="load") +Using the new syntax, there should not be a need to name nodes. Let us know if you think otherwise by creating an issue. + + Resulting graph: .. graphviz:: @@ -283,6 +411,11 @@ You may want to connect two nodes at some point. You can use `add_chain` without # Connect them graph.add_chain(_input=a, _output=b) +Or using the new syntax: + +.. code-block:: python + + graph.get_cursor(a) >> b Inspecting graphs diff --git a/docs/reference/api/bonobo.rst b/docs/reference/api/bonobo.rst index 4a02c60..295823f 100644 --- a/docs/reference/api/bonobo.rst +++ b/docs/reference/api/bonobo.rst @@ -5,6 +5,120 @@ :Module: :mod:`bonobo` + .. automodule:: bonobo + :no-members: + + + +Graphs +:::::: + +* :class:`bonobo.structs.graphs.Graph` + + +Nodes +::::: + +* :class:`bonobo.nodes.CsvReader` +* :class:`bonobo.nodes.CsvWriter` +* :class:`bonobo.nodes.FileReader` +* :class:`bonobo.nodes.FileWriter` +* :class:`bonobo.nodes.Filter` +* :class:`bonobo.nodes.FixedWindow` +* :func:`bonobo.nodes.Format` +* :class:`bonobo.nodes.JsonReader` +* :class:`bonobo.nodes.JsonWriter` +* :class:`bonobo.nodes.LdjsonReader` +* :class:`bonobo.nodes.LdjsonWriter` +* :class:`bonobo.nodes.Limit` +* :func:`bonobo.nodes.MapFields` +* :func:`bonobo.nodes.OrderFields` +* :class:`bonobo.nodes.PickleReader` +* :class:`bonobo.nodes.PickleWriter` +* :class:`bonobo.nodes.PrettyPrinter` +* :class:`bonobo.nodes.RateLimited` +* :func:`bonobo.nodes.Rename` +* :func:`bonobo.nodes.SetFields` +* :func:`bonobo.nodes.Tee` +* :func:`bonobo.nodes.UnpackItems` +* :func:`bonobo.nodes.count` +* :func:`bonobo.nodes.identity` +* :func:`bonobo.nodes.noop` + + +Other top-level APIs +:::::::::::::::::::: + +* :func:`bonobo.create_reader` +* :func:`bonobo.create_strategy` +* :func:`bonobo.create_writer` +* :func:`bonobo.get_argument_parser` +* :func:`bonobo.get_examples_path` +* :func:`bonobo.inspect` +* :func:`bonobo.open_examples_fs` +* :func:`bonobo.open_fs` +* :func:`bonobo.parse_args` +* :func:`bonobo.run` + + +create_reader +------------- + +.. autofunction:: bonobo.create_reader + + +create_strategy +--------------- + +.. autofunction:: bonobo.create_strategy + + +create_writer +------------- + +.. autofunction:: bonobo.create_writer + + +get_argument_parser +------------------- + +.. autofunction:: bonobo.get_argument_parser + + +get_examples_path +----------------- + +.. autofunction:: bonobo.get_examples_path + + +inspect +------- + +.. autofunction:: bonobo.inspect + + +open_examples_fs +---------------- + +.. autofunction:: bonobo.open_examples_fs + + +open_fs +------- + +.. autofunction:: bonobo.open_fs + + +parse_args +---------- + +.. autofunction:: bonobo.parse_args + + +run +--- + +.. autofunction:: bonobo.run \ No newline at end of file diff --git a/docs/reference/api/bonobo/config.rst b/docs/reference/api/bonobo/config.rst index f2432df..83932a4 100644 --- a/docs/reference/api/bonobo/config.rst +++ b/docs/reference/api/bonobo/config.rst @@ -5,6 +5,9 @@ :Module: :mod:`bonobo.config` + .. automodule:: bonobo.config + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/constants.rst b/docs/reference/api/bonobo/constants.rst index 5e6deda..540591a 100644 --- a/docs/reference/api/bonobo/constants.rst +++ b/docs/reference/api/bonobo/constants.rst @@ -5,7 +5,10 @@ :Module: :mod:`bonobo.constants` + .. automodule:: bonobo.constants :no-members: + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/execution.rst b/docs/reference/api/bonobo/execution.rst index 28640a4..e532c98 100644 --- a/docs/reference/api/bonobo/execution.rst +++ b/docs/reference/api/bonobo/execution.rst @@ -5,15 +5,9 @@ :Module: :mod:`bonobo.execution` -.. toctree:: - :caption: Submodules - :maxdepth: 1 - - execution/contexts - execution/events - execution/strategies .. automodule:: bonobo.execution + \ No newline at end of file diff --git a/docs/reference/api/bonobo/execution/contexts.rst b/docs/reference/api/bonobo/execution/contexts.rst index d230b69..97592a3 100644 --- a/docs/reference/api/bonobo/execution/contexts.rst +++ b/docs/reference/api/bonobo/execution/contexts.rst @@ -5,6 +5,9 @@ :Module: :mod:`bonobo.execution.contexts` + .. automodule:: bonobo.execution.contexts + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/execution/events.rst b/docs/reference/api/bonobo/execution/events.rst index 93087c9..8cda03e 100644 --- a/docs/reference/api/bonobo/execution/events.rst +++ b/docs/reference/api/bonobo/execution/events.rst @@ -5,6 +5,9 @@ :Module: :mod:`bonobo.execution.events` + .. automodule:: bonobo.execution.events + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/execution/strategies.rst b/docs/reference/api/bonobo/execution/strategies.rst index 8ca0068..af27988 100644 --- a/docs/reference/api/bonobo/execution/strategies.rst +++ b/docs/reference/api/bonobo/execution/strategies.rst @@ -5,6 +5,9 @@ :Module: :mod:`bonobo.execution.strategies` + .. automodule:: bonobo.execution.strategies + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/nodes.rst b/docs/reference/api/bonobo/nodes.rst new file mode 100644 index 0000000..0805068 --- /dev/null +++ b/docs/reference/api/bonobo/nodes.rst @@ -0,0 +1,13 @@ +:mod:`Nodes ` +=========================== + +.. currentmodule:: bonobo.nodes + +:Module: :mod:`bonobo.nodes` + + +.. automodule:: bonobo.nodes + + + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/structs/graphs.rst b/docs/reference/api/bonobo/structs/graphs.rst new file mode 100644 index 0000000..b7cef69 --- /dev/null +++ b/docs/reference/api/bonobo/structs/graphs.rst @@ -0,0 +1,13 @@ +:mod:`Graphs ` +===================================== + +.. currentmodule:: bonobo.structs.graphs + +:Module: :mod:`bonobo.structs.graphs` + + +.. automodule:: bonobo.structs.graphs + + + + \ No newline at end of file diff --git a/docs/reference/api/bonobo/util.rst b/docs/reference/api/bonobo/util.rst index 2be9c1a..1fd14c5 100644 --- a/docs/reference/api/bonobo/util.rst +++ b/docs/reference/api/bonobo/util.rst @@ -5,6 +5,9 @@ :Module: :mod:`bonobo.util` + .. automodule:: bonobo.util + + \ No newline at end of file diff --git a/docs/reference/examples.rst b/docs/reference/examples.rst index 4edc910..3df3531 100644 --- a/docs/reference/examples.rst +++ b/docs/reference/examples.rst @@ -18,7 +18,6 @@ or - .. toctree:: :maxdepth: 4 diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 6f18ee5..031f69b 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -11,8 +11,9 @@ means that the api is not yet 1.0-proof. api/bonobo/config api/bonobo/constants api/bonobo/execution + api/bonobo/nodes + api/bonobo/structs/graphs api/bonobo/util commands settings examples - private diff --git a/docs/tutorial/0.5/_outdated_note.rst b/docs/tutorial/0.5/_outdated_note.rst deleted file mode 100644 index 9aeae82..0000000 --- a/docs/tutorial/0.5/_outdated_note.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. warning:: - - This tutorial was written for |bonobo| 0.5, while the current stable version is |bonobo| 0.6. - - Please be aware that some things changed. - - A summary of changes is available in the `migration guide from 0.5 to 0.6 `_. - - diff --git a/docs/tutorial/0.5/index.rst b/docs/tutorial/0.5/index.rst deleted file mode 100644 index 8bac110..0000000 --- a/docs/tutorial/0.5/index.rst +++ /dev/null @@ -1,65 +0,0 @@ -First steps -=========== - -.. include:: _outdated_note.rst - -What is Bonobo? -::::::::::::::: - -Bonobo is an ETL (Extract-Transform-Load) framework for python 3.5. The goal is to define data-transformations, with -python code in charge of handling similar shaped independent lines of data. - -Bonobo *is not* a statistical or data-science tool. If you're looking for a data-analysis tool in python, use Pandas. - -Bonobo is a lean manufacturing assembly line for data that let you focus on the actual work instead of the plumbery -(execution contexts, parallelism, error handling, console output, logging, ...). - -Bonobo uses simple python and should be quick and easy to learn. - -Tutorial -:::::::: - -.. note:: - - Good documentation is not easy to write. We do our best to make it better and better. - - Although all content here should be accurate, you may feel a lack of completeness, for which we plead guilty and - apologize. - - If you're stuck, please come and ask on our `slack channel `_, we'll figure - something out. - - If you're not stuck but had trouble understanding something, please consider contributing to the docs (via GitHub - pull requests). - -.. toctree:: - :maxdepth: 2 - - tut01 - tut02 - tut03 - tut04 - - -What's next? -:::::::::::: - -Read a few examples -------------------- - -* :doc:`/reference/examples` - -Read about best development practices -------------------------------------- - -* :doc:`/guide/index` -* :doc:`/guide/purity` - -Read about integrating external tools with bonobo -------------------------------------------------- - -* :doc:`/extension/docker`: run transformation graphs in isolated containers. -* :doc:`/extension/jupyter`: run transformations within jupyter notebooks. -* :doc:`/extension/selenium`: crawl the web using a real browser and work with the gathered data. -* :doc:`/extension/sqlalchemy`: everything you need to interract with SQL databases. - diff --git a/docs/tutorial/0.5/python.rst b/docs/tutorial/0.5/python.rst deleted file mode 100644 index a1b5a57..0000000 --- a/docs/tutorial/0.5/python.rst +++ /dev/null @@ -1,13 +0,0 @@ -Just enough Python for Bonobo -============================= - -.. include:: _outdated_note.rst - -.. todo:: - - This is a work in progress and it is not yet available. Please come back later or even better, help us write this - guide! - - This guide is intended to help programmers or enthusiasts to grasp the python basics necessary to use Bonobo. It - should definately not be considered as a general python introduction, neither a deep dive into details. - diff --git a/docs/tutorial/0.5/tut01.rst b/docs/tutorial/0.5/tut01.rst deleted file mode 100644 index df26a33..0000000 --- a/docs/tutorial/0.5/tut01.rst +++ /dev/null @@ -1,202 +0,0 @@ -Let's get started! -================== - -.. include:: _outdated_note.rst - -To begin with Bonobo, you need to install it in a working python 3.5+ environment, and you'll also need cookiecutter -to bootstrap your project. - -.. code-block:: shell-session - - $ pip install bonobo cookiecutter - -See :doc:`/install` for more options. - - -Create an empty project -::::::::::::::::::::::: - -Your ETL code will live in ETL projects, which are basically a bunch of files, including python code, that bonobo -can run. - -.. code-block:: shell-session - - $ bonobo init tutorial - -This will create a `tutorial` directory (`content description here `_). - -To run this project, use: - -.. code-block:: shell-session - - $ bonobo run tutorial - - -Write a first transformation -:::::::::::::::::::::::::::: - -Open `tutorial/main.py`, and delete all the code here. - -A transformation can be whatever python can call. Simplest transformations are functions and generators. - -Let's write one: - -.. code-block:: python - - def transform(x): - return x.upper() - -Easy. - -.. note:: - - This function is very similar to :func:`str.upper`, which you can use directly. - -Let's write two more transformations for the "extract" and "load" steps. In this example, we'll generate the data from -scratch, and we'll use stdout to "simulate" data-persistence. - -.. code-block:: python - - def extract(): - yield 'foo' - yield 'bar' - yield 'baz' - - def load(x): - print(x) - -Bonobo makes no difference between generators (yielding functions) and regular functions. It will, in all cases, iterate -on things returned, and a normal function will just be seen as a generator that yields only once. - -.. note:: - - Once again, you should use the builtin :func:`print` directly instead of this `load()` function. - - -Create a transformation graph -::::::::::::::::::::::::::::: - -Amongst other features, Bonobo will mostly help you there with the following: - -* Execute the transformations in independent threads -* Pass the outputs of one thread to other(s) thread(s) inputs. - -To do this, it needs to know what data-flow you want to achieve, and you'll use a :class:`bonobo.Graph` to describe it. - -.. code-block:: python - - import bonobo - - graph = bonobo.Graph(extract, transform, load) - - if __name__ == '__main__': - bonobo.run(graph) - -.. graphviz:: - - digraph { - rankdir = LR; - stylesheet = "../_static/graphs.css"; - - BEGIN [shape="point"]; - BEGIN -> "extract" -> "transform" -> "load"; - } - -.. note:: - - The `if __name__ == '__main__':` section is not required, unless you want to run it directly using the python - interpreter. - - -Execute the job -::::::::::::::: - -Save `tutorial/main.py` and execute your transformation again: - -.. code-block:: shell-session - - $ bonobo run tutorial - -This example is available in :mod:`bonobo.examples.tutorials.tut01e01`, and you can also run it as a module: - -.. code-block:: shell-session - - $ bonobo run -m bonobo.examples.tutorials.tut01e01 - - -Rewrite it using builtins -::::::::::::::::::::::::: - -There is a much simpler way to describe an equivalent graph: - -.. literalinclude:: ../../bonobo/examples/tutorials/tut01e02.py - :language: python - -The `extract()` generator has been replaced by a list, as Bonobo will interpret non-callable iterables as a no-input -generator. - -This example is also available in :mod:`bonobo.examples.tutorials.tut01e02`, and you can also run it as a module: - -.. code-block:: shell-session - - $ bonobo run -m bonobo.examples.tutorials.tut01e02 - -You can now jump to the next part (:doc:`tut02`), or read a small summary of concepts and definitions introduced here -below. - -Takeaways -::::::::: - -① The :class:`bonobo.Graph` class is used to represent a data-processing pipeline. - -It can represent simple list-like linear graphs, like here, but it can also represent much more complex graphs, with -forks and joins. - -This is what the graph we defined looks like: - -.. graphviz:: - - digraph { - rankdir = LR; - BEGIN [shape="point"]; - BEGIN -> "iter(['foo', 'bar', 'baz'])" -> "str.upper" -> "print"; - } - - -② `Transformations` are simple python callables. Whatever can be called can be used as a `transformation`. Callables can -either `return` or `yield` data to send it to the next step. Regular functions (using `return`) should be prefered if -each call is guaranteed to return exactly one result, while generators (using `yield`) should be prefered if the -number of output lines for a given input varies. - -③ The `Graph` instance, or `transformation graph` is executed using an `ExecutionStrategy`. You won't use it directly, -but :func:`bonobo.run` created an instance of :class:`bonobo.ThreadPoolExecutorStrategy` under the hood (the default -strategy). Actual behavior of an execution will depend on the strategy chosen, but the default should be fine for most -cases. - -④ Before actually executing the `transformations`, the `ExecutorStrategy` instance will wrap each component in an -`execution context`, whose responsibility is to hold the state of the transformation. It enables you to keep the -`transformations` stateless, while allowing you to add an external state if required. We'll expand on this later. - -Concepts and definitions -:::::::::::::::::::::::: - -* **Transformation**: a callable that takes input (as call parameters) and returns output(s), either as its return value or - by yielding values (a.k.a returning a generator). - -* **Transformation graph (or Graph)**: a set of transformations tied together in a :class:`bonobo.Graph` instance, which is - a directed acyclic graph (or DAG). - -* **Node**: a graph element, most probably a transformation in a graph. - -* **Execution strategy (or strategy)**: a way to run a transformation graph. It's responsibility is mainly to parallelize - (or not) the transformations, on one or more process and/or computer, and to setup the right queuing mechanism for - transformations' inputs and outputs. - -* **Execution context (or context)**: a wrapper around a node that holds the state for it. If the node needs state, there - are tools available in bonobo to feed it to the transformation using additional call parameters, keeping - transformations stateless. - -Next -:::: - -Time to jump to the second part: :doc:`tut02`. diff --git a/docs/tutorial/0.5/tut02.rst b/docs/tutorial/0.5/tut02.rst deleted file mode 100644 index 3617005..0000000 --- a/docs/tutorial/0.5/tut02.rst +++ /dev/null @@ -1,123 +0,0 @@ -Working with files -================== - -.. include:: _outdated_note.rst - -Bonobo would be pointless if the aim was just to uppercase small lists of strings. - -In fact, Bonobo should not be used if you don't expect any gain from parallelization/distribution of tasks. - -Some background... -:::::::::::::::::: - -Let's take the following graph: - -.. graphviz:: - - digraph { - rankdir = LR; - BEGIN [shape="point"]; - BEGIN -> "A" -> "B" -> "C"; - "B" -> "D"; - } - -When run, the execution strategy wraps every component in a thread (assuming you're using the default -:class:`bonobo.strategies.ThreadPoolExecutorStrategy`). - -Bonobo will send each line of data in the input node's thread (here, `A`). Now, each time `A` *yields* or *returns* -something, it will be pushed on `B` input :class:`queue.Queue`, and will be consumed by `B`'s thread. Meanwhile, `A` -will continue to run, if it's not done. - -When there is more than one node linked as the output of a node (for example, with `B`, `C`, and `D`), the same thing -happens except that each result coming out of `B` will be sent to both on `C` and `D` input :class:`queue.Queue`. - -One thing to keep in mind here is that as the objects are passed from thread to thread, you need to write "pure" -transformations (see :doc:`/guide/purity`). - -You generally don't have to think about it. Just be aware that your nodes will run in parallel, and don't worry -too much about nodes running blocking operations, as they will run in parallel. As soon as a line of output is ready, -the next nodes will start consuming it. - -That being said, let's manipulate some files. - -Reading a file -:::::::::::::: - -There are a few component builders available in **Bonobo** that let you read from (or write to) files. - -All readers work the same way. They need a filesystem to work with, and open a "path" they will read from. - -* :class:`bonobo.CsvReader` -* :class:`bonobo.FileReader` -* :class:`bonobo.JsonReader` -* :class:`bonobo.PickleReader` - -We'll use a text file that was generated using Bonobo from the "liste-des-cafes-a-un-euro" dataset made available by -Mairie de Paris under the Open Database License (ODbL). You can `explore the original dataset -`_. - -You'll need the `"coffeeshops.txt" example dataset `_, -available in **Bonobo**'s repository: - -.. code-block:: shell-session - - $ curl https://raw.githubusercontent.com/python-bonobo/bonobo/master/bonobo/examples/datasets/coffeeshops.txt > `python3 -c 'import bonobo; print(bonobo.get_examples_path("datasets/coffeeshops.txt"))'` - -.. note:: - - The "example dataset download" step will be easier in the future. - - https://github.com/python-bonobo/bonobo/issues/134 - -.. literalinclude:: ../../bonobo/examples/tutorials/tut02e01_read.py - :language: python - -You can also run this example as a module (but you'll still need the dataset...): - -.. code-block:: shell-session - - $ bonobo run -m bonobo.examples.tutorials.tut02e01_read - -.. note:: - - Don't focus too much on the `get_services()` function for now. It is required, with this exact name, but we'll get - into that in a few minutes. - -Writing to files -:::::::::::::::: - -Let's split this file's each lines on the first comma and store a json file mapping coffee names to their addresses. - -Here are, like the readers, the classes available to write files - -* :class:`bonobo.CsvWriter` -* :class:`bonobo.FileWriter` -* :class:`bonobo.JsonWriter` -* :class:`bonobo.PickleWriter` - -Let's write a first implementation: - -.. literalinclude:: ../../bonobo/examples/tutorials/tut02e02_write.py - :language: python - -(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e02_write` or :code:`bonobo run myfile.py`) - -If you read the output file, you'll see it misses the "map" part of the problem. - -Let's extend :class:`bonobo.io.JsonWriter` to finish the job: - -.. literalinclude:: ../../bonobo/examples/tutorials/tut02e03_writeasmap.py - :language: python - -(run it with :code:`bonobo run -m bonobo.examples.tutorials.tut02e03_writeasmap` or :code:`bonobo run myfile.py`) - -It should produce a nice map. - -We favored a bit hackish solution here instead of constructing a map in python then passing the whole to -:func:`json.dumps` because we want to work with streams, if you have to construct the whole data structure in python, -you'll loose a lot of bonobo's benefits. - -Next -:::: - -Time to write some more advanced transformations, with service dependencies: :doc:`tut03`. diff --git a/docs/tutorial/0.5/tut03.rst b/docs/tutorial/0.5/tut03.rst deleted file mode 100644 index 47bbde8..0000000 --- a/docs/tutorial/0.5/tut03.rst +++ /dev/null @@ -1,202 +0,0 @@ -Configurables and Services -========================== - -.. include:: _outdated_note.rst - -.. note:: - - This section lacks completeness, sorry for that (but you can still read it!). - -In the last section, we used a few new tools. - -Class-based transformations and configurables -::::::::::::::::::::::::::::::::::::::::::::: - -Bonobo is a bit dumb. If something is callable, it considers it can be used as a transformation, and it's up to the -user to provide callables that logically fits in a graph. - -You can use plain python objects with a `__call__()` method, and it will just work. - -As a lot of transformations needs common machinery, there is a few tools to quickly build transformations, most of -them requiring your class to subclass :class:`bonobo.config.Configurable`. - -Configurables allows to use the following features: - -* You can add **Options** (using the :class:`bonobo.config.Option` descriptor). Options can be positional, or keyword - based, can have a default value and will be consumed from the constructor arguments. - - .. code-block:: python - - from bonobo.config import Configurable, Option - - class PrefixIt(Configurable): - prefix = Option(str, positional=True, default='>>>') - - def call(self, row): - return self.prefix + ' ' + row - - prefixer = PrefixIt('$') - -* You can add **Services** (using the :class:`bonobo.config.Service` descriptor). Services are a subclass of - :class:`bonobo.config.Option`, sharing the same basics, but specialized in the definition of "named services" that - will be resolved at runtime (a.k.a for which we will provide an implementation at runtime). We'll dive more into that - in the next section - - .. code-block:: python - - from bonobo.config import Configurable, Option, Service - - class HttpGet(Configurable): - url = Option(default='https://jsonplaceholder.typicode.com/users') - http = Service('http.client') - - def call(self, http): - resp = http.get(self.url) - - for row in resp.json(): - yield row - - http_get = HttpGet() - - -* You can add **Methods** (using the :class:`bonobo.config.Method` descriptor). :class:`bonobo.config.Method` is a - subclass of :class:`bonobo.config.Option` that allows to pass callable parameters, either to the class constructor, - or using the class as a decorator. - - .. code-block:: python - - from bonobo.config import Configurable, Method - - class Applier(Configurable): - apply = Method() - - def call(self, row): - return self.apply(row) - - @Applier - def Prefixer(self, row): - return 'Hello, ' + row - - prefixer = Prefixer() - -* You can add **ContextProcessors**, which are an advanced feature we won't introduce here. If you're familiar with - pytest, you can think of them as pytest fixtures, execution wise. - -Services -:::::::: - -The motivation behind services is mostly separation of concerns, testability and deployability. - -Usually, your transformations will depend on services (like a filesystem, an http client, a database, a rest api, ...). -Those services can very well be hardcoded in the transformations, but there is two main drawbacks: - -* You won't be able to change the implementation depending on the current environment (development laptop versus - production servers, bug-hunting session versus execution, etc.) -* You won't be able to test your transformations without testing the associated services. - -To overcome those caveats of hardcoding things, we define Services in the configurable, which are basically -string-options of the service names, and we provide an implementation at the last moment possible. - -There are two ways of providing implementations: - -* Either file-wide, by providing a `get_services()` function that returns a dict of named implementations (we did so - with filesystems in the previous step, :doc:`tut02`) -* Either directory-wide, by providing a `get_services()` function in a specially named `_services.py` file. - -The first is simpler if you only have one transformation graph in one file, the second allows to group coherent -transformations together in a directory and share the implementations. - -Let's see how to use it, starting from the previous service example: - -.. code-block:: python - - from bonobo.config import Configurable, Option, Service - - class HttpGet(Configurable): - url = Option(default='https://jsonplaceholder.typicode.com/users') - http = Service('http.client') - - def call(self, http): - resp = http.get(self.url) - - for row in resp.json(): - yield row - -We defined an "http.client" service, that obviously should have a `get()` method, returning responses that have a -`json()` method. - -Let's provide two implementations for that. The first one will be using `requests `_, -that coincidally satisfies the described interface: - -.. code-block:: python - - import bonobo - import requests - - def get_services(): - return { - 'http.client': requests - } - - graph = bonobo.Graph( - HttpGet(), - print, - ) - -If you run this code, you should see some mock data returned by the webservice we called (assuming it's up and you can -reach it). - -Now, the second implementation will replace that with a mock, used for testing purposes: - -.. code-block:: python - - class HttpResponseStub: - def json(self): - return [ - {'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}, - {'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}}, - ] - - class HttpStub: - def get(self, url): - return HttpResponseStub() - - def get_services(): - return { - 'http.client': HttpStub() - } - - graph = bonobo.Graph( - HttpGet(), - print, - ) - -The `Graph` definition staying the exact same, you can easily substitute the `_services.py` file depending on your -environment (the way you're doing this is out of bonobo scope and heavily depends on your usual way of managing -configuration files on different platforms). - -Starting with bonobo 0.5 (not yet released), you will be able to use service injections with function-based -transformations too, using the `bonobo.config.requires` decorator to mark a dependency. - -.. code-block:: python - - from bonobo.config import requires - - @requires('http.client') - def http_get(http): - resp = http.get('https://jsonplaceholder.typicode.com/users') - - for row in resp.json(): - yield row - - -Read more -::::::::: - -* :doc:`/guide/services` -* :doc:`/reference/api_config` - -Next -:::: - -:doc:`tut04`. diff --git a/docs/tutorial/0.5/tut04.rst b/docs/tutorial/0.5/tut04.rst deleted file mode 100644 index 2ad6557..0000000 --- a/docs/tutorial/0.5/tut04.rst +++ /dev/null @@ -1,216 +0,0 @@ -Working with databases -====================== - -.. include:: _outdated_note.rst - -Databases (and especially SQL databases here) are not the focus of Bonobo, thus support for it is not (and will never -be) included in the main package. Instead, working with databases is done using third party, well maintained and -specialized packages, like SQLAlchemy, or other database access libraries from the python cheese shop. - -.. note:: - - SQLAlchemy extension is not yet complete. Things may be not optimal, and some APIs will change. You can still try, - of course. - - Consider the following document as a "preview" (yes, it should work, yes it may break in the future). - - Also, note that for early development stages, we explicitely support only PostreSQL, although it may work well - with `any other database supported by SQLAlchemy `_. - -First, read https://www.bonobo-project.org/with/sqlalchemy for instructions on how to install. You **do need** the -bleeding edge version of `bonobo` and `bonobo-sqlalchemy` to make this work. - -Requirements -:::::::::::: - -Once you installed `bonobo_sqlalchemy` (read https://www.bonobo-project.org/with/sqlalchemy to use bleeding edge -version), install the following additional packages: - -.. code-block:: shell-session - - $ pip install -U python-dotenv psycopg2 awesome-slugify - -Those packages are not required by the extension, but `python-dotenv` will help us configure the database DSN, and -`psycopg2` is required by SQLAlchemy to connect to PostgreSQL databases. Also, we'll use a slugifier to create unique -identifiers for the database (maybe not what you'd do in the real world, but very much sufficient for example purpose). - -Configure a database engine -::::::::::::::::::::::::::: - -Open your `_services.py` file and replace the code: - -.. code-block:: python - - import bonobo, dotenv, logging, os - from bonobo_sqlalchemy.util import create_postgresql_engine - - dotenv.load_dotenv(dotenv.find_dotenv()) - logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) - - def get_services(): - return { - 'fs': bonobo.open_examples_fs('datasets'), - 'fs.output': bonobo.open_fs(), - 'sqlalchemy.engine': create_postgresql_engine(**{ - 'name': 'tutorial', - 'user': 'tutorial', - 'pass': 'tutorial', - }) - } - -The `create_postgresql_engine` is a tiny function building the DSN from reasonable defaults, that you can override -either by providing kwargs, or with system environment variables. If you want to override something, open the `.env` -file and add values for one or more of `POSTGRES_NAME`, `POSTGRES_USER`, 'POSTGRES_PASS`, `POSTGRES_HOST`, -`POSTGRES_PORT`. Please note that kwargs always have precedence on environment, but that you should prefer using -environment variables for anything that is not immutable from one platform to another. - -Add database operation to the graph -::::::::::::::::::::::::::::::::::: - -Let's create a `tutorial/pgdb.py` job: - -.. code-block:: python - - import bonobo - import bonobo_sqlalchemy - - from bonobo.examples.tutorials.tut02e03_writeasmap import graph, split_one_to_map - - graph = graph.copy() - graph.add_chain( - bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'), - _input=split_one_to_map - ) - -Notes here: - -* We use the code from :doc:`tut02`, which is bundled with bonobo in the `bonobo.examples.tutorials` package. -* We "fork" the graph, by creating a copy and appending a new "chain", starting at a point that exists in the other - graph. -* We use :class:`bonobo_sqlalchemy.InsertOrUpdate` (which role, in case it is not obvious, is to create database rows if - they do not exist yet, or update the existing row, based on a "discriminant" criteria (by default, "id")). - -If we run this transformation (with `bonobo run tutorial/pgdb.py`), we should get an error: - -.. code-block:: text - - | File ".../lib/python3.6/site-packages/psycopg2/__init__.py", line 130, in connect - | conn = _connect(dsn, connection_factory=connection_factory, **kwasync) - | sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist - | - | - | The above exception was the direct cause of the following exception: - | - | Traceback (most recent call last): - | File ".../bonobo-devkit/bonobo/bonobo/strategies/executor.py", line 45, in _runner - | node_context.start() - | File ".../bonobo-devkit/bonobo/bonobo/execution/base.py", line 75, in start - | self._stack.setup(self) - | File ".../bonobo-devkit/bonobo/bonobo/config/processors.py", line 94, in setup - | _append_to_context = next(_processed) - | File ".../bonobo-devkit/bonobo-sqlalchemy/bonobo_sqlalchemy/writers.py", line 43, in create_connection - | raise UnrecoverableError('Could not create SQLAlchemy connection: {}.'.format(str(exc).replace('\n', ''))) from exc - | bonobo.errors.UnrecoverableError: Could not create SQLAlchemy connection: (psycopg2.OperationalError) FATAL: database "tutorial" does not exist. - -The database we requested do not exist. It is not the role of bonobo to do database administration, and thus there is -no tool here to create neither the database, nor the tables we want to use. - -Create database and table -::::::::::::::::::::::::: - -There are however tools in `sqlalchemy` to manage tables, so we'll create the database by ourselves, and ask sqlalchemy -to create the table: - -.. code-block:: shell-session - - $ psql -U postgres -h localhost - - psql (9.6.1, server 9.6.3) - Type "help" for help. - - postgres=# CREATE ROLE tutorial WITH LOGIN PASSWORD 'tutorial'; - CREATE ROLE - postgres=# CREATE DATABASE tutorial WITH OWNER=tutorial TEMPLATE=template0 ENCODING='utf-8'; - CREATE DATABASE - -Now, let's use a little trick and add this section to `pgdb.py`: - -.. code-block:: python - - import sys - from sqlalchemy import Table, Column, String, Integer, MetaData - - def main(): - from bonobo.commands.run import get_default_services - services = get_default_services(__file__) - if len(sys.argv) == 1: - return bonobo.run(graph, services=services) - elif len(sys.argv) == 2 and sys.argv[1] == 'reset': - engine = services.get('sqlalchemy.engine') - metadata = MetaData() - - coffee_table = Table( - 'coffeeshops', - metadata, - Column('id', String(255), primary_key=True), - Column('name', String(255)), - Column('address', String(255)), - ) - - metadata.drop_all(engine) - metadata.create_all(engine) - else: - raise NotImplementedError('I do not understand.') - - if __name__ == '__main__': - main() - -.. note:: - - We're using private API of bonobo here, which is unsatisfactory, discouraged and may change. Some way to get the - service dictionnary will be added to the public api in a future release of bonobo. - -Now run: - -.. code-block:: python - - $ python tutorial/pgdb.py reset - -Database and table should now exist. - -Format the data -::::::::::::::: - -Let's prepare our data for database, and change the `.add_chain(..)` call to do it prior to `InsertOrUpdate(...)` - -.. code-block:: python - - from slugify import slugify_url - - def format_for_db(row): - name, address = list(row.items())[0] - return { - 'id': slugify_url(name), - 'name': name, - 'address': address, - } - - # ... - - graph = graph.copy() - graph.add_chain( - format_for_db, - bonobo_sqlalchemy.InsertOrUpdate('coffeeshops'), - _input=split_one_to_map - ) - -Run! -:::: - -You can now run the script (either with `bonobo run tutorial/pgdb.py` or directly with the python interpreter, as we -added a "main" section) and the dataset should be inserted in your database. If you run it again, no new rows are -created. - -Note that as we forked the graph from :doc:`tut02`, the transformation also writes the data to `coffeeshops.json`, as -before. -