work in progress ...

This commit is contained in:
Romain Dorgueil
2016-12-09 08:01:04 +01:00
parent 854ef4e2bf
commit 90d3b6235b
24 changed files with 822 additions and 85 deletions

26
.coveragerc Normal file
View File

@ -0,0 +1,26 @@
[run]
branch = True
[report]
# Regexes for lines to exclude from consideration
exclude_lines =
# Have to re-enable the standard pragma
pragma: no cover
# Don't complain about missing debug-only code:
def __repr__
if self\.debug
# Don't complain if tests don't hit defensive assertion code:
raise AbstractError
raise AssertionError
raise NotImplementedError
# Don't complain if non-runnable code isn't run:
if 0:
if __name__ == .__main__.:
ignore_errors = True
[html]
directory = docs/_build/html/coverage

117
.gitignore vendored
View File

@ -1,89 +1,38 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.egg
*.egg-info/
*.log
*.manifest
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
*.py[cod]
*.so
*.spec
.Python
.cache
.coverage
.coverage.*
.eggs/
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
.hypothesis/
.installed.cfg
.ipynb_checkpoints
.python-version
.tox/
.webassets-cache
/.idea
/bonobo.iml
/build/
/coverage.xml
/develop-eggs/
/dist/
/docs/_build/
/downloads/
/eggs/
/htmlcov/
/sdist/
celerybeat-schedule
parts/
pip-delete-this-directory.txt
pip-log.txt

View File

@ -1 +0,0 @@
from __future__ import absolute_import, print_function, unicode_literals

0
bonobo/core/__init__.py Normal file
View File

50
bonobo/core/errors.py Normal file
View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#
# Copyright 2012-2014 Romain Dorgueil
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
class AbstractError(NotImplementedError):
"""Abstract error is a convenient error to declare a method as "being left as an exercise for the reader"."""
def __init__(self, method):
super().__init__(
'Call to abstract method {class_name}.{method_name}(...): missing implementation.'.format(
class_name=method.__self__.__name__,
method_name=method.__name__,
))
class InactiveIOError(IOError):
pass
class InactiveReadableError(InactiveIOError):
pass
class InactiveWritableError(InactiveIOError):
pass
class ValidationError(RuntimeError):
def __init__(self, inst, message):
super(ValidationError, self).__init__('Validation error in {class_name}: {message}'.format(
class_name=type(inst).__name__,
message=message,
))
class ProhibitedOperationError(RuntimeError):
pass

27
bonobo/core/graph.py Normal file
View File

@ -0,0 +1,27 @@
from bonobo.core.tokens import BEGIN
class Graph:
"""
Represents a coherent directed acyclic graph (DAG) of components.
"""
def __init__(self):
self.components = []
self.graph = {BEGIN: set()}
def outputs_of(self, idx, create=False):
if create and not idx in self.graph:
self.graph[idx] = set()
return self.graph[idx]
def add_component(self, c):
i = len(self.components)
self.components.append(c)
return i
def add_chain(self, *components, input=BEGIN):
for component in components:
next = self.add_component(component)
self.outputs_of(input, create=True).add(next)
input = next

93
bonobo/core/io.py Normal file
View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
#
# Copyright 2012-2014 Romain Dorgueil
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABCMeta, abstractmethod
from queue import Queue
from bonobo.core.errors import AbstractError, InactiveWritableError, InactiveReadableError
from bonobo.core.tokens import BEGIN, END
BUFFER_SIZE = 8192
class Readable(metaclass=ABCMeta):
"""Interface for things you can read from."""
@abstractmethod
def get(self, block=True, timeout=None):
"""Read. Block/timeout are there for Queue compat."""
raise AbstractError(self.get)
class Writable(metaclass=ABCMeta):
"""Interface for things you can write to."""
@abstractmethod
def put(self, data, block=True, timeout=None):
"""Write. Block/timeout are there for Queue compat."""
raise AbstractError(self.put)
class Input(Queue, Readable, Writable):
def __init__(self, maxsize=BUFFER_SIZE):
Queue.__init__(self, maxsize)
self._runlevel = 0
self._writable_runlevel = 0
def put(self, data, block=True, timeout=None):
# Begin token is a metadata to raise the input runlevel.
if data == BEGIN:
self._runlevel += 1
self._writable_runlevel += 1
return
# Check we are actually able to receive data.
if self._writable_runlevel < 1:
raise InactiveWritableError('Cannot put() on an inactive {}.'.format(Writable.__name__))
if data == END:
self._writable_runlevel -= 1
return Queue.put(self, data, block, timeout)
def get(self, block=True, timeout=None):
if not self.alive:
raise InactiveReadableError('Cannot get() on an inactive {}.'.format(Readable.__name__))
data = Queue.get(self, block, timeout)
if data == END:
self._runlevel -= 1
if not self.alive:
raise InactiveReadableError(
'Cannot get() on an inactive {} (runlevel just reached 0).'.format(Readable.__name__))
return self.get(block, timeout)
return data
def empty(self):
self.mutex.acquire()
while self._qsize() and self.queue[0] == END:
self._runlevel -= 1
Queue._get(self)
self.mutex.release()
return Queue.empty(self)
@property
def alive(self):
return self._runlevel > 0

30
bonobo/core/stats.py Normal file
View File

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
#
# copyright 2012-2014 romain dorgueil
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from abc import ABCMeta, abstractmethod
from bonobo.core.errors import AbstractError
class WithStatistics(metaclass=ABCMeta):
@abstractmethod
def get_stats(self, *args, **kwargs):
raise AbstractError(self.get_stats)
def get_stats_as_string(self, *args, **kwargs):
return ' '.join(
('{0}={1}'.format(name, cnt) for name, cnt in self.get_stats(*args, **kwargs) if cnt > 0)
)

77
bonobo/core/strategy.py Normal file
View File

@ -0,0 +1,77 @@
import time
from concurrent.futures import Executor
from queue import Queue, Empty
from bonobo.core.io import Input
from bonobo.core.tokens import BEGIN
from bonobo.util.iterators import force_iterator
class Strategy:
def execute(self, graph, *args, **kwargs):
raise NotImplementedError
class NaiveStrategy(Strategy):
def execute(self, graph, *args, **kwargs):
input_queues = {i: Queue() for i in range(len(graph.components))}
for i, component in enumerate(graph.components):
while True:
try:
args = (input_queues[i].get(block=False),) if i else ()
for row in force_iterator(component(*args)):
input_queues[i + 1].put(row)
if not i:
raise Empty
except Empty:
break
class ExecutionContext:
def __init__(self, graph):
self.graph = graph
class ExecutorStrategy(Strategy):
context_type = ExecutionContext
executor_type = Executor
def __init__(self, executor=None):
self.executor = executor or self.executor_type()
def create_context(self, graph, *args, **kwargs):
return self.context_type(graph)
def execute(self, graph, *args, **kwargs):
context = self.create_context(graph)
for i in graph.outputs_of(BEGIN):
self.call_component(i, *args, **kwargs)
while len(self.running):
# print(self.running)
time.sleep(0.1)
f = self.executor.submit(self.components[idx], *args, **kwargs)
self.running.add(f)
@f.add_done_callback
def on_component_done(f):
nonlocal self, idx
outputs = self.outputs_of(idx)
results = force_iterator(f.result())
if results:
for result in results:
for output in outputs:
self.call_component(output, result)
self.running.remove(f)
def __run_component(self, component):
c_in = Input()
while c_in.alive:
row = c_in.get()
component(row)

12
bonobo/core/tokens.py Normal file
View File

@ -0,0 +1,12 @@
class Token:
"""Factory for signal oriented queue messages or other token types."""
def __init__(self, name):
self.__name__ = name
def __repr__(self):
return '<{}>'.format(self.__name__)
BEGIN = Token('Begin')
END = Token('End')

0
bonobo/util/__init__.py Normal file
View File

7
bonobo/util/iterators.py Normal file
View File

@ -0,0 +1,7 @@
def force_iterator(x):
if isinstance(x, str):
return [x]
try:
return iter(x)
except Exception as e:
return [x] if x else []

20
docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = Bonobo
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

23
docs/changelog.rst Normal file
View File

@ -0,0 +1,23 @@
Changelog
=========
0.9.0
:::::
* todo migrate doc
* todo migrate tests
* todo migrate transforms ?
Initial release
:::::::::::::::
* Migration from rdc.etl.
* New cool name.
* Only supports python 3.5+, aggressively (which means, we can use async, and we remove all things from python 2/six compat)
* Removes all thing deprecated and/or not really convincing
* We want transforms to be simple callables, so refactoring of the harness mess
* We want to use plain python data structures, so hashes are removed. If you use python 3.6, you may even get sorted dicts.
* Input/output MUX DEMUX removed, maybe no need for that in the real world. May come back, but not in 1.0
* Change dependency policy. We need to include only the very basic requirements (and very required). Everything related to transforms that we may not use (bs, sqla, ...) should be optional dependencies.
* execution strategies !!!

187
docs/conf.py Normal file
View File

@ -0,0 +1,187 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Bonobo documentation build configuration file, created by
# sphinx-quickstart on Fri Dec 9 06:12:38 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.coverage',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'Bonobo'
copyright = '2016, Romain Dorgueil'
author = 'Romain Dorgueil'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = ''
# The full version, including alpha/beta/rc tags.
release = ''
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'Bonobodoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'Bonobo.tex', 'Bonobo Documentation',
'Romain Dorgueil', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'bonobo', 'Bonobo Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Bonobo', 'Bonobo Documentation',
author, 'Bonobo', 'One line description of project.',
'Miscellaneous'),
]
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
epub_author = author
epub_publisher = author
epub_copyright = copyright
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'https://docs.python.org/': None}

20
docs/index.rst Normal file
View File

@ -0,0 +1,20 @@
.. Bonobo documentation master file, created by
sphinx-quickstart on Fri Dec 9 06:12:38 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Bonobo's documentation!
==================================
.. toctree::
:maxdepth: 2
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

36
docs/make.bat Normal file
View File

@ -0,0 +1,36 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=Bonobo
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd

View File

@ -0,0 +1,30 @@
from bonobo.strategy import NaiveStrategy, ExecutorStrategy
from bonobo.core.graph import Graph
def extract():
yield 'foo'
yield 'bar'
yield 'baz'
def transform(s):
return s.title()
def load(s):
print(s)
if __name__ == '__main__':
etl = Graph()
etl.add_chain(extract, transform, load)
s = NaiveStrategy()
s.execute(etl)
s = ExecutorStrategy()
s.execute(etl)

43
tests/core/test_graph.py Normal file
View File

@ -0,0 +1,43 @@
import pytest
from bonobo.core.graph import Graph
from bonobo.core.tokens import BEGIN
identity = lambda x: x
def test_graph_outputs_of():
g = Graph()
# default graph only node
assert len(g.outputs_of(BEGIN)) == 0
# unexisting node
with pytest.raises(KeyError):
g.outputs_of(0)
# create node
assert len(g.outputs_of(0, create=True)) == 0
assert len(g.outputs_of(0)) == 0
def test_graph_add_component():
g = Graph()
assert len(g.components) == 0
g.add_component(identity)
assert len(g.components) == 1
g.add_component(identity)
assert len(g.components) == 2
def test_graph_add_chain():
g = Graph()
assert len(g.components) == 0
g.add_chain(identity, identity, identity)
assert len(g.components) == 3
assert len(g.outputs_of(BEGIN)) == 1

66
tests/core/test_io.py Normal file
View File

@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
#
# Copyright 2012-2014 Romain Dorgueil
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from queue import Empty
import pytest
from bonobo.core.errors import InactiveWritableError, InactiveReadableError
from bonobo.core.io import Input
from bonobo.core.tokens import BEGIN, END
def test_input_runlevels():
q = Input()
# Before BEGIN, noone should be able to write in an Input queue.
assert not q.alive
with pytest.raises(InactiveWritableError):
q.put('hello, unborn queue.')
# Begin
q.put(BEGIN)
assert q.alive and q._runlevel == 1
q.put('foo')
# Second Begin
q.put(BEGIN)
assert q.alive and q._runlevel == 2
q.put('bar')
q.put(END)
# FIFO
assert q.get() == 'foo'
assert q.get() == 'bar'
# self.assertEqual(q.alive, False) XXX queue don't know it's dead yet, but it is ...
# Async get raises Empty (End is not returned)
with pytest.raises(Empty):
q.get(block=False)
assert q.alive
# Before killing, let's slide some data in.
q.put('baz')
# Now kill the queue...
q.put(END)
with pytest.raises(InactiveWritableError):
q.put('foo')
# Still can get remaining data
assert q.get() == 'baz'
with pytest.raises(InactiveReadableError):
q.get()

14
tests/core/test_stats.py Normal file
View File

@ -0,0 +1,14 @@
from bonobo.core.stats import WithStatistics
class MyThingWithStats(WithStatistics):
def get_stats(self, *args, **kwargs):
return (
('foo', 42),
('bar', 69),
)
def test_with_statistics():
o = MyThingWithStats()
assert o.get_stats_as_string() == 'foo=42 bar=69'

View File

@ -0,0 +1,6 @@
from bonobo.core.tokens import Token
def test_token_repr():
t = Token('Acme')
assert repr(t) == '<Acme>'

View File

@ -0,0 +1,22 @@
import types
from bonobo.util.iterators import force_iterator
def test_force_iterator_with_string():
assert force_iterator('foo') == ['foo']
def test_force_iterator_with_none():
assert force_iterator(None) == []
def test_force_iterator_with_generator():
def generator():
yield 'aaa'
yield 'bbb'
yield 'ccc'
iterator = force_iterator(generator())
assert type(iterator) == types.GeneratorType
assert list(iterator) == ['aaa', 'bbb', 'ccc']