basic reimplementation of what was working in rdc.etl, jupyter widget, json import, demo with opendatasoft api, etc. Lot of things are still work in progress, stay tuned.

This commit is contained in:
Romain Dorgueil
2016-12-24 10:37:53 +01:00
parent f0315936d3
commit c30048f1b0
54 changed files with 4680 additions and 256 deletions

View File

@ -1,5 +1,9 @@
from bonobo.core.graph import Graph
from bonobo.core.strategy import NaiveStrategy, ExecutorStrategy
import time
from random import randint
from bonobo.core.graphs import Graph
from bonobo.core.strategies.executor import ThreadPoolExecutorStrategy
from bonobo.ext.console import ConsoleOutputPlugin
def extract():
@ -9,19 +13,20 @@ def extract():
def transform(s):
return s.title()
wait = randint(0, 1)
time.sleep(wait)
return s.title() + ' ' + str(wait)
def load(s):
print(s)
Strategy = ThreadPoolExecutorStrategy
if __name__ == '__main__':
etl = Graph()
etl.add_chain(extract, transform, load)
s = NaiveStrategy()
s.execute(etl)
s = ExecutorStrategy()
s.execute(etl)
s = Strategy()
s.execute(etl, plugins=[ConsoleOutputPlugin()])

View File

@ -0,0 +1,60 @@
import json
from blessings import Terminal
from pycountry import countries
from bonobo.ext.console import console_run
from bonobo.ext.ods import extract_ods
from bonobo.util import tee
from bonobo.io.json import to_json
DATASET = 'fablabs-in-the-world'
SEARCH_URL = 'https://datanova.laposte.fr/api/records/1.0/search/'
URL = SEARCH_URL + '?dataset=' + DATASET
ROWS = 100
t = Terminal()
def _getlink(x):
return x.get('url', None)
def normalize(row):
result = {
**row,
'links': list(filter(None, map(_getlink, json.loads(row.get('links'))))),
'country': countries.get(alpha_2=row.get('country_code', '').upper()).name,
}
return result
def filter_france(row):
if row.get('country') == 'France':
yield row
def display(row):
print(t.bold(row.get('name')))
address = list(filter(None, (
' '.join(filter(None, (row.get('postal_code', None), row.get('city', None)))),
row.get('county', None),
row.get('country'),
)))
print(' - {}: {address}'.format(t.blue('address'), address=', '.join(address)))
print(' - {}: {links}'.format(t.blue('links'), links=', '.join(row['links'])))
print(' - {}: {geometry}'.format(t.blue('geometry'), **row))
print(' - {}: {source}'.format(t.blue('source'), source='datanova/' + DATASET))
if __name__ == '__main__':
console_run(
extract_ods(SEARCH_URL, DATASET, timezone='Europe/Paris'),
normalize,
filter_france,
tee(display),
to_json('fablabs.json'),
output=True,
)