From 2ffbc90bcb74606aca4bc6c75dd52a2ce4d6875b Mon Sep 17 00:00:00 2001 From: Michael Copeland Date: Wed, 24 May 2017 22:21:29 -0400 Subject: [PATCH] Add examples for pickle functionality --- bonobo/examples/datasets/spam.tgz | Bin 0 -> 6231 bytes bonobo/examples/files/pickle_handlers.py | 58 +++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 bonobo/examples/datasets/spam.tgz create mode 100644 bonobo/examples/files/pickle_handlers.py diff --git a/bonobo/examples/datasets/spam.tgz b/bonobo/examples/datasets/spam.tgz new file mode 100644 index 0000000000000000000000000000000000000000..5ffddab49eecb13a51b6a108a086d443fb33fa41 GIT binary patch literal 6231 zcmV-d7^vqTiwFqQ2qjqp|8sC*Z7y_YdI0TOXOtY-RhGTZIp^?NyVgj%qgMAMg=Kd< zni*+EnxLH-?UX=^dENb{ySA(9b){*wuwgM^3^)O{0UL~sS)u_GZNMg2Cg+@!$vNlb z@4K&hs>jFY@CP`DbGnYssJiOC`|i8(yZ2Sv(_yvO4UYUTUNu~Eb93@Pu8;YDYIbhA zexyEKuh*w)Gt*PkM{4!j-0bWTRXg&3aOG*H6Q%w~HaWVv|0mZ&w>=OaIsQQWh~wVl z@Uy8i)3(VVex!FBex_j>-)0_&A9Y-Qc61m&+OxR3XjL9%exM%A6C1Tv&-zg|shWwt zIjC0E!oKzc-3W}bQ7}+`q#AkKPgSJaiP4$r+azeIKr4e6dwV9UV$z}?8I^TRVpQ8^ zN{1GAea(0AW5C2?OC}!Yy%=|!U64pP^^)!S;G*yEsT@oCobq@~dBSn;QQQWMJbt40 zcHCY_j8P`a65(n}Y1Qw7!=?>;Dzl&vsS68gG4H88KhlB9x>>5G>S?Cd^*~M4YO__f zV{T*|h4E)j*Qe^dNlerD`$N)kZqm%R(eX5EsSKkN$o0g|vBim0>0S>D*3Qh#PMxXO zYNu!IzDeq5Cervx;P}bqCZFQfad%a<@<^pvJhjT`1amu9Hhon!l@1`!)_`J@LEQLL zLFQ>CnWuaA;BG^A)T(OPXd=U4KkBLu#cWbCpQn41{2gEiU@)M< zK66Bt&+^_{$Z|bxs}(aS^wt7@RD_>lqYVR|rd8jVE`CL}uQbR;mfCNrhHiE%YIOo; z;CBr+7aFyPoepfSD1DoNih>k_U|#Jsl*MT8hNmdeW~VKDrzUDms#_0#=-u_|Y)#bA z+>f6PW}j2;|GC~e?w0q#OgK=D#FAA-V}y~(+N9Jox(n@R9TggAxWg@psfNZ4(tayd zJ*Y9ts`2x%#PiD~Uf{hGcMEtA%Gf{xa}X0y6YC~`7r4L2Hi`xi}EO~sw;^}Q+4NA?xSgY12Vvt9>2h23j&1wA58ighyybb_LF0;00@e zX=WH#TD}%NoUPSVFY#|0JnrkHRZ%-&D$sr?TTr#LwYmD~T771&4u^NoZOf?3Ytu7$ za=#Azma0#|#PcMT;8BfV0=8c|!nX3>hT9e0avKe8!-sI|b=*Z+2ebGrG4pVcE&8LM z1&-6Jy%zMxyNt$;duXU>gOW<>OrWW;Wi67XjaNj7Gzzuu7Aq(%T6JM-c^Q1dpOMMV zo|&sn*WwBoK3ZDG?Ouqx)rk=P2Hit~*j`AJ$2_`(U-)HTN11S^gRasIn~STaFnPpC zj2uWr%n;w&9R%C%JlqEwX+qO2Y?Pjgw=G10_ztk}vJsgb^InVF?NtxHqobyCdj`A3 zh3%CJc4>h9Ti0_+_hVef?Z4u-m|i_0536?JWBYkdkx)g^>D zNrxtE!00=EkKvn&6YsA%p09}IT|#8(*$k$4BNI(C-kej5YYVIEo%B8|YAt<2 zC(RC{AMJ+gCe)Lxo;st`2)lG7HvpP#FvK7nRS}ze(1D*CSOc~V#x;VD7Vo$v%oEt6oF%z^2_#8a|}z($kK{LrM;_#}2#Eq8Xx zyPZ2zD+~88s|B^Ru(PZ-H`MmkjirTaYIk*K1%KAoDwVi~iS-dZPI+&^t><~pPuA_e zZx(~RagJdiPm2vT6VvjE76QN76-n(bTtSqDbf8rJj+kMYH^E-2+lD^vMl;Rad#~zb zS?}DbQ+?g5sv9?MIPVR)TocPxk34&v^GSjdTDo664GeZrlfiK<eS&TACLRMJ@HvAbFN(GZtq@ZRze|w8JJ9V>)~5(Nu3=71i*NV z37DZi_Kgpq0^9^8$DaV2_l&6JUU9GGs2$J@_5o#@1|i4S>Lf!C#O$YzP}}MtJ(U?9 zLI-^$7y1xX7HPs6;vv4008l%9TA8+5-nh87u(4D=+errJb{E!`wMwVBjXG2(eJ{qj-}3*HGDVJb3!b0xmbQvpAH@u9;~ zpO4RD`eHeK$(!Z$k396=TW`PhK89YZmR<;WMrWEDri+f|Kf$RF3XF`GLFz(D>Y_Kt z*lE?R*S!J=0Mq>hVXuImJ49Z6{sAj3jbLqIrZ7N1A{bi%t<{p&CC?*L*vnu50GNK- zq4x%P)a*D6=YvzwNT8Q~qyiv4wE(mTe;t=S;H4_t1QTyA?W!5v<>V50+UG$nX$+GyX8npmkC?_kc= za?btUYjC&Z3#5QdGN%p{C#jl;0*njW$v|Rf1iNqJ`Eb0rwWe#7V<8WYEf7iw02~-- zomd}%A%b{Gz&g6{Xqv$q-m3X{7rc2TZ`Zub1$-c6R>ySMyBncn0t@x)UFJ3yLyM9~BsnS((RbAfJ}&~j10pac%EqmCcA3|&xf zO6;e5Np3?qt9B@dsfs!eYJEXD;Js~~k}AL_l(7p}7D{iiKJ6I6`bh8dDsEy+t#V67 zs^QBpK+uKEfD|*yWKN0@Y&G7)ly*6#4%OHwA_&?pvqJsta4>A{iY)Aink+Z}?RR3fkVM4C5J0M{PV3RSj+ zrGab#tuG&f^@4MNm;qapHX;t;-s~v}-RcUg2bcnWjff2Qpt%BPaue5<&>4GfU{U zmhi8q? zcUJ=9C#5?`rOpX}2(y&YP%fRxw$-MQ zAp55HxxRf--J^D4KvH?K?VyW2BxczEpts?T_Yrayf1Sss@`e)*1{9?X?|`-;H+dev9<$z1&U&MF(%A@e0-2$e zbWBe*lplsUaGxM?CX>jAQhZ&F-vkn0P?GpU@1>0G+E0q6z+~w$E)v;jKK>$1dUHAH zE#BfmT7(=>G6pE zT};Vx5~>&l(kyQ@Y!Zn;Db|+TvSFWYn8%qDU9+ZR)1IH{fFrx6mxn~K&PPhWuW)vCB1h@23cU)9s-Euf`^kFE+hsvVnk370tup&Avo_a1*D-)GQnNa z)?x63B@~e~5Z}jcI~FdK8f3thTZWq~3CCXnj^0^v^e*p;0ENAQ57fj)qgDw5YtdI8 zIuzSy5rykA6XG|bPueN^2K0+wfC_ThiHg4xtGv5h<*U5g9|1YqepO-^5ig4Y|0ZL_ zP`U#7*wGFf8pP=jjiPg@o2Z^_OYB_2uaiM`;F~SQ?m-IAlWvJlhj59AojpCjuuw|P z3Cz_1257d$RXe)Z8x)$o?$GnTxqj4DE-8dCC@qpiobgJmFM4pQjUY5S-bXCn!=_rJ z4_z!oMiwqdw#y>qX#6k)_thh@_iMZxtTflRs}O?P?q&Pur0ck0`sxx))(q5=+4pJC zY{TKrlF1SapWMQ}b_Iz*HDDf+SwS6Y2JF!#iD_ocfU8l{{g_wF5zw1!%Ppfo-@dYQ zfBdyz=j%#6e!aKGP-I6jhUPg4Jg=+OHZY6sYXO340I?fp^m%v*c%xuBAOtvy+P>-QK%rJMp+=dLrbW%}yM8##qHHHm)y~e$ z&2qVlT2$M4FJaiee-fM`LEV_1RNI+Oc6E~C)?`h|m^0E(tf zml-m#51S&`NJYip1irtyl+1g*qtX$8iVG+v+_0mZFdnd=mNN5OFz;K-dEe$ehftUa zrQaFWJn#)ciV>~usKdV#=Jg>ccR(p}R?GGP;cnG635KCLv^4uGfVb2| z*LP7E!{YA-2j5e2@ByzTUHYQG(zdq6{t6~v^phear+${Zbe0l1F(!WyRKB;Q@_pX( z30E^lsvaUDw+(5zvUl~q`1>*Z2g>0e^3DtuGm3@Teg$U7J{gTBnZ)oD$&IOuMJD?c z{rUJ7sC~Gk_Jdw@aLOo6`kW`4vb-EMwzxB7*%l9AOmc7-2fs`;oT8s2H9%GSs26RL zNngXgVEMr}!(O126cKwT@ehHAA0Ba$9}zS?dNwmWlbLj2q$hYuV3=<0`>x6A0_~m! z2Jcf;Jfwk1WB)#4iQRKLUbhXP=cW(;5HykNDbI&N)#uqVLMaa^uOC&r9lzO8@>0Z+AB+)u49&dS zmcs@%jJP+BNkj&6u)LS>c!C+qwc`?oGHJgajei0h{p5%Yf69B97zD-+8C=~u*r%3r z-v9|cLWJOnpD(At%Xwm#d?pyR1Kd5S7Mk|hGEa7A>T2`g%%N5-Pq)~tURvEQ&xGq| z?wY%Ecrr+yLf};d&II(r&1AbBAX)|4iF$2Pt!>xU^une1r@{NrjPU-mlCy`c zrVhj7@MwsA^1iyhyEWi_pjC9>5-0lw6?q2@7M)txJb6m)ag zVuXm|KLUY2E(!dJx9U152!l;)0fK!JLr{g?28P-0;7G_FTDu+xbk_3Qk)I9ZJ)#uy zYF3T^6y*P`B>(5$l<+q!d$4KS<5{RwSz+GnoI4?ohBf6;+8H1iSL43`oxd#U{FO*_ ziQPehaa0lf>Otu}X2sExi9j5iiT@h&|E8S(w}Lkq>m%#TJJpa4bpxl7<{{(c@Pthx zoiqsY`@$oOlN}8Umllh((yH;_VTHdhSNI3XzaKwu$Nz}2|5T3sXDQz zw>bTm-WebQzwBWRkLa2_+EIPXA*zoh)j`ovB-VG`sQjR_FQ9_`4LN3!zQ$C6q=83R z*U&)O{*#9fYCe=_#M10zN#$MS*PQ82v_bO>GC!>*k8^xI{t#bJAYX5CFbeberB~6y zk#p?gBMnI}(!&QXpq=-KZukv=>$n59A9UaOB@-e3Qh^0fccjB!;GS+~R{9c_+_r6& zA;F+LMVXkf3pSB;4HC#6MRyVXoCg^l@o1J-?iw1Aw95q%x!gE21rxO86o7yr zZ}GDzr$H(Ik*J17>stE4dsQXRbvu0Cp&dS-JA6HN7_#LB&ZNi1bB^)}ZO;7?-+`Hk zh4th^7zxc=A2TG)!@63@6EJeMc-}BgdsFh>=EibiE<9CAhaJ^i22vT7GMfedyy{5yH zXM17sF_Q~1yPF$KV7ZoW8j8j~F_rTe>p<3QZTLM=C3&%9_a%o!`%-e-l|o~mMnL=$ z6%F76Mz|YZX?z@`c>>^;Hi*nTT?>HYcBVA^rM$ zu(OMBgR3c>fF^i8@iQVUbzyxix!o~$$00GjjJ#C@sTChNWG2Ylxsp#Ns#TWpx`9YD zpB!`ZCl1X&&iT{$!GaU9cvlX+&4GA~Q!1lNN?*cFBsSO6gXDyxc;_LCcah@zsW>0x zy6$ro`1wiNQO5?UtLGWG@xbN#hVL5QY@%VaHfbMp{@wbl#)@K0zC7Ybsl@h5Y9V%_ ztxG-xxa_Mk4o0GTh_Yc=AOe?3d^g3H2TjfIR(c6aC32C^UqAW;Y~W}z>9%$9(6*}F z*4ff~iVrT0e9AKXgp2m*@~m}TSrer0D@4>r$tlt`)g$9;e2uU1HNM8z_!?j1YkZBb z@io52*Z3M=<7<44ukkg$#@F~7U*l_hjj!=FzQ))18eijUeEs*=M*#swC!_#S004rn BC^rBA literal 0 HcmV?d00001 diff --git a/bonobo/examples/files/pickle_handlers.py b/bonobo/examples/files/pickle_handlers.py new file mode 100644 index 0000000..c00b3fa --- /dev/null +++ b/bonobo/examples/files/pickle_handlers.py @@ -0,0 +1,58 @@ +import bonobo +from fs.tarfs import TarFS +import os + + +def cleanse_sms(row): + + if row['category'] == 'spam': + row['sms_clean'] = '**MARKED AS SPAM** ' + row['sms'][0:50] + ('...' if len(row['sms']) > 50 else '') + else: + row['sms_clean'] = row['sms'] + + return row['sms_clean'] + + +graph = bonobo.Graph( + bonobo.PickleReader('spam.pkl'), # spam.pkl is within the gzipped tarball + cleanse_sms, + print +) + + +if __name__ == '__main__': + + ''' + This example shows how a different file system service can be injected + into a transformation (as compressing pickled objects often makes sense + anyways). The pickle itself contains a list of lists as follows: + + ``` + [ + ['category', 'sms'], + ['ham', 'Go until jurong point, crazy..'], + ['ham', 'Ok lar... Joking wif u oni...'], + ['spam', 'Free entry in 2 a wkly comp to win...'], + ['ham', 'U dun say so early hor... U c already then say...'], + ['ham', 'Nah I don't think he goes to usf, he lives around here though'], + ['spam', 'FreeMsg Hey there darling it's been 3 week's now...'], + ... + ] + ``` + + where the first column categorizes and sms as "ham" or "spam". The second + column contains the sms itself. + + Data set taken from: + https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip + + The transformation (1) reads the pickled data, (2) marks and shortens + messages categorized as spam, and (3) prints the output. + ''' + + services = { + 'fs': TarFS( + os.path.join(bonobo.get_examples_path(), 'datasets', 'spam.tgz') + ) + } + bonobo.run(graph, services=services)