From 167f819676437d4971f95dd7464c77b016aa5b39 Mon Sep 17 00:00:00 2001 From: Stephanie Ewelu Date: Fri, 7 Feb 2025 04:11:30 +0000 Subject: [PATCH 1/2] Added PolarsExcelDataset implementation --- .../kedro_datasets/polars/__init__.py | 3 + .../kedro_datasets/polars/data.xlsx | Bin 0 -> 5491 bytes .../polars/polars_excel_dataset.py | 76 ++++++++++++++++++ .../polars/polars_multi_sheet.xlsx | Bin 0 -> 5831 bytes kedro-datasets/kedro_datasets/polars/setup.py | 11 +++ 5 files changed, 90 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/polars/data.xlsx create mode 100644 kedro-datasets/kedro_datasets/polars/polars_excel_dataset.py create mode 100644 kedro-datasets/kedro_datasets/polars/polars_multi_sheet.xlsx create mode 100644 kedro-datasets/kedro_datasets/polars/setup.py diff --git a/kedro-datasets/kedro_datasets/polars/__init__.py b/kedro-datasets/kedro_datasets/polars/__init__.py index 3ea77eec4..ce9fce0df 100644 --- a/kedro-datasets/kedro_datasets/polars/__init__.py +++ b/kedro-datasets/kedro_datasets/polars/__init__.py @@ -4,6 +4,9 @@ import lazy_loader as lazy +# Import the PolarsExcelDataset +from .polars_excel_dataset import PolarsExcelDataset + # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 CSVDataset: Any EagerPolarsDataset: Any diff --git a/kedro-datasets/kedro_datasets/polars/data.xlsx b/kedro-datasets/kedro_datasets/polars/data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..07e5ef9be8905fb8cefa9ff97e434104044f995d GIT binary patch literal 5491 zcmZ`-1yodv^Il3?fd!O)G{{m*r-ZbmG}0^~v2+PYrywPrQX;~lAl)6p(jd7C2#B;2 zOZ^u8zvGktdwb8l_uO;#o4YgL%zU$>p@M-)3IG6b0M*nAhVs3tiZ4)4L#UGoby`6! zHC&<2ZrtY1&RpJ(4ytff{0<(18;Djd&)W28+QOUnC8Beh;9MSIHSCU|$ek_Bdrm$+ zeN->Gq8M0e(&YBIRvh`jy@XOzj7Z7Bf&A$Fl_wRv3qjA`Qr;@0qs8ca={Ox2#!Ddl z;d5=}#srtVjR!nM#_OG^CQIjMeSXomtaN4b)ObW|zRZ#_KoyZx4y?Zfy{0|3PTbioqp3c2!OC{a_TlZU9cHF~AFxmpK1e=x>U z9WNN8yEcBl#w;m(rEsBEsN5)G9ebD8>+)>d%enlf%n@;}B^^&w|MFl&Bi22E5j zKoJ?KRs?4CbJ{*#CtbA>m(oDW(hxl!-f5?2FHmg91a-(*y%R88kBmlo_fRz>I&HpL zr;P^Eh<;DbyUStbnsG(_hT1=D-FA$O?o|ERbLZ|iU(2II zIx1642!89fOaR)S(Mf-l#Yl7m0LWqm0D!3Icsp>r+d>>6zplJj@!2voa+?(<4cLK= zc-tI0;`ay!)l>uB>^$eH(?hq^=)#EgV;y{~5<;K0DhLt{rVwr=sy1YbV}dqLhvgL3 z)`fizwz(|U!y}XTz0E#y&rLPfaxWtEz)Zx_JQO9c-8Q_U2PW?g?Esy^{{29b&iUXN z92IZU!Kp_GBT8<@sjt;@3%a3p4*7Q!tjzpPK(G+Px2DM`$1!zD$ zX_orP>irXk?z7oNNeFyHlm;B+#6zY3*VAEW$%>j`QcgZurH zu^N;P3&zQCz=UF=+4{qteurIKxz7e>-T56U5zBkt!Zm|qG0xPO2+hWvj_mIat%cIK z)dhAVRx^jQ6$!^b#e3?s#a9qExw`H`k^LHg0I{=mQNum~>4vq9bwaQT!B~{k;YyPDWqQ4~1a*?s?exkJQHg zLO$PYf6-z4Xnee1DxAm3WlB8)>nPKN95boY7qmg+8tCJYeRg~IB#t(jY zm{7zFhVPul%KQQ zzPOD#oMIQrj~?nY%@?HVlcA;FS>k^!0P3Fde$lp9o+VBLBpFJ^zabr-%EUraDiWZo z2)y7Zn!U&xm)cF{W z(}&tM?$W}T;xfuRh&f~=Ehq#jUEh-j)wq;~ni-PxjC!k$RTG~Xyb(x}H0%2|l;ANu zm2i_d(L@OK-Qk#c;#S&lVdadimoEC4=Q$>wO6}{xgVyJD0pXK;4xqRrg^AnBE4Nrm z?rnhiu?sQ`aGOJQALqV~;Fo+w3&z9RtEZJ*_4URRO6h6Rs0e<7b~!pL)KPXE+$Qpr z%O75}Q~zBef2>xB(ch+hJjQ5I-pQ!zw1RhbnxBi!QqOg$uU8wE#6BaE+xG3;Rx)RA z3;B^L6qaGY$w(fr5>zG4RCg9LuzR-)ciqZ}8Fx^793m7=k|RUqwDItV@{8$JSUp>v zxY5c6EZIdMTK+H66*2Bwc??jCmY?~l%GXDV%Gbo={c^=jELM)*82O|yB?qrp$==?bdK=H6;jYFwRN4QmvuZAS`PSf(2UKh28 zbtIlOx6r}$jUuJK5MG<&{UuHUT#!+u0m)}oZ|GNZG<-rqb!_l+DR{IcG8g0C%TAX@&yQzEevS)kB$x`F0*K9fT^D_XNgrm*$etB^K5Fa@NfY^;5Sib zSY*XPd`|`2k1p|s#=H@U#e+&u zO0C|Tn;^73B;=<~z%t=YuMsR*QNt6Tzt|zH?C6U7v=!5OO+fT)OG|yx002D<0N~Cq z0r7&m+Pm38AntD5KW{%(BrHi&z2iz{2gYuhnFmt7<0O7IiVZ~9%pLT!B;0bU!5)V~ zjm*;T@7j4yH!fXn&oXmnl$Qh8kYlfF*vld_*w+MMQut3o2q(4+?-yW+Ogq#$5=1hP z+I26tBU0???NgZPYj_NT#kAgqhedr;vo~+C$*WZT7@v)b8jE715%F!tK4jAHF0@qFeJdL{PPpNY3r$G7%1d2wF!H{(?$ypJVKz6q|Y&#dgD zuGm8m37W6jGs)0=7-;t6k!;USjdXWCb&eW`{(~>@mFkBj z&Z%|s5QLi+4PeC6+{r5~uO_6glgbc8n>N_28BJ_pT5M>RP^!_v;;oC`npuL1MH%rM z8KG&okG!tI{l!<=|M68Ub;Vcmyx@)dw9(a{3)&HJSA6x**YMjtyW*?pCtq=H8Z#^o zeOy5=KpgvMIX$s#=7is;K(Pnj3jIalv&wH6G2}X|J2(8k9msYxhE5%R>r;OxJ@325 ztUl8wgoy{<$NUs00g__B052ea87%{GFU23Cv zb2h?oz70x0Ql_yPGb)OEHk3AtIJ4%&Tt-s?D)!)QT~QHbKJBWsB>T=&aWTE2%kqq; zu+M6>Kst}l1%O@k_v%o~mK~L66>Ybk#`n_r%rmM7#P{m;^)B@_)rw#8nuM`K92v2~ zLI|9;SwY|uNG^2;i-UE@{Z}Cd^_RFktaYd830!ecH@ACBO#h?^v8|y{ilT=c9{?cy zRZ?#5J`NDKD_$0K=c4C$h|bAA-FlUjmQmLgBuKdlnH z_`__nrmqPTuibewSM3Ii)TTHjs^_MDI6!Y~4OpcVZt6IGpR7_0#;DoXtz^`Ly!)iQ z{pR}fP#(ZMJgCT(+4m#x&7C;1b6+eicT4LJSz z!y%d6R^1tk@X4k$=P#oK<{jwEOY-<{p&eWo?4Zq+(K>F7EQQk_uy)}+D>+W}0RPvl z{&$gCkv9VtJ-(h9^oKaf37^v8SLmH}SjN8HC!27_JGv@01@r9r0&L2aLATevl&Y(H z-3=UPD-ds=ffejUB@Z1)qx42*b9h?R_oU_JhWCeB;cd^mj z#fS##Cl6n`=P>)^1ZviE7W5^qfRxnCy5nOnwM1Ri&~lHJVi*SZvtF-T1Bc9HCpqDJ z+_bR+gcQj)@17f%Z#azu_b;e5B>bT=uad@+6tj~n1Q_-j=nL3BFBH4SOM3)m&`dnv zL_FvgF$gy-+JFLkf-;ivI`vmxaGxz z4tCljjx>FJ)QhGsL#MCHL~3*+bsH*|fvCJjr8U6R72@E=&2@FnPONuAbsDLs-6J_X z*dz3je2_hI%o8--m$zm}u%hL%TVb=ae_Ztf0^Jp~^f5D3VvDl@(?qVyhLz3@aRjpj4xlV`~FkH4fTdP>uK7(c!Nvt+3D zHCjJ$vDLstWe&*FXFJ@ot@s=vZej^6rJyjOZ?0fpLMmG674 zQRPl4qO%vybY^3DSSrgga-5Om9R95=nbsQ%jx&wbvoJ3=`a48jw{s6YJt?fMp*}2< z+fi}-5dy=vU*qN*7nX8URnD4f;;XR;yq%Oblq_t4jJ$vXf(7hyAO{nV-M>1CxH<}F z3zFs$D5|y5&d_I_A2mUffWGqu?YjoAjs^2bVs3JpGc`R%EYW^m=L>_J%2Mr>j)P9u4 zm(nFc&7q6BM>Mn!(b}cx5zGAM)i+Qb&q|+}Z_u#NjPDC!$9#r5QM4fmr`&Te!1e0WUoDwvW+o4aEfE7kG^l^T*nVs%=m%1A`n=0md?Y&+du z;ETk?i$?B0V;y@0GOR|$I0_YO%3rbml~@0a_|F{cPE^GU;USWTk+=rN&5p@w>Mhi9 zn*6nX95NdEA^Gc;y+?i!S3^QYhY6!)>lv=XW6yvJHp-TUT(;5PH7nAQyG(^?sr z?28K3gjxo#;;8c(M8A1ZI5?ou(Hh2}&L*l6={=^HIh4_iJP7(B-r8O6AGJBM)W5n6 zScI!A{pqVy2~!j8)trU->yNsTEY!&z;^c0s?c)q_Grr2L?qv9tWcQ#X+v73V6A?IN z9LVvl4L@|sxZq2!1=Jn-e9kFJs=(qdNveG8*3Q~-%)0~BdAftE&38jscatB&=mj@fU;S9rI-K7V%s}8Lw7u-3?d{6Z z!J#P-C}N2#3xTz0?gOP=SK~usqv{Zt`4uF{K6v26g-tXqFqxH6_PU(duvWH?q5?i! z33J8wJ6~s$HNsqX>h4=S$m4JT^cjdma~1b@0mJv62xZ>udw?e5mML@J6V$#{SH@lAy;ZFY-poR(>Iw|14S6it3`S}S%1^NFkx~`+IZZoD?dlW>sGGMD}P%FMDswk zdXzK2r$310s&w-Yk+=@L-e!LT!Kki_dgnh|z;*ETzVREJgIaQ;!2i)juG_g@ hlfUgmqc)BIQlT0uSg2ha0Dy-&LQzfYBgxg%{{SW_jx_)P literal 0 HcmV?d00001 diff --git a/kedro-datasets/kedro_datasets/polars/polars_excel_dataset.py b/kedro-datasets/kedro_datasets/polars/polars_excel_dataset.py new file mode 100644 index 000000000..b872551f8 --- /dev/null +++ b/kedro-datasets/kedro_datasets/polars/polars_excel_dataset.py @@ -0,0 +1,76 @@ +from typing import Dict, Any +from pathlib import Path +import polars as pl +import pandas as pd +import openpyxl +from kedro.io import AbstractDataset +from kedro.io.core import DatasetError + +class PolarsExcelDataset(AbstractDataset[Dict[str, pl.DataFrame], Dict[str, pl.DataFrame]]): + """ + Kedro Dataset for reading and writing multiple Polars DataFrames to/from an Excel file. + + Example: + >>> dataset = PolarsExcelDataset("data.xlsx") + >>> data = dataset.load() # Returns a dictionary of Polars DataFrames + >>> dataset.save({"sheet1": df1, "sheet2": df2}) # Saves multiple DataFrames to Excel + """ + + def __init__(self, filepath: str): + """ + Initialize PolarsExcelDataset. + + Args: + filepath (str): Path where the dataset will be stored. + """ + self._filepath = Path(filepath) + + def _load(self) -> Dict[str, pl.DataFrame]: + """Load multiple sheets into a dictionary of Polars DataFrames.""" + if not self._filepath.exists(): + raise DatasetError(f"File not found: {self._filepath}") + + try: + # Use pandas to read all sheets + pandas_data = pd.read_excel(self._filepath, sheet_name=None) + # Convert each sheet to a Polars DataFrame + return {sheet_name: pl.DataFrame(df) for sheet_name, df in pandas_data.items()} + except Exception as e: + raise DatasetError(f"Failed to load dataset: {e}") + + def _save(self, data: Dict[str, pl.DataFrame]) -> None: + """Save multiple Polars DataFrames as different sheets.""" + if not isinstance(data, dict) or not all(isinstance(df, pl.DataFrame) for df in data.values()): + raise DatasetError("Data must be a dictionary of Polars DataFrames.") + + try: + # Convert Polars DataFrame to Pandas DataFrame and then save with openpyxl + with pd.ExcelWriter(self._filepath, engine='openpyxl') as writer: + for sheet_name, df in data.items(): + if not isinstance(sheet_name, str) or len(sheet_name) > 31: + raise DatasetError(f"Invalid sheet name: {sheet_name}. Sheet names must be strings and <= 31 characters.") + df.to_pandas().to_excel(writer, sheet_name=sheet_name, index=False) + except Exception as e: + raise DatasetError(f"Failed to save dataset: {e}") + + def _exists(self) -> bool: + """Check if the dataset exists.""" + return self._filepath.exists() + + def _describe(self) -> Dict[str, Any]: + """Return dataset metadata.""" + return { + "filepath": str(self._filepath), + "exists": self._exists(), + } +# Create sample data +df1 = pl.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) +df2 = pl.DataFrame({"colA": [4, 5], "colB": ["x", "y"]}) +data = {"sheet1": df1, "sheet2": df2} + +# Save and load data +dataset = PolarsExcelDataset("data.xlsx") +dataset.save(data) +loaded_data = dataset.load() + +print(loaded_data) \ No newline at end of file diff --git a/kedro-datasets/kedro_datasets/polars/polars_multi_sheet.xlsx b/kedro-datasets/kedro_datasets/polars/polars_multi_sheet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f613f0e325483930e19ecd25aa12c994201ff00b GIT binary patch literal 5831 zcmZ`-1yq#V_Z_-(0C6ahhM@-u=@?27q^=AHd+s^sZY^~zY$^Z%fDg#Ou51kL(@;r8e;Y;*a`dov zwbJr%b@SxE@8-tm>+Ga~)FA2+-ndSoQV;utI&BmBtno_ z?Co+x)#d~r)Xp23CjYQnLYuv7*$^V}nuDoqj-HTw-JeZ57N{C->8! z>$x>f8(#A@tv{A<#4`Tv8+z(Im;eC9KbK(T>S6O^4Z}$tnq30Kkrp^>3lbV4U2ktk60%zGAIh483vr*sRzroDs~%u}SMTJd$j&g*Y3=(g};z>rm=&z4az7HKANDc>R=N+a1$(rA7z zi4ER78&OhT-w^XV_{wLw5gDDb?+g3DKRbnJ;9poW5Mrf}6S!Vlx!X=ye8;@b*a6Tb z7B~Q;?3xRU#aH*G8k#a)G6C_kOnt7Mo!1X{a4MixwuS|ogY9MC>1rFhTHL5zV-h~5 z9@34sDZ&66$gwv?*Y2M<^_CHmab|VM}O4q>~43olhJ?B zH#(*g5;hQ+g43*uDjKK87b3YWkz+XG6M*X8&WjtI@q##mmR9zB#p;K~V%_Mmm$VVN zojKo}+Td~owM7o^tly(%DiTXx#_QZ2KMGvjdTyUh7;kcS{K3Qd@uyFEU%m4sY2CXU z)m8DNx9pwQmIFJ-Ma2X7CXKy3bY;t*lDck9cKimw(9LJ+?HE>?IoGLo3g`m^VqVU4 z*YKIECLdcyH!YxfnJ0|_KDl;T5@3`E^kc43)*(Oz?NIt9wA zl-JMOyhTJifA~0D_>#17eGlv?MQZM%54Pj6Rw35y{%6?4J{}SgUTZE84 zk&Xe=j;;pE9PWVFWvpaW z{^i16I(KjT^aE>nWtI^y3r&K0XpJ0e<9Y1hE_XMS z>hxM=6KA8O$?9fhftNHGc_L}CUOP-AtkB@0{<%J-wT@j(?c3|OZkmik#JBAvlNjnB z<}-+#q0D?%S&9X8(iIv(@~v3T%{g-tV!0WKO@!n35?s4ZHhX+Q%i1d-m9v7O4Y0=PJuuahHHv?Du7}s1Z{8E{bjSeOl?^V~ zyzbCrKrce_?{Dr@bpk2xSA}=uk*d*GBa#_P{dvg7Gc7^{&$f>84-r!L92RqkZ_^-y4VDPgjt(&N%x6I#{!nv{SG#28K)P~61~ zvo4_z$v7oy*(Yw|q8@oAEgq}{ejKT3KE_5#6;nlw!+gzSWoIYZ@nPKbDLB+QTvy0? z^4g%nlMX3i4~6%-YzGxH>_ic>%eun|P%Ar3z02PvhnM5~toejg?KLUrK~d%sV@;Ij zbN|LprSk8FQ;a7-MmWYqBjn38a<;!M6lES^zWzAAbBs&__^*e}HJfVjX`Y%xkH3nV-l zTP_|?=qfiUyCyR`L1K?0fzT)7mt_|w&^Ei^e@?!I$aoLBIpcVaXF!|R>ebqqaWoCoIhyQJSn z4fq6J|1~ifa!APQ`Hm`}Vd3jo0O(j(;FRvIWfE|iEor)W$q0fC8lj@L77JPe7%LaR zR?c1yr1PvS^X5+G>7WwUSf%K-f?s4chsk*8E3n{cDAQXkW!crx1|+fCH4*e>yN=Zj z`j-?CtbF) ze+nt|qa1td%(|fau8UM|CfP<;d1p3QR(+Hze(konRyvebBb~PS;$j(7TXt5*Id8>t zxm&};imU>&ebuCF#-?1}$u=8dhGX2A>yVF9aVV$R&_S$MyuJ}rp~@ru<*3w@$f!ng z+mYva>8EJ5sjBKHw;g%EATd)Mgm1y7JB`2cz%h~I&@cC~ju*fe9S)ko zxM94So{VFLj|%czg<=J&^}Y0*e!N0crC6b13`t?Tc|fon7v+B6akG6jOf}v;Kri^K z?NNMa)&{G5uX@4XmIRKDm2B&NUXb^-14 zb6yCk{7STBGce}$W)Ar?mz4T7jrT3(W`+6Rq%Ib^!r4&gnCsXYygqv~}Gm98`S>PU3c z^?B{`W3gN}UhAG}_$ywgv6IKcJ`r8bEbeM8_?H5iocQQP$^|wEUo=aWwe#DjN$HX{f(-anP6<@u-|1nbYKaLl zy+^9a-f0uGy>^GCK0Y!-#$p+Chs>T}9Uo*NJ>hGEM#!*m_NVI)@dBtgtGlzaO7*>^esgI-Q~%`(J2IwvyhqsN0Ebl;b+1H;Q?yk3c4Tj9Kgy zj~^bHGa&#Yt0sh#?HY4qb4=fpY&4k3G`QbT?U5MPDcaM55RCn@$MBRO zt9ibuP7cycqTbdFd$P3GNh`@wi#v}@xVVy=0uGAiw9&mkiU0s0`IVbIz5JYPJb&!Z zy#8#=tN{51^+&S5k~2iyRrj?NUYYj5q_!GsaPn4JQxYCWmr+G=GxUIvM7gZo zit!$W3a!=1xRqApXci)K+ex!xTLAxmDlwwiA zVD08{-fSvmEzB%=qramW+K7-Sa%j*lJd}0Y_v$TeFngMRr2qJxN>FuV6unEFHlRVd zU_$umhVrBJv$5*;TYb2*Z>uJr;}gVNdI71weJ^d;ILn)_)@(KYZWH2o=Up)AR4fOF zu^+){amzEA2+}8WN_pkP=~mAJ`*L(@h2SE=7-acomcQ=&Cb=RmrXmAB>l-E1M>Q5I z4oe4YL2lAijGiZ8?vd}h$gFY>OI?>O)@pX~2T4F9E;y+uHOKIaB{c(MJtgflUF9u_ zOgW9iSKz&L6m%_;n41)OiZCA^FA-f`k;=C{9gWmV9eb{#r*;HN^jb{N*4uQ@aZixb z2%TYw%X|9xcIqc89L?0hc>l)^@Ryv}4+UF_Y*f8UyKhS|edgtgd8x({T`bH=NZ8Sk z2>pv#0FRI>W|tx0Kx)u+@kPL8X~P-GpJ^gJ20`DBjyWJY{)o_T3lAG7Pkz21&zz(t z7xb-DHn`_qt^nRU=4fG?JsRv24E;i~_mns>N;z$nGsW%+MA!YKBq`?@9b~t~;PtUD zXfMd`IT%kiVyM)!J#!dIl&_coGm1ow$tuoTDO4GQrDgf9+cvaV)0BF`4Wdn_bBd*A z`R~fPhEz)2ahG>cQ5PoN9Uco7IyxX0QCd@DNi_laj;~$%EUAtb+%%-gQW_X9WGD`1 z=tCupZ&q5}to1k9IB~bvC44$(#TMsoS$ioGHbC|z2=M1ZOK~M?@1XOc1Ues*{W@en z8URaI*9Sk2Sxb_(dY3G5Uz=JJ8LUAQ@2NLGsrLxq$29v zRm$@?S;ucXPd<6#+r~D-!89e7S0)i|n^w~LSBGP;5{cLg-EMt6RloGpNV)V@T>L5c z2YZOdD`+XrLF;tZ_?6mbx9;iP$57`cr7_(IuomFWKM=WP&ZcS7 zuczp=o>;|=_z)&blD&LgE!4oK`^+k3FN6)zo^*kwehZdq3d`)SJ4A37$m6ZmeA_D= znx}f8IYWLiT;bZ=;K77v^;Pka`(5}@(BZ`gzSb+kBb+{eX+{eULJJT2CH$`-`=|Ur z1Fk1oob<5(xl(0V03ec0l>$Q#G!so^C;+y6y_U)8c1F(GelG?RJ$Pz$a$yix#Nm@f zC!#fP@6p>z$4Mhn(~6~KZpA;K2Z}l=(Je`rRMrVi^W)(OZ$EH>p@9re+3P|F1V`E>~d)xC59Bu)o3{<5547HMTH4_$es4ZguGAm3JTwT0u zT)ZrF{oHIk&3=U2a0>E=qtk0q{Mp-4D6$*l6#`)JKJ*%XJOVl_)3dO#z}PU)9t!P& zur@**oq}ch@{-oJq%m!CY8Z#!FvlF-vz!dk;grWlCQ7iDMhA~Wp?YC?9C&OfXqUq%FiFIAZug5UR<;BUKHGzC#>UJqU!`eju*Oe(;C8-3_d z`uPb#EB*g%zpLo09@{T001$%F^-uKQj@woERsZQX{1f``|HX~EN^tf5@i#$zCwbXIY*5!}-N(2~0*V*+S4kqA#Q|9U_ literal 0 HcmV?d00001 diff --git a/kedro-datasets/kedro_datasets/polars/setup.py b/kedro-datasets/kedro_datasets/polars/setup.py new file mode 100644 index 000000000..ab9b3c75f --- /dev/null +++ b/kedro-datasets/kedro_datasets/polars/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, find_packages + +setup( + name="kedro_datasets", # The name of your package + version="0.1", # Version of your package + packages=find_packages(), # Automatically find all subpackages + install_requires=[ # List any external dependencies here + "kedro>=0.18.0", # Include Kedro dependency (replace with your version) + "polars", # Include Polars package (if you're using it) + ], +) From 7eda0613d28d7be2cc68f670335402f1ac0ded19 Mon Sep 17 00:00:00 2001 From: Stephanie Ewelu <45932619+Stephanieewelu@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:41:26 +0000 Subject: [PATCH 2/2] Add files via upload Implementation of Polar Dataset Signed-off-by: Stephanie Ewelu <45932619+Stephanieewelu@users.noreply.github.com> --- .../polars/eager_polars_dataset.py | 541 +++++++++++------- 1 file changed, 335 insertions(+), 206 deletions(-) diff --git a/kedro-datasets/kedro_datasets/polars/eager_polars_dataset.py b/kedro-datasets/kedro_datasets/polars/eager_polars_dataset.py index 5914ce4d7..26f922cf1 100644 --- a/kedro-datasets/kedro_datasets/polars/eager_polars_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/eager_polars_dataset.py @@ -1,206 +1,335 @@ -"""``EagerPolarsDataset`` loads/saves data from/to a data file using an underlying -filesystem (e.g.: local, S3, GCS). It uses polars to handle the -type of read/write target. -""" -from __future__ import annotations - -from copy import deepcopy -from io import BytesIO -from pathlib import PurePosixPath -from typing import Any - -import fsspec -import polars as pl -from kedro.io.core import ( - AbstractVersionedDataset, - DatasetError, - Version, - get_filepath_str, - get_protocol_and_path, -) - - -class EagerPolarsDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): - """``polars.EagerPolarsDataset`` loads/saves data from/to a data file using an underlying - filesystem (e.g.: local, S3, GCS). It uses polars to handle the dynamically select the - appropriate type of read/write on a best effort basis. - - Example usage for the `YAML API `_: - - .. code-block:: yaml - - cars: - type: polars.EagerPolarsDataset - file_format: parquet - filepath: s3://data/01_raw/company/cars.parquet - load_args: - low_memory: True - save_args: - compression: "snappy" - - Example using Python API: - - .. code-block:: pycon - - >>> from kedro_datasets.polars import EagerPolarsDataset - >>> import polars as pl - >>> - >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) - >>> - >>> dataset = EagerPolarsDataset(filepath=tmp_path / "test.parquet", file_format="parquet") - >>> dataset.save(data) - >>> reloaded = dataset.load() - >>> assert data.equals(reloaded) - - """ - - DEFAULT_LOAD_ARGS = {} # type: dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: dict[str, Any] - - def __init__( # noqa: PLR0913 - self, - *, - filepath: str, - file_format: str, - load_args: dict[str, Any] | None = None, - save_args: dict[str, Any] | None = None, - version: Version | None = None, - credentials: dict[str, Any] | None = None, - fs_args: dict[str, Any] | None = None, - metadata: dict[str, Any] | None = None, - ): - """Creates a new instance of ``EagerPolarsDataset`` pointing to a concrete data file - on a specific filesystem. The appropriate polars load/save methods are dynamically - identified by string matching on a best effort basis. - - Args: - filepath: Filepath in POSIX format to a file prefixed with a protocol like - `s3://`. - If prefix is not provided, `file` protocol (local filesystem) - will be used. - The prefix should be any protocol supported by ``fsspec``. - Key assumption: The first argument of either load/save method points to - a filepath/buffer/io type location. There are some read/write targets such - as 'clipboard' or 'records' that will fail since they do not take a filepath - like argument. - file_format: String which is used to match the appropriate load/save method on a - best effort basis. For example if 'csv' is passed, the `polars.read_csv` and - `polars.DataFrame.write_csv` methods will be identified. An error will - be raised unless there is at least one matching `read_` - or `write_`. - load_args: Polars options for loading CSV files. - Here you can find all available arguments: - https://pola-rs.github.io/polars/py-polars/html/reference/io.html - All defaults are preserved. - save_args: Polars options for saving files. - Here you can find all available arguments: - https://pola-rs.github.io/polars/py-polars/html/reference/io.html - All defaults are preserved. - version: If specified, should be an instance of - ``kedro.io.core.Version``. If its ``load`` attribute is - None, the latest version will be loaded. If its ``save`` - attribute is None, save version will be autogenerated. - credentials: Credentials required to get access to the underlying filesystem. - E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. - fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). - metadata: Any arbitrary metadata. - This is ignored by Kedro, but may be consumed by users or external plugins. - Raises: - DatasetError: Will be raised if at least less than one appropriate read or write - methods are identified. - """ - - self._file_format = file_format.lower() - - _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) - _credentials = deepcopy(credentials) or {} - - protocol, path = get_protocol_and_path(filepath) - if protocol == "file": - _fs_args.setdefault("auto_mkdir", True) - - self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) - self.metadata = metadata - - super().__init__( - filepath=PurePosixPath(path), - version=version, - exists_function=self._fs.exists, - glob_function=self._fs.glob, - ) - - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save - - def load(self) -> pl.DataFrame: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - load_method = getattr(pl, f"read_{self._file_format}", None) - - if not load_method: - raise DatasetError( - f"Unable to retrieve 'polars.read_{self._file_format}' method, please" - " ensure that your " - "'file_format' parameter has been defined correctly as per the Polars" - " API" - " https://pola-rs.github.io/polars/py-polars/html/reference/io.html" - ) - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return load_method(fs_file, **self._load_args) - - def save(self, data: pl.DataFrame) -> None: - save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_method = getattr(data, f"write_{self._file_format}", None) - - if not save_method: - raise DatasetError( - f"Unable to retrieve 'polars.DataFrame.write_{self._file_format}' " - "method, please " - "ensure that your 'file_format' parameter has been defined correctly as" - " per the Polars API " - "https://pola-rs.github.io/polars/py-polars/html/reference/io.html" - ) - buf = BytesIO() - save_method(buf, **self._save_args) - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - fs_file.write(buf.getvalue()) - self._invalidate_cache() - - def _exists(self) -> bool: - try: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DatasetError: - return False - - return self._fs.exists(load_path) - - def _describe(self) -> dict[str, Any]: - return { - "file_format": self._file_format, - "filepath": self._filepath, - "protocol": self._protocol, - "load_args": self._load_args, - "save_args": self._save_args, - "version": self._version, - } - - def _release(self) -> None: - super()._release() - self._invalidate_cache() - - def _invalidate_cache(self) -> None: - """Invalidate underlying filesystem caches.""" - filepath = get_filepath_str(self._filepath, self._protocol) - self._fs.invalidate_cache(filepath) +"""``EagerPolarsDataset`` loads/saves data from/to a data file using an underlying +filesystem (e.g.: local, S3, GCS). It uses polars to handle the +type of read/write target. +""" + +from __future__ import annotations + +from copy import deepcopy +from io import BytesIO +from pathlib import PurePosixPath +from typing import Any + +import fsspec +import polars as pl +from kedro.io.core import ( + AbstractVersionedDataset, + DatasetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + + +class EagerPolarsDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): + """``polars.EagerPolarsDataset`` loads/saves data from/to a data file using an underlying + filesystem (e.g.: local, S3, GCS). It uses polars to handle the dynamically select the + appropriate type of read/write on a best effort basis. + + Example usage for the `YAML API `_: + + .. code-block:: yaml + + cars: + type: polars.EagerPolarsDataset + file_format: parquet + filepath: s3://data/01_raw/company/cars.parquet + load_args: + low_memory: True + save_args: + compression: "snappy" + + Example using Python API: + + .. code-block:: pycon + + >>> from kedro_datasets.polars import EagerPolarsDataset + >>> import polars as pl + >>> + >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + >>> + >>> dataset = EagerPolarsDataset(filepath=tmp_path / "test.parquet", file_format="parquet") + >>> dataset.save(data) + >>> reloaded = dataset.load() + >>> assert data.equals(reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: dict[str, Any] + + def __init__( # noqa: PLR0913 + self, + *, + filepath: str, + file_format: str, + load_args: dict[str, Any] | None = None, + save_args: dict[str, Any] | None = None, + version: Version | None = None, + credentials: dict[str, Any] | None = None, + fs_args: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + ): + """Creates a new instance of ``EagerPolarsDataset`` pointing to a concrete data file + on a specific filesystem. The appropriate polars load/save methods are dynamically + identified by string matching on a best effort basis. + + Args: + filepath: Filepath in POSIX format to a file prefixed with a protocol like + `s3://`. + If prefix is not provided, `file` protocol (local filesystem) + will be used. + The prefix should be any protocol supported by ``fsspec``. + Key assumption: The first argument of either load/save method points to + a filepath/buffer/io type location. There are some read/write targets such + as 'clipboard' or 'records' that will fail since they do not take a filepath + like argument. + file_format: String which is used to match the appropriate load/save method on a + best effort basis. For example if 'csv' is passed, the `polars.read_csv` and + `polars.DataFrame.write_csv` methods will be identified. An error will + be raised unless there is at least one matching `read_` + or `write_`. + load_args: Polars options for loading CSV files. + Here you can find all available arguments: + https://pola-rs.github.io/polars/py-polars/html/reference/io.html + All defaults are preserved. + save_args: Polars options for saving files. + Here you can find all available arguments: + https://pola-rs.github.io/polars/py-polars/html/reference/io.html + All defaults are preserved. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + Raises: + DatasetError: Will be raised if at least less than one appropriate read or write + methods are identified. + """ + + self._file_format = file_format.lower() + + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def load(self) -> pl.DataFrame: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + load_method = getattr(pl, f"read_{self._file_format}", None) + + if not load_method: + raise DatasetError( + f"Unable to retrieve 'polars.read_{self._file_format}' method, please" + " ensure that your " + "'file_format' parameter has been defined correctly as per the Polars" + " API" + " https://pola-rs.github.io/polars/py-polars/html/reference/io.html" + ) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return load_method(fs_file, **self._load_args) + + def save(self, data: pl.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_method = getattr(data, f"write_{self._file_format}", None) + + if not save_method: + raise DatasetError( + f"Unable to retrieve 'polars.DataFrame.write_{self._file_format}' " + "method, please " + "ensure that your 'file_format' parameter has been defined correctly as" + " per the Polars API " + "https://pola-rs.github.io/polars/py-polars/html/reference/io.html" + ) + buf = BytesIO() + save_method(buf, **self._save_args) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + fs_file.write(buf.getvalue()) + self._invalidate_cache() + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DatasetError: + return False + + return self._fs.exists(load_path) + + def _describe(self) -> dict[str, Any]: + return { + "file_format": self._file_format, + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) + + +from kedro_datasets.polars import PolarsEagerDataset +import polars as pl +from pathlib import Path + + +def main(): + # Specify the path to your Excel file in the "polars" directory + filepath = Path( + r"C:\Users\Stephanie Ewelu\kedro-plugins\kedro-datasets\kedro_datasets\polars\your_excel_file.xlsx" + ) + sheet_name = "Sheet1" # Adjust the sheet name as necessary + + # Initialize the dataset to load data from the specified Excel file + dataset = PolarsEagerDataset(filepath=str(filepath), sheet_name=sheet_name) + + # Load the dataset into a Polars DataFrame + try: + df = dataset.load() + print("Data loaded successfully:") + print(df) + except Exception as e: + print(f"Error loading data: {e}") + + # Process the DataFrame (example: filtering data where a column 'A' is greater than 10) + processed_df = df.filter(df["A"] > 10) # Example filtering operation + + # Print the processed data + print("Processed DataFrame:") + print(processed_df) + + # Save the processed data back to the Excel file + try: + dataset.save(processed_df) + print("Data saved successfully.") + except Exception as e: + print(f"Error saving data: {e}") + + +if __name__ == "__main__": + main() +from typing import Dict, Any +from pathlib import Path +import polars as pl +import pandas as pd +import openpyxl +from kedro.io import AbstractDataset +from kedro.io.core import DatasetError + + +class PolarsExcelDataset( + AbstractDataset[Dict[str, pl.DataFrame], Dict[str, pl.DataFrame]] +): + """ + Kedro Dataset for reading and writing multiple Polars DataFrames to/from an Excel file. + Example: + >>> dataset = PolarsExcelDataset("data.xlsx") + >>> data = dataset.load() # Returns a dictionary of Polars DataFrames + >>> dataset.save({"sheet1": df1, "sheet2": df2}) # Saves multiple DataFrames to Excel + """ + + def __init__(self, filepath: str): + """ + Initialize PolarsExcelDataset. + + Args: + filepath (str): Path where the dataset will be stored. + """ + self._filepath = Path(filepath) + + def _load(self) -> Dict[str, pl.DataFrame]: + """Load multiple sheets into a dictionary of Polars DataFrames.""" + if not self._filepath.exists(): + raise DatasetError(f"File not found: {self._filepath}") + + try: + # Use pandas to read all sheets + pandas_data = pd.read_excel(self._filepath, sheet_name=None) + # Convert each sheet to a Polars DataFrame + return { + sheet_name: pl.DataFrame(df) for sheet_name, df in pandas_data.items() + } + except Exception as e: + raise DatasetError(f"Failed to load dataset: {e}") + + def _save(self, data: Dict[str, pl.DataFrame]) -> None: + """Save multiple Polars DataFrames as different sheets.""" + if not isinstance(data, dict) or not all( + isinstance(df, pl.DataFrame) for df in data.values() + ): + raise DatasetError("Data must be a dictionary of Polars DataFrames.") + + try: + # Convert Polars DataFrame to Pandas DataFrame and then save with openpyxl + with pd.ExcelWriter(self._filepath, engine="openpyxl") as writer: + for sheet_name, df in data.items(): + if not isinstance(sheet_name, str) or len(sheet_name) > 31: + raise DatasetError( + f"Invalid sheet name: {sheet_name}. Sheet names must be strings and <= 31 characters." + ) + df.to_pandas().to_excel(writer, sheet_name=sheet_name, index=False) + except Exception as e: + raise DatasetError(f"Failed to save dataset: {e}") + + def _exists(self) -> bool: + """Check if the dataset exists.""" + return self._filepath.exists() + + def _describe(self) -> Dict[str, Any]: + """Return dataset metadata.""" + return { + "filepath": str(self._filepath), + "exists": self._exists(), + } + + +# Create sample data +df1 = pl.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) +df2 = pl.DataFrame({"colA": [4, 5], "colB": ["x", "y"]}) +data = {"sheet1": df1, "sheet2": df2} + +# Save and load data +dataset = PolarsExcelDataset("data.xlsx") +dataset.save(data) +loaded_data = dataset.load() + +print(loaded_data)