Skip to content

Commit 99de0a1

Browse files
authored
Feat/datacompy (#341)
# Release 0.8.2 This release makes it easier to build apps on top of buckaroo. Post processing functions can now hide columns CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict. This makes it easier to hard code summary_dict values. More resiliency around styling columns. Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used. [Datacompy_app](capitalone/datacompy#372) example built utilizing this new functionality. This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library
1 parent 2618922 commit 99de0a1

File tree

12 files changed

+634
-50
lines changed

12 files changed

+634
-50
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,13 @@ It also moves the js code to `packages/buckaroo_js_core` This is a regular react
77

88
None of the end user experience should change with this release.
99

10+
## 0.8.2 2025-01-15
11+
12+
This release makes it easier to build apps on top of buckaroo.
13+
14+
Post processing functions can now hide columns
15+
CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict. This makes it easier to hard code summary_dict values.
16+
17+
More resiliency around styling columns. Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used.
18+
19+
[Datacompy_app](https://github.com/capitalone/datacompy/issues/372) example built utilizing this new functionality. This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,19 @@ uv venv
148148
uv sync -q
149149

150150
``
151+
### Release instructions
152+
[github release instructions](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)
153+
154+
```bash
155+
git tag $VERSION_NUMBER #no leading v
156+
update CHANGELOG.md
157+
#push code and tag to github
158+
```
159+
navigate to [create new buckaroo release](https://github.com/paddymul/buckaroo/releases/new)
160+
Follow instructions
161+
162+
163+
151164

152165
## Contributions
153166

buckaroo/dataflow/dataflow.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def _summary_sd(self, change):
168168
result_summary_sd = self._get_summary_sd(self.processed_df)
169169
self.summary_sd = result_summary_sd
170170

171-
@observe('summary_sd')
171+
@observe('summary_sd', 'processed_result')
172172
@exception_protect('merged_sd-protector')
173173
def _merged_sd(self, change):
174174
#slightly inconsitent that processed_sd gets priority over
@@ -207,7 +207,12 @@ class CustomizableDataflow(DataFlow):
207207
def __init__(self, orig_df, debug=False,
208208
column_config_overrides=None,
209209
pinned_rows=None, extra_grid_config=None,
210-
component_config=None):
210+
component_config=None, init_sd=None):
211+
if init_sd is None:
212+
self.init_sd = {}
213+
else:
214+
self.init_sd = init_sd
215+
211216
if column_config_overrides is None:
212217
column_config_overrides = {}
213218
self.column_config_overrides = column_config_overrides
@@ -273,6 +278,16 @@ def setup_options_from_analysis(self):
273278
df_data_dict = Any({'empty':[]}).tag(sync=True)
274279

275280

281+
@observe('summary_sd', 'processed_result')
282+
@exception_protect('merged_sd-protector')
283+
def _merged_sd(self, change):
284+
#slightly inconsitent that processed_sd gets priority over
285+
#summary_sd, given that processed_df is computed first. My
286+
#thinking was that processed_sd has greater total knowledge
287+
#and should supersede summary_sd.
288+
self.merged_sd = merge_sds(self.init_sd, self.cleaned_sd, self.summary_sd, self.processed_sd)
289+
290+
276291
### start code interpreter block
277292
def add_command(self, incomingCommandKls):
278293
return self.ac_obj.add_command(incomingCommandKls)
@@ -374,6 +389,8 @@ def _handle_widget_change(self, change):
374389
self.df_display_args = temp_display_args
375390

376391
"""
392+
393+
377394
Instantiation
378395
df_data_dict starts with only 'empty'
379396
first populate df_display_args, make all data point to 'empty', make all df_viewer_configs EMPTY_DFVIEWER_CONFIG

buckaroo/dataflow/dataflow_extras.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
import sys
2+
import logging
3+
4+
25
import pandas as pd
36
from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import (ColAnalysis)
47

8+
logger = logging.getLogger()
9+
510
EMPTY_DFVIEWER_CONFIG = {
611
'pinned_rows': [],
712
'column_config': []}
@@ -151,20 +156,32 @@ def style_column(kls, col, column_metadata):
151156
data_key = "main"
152157
summary_stats_key= 'all_stats'
153158

159+
@classmethod
160+
def default_styling(kls, col_name):
161+
return {'col_name': col_name, 'displayer_args': {'displayer': 'obj'}}
162+
154163
@classmethod
155164
def style_columns(kls, sd):
156165
ret_col_config = []
157-
158166
#this is necessary for polars to add an index column, which is
159167
#required so that summary_stats makes sense
160168
if 'index' not in sd:
161-
ret_col_config.append({'col_name': 'index', 'displayer_args': {'displayer': 'obj'}})
169+
ret_col_config.append(kls.default_styling('index'))
162170

163171
for col in sd.keys():
164172
col_meta = sd[col]
165-
base_style = kls.style_column(col, col_meta)
173+
if col_meta.get('merge_rule') == 'hidden':
174+
continue
175+
try:
176+
base_style = kls.style_column(col, col_meta)
177+
except Exception:
178+
logger.warn(f"Warning, styling failed from {kls} on column {col} with col_meta {col_meta} using default_styling instead")
179+
base_style = kls.default_styling(col)
166180
if 'column_config_override' in col_meta:
181+
#column_config_override, sent by the instantiation, gets set later
167182
base_style.update(col_meta['column_config_override'])
183+
if base_style.get('merge_rule') == 'hidden':
184+
continue
168185
ret_col_config.append(base_style)
169186

170187
return {
@@ -173,3 +190,4 @@ def style_columns(kls, sd):
173190
'extra_grid_config': kls.extra_grid_config,
174191
'component_config': kls.component_config
175192
}
193+
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "d8f47719-21e9-4a99-bb44-73c4f8b99c3d",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"import datacompy\n",
12+
"from datacompy_app import DatacompyBuckaroo"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "4a405785-0456-4a78-aec8-4e8045d7462a",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": [
22+
"# Create sample DataFrames\n",
23+
"df_a = pd.DataFrame({\n",
24+
" 'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
25+
" 'b': [4, 5, 6, 4, 4, 6, 7, 8],\n",
26+
" 'c': ['foo', 'foo', 'bar', None, None, 'bar', 'bar', 'foo'],\n",
27+
" 'e': [100, 10, 1, 200, 150, 140, 130, 120]})\n",
28+
"\n",
29+
"df_b = pd.DataFrame({\n",
30+
" 'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
31+
" 'b': [4, 5, 7, 4, 4, 6, 4, 4],\n",
32+
" 'd': ['foo', 'baz', 'baz', 'bar', None, None, 'bar', 'bar'],\n",
33+
" 'f': [100, 10, 1, 200, 150, 140, 130, 120]\n",
34+
"}) # Notice the difference in the last row"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"id": "95d9ce1f-71b1-4fa2-bde6-8ffbc642c574",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"DatacompyBuckaroo(df_a, df_b)"
45+
]
46+
}
47+
],
48+
"metadata": {
49+
"kernelspec": {
50+
"display_name": "Python 3 (ipykernel)",
51+
"language": "python",
52+
"name": "python3"
53+
},
54+
"language_info": {
55+
"codemirror_mode": {
56+
"name": "ipython",
57+
"version": 3
58+
},
59+
"file_extension": ".py",
60+
"mimetype": "text/x-python",
61+
"name": "python",
62+
"nbconvert_exporter": "python",
63+
"pygments_lexer": "ipython3",
64+
"version": "3.12.8"
65+
},
66+
"widgets": {
67+
"application/vnd.jupyter.widget-state+json": {
68+
"state": {},
69+
"version_major": 2,
70+
"version_minor": 0
71+
}
72+
}
73+
},
74+
"nbformat": 4,
75+
"nbformat_minor": 5
76+
}

docs/example-notebooks/Styling-Howto.ipynb

Lines changed: 73 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,20 @@
2222
},
2323
{
2424
"cell_type": "code",
25-
"execution_count": null,
25+
"execution_count": 1,
2626
"id": "1",
2727
"metadata": {
2828
"tags": []
2929
},
30-
"outputs": [],
30+
"outputs": [
31+
{
32+
"name": "stdout",
33+
"output_type": "stream",
34+
"text": [
35+
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
36+
]
37+
}
38+
],
3139
"source": [
3240
"import pandas as pd\n",
3341
"import numpy as np\n",
@@ -38,7 +46,7 @@
3846
},
3947
{
4048
"cell_type": "code",
41-
"execution_count": null,
49+
"execution_count": 2,
4250
"id": "2",
4351
"metadata": {
4452
"tags": []
@@ -52,12 +60,28 @@
5260
},
5361
{
5462
"cell_type": "code",
55-
"execution_count": null,
63+
"execution_count": 3,
5664
"id": "3",
5765
"metadata": {
5866
"tags": []
5967
},
60-
"outputs": [],
68+
"outputs": [
69+
{
70+
"data": {
71+
"application/vnd.jupyter.widget-view+json": {
72+
"model_id": "8273a2870220428f8ba3784ad246d176",
73+
"version_major": 2,
74+
"version_minor": 1
75+
},
76+
"text/plain": [
77+
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
78+
]
79+
},
80+
"execution_count": 3,
81+
"metadata": {},
82+
"output_type": "execute_result"
83+
}
84+
],
6185
"source": [
6286
"BuckarooWidget(typed_df)"
6387
]
@@ -73,12 +97,28 @@
7397
},
7498
{
7599
"cell_type": "code",
76-
"execution_count": null,
100+
"execution_count": 4,
77101
"id": "5",
78102
"metadata": {
79103
"tags": []
80104
},
81-
"outputs": [],
105+
"outputs": [
106+
{
107+
"data": {
108+
"application/vnd.jupyter.widget-view+json": {
109+
"model_id": "abbe10263ccb4bf1a013e500eb7e4a65",
110+
"version_major": 2,
111+
"version_minor": 1
112+
},
113+
"text/plain": [
114+
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
115+
]
116+
},
117+
"execution_count": 4,
118+
"metadata": {},
119+
"output_type": "execute_result"
120+
}
121+
],
82122
"source": [
83123
"bw2 = BuckarooWidget(\n",
84124
" typed_df, \n",
@@ -209,12 +249,28 @@
209249
},
210250
{
211251
"cell_type": "code",
212-
"execution_count": null,
252+
"execution_count": 5,
213253
"id": "13",
214254
"metadata": {
215255
"tags": []
216256
},
217-
"outputs": [],
257+
"outputs": [
258+
{
259+
"data": {
260+
"application/vnd.jupyter.widget-view+json": {
261+
"model_id": "db49c1cad5af48c79acdf011253666dd",
262+
"version_major": 2,
263+
"version_minor": 1
264+
},
265+
"text/plain": [
266+
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
267+
]
268+
},
269+
"execution_count": 5,
270+
"metadata": {},
271+
"output_type": "execute_result"
272+
}
273+
],
218274
"source": [
219275
"bw_ = BuckarooWidget(\n",
220276
" typed_df, \n",
@@ -529,7 +585,14 @@
529585
"name": "python",
530586
"nbconvert_exporter": "python",
531587
"pygments_lexer": "ipython3",
532-
"version": "3.9.20"
588+
"version": "3.12.8"
589+
},
590+
"widgets": {
591+
"application/vnd.jupyter.widget-state+json": {
592+
"state": {},
593+
"version_major": 2,
594+
"version_minor": 0
595+
}
533596
}
534597
},
535598
"nbformat": 4,

0 commit comments

Comments
 (0)