Skip to content

Commit 73c04b1

Browse files
authored
Handle public and private CSVs (#218)
* handle public and private CSV in CLI... but nothing downstream * coverage * More readable CLI help * fake data into separate function * warn -> error * warn -> error * add private and public component params * add explanation in UI * add cards to organize first tab * stub where the warning message will go * warning message about column mismatch * better formating on list * linting * make the "Define analysis" button conditional * fix label in end-to-end * reformat for readability * match -> mismatch * read either public or private * move out content of simulation card * Different simulation card if public CSV * fix renaming bugs * add test to fix coverage; use "Optional" * factor mock data generation out of make_accuracy_histogram * public and private previews * start testing conditional display for public vs private * nb reads public or private * also make plot title conditional * factor out shared descriptions * missing f on f-string
1 parent 134890b commit 73c04b1

17 files changed

+445
-135
lines changed

README-PYPI.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,27 @@ Output options include:
1010
## Usage
1111

1212
```
13-
usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo]
13+
usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo]
14+
15+
DP Wizard makes it easier to get started with Differential Privacy.
1416
1517
options:
1618
-h, --help show this help message and exit
17-
--csv CSV_PATH Path to CSV containing private data
19+
--public_csv CSV Path to public CSV
20+
--private_csv CSV Path to private CSV
1821
--contrib CONTRIB How many rows can an individual contribute?
1922
--demo Use generated fake CSV for a quick demo
23+
24+
Use "--public_csv" if you have a public data set, and are curious how
25+
DP can be applied: The preview visualizations will use your public data.
26+
27+
Use "--private_csv" if you only have a private data set, and want to
28+
make a release from it: The preview visualizations will only use
29+
simulated data, and apart from the headers, the private CSV is not
30+
read until the release.
31+
32+
Use "--public_csv" and "--private_csv" together if you have two CSVs
33+
with the same structure. Perhaps the public CSV is older and no longer
34+
sensitive. Preview visualizations will be made with the public data,
35+
but the release will be made with private data.
2036
```

README.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,29 @@ Building on what we've learned from [DP Creator](https://github.com/opendp/dpcre
1313
## Usage
1414

1515
```
16-
usage: dp-wizard [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo]
16+
usage: dp-wizard [-h] [--public_csv CSV] [--private_csv CSV] [--contrib CONTRIB] [--demo]
17+
18+
DP Wizard makes it easier to get started with Differential Privacy.
1719
1820
options:
1921
-h, --help show this help message and exit
20-
--csv CSV_PATH Path to CSV containing private data
22+
--public_csv CSV Path to public CSV
23+
--private_csv CSV Path to private CSV
2124
--contrib CONTRIB How many rows can an individual contribute?
2225
--demo Use generated fake CSV for a quick demo
26+
27+
Use "--public_csv" if you have a public data set, and are curious how
28+
DP can be applied: The preview visualizations will use your public data.
29+
30+
Use "--private_csv" if you only have a private data set, and want to
31+
make a release from it: The preview visualizations will only use
32+
simulated data, and apart from the headers, the private CSV is not
33+
read until the release.
34+
35+
Use "--public_csv" and "--private_csv" together if you have two CSVs
36+
with the same structure. Perhaps the public CSV is older and no longer
37+
sensitive. Preview visualizations will be made with the public data,
38+
but the release will be made with private data.
2339
```
2440

2541

dp_wizard/app/__init__.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@ def ctrl_c_reminder(): # pragma: no cover
2828

2929
def make_server_from_cli_info(cli_info: CLIInfo):
3030
def server(input: Inputs, output: Outputs, session: Session): # pragma: no cover
31-
cli_csv_path = cli_info.csv_path
32-
csv_path = reactive.value("" if cli_csv_path is None else cli_csv_path)
31+
public_csv_path = reactive.value( # noqa: F841 # TODO
32+
cli_info.public_csv_path or ""
33+
)
34+
private_csv_path = reactive.value(cli_info.private_csv_path or "")
35+
3336
contributions = reactive.value(cli_info.contributions)
3437

3538
lower_bounds = reactive.value({})
@@ -43,15 +46,17 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove
4346
output,
4447
session,
4548
is_demo=cli_info.is_demo,
46-
csv_path=csv_path,
49+
public_csv_path=public_csv_path,
50+
private_csv_path=private_csv_path,
4751
contributions=contributions,
4852
)
4953
analysis_panel.analysis_server(
5054
input,
5155
output,
5256
session,
5357
is_demo=cli_info.is_demo,
54-
csv_path=csv_path,
58+
public_csv_path=public_csv_path,
59+
private_csv_path=private_csv_path,
5560
contributions=contributions,
5661
lower_bounds=lower_bounds,
5762
upper_bounds=upper_bounds,
@@ -63,7 +68,8 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove
6368
input,
6469
output,
6570
session,
66-
csv_path=csv_path,
71+
public_csv_path=public_csv_path,
72+
private_csv_path=private_csv_path,
6773
contributions=contributions,
6874
lower_bounds=lower_bounds,
6975
upper_bounds=upper_bounds,

dp_wizard/app/analysis_panel.py

Lines changed: 59 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
from math import pow
22
from typing import Iterable, Any
3+
from pathlib import Path
34

45
from shiny import ui, reactive, render, req, Inputs, Outputs, Session
56

67
from dp_wizard.app.components.inputs import log_slider
78
from dp_wizard.app.components.column_module import column_ui, column_server
8-
from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
9+
from dp_wizard.utils.csv_helper import (
10+
read_csv_ids_labels,
11+
read_csv_ids_names,
12+
get_csv_row_count,
13+
)
914
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
1015
from dp_wizard.utils.code_generators import make_privacy_loss_block
1116

@@ -42,24 +47,7 @@ def analysis_ui():
4247
),
4348
ui.card(
4449
ui.card_header("Simulation"),
45-
ui.markdown(
46-
"""
47-
This simulation will assume a normal distribution
48-
between the specified lower and upper bounds.
49-
Until you make a release, your CSV will not be
50-
read except to determine the columns.
51-
52-
What is the approximate number of rows in the dataset?
53-
This number is only used for the simulation
54-
and not the final calculation.
55-
"""
56-
),
57-
ui.input_select(
58-
"row_count",
59-
"Estimated Rows",
60-
choices=["100", "1000", "10000"],
61-
selected="100",
62-
),
50+
ui.output_ui("simulation_card_ui"),
6351
),
6452
),
6553
ui.output_ui("columns_ui"),
@@ -82,7 +70,8 @@ def analysis_server(
8270
input: Inputs,
8371
output: Outputs,
8472
session: Session,
85-
csv_path: reactive.Value[str],
73+
public_csv_path: reactive.Value[str],
74+
private_csv_path: reactive.Value[str],
8675
contributions: reactive.Value[int],
8776
is_demo: bool,
8877
lower_bounds: reactive.Value[dict[str, float]],
@@ -124,13 +113,59 @@ def columns_checkbox_group_tooltip_ui():
124113
""",
125114
)
126115

116+
@render.ui
117+
def simulation_card_ui():
118+
if public_csv_path():
119+
row_count = get_csv_row_count(Path(public_csv_path()))
120+
return [
121+
ui.markdown(
122+
f"""
123+
Because you've provided a public CSV,
124+
it *will be read* to generate previews.
125+
126+
The confidence interval depends on the number of rows.
127+
Your public CSV has {row_count} rows,
128+
but if you believe the private CSV will be
129+
much larger or smaller, please update.
130+
"""
131+
),
132+
ui.input_select(
133+
"row_count",
134+
"Estimated Rows",
135+
choices=[row_count, "100", "1000", "10000"],
136+
selected=row_count,
137+
),
138+
]
139+
else:
140+
return [
141+
ui.markdown(
142+
"""
143+
This simulation will assume a normal distribution
144+
between the specified lower and upper bounds.
145+
Until you make a release, your CSV will not be
146+
read except to determine the columns.
147+
148+
What is the approximate number of rows in the dataset?
149+
This number is only used for the simulation
150+
and not the final calculation.
151+
"""
152+
),
153+
ui.input_select(
154+
"row_count",
155+
"Estimated Rows",
156+
choices=["100", "1000", "10000"],
157+
selected="100",
158+
),
159+
]
160+
127161
@render.ui
128162
def columns_ui():
129163
column_ids = input.columns_checkbox_group()
130164
column_ids_to_names = csv_ids_names_calc()
131165
for column_id in column_ids:
132166
column_server(
133167
column_id,
168+
public_csv_path=public_csv_path(),
134169
name=column_ids_to_names[column_id],
135170
contributions=contributions(),
136171
epsilon=epsilon(),
@@ -146,11 +181,13 @@ def columns_ui():
146181

147182
@reactive.calc
148183
def csv_ids_names_calc():
149-
return read_csv_ids_names(req(csv_path()))
184+
# The previous tab validated that if both public and private are given,
185+
# the columns match, so it shouldn't matter which is read.
186+
return read_csv_ids_names(Path(req(public_csv_path() or private_csv_path())))
150187

151188
@reactive.calc
152189
def csv_ids_labels_calc():
153-
return read_csv_ids_labels(req(csv_path()))
190+
return read_csv_ids_labels(Path(req(public_csv_path() or private_csv_path())))
154191

155192
@render.ui
156193
def epsilon_tooltip_ui():

dp_wizard/app/components/column_module.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
from htmltools.tags import details, summary
44
from shiny import ui, render, module, reactive, Inputs, Outputs, Session
55
from shiny.types import SilentException
6+
import polars as pl
67

78
from dp_wizard.utils.dp_helper import make_accuracy_histogram
89
from dp_wizard.utils.shared import plot_histogram
910
from dp_wizard.utils.code_generators import make_column_config_block
1011
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip, hide_if
1112
from dp_wizard.utils.dp_helper import confidence
13+
from dp_wizard.utils.mock_data import mock_data, ColumnDef
1214

1315

1416
default_weight = "2"
@@ -56,6 +58,7 @@ def column_server(
5658
input: Inputs,
5759
output: Outputs,
5860
session: Session,
61+
public_csv_path: str,
5962
name: str,
6063
contributions: int,
6164
epsilon: float,
@@ -107,7 +110,20 @@ def accuracy_histogram():
107110
# This function is triggered when column is removed;
108111
# Exit early to avoid divide-by-zero.
109112
raise SilentException("weights_sum == 0")
113+
114+
# Mock data only depends on lower and upper bounds, so it could be cached,
115+
# but I'd guess this is dominated by the DP operations,
116+
# so not worth optimizing.
117+
# TODO: Use real public data, if we have it!
118+
if public_csv_path:
119+
lf = pl.scan_csv(public_csv_path)
120+
else:
121+
lf = pl.LazyFrame(
122+
mock_data({name: ColumnDef(lower_x, upper_x)}, row_count=row_count)
123+
)
110124
return make_accuracy_histogram(
125+
lf=lf,
126+
column_name=name,
111127
row_count=row_count,
112128
lower=lower_x,
113129
upper=upper_x,
@@ -210,9 +226,11 @@ def data_frame():
210226
def histogram_preview_plot():
211227
accuracy, histogram = accuracy_histogram()
212228
s = "s" if contributions > 1 else ""
213-
title = (
214-
f"Simulated {name}: normal distribution, "
215-
f"{contributions} contribution{s} / invidual"
229+
title = ", ".join(
230+
[
231+
name if public_csv_path else f"Simulated {name}: normal distribution",
232+
f"{contributions} contribution{s} / invidual",
233+
]
216234
)
217235
return plot_histogram(
218236
histogram,

dp_wizard/app/components/outputs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,7 @@ def demo_tooltip(is_demo: bool, text: str): # pragma: no cover
2222
def hide_if(condition: bool, el): # pragma: no cover
2323
display = "none" if condition else "block"
2424
return ui.div(el, style=f"display: {display};")
25+
26+
27+
def info_box(content): # pragma: no cover
28+
return ui.div(content, class_="alert alert-info", role="alert")

0 commit comments

Comments
 (0)