diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb new file mode 100644 index 00000000..dd52a036 --- /dev/null +++ b/examples/quickstart.ipynb @@ -0,0 +1,405 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "PipelineDP Quick Start ", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "bW1gifIe0pUt" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3Pa1EeIdJyZn" + }, + "source": [ + "This is a simple example that shows how to calculate anonymized statistics using PipelineDP. The input data is a simulated dataset of visits to some restaurant during a 7 day period. Each visit is characterized by a visitor ID, the entry date, and the amount of money spent. In this colab we use Pipeline DP\n", + "Core API to calculate the count of restaurant visits per day.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zxcPpZGuAPq8" + }, + "source": [ + "# Install dependencies and download data\n", + "\n", + "Run the code below to install the necessary dependencies, load and explore the input data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "E8yzpKYNbHTF", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "0e60ad12-094a-4e0d-9c44-d8377accc47c", + "cellView": "form" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_identer_timespent_minutesspent_moneyday
05809:27AM29171
112159:16AM45181
244811:55AM12161
312510:47AM27201
448411:08AM35131
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " user_id enter_time spent_minutes spent_money day\n", + "0 580 9:27AM 29 17 1\n", + "1 1215 9:16AM 45 18 1\n", + "2 448 11:55AM 12 16 1\n", + "3 125 10:47AM 27 20 1\n", + "4 484 11:08AM 35 13 1" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "source": [ + "#@markdown Install dependencies and download data\n", + "\n", + "import os\n", + "os.chdir('/content')\n", + "!git clone https://github.com/OpenMined/PipelineDP.git\n", + "!pip install -r PipelineDP/requirements.dev.txt\n", + "\n", + "import sys\n", + "sys.path.insert(0,'/content/PipelineDP')\n", + "\n", + "#Download restaurant dataset from github\n", + "!wget https://raw.githubusercontent.com/google/differential-privacy/main/examples/go/data/week_data.csv\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.runners.portability import fn_api_runner\n", + "from apache_beam.runners.interactive import interactive_runner\n", + "from apache_beam.runners.interactive.interactive_beam import *\n", + "import pyspark\n", + "from dataclasses import dataclass\n", + "import pipeline_dp\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = pd.read_csv('week_data.csv')\n", + "df.rename(inplace=True, columns={'VisitorId' : 'user_id', 'Time entered' : 'enter_time', 'Time spent (minutes)' : 'spent_minutes', 'Money spent (euros)' : 'spent_money', 'Day' : 'day'})\n", + "rows = [index_row[1] for index_row in df.iterrows()]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Run the pipeline" + ], + "metadata": { + "id": "hzPiLxByC5BJ" + } + }, + { + "cell_type": "code", + "source": [ + "# Set the backend to local backend. Other options (Beam or Spark)\n", + "# are possible.\n", + "backend = pipeline_dp.LocalBackend()\n", + "\n", + "# Define the total budget.\n", + "budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6)\n", + "\n", + "# Create DPEngine which will execute the logic.\n", + "dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)\n", + "\n", + "# Define privacy ID, partition key and aggregated value extractors.\n", + "# The aggregated value extractor isn't used in this example.\n", + "data_extractors = pipeline_dp.DataExtractors(\n", + " partition_extractor=lambda row: row.day,\n", + " privacy_id_extractor=lambda row: row.user_id,\n", + " value_extractor=lambda row: 1)\n", + "\n", + "# Configure the aggregation parameters.\n", + "params = pipeline_dp.AggregateParams(\n", + " noise_kind=pipeline_dp.NoiseKind.LAPLACE,\n", + " # This example computes only count but we can compute multiple\n", + " # ... metrics at once.\n", + " metrics=[pipeline_dp.Metrics.COUNT],\n", + " # Limits visits contributed by a visitor. A visitor can contribute to\n", + " # ... up to 3 days \n", + " max_partitions_contributed=3,\n", + " # ... and up to 2 visits per day. \n", + " max_contributions_per_partition=2,\n", + " # Configure the output partition keys as they are publicly known.\n", + " # The output should include all week days.\n", + " public_partitions=list(range(1, 8)))\n", + "\n", + "# Create a computational graph for the aggregation.\n", + "# All computations are lazy. dp_result is iterable, but iterating it would\n", + "# fail until budget is computed (below).\n", + "# It’s possible to call DPEngine.aggregate multiple times with different\n", + "# metrics to compute.\n", + "dp_result = dp_engine.aggregate(rows, params, data_extractors)\n", + "\n", + "# Compute budget per each DP operation. \n", + "budget_accountant.compute_budgets()\n", + "\n", + "# Here's where the lazy iterator initiates computations and gets transformed\n", + "# into actual results\n", + "dp_result = list(dp_result)\n" + ], + "metadata": { + "id": "rFj2u61qBx0r" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Inspect the result" + ], + "metadata": { + "id": "hfHqnCLcDqpU" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown ##Inspect the result\n", + "#@markdown Below you can see the DP and non-DP results.\n", + "\n", + "# Compute non-DP result\n", + "non_dp_count = [0] * 7\n", + "days = range(1, 7)\n", + "for row in rows:\n", + " index = row['day'] - 1\n", + " non_dp_count[index] += 1\n", + "\n", + "# Copy the DP result to a list\n", + "dp_count = [0] * 7 \n", + "for count_sum_per_day in dp_result:\n", + " index = count_sum_per_day[0] - 1\n", + " dp_count[index] = count_sum_per_day[1][0]\n", + "\n", + "days = [\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\"]\n", + "x = np.arange(len(days))\n", + "\n", + "width = 0.35\n", + "fig, ax = plt.subplots()\n", + "rects1 = ax.bar(x - width/2, non_dp_count, width, label='non-DP')\n", + "rects2 = ax.bar(x + width/2, dp_count, width, label='DP')\n", + "ax.set_ylabel('Visit count')\n", + "ax.set_title('Count visits per day')\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(days)\n", + "ax.legend()\n", + "fig.tight_layout()\n", + "plt.show()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 297 + }, + "id": "sTkYZ0wSbo3h", + "outputId": "80ab959d-5a2a-4901-fe10-2b99c1bd090b", + "cellView": "form" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + } + ] +}