Merge pull request #86 from privacytoolsproject/MF_RefactoringUtility…

…Functions Refactoring utility functions
privacytoolsproject · Aug 16, 2019 · 1cb65f3 · 1cb65f3
2 parents e906e4b + 11344d5
commit 1cb65f3
Show file tree

Hide file tree

Showing 142 changed files with 3,341 additions and 3,112 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,7 +21,6 @@ Suggests:
     testthat
 Collate: 
     'CompositionTheorems.R'
-    'createJSON.R'
     'datasets.R'
     'mechanism.R'
     'mechanism-bootstrap.R'
@@ -38,5 +37,13 @@ Collate:
     'statistic-tree.R'
     'statistic-variance.R'
     'update_parameters.R'
-    'utilities.R'
+    'utilities-categorical.R'
+    'utilities-covariance.R'
+    'utilities-data-validation.R'
+    'utilities-histogram.R'
+    'utilities-mean.R'
+    'utilities-noise-generation.R'
+    'utilities-objective.R'
+    'utilities-postprocessing.R'
+    'utilities-tree.R'
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,13 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
-export(bootstrap.replication)
-export(censordata)
-export(check_histogram_mechanism)
-export(check_variable_type)
-export(checkepsilon)
-export(checkrange)
-export(coefficient.release)
-export(dlap)
+export(bootstrapReplication)
+export(censorData)
+export(checkEpsilon)
+export(checkRange)
+export(checkVariableType)
+export(coefficientRelease)
+export(dLap)
 export(dpCovariance)
 export(dpGLM)
 export(dpHeavyHitters)
@@ -18,28 +17,26 @@ export(dpTree)
 export(dpUnif)
 export(dpVariance)
 export(fillMissing)
-export(glm.getAccuracy)
-export(glm.getParameters)
-export(histogram.getAccuracy)
-export(histogram.getEpsilon)
-export(make_logical)
-export(mean.getAccuracy)
-export(mean.getCI)
-export(mean.getParameters)
+export(glmGetAccuracy)
+export(glmGetParameters)
+export(histogramGetAccuracy)
+export(histogramGetEpsilon)
+export(makeLogical)
+export(meanGetCI)
 export(mechanism)
 export(mechanismBootstrap)
 export(mechanismExponential)
 export(mechanismGaussian)
 export(mechanismLaplace)
 export(mechanismObjective)
 export(mechanismStability)
-export(plap)
-export(qlap)
+export(pLap)
+export(qLap)
+export(rLap)
 export(release2json)
-export(rlap)
 export(sgn)
-export(tree.getAccuracy)
-export(tree.getParameters)
+export(treeGetAccuracy)
+export(treeGetParameters)
 exportClasses(dpCovariance)
 exportClasses(dpGLM)
 exportClasses(dpHeavyHitters)

diff --git a/R/createJSON.R b/R/createJSON.R
diff --git a/R/mechanism-bootstrap.R b/R/mechanism-bootstrap.R
@@ -9,19 +9,19 @@
 #' @import stats
 #' @export
 
-bootstrap.replication <- function(x, n, sensitivity, epsilon, fun) {
+bootstrapReplication <- function(x, n, sensitivity, epsilon, fun) {
     partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
-    max.appearances <- max(partition)
-    probs <- sapply(1:max.appearances, dbinom, size=n, prob=(1 / n))
-    stat.partitions <- vector('list', max.appearances)
-    for (i in 1:max.appearances) {
-        variance.i <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
-        stat.i <- fun(x[partition == i])
-        noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
-        stat.partitions[[i]] <- i * stat.i + noise.i
+    maxAppearances <- max(partition)
+    probs <- sapply(1:maxAppearances, dbinom, size=n, prob=(1 / n))
+    statPartitions <- vector('list', maxAppearances)
+    for (i in 1:maxAppearances) {
+        iVariance <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
+        iStat <- fun(x[partition == i])
+        iNoise <- dpNoise(n=length(iStat), scale=sqrt(iVariance), dist='gaussian')
+        statPartitions[[i]] <- i * iStat + iNoise
     }
-    stat.out <- do.call(rbind, stat.partitions)
-    return(apply(stat.out, 2, sum))
+    statOut <- do.call(rbind, statPartitions)
+    return(apply(statOut, 2, sum))
 }
 
 
@@ -40,31 +40,31 @@ mechanismBootstrap <- setRefClass(
 
 mechanismBootstrap$methods(
     bootStatEval = function(xi,...) {
-        fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-        input.vals = c(list(x=x), fun.args)
-        stat <- do.call(boot.fun, input.vals)
+        funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
+        inputVals = c(list(x=x), funArgs)
+        stat <- do.call(bootFun, inputVals)
         return(stat)
 })
 
 mechanismBootstrap$methods(
-    bootSE = function(release, n.boot, sens) {
+    bootSE = function(release, nBoot, sens) {
         se <- sd(release)
-        c.alpha <- qchisq(0.01, df=(n.boot - 1))
-        conservative <- sqrt(max(c(se^2 - (c.alpha * sens^2 * n.boot) / (2 * epsilon * (n.boot - 1)), 0)))
-        naive <- sqrt(max(c(se^2 - (sens^2 * n.boot) / (2 * epsilon), 0)))
+        cAlpha <- qchisq(0.01, df=(nBoot - 1))
+        conservative <- sqrt(max(c(se^2 - (cAlpha * sens^2 * nBoot) / (2 * epsilon * (nBoot - 1)), 0)))
+        naive <- sqrt(max(c(se^2 - (sens^2 * nBoot) / (2 * epsilon), 0)))
         return(list('sd' = se,
                     'conservative' = conservative,
                     'naive' = naive))
 })
 
 mechanismBootstrap$methods(
     evaluate = function(fun, x, sens, postFun) {
-        x <- censordata(x, .self$var.type, .self$rng)
-        x <- fillMissing(x, .self$var.type, .self$impute.rng[0], .self$impute.rng[1])
-        epsilon.part <- epsilon / .self$n.boot
-        release <- replicate(.self$n.boot, bootstrap.replication(x, n, sens, epsilon.part, fun=.self$bootStatEval))
-        std.error <- .self$bootSE(release, .self$n.boot, sens)
-        out <- list('release' = release, 'std.error' = std.error)
+        x <- censorData(x, .self$varType, .self$rng)
+        x <- fillMissing(x, .self$varType, .self$imputeRng[0], .self$imputeRng[1])
+        epsilonPart <- epsilon / .self$nBoot
+        release <- replicate(.self$nBoot, bootstrapReplication(x, n, sens, epsilonPart, fun=.self$bootStatEval))
+        stdError <- .self$bootSE(release, .self$nBoot, sens)
+        out <- list('release' = release, 'stdError' = stdError)
         out <- postFun(out)
         return(out)
 })
diff --git a/R/mechanism-exponential.R b/R/mechanism-exponential.R
@@ -13,18 +13,18 @@ mechanismExponential <- setRefClass(
 
 mechanismExponential$methods(
     evaluate = function(fun, x, sens, postFun, ...) {
-        x <- censordata(x, .self$var.type, rng=.self$rng, levels=.self$bins)
-        x <- fillMissing(x, .self$var.type, rng=.self$rng, categories=.self$bins)
+        x <- censorData(x, .self$varType, rng=.self$rng, levels=.self$bins)
+        x <- fillMissing(x, .self$varType, rng=.self$rng, categories=.self$bins)
         fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-        input.vals = c(list(x=x), fun.args)
-        true.val <- do.call(fun, input.vals)  # Concern: are we confident that the environment this is happening in is getting erased.
-        quality <- true.val - max(true.val)
-        probs <- ifelse(true.val == 0, 0, exp((.self$epsilon * quality) / (2 * sens)))
-        gap <- as.numeric(true.val[.self$k] - true.val[.self$k + 1])
+        inputVals = c(list(x=x), fun.args)
+        trueVal <- do.call(fun, inputVals)  # Concern: are we confident that the environment this is happening in is getting erased.
+        quality <- trueVal - max(trueVal)
+        probs <- ifelse(trueVal == 0, 0, exp((.self$epsilon * quality) / (2 * sens)))
+        gap <- as.numeric(trueVal[.self$k] - trueVal[.self$k + 1])
         if (gap < (-2 / epsilon * log(delta))) {
             out <- list('release' = NULL)
         } else {
-            release <- sample(names(true.val), size=.self$k, prob=probs)
+            release <- sample(names(trueVal), size=.self$k, prob=probs)
             out <- list('release' = release)
             out <- postFun(out, gap)
         }

diff --git a/R/mechanism-gaussian.R b/R/mechanism-gaussian.R
@@ -13,13 +13,13 @@ mechanismGaussian <- setRefClass(
 
 mechanismGaussian$methods(
     evaluate = function(fun, x, sens, postFun, ...) {
-        x <- censordata(x, .self$var.type, .self$rng)
-        x <- fillMissing(x, .self$var.type, impute.rng=.self$rng, categories=.self$bins)
-        fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-        input.vals = c(list(x=x), fun.args)
-        true.val <- do.call(fun, input.vals)
+        x <- censorData(x, .self$varType, .self$rng)
+        x <- fillMissing(x, .self$varType, imputeRng=.self$rng, categories=.self$bins)
+        funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
+        inputVals = c(list(x=x), funArgs)
+        trueVal <- do.call(fun, inputVals)
         scale <- sens * sqrt(2 * log(1.25 / .self$delta)) / .self$epsilon
-        release <- true.val + dpNoise(n=length(true.val), scale=scale, dist='gaussian')
+        release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='gaussian')
         out <- list('release' = release)
         out <- postFun(out, ...)
         return(out)

diff --git a/R/mechanism-laplace.R b/R/mechanism-laplace.R
@@ -57,13 +57,13 @@ mechanismLaplace$methods(
   #' laplace_mean <- mechanismLaplace$evaluate(mean_function, data[, variable], sens, post_processing_function)
   #' 
   evaluate = function(fun, x, sens, postFun, ...) {
-    x <- censordata(x, .self$var.type, .self$rng, .self$bins)
-    x <- fillMissing(x, .self$var.type, impute.rng=.self$rng, categories=.self$impute.bins)
+    x <- censorData(x, .self$varType, .self$rng, .self$bins)
+    x <- fillMissing(x, .self$varType, imputeRng=.self$rng, categories=.self$imputeBins)
     fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-    input.vals = c(list(x=x), fun.args)
-    true.val <- do.call(fun, input.vals)  # Concern: are we confident that the environment this is happening in is getting erased.
+    inputVals = c(list(x=x), fun.args)
+    trueVal <- do.call(fun, inputVals)  # Concern: are we confident that the environment this is happening in is getting erased.
     scale <- sens / .self$epsilon
-    release <- true.val + dpNoise(n=length(true.val), scale=scale, dist='laplace')
+    release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace')
     out <- list('release' = release)
     out <- postFun(out, ...)
     return(out)

diff --git a/R/mechanism-objective.R b/R/mechanism-objective.R
@@ -19,13 +19,13 @@ mechanismObjective$methods(
         x <- x[, cols]
 
         # censor & impute missing values
-        x <- censordata(x, .self$var.type, .self$rng, .self$bins)
-        x <- fillMissing(x, .self$var.type, impute.rng=.self$impute.rng)
+        x <- censorData(x, .self$varType, .self$rng, .self$bins)
+        x <- fillMissing(x, .self$varType, imputeRng=.self$imputeRng)
 
         # extract X and y
         y <- x[, cols[1]]
         X <- x[, cols[2:length(cols)], drop=FALSE]
-        X.names <- names(X)
+        xNames <- names(X)
 
         # scale inputs s.t. max Euclidean norm <= 1
         scaler <- mapMatrixUnit(X, p=2)
@@ -34,19 +34,19 @@ mechanismObjective$methods(
         # add intercept
         if (.self$intercept) {
             X <- cbind(1, X)
-            X.names <- c('intercept', X.names)
+            xNames <- c('intercept', xNames)
         }
 
         # set start params, adjust for ols
         if (.self$name == 'ols') {
-            start.params <- rep(0, ncol(X) + 1)
-            X.names <- c(X.names, 'variance')
-            y.scaler <- mapMatrixUnit(y, p=2)
-            y <- y.scaler$matrix
-            y.max.norm <- y.scaler$max.norm
+            startParams <- rep(0, ncol(X) + 1)
+            xNames <- c(xNames, 'variance')
+            yScaler <- mapMatrixUnit(y, p=2)
+            y <- yScaler$matrix
+            yMaxNorm <- yScaler$maxNorm
         } else {
-            start.params <- rep(0, ncol(X))
-            y.max.norm <- NULL
+            startParams <- rep(0, ncol(X))
+            yMaxNorm <- NULL
         }
 
 		# Set scalar c from [CMS11]
@@ -68,35 +68,35 @@ mechanismObjective$methods(
 
 		beta <- ep/2
         # fit
-        if (is.null(.self$n.boot)) {
+        if (is.null(.self$nBoot)) {
            # old noise draw based on gamma and laplace
-           # b.norm <- dpNoise(n=1, scale=(2 / .self$epsilon), dist='gamma', shape=length(start.params))
-           # b <- dpNoise(n=length(start.params), scale=(-.self$epsilon * b.norm), dist='laplace')
+           # bNorm <- dpNoise(n=1, scale=(2 / .self$epsilon), dist='gamma', shape=length(startParams))
+           # b <- dpNoise(n=length(startParams), scale=(-.self$epsilon * bNorm), dist='laplace')
 
            # new noise draw based on exponential and uniform
-            b.norm <- dpNoise(n=1, scale=(1/beta), dist='gamma', shape=1)
-            random_vec <- dpNoise(n=length(start.params), scale=1, dist='gaussian')
-            random_vec_norm <- sqrt(sum(random_vec^2))
-            b <- random_vec*(b.norm/random_vec_norm)
+            bNorm <- dpNoise(n=1, scale=(1/beta), dist='gamma', shape=1)
+            randomVec <- dpNoise(n=length(startParams), scale=1, dist='gaussian')
+            randomVecNorm <- sqrt(sum(randomVec^2))
+            b <- randomVec*(bNorm/randomVecNorm)
 
-            estimates <- optim(par=start.params, fn=.self$objective, X=X, y=y, b=b, n=n, lambda=lambda)$par
-            release <- data.frame(scaleRelease(estimates, scaler$max.norm, y.max.norm))
+            estimates <- optim(par=startParams, fn=.self$objective, X=X, y=y, b=b, n=n, lambda=lambda)$par
+            release <- data.frame(scaleRelease(estimates, scaler$maxNorm, yMaxNorm))
             names(release) <- 'estimate'
-            rownames(release) <- X.names
+            rownames(release) <- xNames
         } else {
-            local.epsilon <- .self$epsilon / .self$n.boot
-            release <- vector('list', .self$n.boot)
-            for (i in 1:.self$n.boot) {
+            localEpsilon <- .self$epsilon / .self$nBoot
+            release <- vector('list', .self$nBoot)
+            for (i in 1:.self$nBoot) {
                 index <- sample(1:.self$n, .self$n, replace=TRUE)
-                X.star <- X[index, ]
-                y.star <- y[index]
-                b.norm <- dpNoise(n=1, scale=(2 / local.epsilon), dist='gamma', shape=length(start.params))
-                b <- dpNoise(n=length(start.params), scale=(-local.epsilon * b.norm), dist='laplace')
-                estimates <- optim(par=start.params, fn=.self$objective, X=X.star, y=y.star, b=b, n=n)$par
-                release[[i]] <- scaleRelease(estimates, scaler$max.norm, y.max.norm)
+                xStar <- X[index, ]
+                yStar <- y[index]
+                bNorm <- dpNoise(n=1, scale=(2 / localEpsilon), dist='gamma', shape=length(startParams))
+                b <- dpNoise(n=length(startParams), scale=(-localEpsilon * bNorm), dist='laplace')
+                estimates <- optim(par=startParams, fn=.self$objective, X=xStar, y=yStar, b=b, n=n)$par
+                release[[i]] <- scaleRelease(estimates, scaler$maxNorm, yMaxNorm)
             }
             release <- data.frame(do.call(rbind, release))
-            names(release) <- X.names
+            names(release) <- xNames
         }
 
         # format output

diff --git a/R/mechanism-stability.R b/R/mechanism-stability.R
@@ -78,25 +78,25 @@ mechanismStability$methods(
         numHistogramBins <- NULL
         imputationRange <- NULL
         histogramBins <- NULL
-        if (.self$var.type %in% c('numeric', 'integer')) {
+        if (.self$varType %in% c('numeric', 'integer')) {
             dataRange <- range(x)
-            numHistogramBins <- ifelse(is.null(.self$n.bins), .self$n / .self$granularity, .self$n.bins)
+            numHistogramBins <- ifelse(is.null(.self$nBins), .self$n / .self$granularity, .self$nBins)
             histogramBins <- seq(dataRange[1], dataRange[2], length.out=(numHistogramBins + 1))
             # set the imputation range to the detected data range to maintain privacy
             imputationRange <- dataRange
         }
 
-        x <- censordata(x, .self$var.type, dataRange, histogramBins)
-        x <- fillMissing(x, .self$var.type, impute.rng=imputationRange, categories=levels(x)) # levels(x) will be NULL for numeric variables, a vector of bins for character variables
+        x <- censorData(x, .self$varType, dataRange, histogramBins)
+        x <- fillMissing(x, .self$varType, imputeRng=imputationRange, categories=levels(x)) # levels(x) will be NULL for numeric variables, a vector of bins for character variables
         fun.args <- getFuncArgs(fun, inputList=list(bins=histogramBins), inputObject=.self)
-        input.vals <- c(list(x=x), fun.args)
-        true.val <- do.call(fun, input.vals)  # Concern: are we confident that the environment this is happening in is getting erased.
+        inputVals <- c(list(x=x), fun.args)
+        trueVal <- do.call(fun, inputVals)  # Concern: are we confident that the environment this is happening in is getting erased.
 
         # remove empty bins before noise is added (per definition of stability mechanism)
-        true.val <- true.val[true.val > 0]
+        trueVal <- trueVal[trueVal > 0]
 
         scale <- sens / .self$epsilon
-        release <- true.val + dpNoise(n=length(true.val), scale=scale, dist='laplace')
+        release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace')
 
         # calculate the accuracy threshold, below which histogram buckets should be removed
         accuracyThreshold <- 1+2*log(2/delta)/epsilon