From 83e85bade5fa9a1c37a13a5ce47006ce3ca173ee Mon Sep 17 00:00:00 2001 From: User Name Date: Mon, 27 Nov 2023 17:16:15 +0200 Subject: [PATCH] fixed incorrect summation variables --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/algorithms.R | 26 +++++++++++++++++++------- R/cf_variable.R | 9 +++++++-- R/functional.R | 29 +++++++++++------------------ R/identifiable.R | 9 +++++---- R/probability.R | 15 ++++----------- README.md | 2 ++ man/Probability.Rd | 10 +--------- man/counterfactuals.Rd | 9 +++++++-- man/functional.Rd | 15 +-------------- man/identifiable.Rd | 3 ++- tests/testthat/test-probability.R | 20 +++++++++++++++++--- 13 files changed, 81 insertions(+), 72 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 56a4c85..7231e7d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: cfid Type: Package Title: Identification of Counterfactual Queries in Causal Models -Version: 0.1.6 +Version: 0.1.7 Authors@R: c( person(given = "Santtu", family = "Tikka", diff --git a/NEWS.md b/NEWS.md index fdcda81..b6320a6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# cfid 0.1.7 + + * Fixed some formulas having incorrect variables indicated as summation variables. + # cfid 0.1.6 * Summation variables are now properly distinguished from query variables in the output formulas of `identifiable()`. diff --git a/R/algorithms.R b/R/algorithms.R index baa1193..3eb9942 100644 --- a/R/algorithms.R +++ b/R/algorithms.R @@ -40,26 +40,36 @@ id_star <- function(g, gamma) { if (n_comp > 1L) { # Line 6 c_factors <- vector(mode = "list", length = n_comp) + free_vars <- vector(mode = "list", length = n_comp) + form_terms <- vector(mode = "list", length = n_comp) + nonid_factors <- rep(TRUE, n_comp) prob_zero <- FALSE for (i in seq_len(n_comp)) { s_var <- vars(comp[[i]]) s_sub <- setdiff(v_g, s_var) - for (j in seq_along(comp[[i]])) { + n_terms <- length(comp[[i]]) + sub_new <- vector(mode = "list", length = n_terms) + for (j in seq_len(n_terms)) { gamma_val <- val(comp[[i]][[j]], gamma_prime) comp[[i]][[j]]$obs <- ifelse_(is.null(gamma_val), 0L, gamma_val) sub_var <- names(comp[[i]][[j]]$sub) s_sub_j <- setdiff(s_sub, sub_var) s_len <- length(s_sub_j) if (s_len > 0) { - sub_new <- set_names(integer(s_len), s_sub_j) + sub_new[[j]] <- set_names(integer(s_len), s_sub_j) obs_ix <- which(gamma_obs_var %in% s_sub_j) if (length(obs_ix) > 0) { s_val <- unlist(evs(gamma_obs)[obs_ix]) - sub_new[names(s_val)] <- s_val + sub_new[[j]][names(s_val)] <- s_val } - comp[[i]][[j]]$sub <- c(comp[[i]][[j]]$sub, sub_new) + comp[[i]][[j]]$sub <- c(comp[[i]][[j]]$sub, sub_new[[j]]) } } + sumset <- setdiff(v_g, gamma_var) + sub_new_reduce <- names( + Reduce(function(x, y) intersect(names(x), names(y)), sub_new) + ) + free_vars[[i]] <- intersect(sumset, union(sub_new_reduce, s_var)) s_conj <- try( do.call(counterfactual_conjunction, comp[[i]]), silent = TRUE ) @@ -72,13 +82,15 @@ id_star <- function(g, gamma) { c_factors[[i]]$formula$val == 0L) { return(list(id = TRUE, formula = probability(val = 0L))) } + if (c_factors[[i]]$id) { + form_terms[[i]] <- c_factors[[i]]$formula + attr(form_terms[[i]], "free_vars") <- free_vars[[i]] + nonid_factors[i] <- FALSE + } } - nonid_factors <- !vapply(c_factors, "[[", logical(1L), "id") if (any(nonid_factors)) { return(list(id = FALSE, formula = NULL)) } - sumset <- setdiff(v_g, gamma_var) - form_terms <- lapply(c_factors, "[[", "formula") if (length(sumset) > 0L) { form_out <- functional( sumset = lapply(sumset, function(x) cf(var = x, obs = 0L)), diff --git a/R/cf_variable.R b/R/cf_variable.R index 28d7bdf..b8f9aac 100644 --- a/R/cf_variable.R +++ b/R/cf_variable.R @@ -90,8 +90,13 @@ is.counterfactual_variable <- function(x) { #' @rdname counterfactuals #' @param x A `counterfactual_variable` or a `counterfactual_conjunction` #' object. -#' @param use_primes A `logical` value indicating whether primes should be -#' used to differentiate between value assignments +#' @param use_primes A `logical` value. If `TRUE` (the default), any value +#' assignment of a counterfactual variable with `obs` will be formatted with +#' as many primes in the superscript as the value of `obs`, e.g., +#' `obs = 0` outputs `"y"`, `obs = 1` outputs `"y'"`, +#' `obs = 2` outputs `"y''"` and so forth. The alternative when `FALSE` is +#' to simply denote the `obs` value via superscript directly as +#' `"y^{(obs)}"`, where obs is evaluated. #' @export format.counterfactual_variable <- function(x, use_primes = TRUE, ...) { super_var <- character(0L) diff --git a/R/functional.R b/R/functional.R index 2e7bf37..f445c79 100644 --- a/R/functional.R +++ b/R/functional.R @@ -79,23 +79,12 @@ is.functional <- function(x) { #' @rdname functional #' @param x A `functional` object. -#' @param use_primes A `logical` value. If `TRUE` (the default), any value -#' assignment of a counterfactual variable with `obs` will be formatted with -#' as many primes in the superscript as the value of `obs`, e.g., -#' `obs = 0` outputs `"y"`, `obs = 1` outputs `"y'"`, -#' `obs = 2` outputs `"y''"` and so forth. The alternative when `FALSE` is -#' to simply denote the `obs` value via superscript directly as -#' `"y^{(obs)}"`, where obs is evaluated. -#' @param use_do A `logical` value. If `TRUE`, the explicit do-operation is -#' used to denote interventional probabilities (e.g., \eqn{P(y|do(x))}). -#' If `FALSE` (the default), the subscript notation is used instead -#' (e.g., \eqn{P_x(y)}). #' @param ... Additional arguments passed to `format`. #' @return A `character` representation of the `functional` object #' in LaTeX syntax. #' #' @export -format.functional <- function(x, use_primes = TRUE, use_do = FALSE, ...) { +format.functional <- function(x, ...) { terms <- "" sumset <- "" fraction <- "" @@ -104,12 +93,12 @@ format.functional <- function(x, use_primes = TRUE, use_do = FALSE, ...) { if (length(x$sumset) > 0) { sumset <- paste0( "\\sum_{", - comma_sep(vapply(x$sumset, format, character(1L), use_primes)), + comma_sep(vapply(x$sumset, format, character(1L), ...)), "} " ) } if (!is.null(x$terms)) { - terms <- vapply(x$terms, format, character(1L), use_primes, use_do) + terms <- vapply(x$terms, format, character(1L), ...) sums <- vapply( x$terms, function(y) { is.functional(y) && length(y$sumset) > 0 }, @@ -121,13 +110,13 @@ format.functional <- function(x, use_primes = TRUE, use_do = FALSE, ...) { terms <- collapse(terms) } else if (!is.null(x$numerator)) { if (length(x$denominator$val) > 0L && x$denominator$val == 1L) { - fraction <- format(x$numerator, use_primes, use_do) + fraction <- format(x$numerator, ...) } else { fraction <- paste0( "\\frac{", - format(x$numerator, use_primes, use_do), + format(x$numerator, ...), "}{", - format(x$denominator, use_primes, use_do), + format(x$denominator, ...), "}" ) } @@ -171,8 +160,12 @@ assign_values <- function(x, bound, v, termwise = FALSE) { if (termwise) { v_term <- unlist(c(evs(x$var), evs(x$cond), evs(x$do))) v[names(v_term)] <- v_term + bind <- bound > 0 & v_names %in% attr(x, "free_vars") + attr(x, "free_vars") <- NULL + } else { + bind <- bound > 0 } - v[bound > 0] <- -bound[bound > 0] + v[bind] <- -bound[bind] var <- vars(x$var) cond <- vars(x$cond) do <- vars(x$do) diff --git a/R/identifiable.R b/R/identifiable.R index 302d3dd..720e0b8 100644 --- a/R/identifiable.R +++ b/R/identifiable.R @@ -55,7 +55,8 @@ #' * `formula`\cr An object of class `functional` giving the identifying #' functional of the query in LaTeX syntax via `format` or `print`, #' if identifiable. This expression is given in terms of the -#' available `data`. For tautological statements, the resulting +#' available `data`. Variables bound by summation are distinguished by a +#' superscript asterisk. For tautological statements, the resulting #' probability is 1, and for inconsistent statements, the resulting #' probability is 0. For formatting options, see #' [cfid::format.functional()] and [cfid::format.probability()]. @@ -139,9 +140,6 @@ identifiable <- function(g, gamma, delta = NULL, functional(terms = list(out$formula)), out$formula ) - if (out$id && data != "interventions") { - out <- identify_terms(out$formula, data, g) - } if (out$id) { n_obs <- sum(!attr(g, "latent")) v <- set_names(integer(n_obs), attr(g, "labels")[!attr(g, "latent")]) @@ -152,6 +150,9 @@ identifiable <- function(g, gamma, delta = NULL, bound[query_vars] <- bound[query_vars] + 1L out$formula <- assign_values(out$formula, bound, v, termwise = TRUE) } + if (out$id && data != "interventions") { + out <- identify_terms(out$formula, data, g) + } out$undefined <- ifelse_(is.null(out$undefined), FALSE, out$undefined) out$counterfactual <- TRUE out$gamma <- gamma diff --git a/R/probability.R b/R/probability.R index 9f24252..14957ec 100644 --- a/R/probability.R +++ b/R/probability.R @@ -61,13 +61,6 @@ is.probability <- function(x) { #' @method format probability #' @rdname probability #' @param x A `probability` object. -#' @param use_primes A `logical` value. If `TRUE` (the default), any value -#' assignment of a counterfactual variable with `obs` will be formatted with -#' as many primes in the superscript as the value of `obs`, e.g., -#' `obs = 0` outputs `"y"`, `obs = 1` outputs `"y'"`, -#' `obs = 2` outputs `"y''"` and so forth. The alternative when `FALSE` is -#' to simply denote the `obs` value via superscript directly as -#' `"y^{(obs)}"`, where obs is evaluated. #' @param use_do A `logical` value. If `TRUE`, the explicit do-operation is #' used to denote interventional probabilities (e.g., \eqn{P(y|do(x))}). #' If `FALSE` (the default), the subscript notation is used instead @@ -98,7 +91,7 @@ is.probability <- function(x) { #' format(f, use_primes = FALSE, use_do = TRUE) #' #' @export -format.probability <- function(x, use_primes = TRUE, use_do = FALSE, ...) { +format.probability <- function(x, use_do = FALSE, ...) { if (length(x$val) > 0L) { return(as.character(x$val)) } @@ -109,7 +102,7 @@ format.probability <- function(x, use_primes = TRUE, use_do = FALSE, ...) { any_do <- length(x$do) > 0L any_cond <- length(x$cond) > 0L if (any_do) { - form_do <- comma_sep(vapply(x$do, format, character(1L), use_primes)) + form_do <- comma_sep(vapply(x$do, format, character(1L), ...)) if (!use_do) { sub <- paste0("_{", form_do, "}") } else { @@ -118,13 +111,13 @@ format.probability <- function(x, use_primes = TRUE, use_do = FALSE, ...) { } if (any_cond) { cond <- paste0( - comma_sep(vapply(x$cond, format, character(1L), use_primes)) + comma_sep(vapply(x$cond, format, character(1L), ...)) ) } if ((any_do && use_do) || any_cond) { rhs <- paste0("|", do, cond) } - var <- paste0(comma_sep(vapply(x$var, format, character(1L), use_primes))) + var <- paste0(comma_sep(vapply(x$var, format, character(1L), ...))) paste0("P", sub, "(", var, rhs, ")") } diff --git a/README.md b/README.md index 9e691eb..6392229 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ coverage](https://codecov.io/gh/santikka/cfid/branch/main/graph/badge.svg)](http version](http://www.r-pkg.org/badges/version/cfid)](https://CRAN.R-project.org/package=cfid) + #> Warning: package 'cfid' was built under R version 4.3.2 + ## Overview This package facilitates the identification of counterfactual queries in diff --git a/man/Probability.Rd b/man/Probability.Rd index e99b6da..b315707 100644 --- a/man/Probability.Rd +++ b/man/Probability.Rd @@ -8,7 +8,7 @@ \usage{ probability(val = NULL, var = NULL, do = NULL, cond = NULL) -\method{format}{probability}(x, use_primes = TRUE, use_do = FALSE, ...) +\method{format}{probability}(x, use_do = FALSE, ...) \method{print}{probability}(x, ...) } @@ -29,14 +29,6 @@ conditioning set \eqn{z} in \eqn{P(y|do(x),z)}.} \item{x}{A \code{probability} object.} -\item{use_primes}{A \code{logical} value. If \code{TRUE} (the default), any value -assignment of a counterfactual variable with \code{obs} will be formatted with -as many primes in the superscript as the value of \code{obs}, e.g., -\code{obs = 0} outputs \code{"y"}, \code{obs = 1} outputs \code{"y'"}, -\code{obs = 2} outputs \code{"y''"} and so forth. The alternative when \code{FALSE} is -to simply denote the \code{obs} value via superscript directly as -\code{"y^{(obs)}"}, where obs is evaluated.} - \item{use_do}{A \code{logical} value. If \code{TRUE}, the explicit do-operation is used to denote interventional probabilities (e.g., \eqn{P(y|do(x))}). If \code{FALSE} (the default), the subscript notation is used instead diff --git a/man/counterfactuals.Rd b/man/counterfactuals.Rd index 326eeb5..77c52c2 100644 --- a/man/counterfactuals.Rd +++ b/man/counterfactuals.Rd @@ -63,8 +63,13 @@ the observed value of \code{var} (i.e., \eqn{Y = y})} variables intervened on (via \eqn{do(X = x)}) and values to the value assignments (their levels, e.g., \eqn{x}).} -\item{use_primes}{A \code{logical} value indicating whether primes should be -used to differentiate between value assignments} +\item{use_primes}{A \code{logical} value. If \code{TRUE} (the default), any value +assignment of a counterfactual variable with \code{obs} will be formatted with +as many primes in the superscript as the value of \code{obs}, e.g., +\code{obs = 0} outputs \code{"y"}, \code{obs = 1} outputs \code{"y'"}, +\code{obs = 2} outputs \code{"y''"} and so forth. The alternative when \code{FALSE} is +to simply denote the \code{obs} value via superscript directly as +\code{"y^{(obs)}"}, where obs is evaluated.} } \value{ \code{conj} returns an object of class \code{counterfactual_conjunction}. diff --git a/man/functional.Rd b/man/functional.Rd index aba5dd9..7761400 100644 --- a/man/functional.Rd +++ b/man/functional.Rd @@ -8,7 +8,7 @@ \usage{ functional(sumset = NULL, terms = NULL, numerator = NULL, denominator = NULL) -\method{format}{functional}(x, use_primes = TRUE, use_do = FALSE, ...) +\method{format}{functional}(x, ...) \method{print}{functional}(x, ...) } @@ -31,19 +31,6 @@ The denominator of the quotient representation.} \item{x}{A \code{functional} object.} -\item{use_primes}{A \code{logical} value. If \code{TRUE} (the default), any value -assignment of a counterfactual variable with \code{obs} will be formatted with -as many primes in the superscript as the value of \code{obs}, e.g., -\code{obs = 0} outputs \code{"y"}, \code{obs = 1} outputs \code{"y'"}, -\code{obs = 2} outputs \code{"y''"} and so forth. The alternative when \code{FALSE} is -to simply denote the \code{obs} value via superscript directly as -\code{"y^{(obs)}"}, where obs is evaluated.} - -\item{use_do}{A \code{logical} value. If \code{TRUE}, the explicit do-operation is -used to denote interventional probabilities (e.g., \eqn{P(y|do(x))}). -If \code{FALSE} (the default), the subscript notation is used instead -(e.g., \eqn{P_x(y)}).} - \item{...}{Additional arguments passed to \code{format}.} } \value{ diff --git a/man/identifiable.Rd b/man/identifiable.Rd index 9ab52da..5b0c947 100644 --- a/man/identifiable.Rd +++ b/man/identifiable.Rd @@ -42,7 +42,8 @@ inconsistent, the query will be identifiable, but with probability 0. \item \code{formula}\cr An object of class \code{functional} giving the identifying functional of the query in LaTeX syntax via \code{format} or \code{print}, if identifiable. This expression is given in terms of the -available \code{data}. For tautological statements, the resulting +available \code{data}. Variables bound by summation are distinguished by a +superscript asterisk. For tautological statements, the resulting probability is 1, and for inconsistent statements, the resulting probability is 0. For formatting options, see \code{\link[=format.functional]{format.functional()}} and \code{\link[=format.probability]{format.probability()}}. diff --git a/tests/testthat/test-probability.R b/tests/testthat/test-probability.R index 8af67f6..761023c 100644 --- a/tests/testthat/test-probability.R +++ b/tests/testthat/test-probability.R @@ -30,6 +30,12 @@ id5 <- identifiable(g2, v5, v7) id6 <- identifiable(g2, v5, v7, data = "obs") id7 <- identifiable(g1, conj(v1, v2, v3)) +g3 <- dag("X -> Z -> Y") +v8 <- cf("Z", 0, c("X" = 0)) +v9 <- cf("Y", 1) +id8 <- identifiable(g3, v8, v9) +id9 <- identifiable(g3, v8, v9, data = "obs") + # Format ------------------------------------------------------------------ test_that("probability format works", { @@ -67,15 +73,23 @@ test_that("probability format works", { ) expect_identical( format(id5$formula), - "\\frac{\\sum_{x^*} P_{x^*}(y)P(x^*)P_{x^*,y}(z)}{\\sum_{x^*,y^*} P(x^*)P_{x^*,y^*}(z)P_{x^*}(y^*)}" + "\\frac{\\sum_{x^*} P_{x}(y)P(x^*)P_{x^*,y}(z)}{\\sum_{x^*,y^*} P(x^*)P_{x^*,y^*}(z)P_{x^*}(y^*)}" ) expect_identical( format(id6$formula), - "\\frac{\\sum_{x^*} P(y|x^*)P(x^*)P(z|x^*,y)}{\\sum_{x^*,y^*} P(x^*)P(z|x^*,y^*)P(y^*|x^*)}" + "\\frac{\\sum_{x^*} P(y|x)P(x^*)P(z|x^*,y)}{\\sum_{x^*,y^*} P(x^*)P(z|x^*,y^*)P(y^*|x^*)}" ) expect_identical( format(id7$formula), - "\\sum_{w,d^*} P_{x}(w)P_{w,z}(y,x')P_{d^*}(z)P(d^*)" + "\\sum_{w,d^*} P_{x}(w)P_{w,z}(y,x')P_{d}(z)P(d^*)" + ) + expect_identical( + format(id8$formula), + "\\frac{\\sum_{x^*} P_{x}(z)P(x^*)P_{z}(y')}{\\sum_{x^*,z^*} P(x^*)P_{x^*}(z^*)P_{z^*}(y')}" + ) + expect_identical( + format(id9$formula), + "\\frac{\\sum_{x^*} P(z|x)P(x^*)P(y'|x,z)}{\\sum_{x^*,z^*} P(x^*)P(z^*|x^*)P(y'|x,z^*)}" ) })