classify_documents.R

classify_documents <- function(
    walk_terms, # data with terms generated by get_rwr_terms()
    group_name = "policy_field", # name of the group variable which specifies the classes the documents will be sorted into
    document_tokens, # data with the documents to be classified. Expects tokenized data, with a doc_id and one token per row
    tokens_var, # name of the tokens variable within the documents dataframe. Usually "tokens", "lemma", etc.
    doc_id, # name of the doc_id variable in document_tokens
    classification_measure = c("Score", "ScoreMean", # set the measure to use for classification here. Must be present in the data
                               "ScoreNorm", "ScoreNormMean", 
                               "ScoreNormGroup", "ScoreNormGroupMean"), 
    cutoff = NULL, # Should a cutoff be set to filter the walk_terms?? Applies to classification measure. NULL to skip. This is useful if a more strict cutoff is desired than in get_rwr_terms()
    keep_seed_terms = TRUE, # should seed terms be kept even if their score is lower than the cutoff? only applies if a cutoff is specified
    seedterm_value = NULL, # Should Seed Term Scores values be set to a fixed value for classification? NULL to skip. Otherwise enter a numerical value. Applies to classification_measure only
    normalize_scores = c("doc", "group", NULL), # should the score in the documents be normalized between 0 and 1? Can be doc (normalize within each document), group (normalize for each group), or NULL to skip
    cut_lower_quantile_fields = NULL, # Should policy classifications within a document be set to 0 if they fall below a certain quantile? NULL to skip. Else numerical value to specify the quantile
    cut_frequent_group_terms = c(NULL, numeric(), "auto"),  # Should terms appearing in numerous groups be cut? 
                                      #  "auto" to cut terms appearing in more than 50% of the policy fields
                                      #  numeric value for a specific number
                                      #  NULL to skip
    return_walk_terms = TRUE, # should the processed walk terms be returned for further analysis and transparency?
    return_unclassified_docs = TRUE, # should the IDs of the unlassified docs be returned? 
              # setting return_walk_terms or return_unclassified_docs to TURE returns a list of dataframes rather than a single dataframe
    verbose = TRUE # should the number of unclassified documents be reported?
    ){
  
  ## Data and Input Checks
  
  rlang::arg_match(classification_measure)
  
  rlang::arg_match(normalize_scores)
  
  if (!is.null(cut_frequent_group_terms)) {
    if (!is.na(cut_frequent_group_terms)) {
      if (!(is.numeric(cut_frequent_group_terms) |
            cut_frequent_group_terms == "auto")) {
        stop(
          paste0(
            "cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not '",
            cut_frequent_group_terms,
            "'.\n"
          )
        )
      }
      
    } else {
      stop("cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not NA.\n")
      
    }
  }
  
  if (!(classification_measure %in% names(walk_terms))) { 
    stop(paste("classification_measure", classification_measure, 
               "not present in walk_terms data\n"))
  }

  if (!(doc_id %in% names(document_tokens))) {
    stop(paste("doc_id", doc_id, "not present in document_tokens data\n"))
  }
  
  ## Data Prep
  
  ### apply cutoff
  if (!is.null(cutoff)) {
    walk_terms <-
      walk_terms %>% dplyr::filter(!!as.name(classification_measure) >= cutoff |
                                     seed_term == TRUE)
  }
  
  ### overwrite seedterm values (if desired)
  if (!is.null(seedterm_value)) {
    walk_terms <- walk_terms %>%
      dplyr::mutate(
        !!as.name(classification_measure) := dplyr::case_when(
          seed_term == TRUE ~ seedterm_value,
          .default = !!as.name(classification_measure)
        )
      )
  }
  
  ### calculate means (if necessary) to gain one score for each term in a policy field (rather then one for each seedterm-term connection)
  #### Note that if we calculate the means only now, means are calculated AFTER the initial filtering, and scores will accordingly be higher (all values below the walk_score threshold have already been dropped!)
  if (stringr::str_detect(classification_measure, "Mean")) {
    # if one of the mean scores is set as classification_measure, we simply use that
    classification_terms <- document_tokens %>%
      dplyr::semi_join(walk_terms, # filter for lemmas in the walk terms
                       dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
      dplyr::left_join( # add classification attributes
        walk_terms %>% dplyr::distinct(NodeNames,
                                       !!as.name(classification_measure), 
                                       !!as.name(group_name)),
        dplyr::join_by(!!as.name(tokens_var) == NodeNames),
        relationship = "many-to-many"
      )  # multi-matches for a) terms in multiple docs, b) terms in multiple groups
  } else {
    # else, we need to calculate the mean for each term within a policy field first, in order to handle duplicates from different seed terms
    classification_terms <- document_tokens %>%
      dplyr::semi_join(walk_terms, # filter for token var in the walk terms
                       dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
      dplyr::left_join(
        walk_terms %>%
          dplyr::summarise(
            !!as.name(classification_measure) := mean(!!as.name(classification_measure)),
            .by = c(NodeNames, !!as.name(group_name))
          ),
        dplyr::join_by(!!as.name(tokens_var) == NodeNames),
        # add classification attributes
        relationship = "many-to-many"
      )
  }
  
  
  # cut frequent policy terms
  if (!is.null(cut_frequent_group_terms)) {
    if (cut_frequent_group_terms == "auto") {
      classification_terms <- classification_terms %>% 
        filter(!!as.name(tokens_var) %in% (classification_terms %>% 
                             count(!!as.name(tokens_var), !!as.name(group_name)) %>% 
                             count(!!as.name(tokens_var)) %>% 
                             filter(n <= (distinct(classification_terms, 
                                                   !!as.name(group_name)) %>% 
                                            nrow() / 2)) %>% 
                             pull(!!as.name(tokens_var))))
    } else {
      classification_terms <- classification_terms %>% 
        filter(!!as.name(tokens_var) %in% (classification_terms %>% 
                             count(!!as.name(tokens_var), !!as.name(group_name)) %>% 
                             count(!!as.name(tokens_var)) %>% 
                             filter(n <= cut_frequent_group_terms) %>% 
                             pull(!!as.name(tokens_var))))
    }
  }
  
  ## classify
  
  classified_documents <- classification_terms %>%
    dplyr::summarize(score = sum(!!as.name(classification_measure)),
                     .by = c(!!as.name(doc_id), !!as.name(group_name))) %>% # sum policy scores by field and document
    tidyr::complete(!!as.name(doc_id),!!as.name(group_name),
                    fill = list(score = 0)) 
  
  if (!is.null(normalize_scores)) {
    if (normalize_scores == "doc") {
      # rescale the scores in documents
      classified_documents <- classified_documents %>%
        dplyr::mutate(
          score_norm = scales::rescale(score, to = c(0, 1)),
          .by = !!as.name(doc_id)
        )
    }
    if (normalize_scores == "group") {
      # rescale the scores in groups
      classified_documents <- classified_documents %>%
        dplyr::mutate(
          score_norm = scales::rescale(score, to = c(0, 1)),
          .by = !!as.name(group_name)
        )
    }
  }
  
  if (!is.null(cut_lower_quantile_fields)) { # set scores to 0 for lower quantiles
    
    quantile <- stats::quantile(classified_documents$score, 
                                cut_lower_quantile_fields)[[1]]
    classified_documents <- classified_documents %>% 
      dplyr::mutate(score = dplyr::case_when(score < quantile ~ 0,
                                                    .default = score))
    
    if(!is.null(normalize_scores)) { # set normalized scores to 0
      quantile <- stats::quantile(classified_documents$score_norm, 
                                  cut_lower_quantile_fields)[[1]]
      classified_documents <- classified_documents %>% 
        dplyr::mutate(score_norm = dplyr::case_when(score_norm < quantile ~ 0,
                                                    .default = score_norm))
    } 
  }

  
  # report unclassified documents
  if (verbose | return_unclassified_docs) {
    unclassified_documents <- document_tokens %>% 
      dplyr::anti_join(classified_documents, 
                       by = dplyr::join_by(!!as.name(doc_id))) %>% 
      dplyr::distinct(!!as.name(doc_id))
    
    if (verbose) {
      unclassified <- unclassified_documents %>% nrow()
      total <- document_tokens %>% distinct(!!as.name(doc_id)) %>% nrow()
      cat(paste(unclassified, 
                "out of",
                total,
                "documents could not be classified", 
                paste0("(", scales::percent(unclassified/total), ")"),
                "\n"))
    }
  }
  
  # return the result
  if (return_walk_terms | return_unclassified_docs) {
    
    out <- list()
    
    out$classified_documents <- classified_documents
    
    if (return_walk_terms) { # return the processed and formatted walk_terms
      classification_terms_out <- classification_terms %>% 
        dplyr::distinct(!!as.name(tokens_var), 
                        !!as.name(group_name), 
                        !!as.name(classification_measure))
      
      if (stringr::str_detect(classification_measure, "Mean")) {
        classification_terms_out <- classification_terms_out %>% 
          dplyr::left_join(walk_terms %>% # add seed_term indicator
                           dplyr::distinct(!!as.name(group_name),
                                           NodeNames, seed_term),
                         by = dplyr::join_by(!!as.name(group_name),
                                             !!as.name(tokens_var) == NodeNames)) 
      } else { # if means were calculated during the process, we report the unprocessed values
        classification_terms_out <- classification_terms_out %>% 
          dplyr::left_join(walk_terms %>%
                             dplyr::distinct(!!as.name(group_name),
                                             NodeNames, seed_term,
                                             !!as.name(classification_measure)) %>% 
                             rename_with(~ paste0(classification_measure,
                                           "_unprocessed"), 
                                         all_of(classification_measure)), 
                           by = dplyr::join_by(!!as.name(group_name),
                                               !!as.name(tokens_var) == NodeNames)) 
      }
      
      out$walk_terms <- classification_terms_out %>%
        dplyr::arrange(!!as.name(group_name), 
                       dplyr::desc(!!as.name(classification_measure))) %>% 
        dplyr::select(!!as.name(tokens_var), 
                      !!as.name(group_name), 
                      seed_term, 
                      !!as.name(classification_measure),
                      everything()) 
      
  }
  
    if (return_unclassified_docs) {
      out$unclassified_documents <- unclassified_documents
    }
    
    return(out)
    
  } else {
    return(classified_documents)
  }
  
}