Skip to content

Commit

Permalink
Merge pull request #200 from OscarKjell/pii-cleaning
Browse files Browse the repository at this point in the history
Pii cleaning
  • Loading branch information
OscarKjell authored Nov 30, 2024
2 parents b6f3c09 + b05c2ea commit 4cc10a6
Show file tree
Hide file tree
Showing 22 changed files with 468 additions and 341 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(textAssess)
export(textCentrality)
export(textCentralityPlot)
export(textClassify)
export(textClean)
export(textCleanNonASCII)
export(textDescriptives)
export(textDimName)
Expand Down
11 changes: 9 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
# text (development version)

# text (development versions)

<!-- README.md is generated from README.Rmd. Please edit that file -->

# text 1.3.0
- Alias function: `textPredict()`, `textAssess()` and `textClassify()`.
- LBAM integration with `textLBAM()`.
- Full support of implicit motives models.
- Text cleaning functionality with `textClean()` (removing common personal information).
- Compatability with the topics-package, see [www.r-topics.org](https://r-topics.org/).

# text 1.2.17
- `textLBAM()` returns the library as a datafram

Expand Down
1 change: 1 addition & 0 deletions R/0_1_globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ utils::globalVariables(c(
"hgTransformerGetNER", "hgTransformerGetSummarization", "hgTransformerGetQA",
"hgTransformerGetTranslation", "hgTokenizerGetTokens", "hgTransformerGetZeroShot",
"textModelsPy", "textModelsRMPy", "get_number_of_hidden_layers", "hgTransformerFineTune",
"remove_pii",


# EMBEDDINGS OBJECTS
Expand Down
71 changes: 71 additions & 0 deletions R/1_0_textDescriptives.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,74 @@ textDescriptives <- function(words,
output <- dplyr::bind_cols(variable_name, output)
output
}


#' Cleans text from standard personal information
#'
#' The text is being cleaned from information that may identify them; however, note that
#' this is not a guarantee for anonymization.
#' @param text (character) The text to be cleaned.
#' @param replace (boolean)
#' @param date (boolean)
#' @param time (boolean)
#' @param phone (boolean)
#' @param email (boolean)
#' @param ip (boolean)
#' @param money (boolean)
#' @param creditcard (boolean)
#' @param bitcoin (boolean)
#' @param location (boolean)
#' @param ssn (boolean)
#' @param at_symbol (boolean)
#' @param url (boolean)
#' @return Text cleaned from typical personal identifiable information
#' @importFrom reticulate source_python
#' @export
textClean <- function(
text,
replace = TRUE,
date = TRUE,
time = TRUE,
phone = TRUE,
email = TRUE,
ip = TRUE,
money = TRUE,
creditcard = TRUE,
bitcoin = TRUE,
location = TRUE,
ssn = TRUE,
at_symbol = TRUE,
url = TRUE) {


# Run python file remove_pii interface
reticulate::source_python(
system.file("python",
"text_cleaner.py",
package = "text",
mustWork = TRUE
))

cleaned_text <- remove_pii(
text = text,
replace = replace,
date = date,
time = time,
phone = phone,
email = email,
ip = ip,
money = money,
creditcard = creditcard,
bitcoin = bitcoin,
location = location,
ssn = ssn,
at_symbol = at_symbol,
url = url
)

return(cleaned_text)
}




4 changes: 3 additions & 1 deletion R/2_4_0_textPredict_Assess_Classify.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Wrapper functions for textPredictR and textClassifyPipe


#' textPredict, textAssess and textClassify
#'
#' Trained models created by e.g., textTrain() or stored on e.g., github of huggingface
Expand Down Expand Up @@ -324,6 +323,9 @@ textPredict <- function(
)
}

# display message to user
message(colourise("Predictions are ready!", fg = "green"))
message("\n")
return(results)

}
Expand Down
3 changes: 0 additions & 3 deletions R/2_4_1_textPredictTextTrained.R
Original file line number Diff line number Diff line change
Expand Up @@ -768,9 +768,6 @@ textPredictTextTrained <- function(
predictions = list(predicted_scores2))
}

# display message to user
message(colourise("Predictions are ready!", fg = "green"))
#message("\n")
return(predicted_scores2)
}

Expand Down
3 changes: 0 additions & 3 deletions R/2_4_2_textPredictImplicitMotives.R
Original file line number Diff line number Diff line change
Expand Up @@ -853,9 +853,6 @@ textPredictImplicitMotives <- function(
colnames(predicted_scores2[[2]])[colnames(predicted_scores2[[2]]) == "participant_id"] <- "row_id"
}

# display message to user
message(colourise("Predictions are ready!", fg = "green"))
#message("\n")
return(predicted_scores2)
}

17 changes: 16 additions & 1 deletion R/5_2_textGeneration.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
x = "The meaning of life is"
model = "gpt2"
device = "cpu"
tokenizer_parallelism = FALSE
logging_level = "warning"
force_return_results = FALSE
return_tensors = TRUE
return_tensors = FALSE
return_full_text = FALSE
clean_up_tokenization_spaces = FALSE
prefix = ""
handle_long_generation = "hole"
set_seed = 22L


#' Text generation
#'
#' textGeneration() predicts the words that will follow a specified text prompt. (experimental)
Expand Down Expand Up @@ -84,7 +99,7 @@ textGeneration <- function(x,

# Sort output into tidy-format
if (return_tensors == FALSE) {
output1 <- dplyr::bind_rows(hg_generated)
output1 <- dplyr::bind_rows(hg_generated[[1]][[1]][[1]])
output1
}

Expand Down
4 changes: 3 additions & 1 deletion R/5_5_textQA.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


#' Question Answering. (experimental)
#' @param question (string) A question
#' @param context (string) The context(s) where the model will look for the answer.
Expand Down Expand Up @@ -75,7 +77,7 @@ textQA <- function(question,
max_question_len = max_question_len,
handle_impossible_answer = handle_impossible_answer,
set_seed = set_seed
)[[1]]
)#[[1]]

output1 <- dplyr::bind_rows(hg_QA)

Expand Down
Loading

0 comments on commit 4cc10a6

Please sign in to comment.