diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e6a389d..8e89297 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -32,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up R ${{ matrix.config.os }} (${{ matrix.config.r }}) - uses: r-lib/actions/setup-r@v1 + uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} diff --git a/R/internal_query.R b/R/internal_query.R index 34ab477..795e6fa 100644 --- a/R/internal_query.R +++ b/R/internal_query.R @@ -21,6 +21,8 @@ #' @param collection_date_complete include only entries with complete in collection date the results. #' @param total returns the total number of sequences matching the query. #' @param fast returns all of the accession_ids that match the query. +#' @param aa_substitution returns all sequences with the amino acid mutation(s), negative selection by '-' prefix +#' @param nucl_mutation returns all sequences with the nucleotide mutation(s), negative selection by '-' prefix #' @return Dataframe. internal_query <- function(credentials, @@ -34,6 +36,8 @@ internal_query <- to_subm = NULL, virus_name = NULL, order_by = NULL, + aa_substitution = NULL, + nucl_mutation = NULL, order_asc = TRUE, start_index = 0, nrows = 50, @@ -170,6 +174,35 @@ internal_query <- ) } + # amino acid changes + if (!is.null(aa_substitution)) { + queue <- + append( + queue, + create_search_queue( + credentials, + credentials$aa_substitution_ceid, + aa_substitution, + 'FilterChange' + ) + ) + } + + # nucleotide changes + if (!is.null(nucl_mutation)) { + queue <- + append ( + queue, + create_search_queue( + credentials, + credentials$nucl_mutation_ceid, + nucl_mutation, + 'FilterChange' + ) + ) + } + + if (low_coverage_excl) { queue <- append( @@ -342,6 +375,8 @@ internal_query <- from_subm = from_subm, to = to, to_subm = to_subm, + aa_substitution = aa_substitution, + nucl_mutation = nucl_mutation, nrows = j$totalRecords, # set load_all to false to break the recursion load_all = FALSE, diff --git a/R/login.R b/R/login.R index 8ab4e50..6e33401 100644 --- a/R/login.R +++ b/R/login.R @@ -200,6 +200,10 @@ login <- function(username, password, database="EpiCoV") { linage_ceid <- NULL } + # AA substitution/mutation- ", " separated values + aa_substitution_ceid <- extract_search_ceid('mutation', customSearch_page_text) + # nucleotide substitution/nuc mutation, ", " separated values + nucl_mutation_ceid <- extract_search_ceid('nuc_mutation', customSearch_page_text) # Virus Name virus_name_ceid <- extract_search_ceid('covv_virus_name', customSearch_page_text) @@ -250,6 +254,12 @@ login <- function(username, password, database="EpiCoV") { # collection date complete collection_date_complete_ceid <- extract_search_ceid('quality2', customSearch_page_text) + # AA substitution + aa_substitution_ceid <- + extract_search_ceid('mutation', customSearch_page_text) + # nucleotide mutation + nucl_mutation_ceid <- + extract_search_ceid('nuc_mutation', customSearch_page_text) } # send selection command @@ -281,6 +291,8 @@ login <- function(username, password, database="EpiCoV") { text_ceid = text_ceid, location_ceid = location_ceid, search_cid = search_cid, + aa_substitution_ceid = aa_substitution_ceid, + nucl_mutation_ceid = nucl_mutation_ceid, linage_ceid = linage_ceid, virus_name_ceid = virus_name_ceid, from_ceid = from_ceid, diff --git a/R/query.R b/R/query.R index 268085d..be41523 100644 --- a/R/query.R +++ b/R/query.R @@ -22,6 +22,8 @@ #' @param collection_date_complete include only entries with complete in collection date the results. #' @param total returns the total number of sequences matching the query. #' @param fast returns all of the accession_ids that match the query. +#' @param aa_substitution_ceid returns all sequences with the selected amino acid mutation +#' @param nucl_mutation_ceid returns all sequences with the selected nucleotide mutation #' @return data.frame query <- function(credentials, @@ -35,6 +37,8 @@ query <- to_subm = NULL, virus_name = NULL, order_by = NULL, + aa_substitution = NULL, + nucl_mutation = NULL, order_asc = TRUE, start_index = 0, nrows = 50, @@ -69,7 +73,9 @@ query <- low_coverage_excl = low_coverage_excl, complete = complete, high_coverage = high_coverage, - collection_date_complete = collection_date_complete + collection_date_complete = collection_date_complete, + aa_substitution = aa_substitution, + nucl_mutation = nucl_mutation )) } return(results) @@ -93,6 +99,8 @@ query <- complete = complete, high_coverage = high_coverage, collection_date_complete = collection_date_complete, + aa_substitution = aa_substitution, + nucl_mutation = nucl_mutation, total = total, fast = fast ) @@ -117,7 +125,9 @@ query <- low_coverage_excl = low_coverage_excl, complete = complete, high_coverage = high_coverage, - collection_date_complete = collection_date_complete + collection_date_complete = collection_date_complete, + aa_substitution = aa_substitution, + nucl_mutation = nucl_mutation ) ) } diff --git a/README.md b/README.md index 1411961..be367f8 100644 --- a/README.md +++ b/README.md @@ -283,12 +283,27 @@ total ## Download -To download the full data set you need a list of accession IDs (which can be obtained from `query` results). +To download the full data set you need a list of accession IDs (which can be obtained from `query` results). This will also download the sequence data for each entry. + +``` r +full_df_with_seq <- download( + credentials = credentials, + list_of_accession_ids = list_of_accession_ids, +) +full_df_with_seq$sequence +``` + +[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... +[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... +[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... +... + +You can stop GISAIDR from loading the sequence data into the memory by setting get_sequence=FALSE. Note: the sequence data will still be downloaded. ``` r df <- query(credentials = credentials) list_of_accession_ids <- df$accession_id -full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids) +full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids, get_sequence=FALSE) colnames(full_df) ``` @@ -302,24 +317,6 @@ colnames(full_df) Note: a maximum of 5000 results can be downloaded at a time. -### Get sequence data - -Use the `get_sequence` argument to download the sequences with the full data. - -``` r -full_df_with_seq <- download( - credentials = credentials, - list_of_accession_ids = list_of_accession_ids, - get_sequence=TRUE -) -full_df_with_seq$sequence -``` - -[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... -[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... -[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... -... - ### Export to fasta file Use the export_fasta function to write sequence data to a file in fasta format. The sequence names will be [country\@pango_lineage\@accesion_id\@date](mailto:country@pango_lineage@accesion_id@date), with the date in decimal format (requires the [lubridate](https://cran.r-project.org/web/packages/lubridate/index.html) package). The default is to only export sequences for which a decimal date could be set. To prevent this, use the argument export_dated_only = F. @@ -417,8 +414,9 @@ quality_ceid <- extract_search_ceid("quality'", customSearch_page_text) ``` 1. Add the extracted `ceid` to the list of `credentials` e.g. `complete_ceid = complete_ceid` -2. Add the new argument and default value to the `query()` function in `query.R` e.g. `complete = FALSE`. -3. Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g. +2. Add the new argument and default value to **all** `query()` function in `query.R` and `internal_query.R` e.g. `complete = FALSE`. +3. Add the new argument and default value to the `(load_all && j$totalRecords > nrows)` load_all recursion loop so all paginations will continue using the argument. +4. Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g. ``` r if (complete) { diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 8a816a4..ceeec5c 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -145,6 +145,29 @@ test_that("order_by works", { expect_true(df$submission_date[1] == "2020-01-10") }) +test_that("aa_substitution works", { + df <- query(credentials = credentials, + aa_substitution = 'Spike_E484Q, Spike_H69del, -N_P13L', + to_subm = '2023-02-22', + load_all = TRUE, + order_by='submission_date') + expect_true(is.data.frame(df)) + expect_equal(df$submission_date[1], "2021-01-25") + expect_equal(nrow(df),576) + ## to test accuracy - set of 4 rarely co-existing mutations to verify Spike_H69del, Spike_A222V, Spike_G476S, -N_P13L +}) + +test_that("nucl_mutation works", { + df <- query(credentials = credentials, + nucl_mutation = '-T23599G, -C10029T, -C14408T, -A23403G, T22679C, G28881A, A24424T', + to_subm = '2023-02-22', + load_all = TRUE, + order_by='submission_date') + expect_true(is.data.frame(df)) + expect_equal(df$submission_date[1],"2021-12-29") + expect_equal(nrow(df),55) +}) + test_that("text search works", { accession_ids = c("EPI_ISL_17398411", "EPI_ISL_17199001", "EPI_ISL_17409201", "EPI_ISL_17243716") df <- query(credentials = credentials, text = paste(accession_ids, collapse = "\n"))