diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 906c0fb1..093db343 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -23,10 +23,10 @@ jobs: steps: - name: Confirm crew102 triggered the build run: | - if [ "${{ github.event.sender.login }}" == "crew102" ]; then + if [ "${{ env.GITHUB_ACTOR }}" == "crew102" ]; then echo "Actor is crew102" else - echo "Actor is ${{ github.actor }}, failing build." + echo "Actor is ${{ env.GITHUB_ACTOR }}, failing build." exit 1 fi @@ -119,4 +119,4 @@ jobs: uses: actions/upload-artifact@main with: name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check \ No newline at end of file + path: check diff --git a/DESCRIPTION b/DESCRIPTION index cacc3d76..37ee6a1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ LazyData: TRUE Depends: R (>= 3.1) Imports: - httr, + httr2, lifecycle, jsonlite, utils diff --git a/NAMESPACE b/NAMESPACE index 46981d70..2cff0eaa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(cast_pv_data) export(get_endpoints) export(get_fields) export(get_ok_pk) +export(pad_patent_id) export(qry_funs) export(retrieve_linked_data) export(search_pv) diff --git a/R/cast-pv-data.R b/R/cast-pv-data.R index 57716461..e3cd59da 100644 --- a/R/cast-pv-data.R +++ b/R/cast-pv-data.R @@ -5,21 +5,23 @@ as_is <- function(x) x get_cast_fun <- function(data_type) { # Some fields aren't documented, so we don't know what their data type is. Use # string type for these. + # new version of the API: state of string vs fulltext is in flux. Latter currently unused if (length(data_type) != 1) data_type <- "string" - switch( - data_type, + switch(data_type, "string" = as_is, "date" = as.Date, - "float" = as.numeric, - "integer" = as.integer, + "number" = as_is, + "integer" = as_is, "int" = as.integer, - "fulltext" = as_is + "fulltext" = as_is, + "boolean" = as_is, + "bool" = as.logical ) } #' @noRd lookup_cast_fun <- function(name, typesdf) { - data_type <- typesdf[typesdf$field == name, "data_type"] + data_type <- typesdf[typesdf$common_name == name, "data_type"] get_cast_fun(data_type = data_type) } @@ -29,6 +31,18 @@ cast_one.character <- function(one, name, typesdf) { cast_fun(one) } +#' @noRd +cast_one.double <- function(one, name, typesdf) { + cast_fun <- lookup_cast_fun(name, typesdf) + cast_fun(one) +} + +#' @noRd +cast_one.integer <- function(one, name, typesdf) { + cast_fun <- lookup_cast_fun(name, typesdf) + cast_fun(one) +} + #' @noRd cast_one.default <- function(one, name, typesdf) NA @@ -69,7 +83,7 @@ cast_one <- function(one, name, typesdf) UseMethod("cast_one") #' \dontrun{ #' #' fields <- c("patent_date", "patent_title", "patent_year") -#' res <- search_pv(query = "{\"patent_number\":\"5116621\"}", fields = fields) +#' res <- search_pv(query = "{\"patent_id\":\"5116621\"}", fields = fields) #' cast_pv_data(data = res$data) #' } #' @@ -77,9 +91,25 @@ cast_one <- function(one, name, typesdf) UseMethod("cast_one") cast_pv_data <- function(data) { validate_pv_data(data) - endpoint <- names(data) + entity_name <- names(data) + + if (entity_name == "rel_app_texts") { + # blend the fields from both rel_app_texts entities + typesdf <- unique(fieldsdf[fieldsdf$group == entity_name, c("common_name", "data_type")]) + } else { + # need to get the endpoint from entity_name + endpoint_df <- fieldsdf[fieldsdf$group == entity_name, ] + endpoint <- unique(endpoint_df$endpoint) + + # watch out here- several endpoints return entities that are groups returned + # by the patent and publication endpoints (attorneys, inventors, assignees) + if(length(endpoint) > 1) { + endpoint <- endpoint[!endpoint %in% c("patent", "publication")] + } + + typesdf <- fieldsdf[fieldsdf$endpoint == endpoint, c("common_name", "data_type")] - typesdf <- fieldsdf[fieldsdf$endpoint == endpoint, c("field", "data_type")] + } df <- data[[1]] @@ -89,7 +119,7 @@ cast_pv_data <- function(data) { df[] <- list_out out_data <- list(x = df) - names(out_data) <- endpoint + names(out_data) <- entity_name structure( out_data, diff --git a/R/check-query.R b/R/check-query.R index c572e02e..238c8824 100644 --- a/R/check-query.R +++ b/R/check-query.R @@ -10,28 +10,32 @@ is_int <- function(x) #' @noRd is_date <- function(x) - grepl("[12][[:digit:]]{3}-[01][[:digit:]]-[0-3][[:digit:]]", x) + grepl("^[12][[:digit:]]{3}-[01][[:digit:]]-[0-3][[:digit:]]$", x) #' @noRd one_check <- function(operator, field, value, f1) { - if (nrow(f1) == 0) stop2(field, " is not a valid field to query for your endpoint") if (f1$data_type == "date" && !is_date(value)) stop2("Bad date: ", value, ". Date must be in the format of yyyy-mm-dd") - if (f1$data_type %in% c("string", "fulltext") && !is.character(value)) + if (f1$data_type %in% c("bool", "int", "string", "fulltext") && !is.character(value)) stop2(value, " must be of type character") if (f1$data_type == "integer" && !is_int(value)) stop2(value, " must be an integer") + if (f1$data_type == "boolean" && !is.logical(value)) + stop2(value, " must be a boolean") + if (f1$data_type == "number" && !is.numeric(value)) + stop2(value, " must be a number") if ( - (operator %in% c("_begins", "_contains") && !(f1$data_type == "string")) || - (operator %in% c("_text_all", "_text_any", "_text_phrase") && - !(f1$data_type == "fulltext")) || - (f1$data_type %in% c("string", "fulltext") && - operator %in% c("_gt", "_gte", "_lt", "_lte")) - ) + # The new version of the API blurrs the distinction between string/fulltext fields. + # It looks like the string/fulltext functions can be used interchangeably + (operator %in% c("_begins", "_contains", "_text_all", "_text_any", "_text_phrase") && + !(f1$data_type == "fulltext" || f1$data_type == "string")) || + (f1$data_type %in% c("string", "fulltext") && + operator %in% c("_gt", "_gte", "_lt", "_lte"))) { stop2("You cannot use the operator ", operator, " with the field ", field) + } } #' @noRd @@ -40,13 +44,16 @@ check_query <- function(query, endpoint) { num_opr <- c("_gt", "_gte", "_lt", "_lte") str_opr <- c("_begins", "_contains") fltxt_opr <- c("_text_all", "_text_any", "_text_phrase") - all_opr <- c(simp_opr, num_opr, str_opr, fltxt_opr) + all_opr <- c(simp_opr, num_opr, str_opr, fltxt_opr, "_in_range") - flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$can_query == "y", ] + flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint, ] apply_checks <- function(x, endpoint) { x <- swap_null_nms(x) - if (names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { + + # troublesome next line: 'length(x) = 2 > 1' in coercion to 'logical(1)' + # if (names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { + if (length(names(x)) > 1 || names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { lapply(x, FUN = apply_checks) } else if (names(x) %in% all_opr) { f1 <- flds_flt[flds_flt$field == names(x[[1]]), ] @@ -61,8 +68,8 @@ check_query <- function(query, endpoint) { ) } else { stop2( - names(x), " is either not a valid operator or not a ", - "queryable field for this endpoint" + names(x), " is not a valid operator or not a ", + "valid field for this endpoint" ) } } diff --git a/R/data.R b/R/data.R index 53cc8ecb..49ae72f3 100644 --- a/R/data.R +++ b/R/data.R @@ -3,7 +3,7 @@ #' A data frame containing the names of retrievable fields for each of the #' endpoints. You can find this data on the API's online documentation for each #' endpoint as well (e.g., the -#' \href{https://patentsview.org/apis/api-endpoints/patents}{patents endpoint +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patent endpoint #' field list table}). #' #' @format A data frame with the following columns: diff --git a/R/get-fields.R b/R/get-fields.R index 850eff89..062c804e 100644 --- a/R/get-fields.R +++ b/R/get-fields.R @@ -1,3 +1,9 @@ +#' @noRd +get_top_level_attributes <- function(endpoint) { + fieldsdf[fieldsdf$endpoint == endpoint & !grepl("\\.", fieldsdf$field), "field"] +} + + #' Get list of retrievable fields #' #' This function returns a vector of fields that you can retrieve from a given @@ -13,15 +19,18 @@ #' endpoint's fields (i.e., do not filter the field list based on group #' membership). See the field tables located online to see which groups you #' can specify for a given endpoint (e.g., the -#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patent +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patents #' endpoint table}), or use the \code{fieldsdf} table #' (e.g., \code{unique(fieldsdf[fieldsdf$endpoint == "patent", "group"])}). +#' @param include_pk Boolean on whether to include the endpoint's primary key, +#' defaults to FALSE. The primary key is needed if you plan on calling +#' \code{\link{unnest_pv_data}} on the results of \code{\link{search_pv}} #' #' @return A character vector with field names. #' #' @examples -#' # Get all assignee-level fields for the patent endpoint: -#' fields <- get_fields(endpoint = "patent", groups = "assignees") +#' # Get all top level (non-nested) fields for the patent endpoint: +#' fields <- get_fields(endpoint = "patent", groups = c("patents")) #' #' # ...Then pass to search_pv: #' \dontrun{ @@ -31,7 +40,7 @@ #' fields = fields #' ) #' } -#' # Get all patent and assignee-level fields for the patent endpoint: +#' # Get unnested patent and assignee-level fields for the patent endpoint: #' fields <- get_fields(endpoint = "patent", groups = c("assignees", "patents")) #' #' \dontrun{ @@ -41,15 +50,49 @@ #' fields = fields #' ) #' } +#' # Get the nested inventors fields and the primary key in order to call unnest_pv_data +#' # on the returned data. unnest_pv_data would throw an error if the primary key was +#' # not present in the results. +#' fields <- get_fields(endpoint = "patent", groups = c("inventors"), include_pk = TRUE) +#' +#' \dontrun{ +#' # ...Then pass to search_pv and unnest the results +#' results <- search_pv( +#' query = '{"_gte":{"patent_date":"2007-01-04"}}', +#' fields = fields +#' ) +#' unnest_pv_data(results$data) +#' } #' #' @export -get_fields <- function(endpoint, groups = NULL) { +get_fields <- function(endpoint, groups = NULL, include_pk = FALSE) { validate_endpoint(endpoint) + + # using API's shorthand notation, group names can be requested as fields instead of + # fully qualifying each nested field. Fully qualified, all patent endpoint's attributes + # is over 4K, too big to be sent on a GET with a modest query + + pk <- get_ok_pk(endpoint) + plural_entity <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field == pk, "group"] + top_level_attributes <- get_top_level_attributes(endpoint) + if (is.null(groups)) { - fieldsdf[fieldsdf$endpoint == endpoint, "field"] + c( + top_level_attributes, + unique(fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$group != plural_entity, "group"]) + ) } else { validate_groups(endpoint, groups = groups) - fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$group %in% groups, "field"] + + # don't include pk if plural_entity group is requested (pk would be a member) + extra_field <- if (include_pk && !plural_entity %in% groups) pk else NULL + extra_fields <- if (plural_entity %in% groups) top_level_attributes else NULL + + c( + extra_field, + extra_fields, + groups[!groups == plural_entity] + ) } } diff --git a/R/print.R b/R/print.R index 65bb53fe..025e7b4f 100644 --- a/R/print.R +++ b/R/print.R @@ -24,7 +24,10 @@ print.pv_data_result <- function(x, ...) { ) utils::str( - x, vec.len = 1, max.level = 2, give.attr = FALSE, strict.width = "cut" + x, vec.len = 1, max.level = 2, give.attr = FALSE, strict.width = "cut", + formatNum = function(x, ...) { + format(x, trim = TRUE, drop0trailing = TRUE, scientific = FALSE, ...) + } ) } diff --git a/R/process-error.R b/R/process-error.R deleted file mode 100644 index 4136f5d7..00000000 --- a/R/process-error.R +++ /dev/null @@ -1,57 +0,0 @@ -#' @noRd -throw_er <- function(resp) { - throw_if_loc_error(resp) - xheader_er_or_status(resp) -} - -#' @noRd -throw_if_loc_error <- function(resp) { - if (hit_locations_ep(resp$url) && httr::status_code(resp) == 500) { - num_grps <- get_num_groups(resp$url) - if (num_grps > 2) { - stop2( - "Your request resulted in a 500 error, likely because you have ", - "requested too many fields in your request (the location endpoint ", - "currently has restrictions on the number of fields/groups you can ", - "request). Try slimming down your field list and trying again." - ) - } - } -} - -# Not sure this is still applicable -#' @noRd -hit_locations_ep <- function(url) { - grepl( - "^https://search.patentsview.org/api/v1/location/", - url, - ignore.case = TRUE - ) -} - -#' @noRd -get_num_groups <- function(url) { - prsd_json_filds <- gsub(".*&f=([^&]*).*", "\\1", utils::URLdecode(url)) - fields <- jsonlite::fromJSON(prsd_json_filds) - grps <- fieldsdf[fieldsdf$endpoint == "location" & - fieldsdf$field %in% fields, "group"] - length(unique(grps)) -} - -#' @noRd -xheader_er_or_status <- function(resp) { - - # look for the api's ultra-helpful X-Status-Reason header - xhdr <- get_x_status(resp) - - if (length(xhdr) != 1) - httr::stop_for_status(resp) - else - stop(xhdr[[1]], call. = FALSE) -} - -#' @noRd -get_x_status <- function(resp) { - headers <- httr::headers(resp) - headers[grepl("x-status-reason$", names(headers), ignore.case = TRUE)] -} diff --git a/R/process-resp.R b/R/process-resp.R index 4fb9ed3d..426f3f4b 100644 --- a/R/process-resp.R +++ b/R/process-resp.R @@ -1,23 +1,10 @@ -#' @noRd -parse_resp <- function(resp) { - j <- httr::content(resp, as = "text", encoding = "UTF-8") - jsonlite::fromJSON( - j, - simplifyVector = TRUE, simplifyDataFrame = TRUE, simplifyMatrix = TRUE - ) -} - #' @noRd get_request <- function(resp) { gp <- structure( - list(method = resp$req$method, url = resp$req$url), + list(method = resp$request$method, url = resp$request$url), class = c("list", "pv_request") ) - if (gp$method == "POST") { - gp$body <- rawToChar(resp$req$options$postfields) - } - gp } @@ -42,11 +29,10 @@ get_query_results <- function(prsd_resp) { #' @noRd process_resp <- function(resp) { - if (httr::http_error(resp)) throw_er(resp) - prsd_resp <- parse_resp(resp) request <- get_request(resp) data <- get_data(prsd_resp) + query_results <- get_query_results(prsd_resp) structure( diff --git a/R/query-dsl.R b/R/query-dsl.R index 51bf0640..f3ea00ef 100644 --- a/R/query-dsl.R +++ b/R/query-dsl.R @@ -54,12 +54,31 @@ create_not_fun <- function(fun) { } } +#' @noRd +create_in_range_fun <- function(fun) { + force(fun) + function(...) { + value_p <- list(...) + field <- names(value_p) + value <- unlist(value_p) + names(value) <- NULL + + # throw an error if the length isn't two + asrt(length(value) == 2, fun, " expects a range of exactly two arguments") + + low <- create_one_fun(field = field, value = value[1], fun = "gte") + high <- create_one_fun(field = field, value = value[2], fun = "lte") + z <- list(`_and` = list(low, high)) + + structure(z, class = c(class(z), "pv_query")) + } +} + #' List of query functions #' #' A list of functions that make it easy to write PatentsView queries. See the -#' details section below for a list of the 14 functions, as well as the -#' \href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing -#' queries vignette} for further details. +#' details section below for a list of the 15 functions, as well as the +#' \href{../articles/writing-queries.html}{writing queries vignette} for further details. #' #' @details #' @@ -109,6 +128,13 @@ create_not_fun <- function(fun) { #' \item \code{not} - The comparison is not true #' } #' +#' \strong{4. Convenience function} \cr +#' +#' There is 1 convenience function: +#' \itemize{ +#' \item \code{in_range} - Builds a <= x <= b query +#' } +#' #' @return An object of class \code{pv_query}. This is basically just a simple #' list with a print method attached to it. #' @@ -117,6 +143,10 @@ create_not_fun <- function(fun) { #' #' qry_funs$not(qry_funs$eq(patent_date = "2001-01-01")) #' +#' qry_funs$in_range(patent_year = c(2010, 2021)) +#' +#' qry_funs$in_range(patent_date = c("1976-01-01", "1983-02-28")) + #' @export qry_funs <- c( lapply2( @@ -126,7 +156,8 @@ qry_funs <- c( ), create_key_fun ), lapply2(c("and", "or"), create_array_fun), - lapply2("not", create_not_fun) + lapply2("not", create_not_fun), + lapply2("in_range", create_in_range_fun) ) #' With qry_funs @@ -144,26 +175,25 @@ qry_funs <- c( #' @return The result of \code{code} - i.e., your query. #' #' @examples -#' # Without with_qfuns, we have to do: #' qry_funs$and( #' qry_funs$gte(patent_date = "2007-01-01"), #' qry_funs$text_phrase(patent_abstract = c("computer program")), #' qry_funs$or( -#' qry_funs$eq(inventor_last_name = "ihaka"), -#' qry_funs$eq(inventor_first_name = "chris") +#' qry_funs$eq(inventors.inventor_name_last = "Ihaka"), +#' qry_funs$eq(inventors.inventor_name_last = "Chris") #' ) #' ) #' -#' #...With it, this becomes: +#' # ...With it, this becomes: #' with_qfuns( -#' and( -#' gte(patent_date = "2007-01-01"), -#' text_phrase(patent_abstract = c("computer program")), -#' or( -#' eq(inventor_last_name = "ihaka"), -#' eq(inventor_first_name = "chris") -#' ) -#' ) +#' and( +#' gte(patent_date = "2007-01-01"), +#' text_phrase(patent_abstract = c("computer program")), +#' or( +#' eq(inventors.inventor_name_last = "Ihaka"), +#' eq(inventors.inventor_name_last = "Chris") +#' ) +#' ) #' ) #' #' @export diff --git a/R/search-pv.R b/R/search-pv.R index 7e711c0b..0f6b32c0 100644 --- a/R/search-pv.R +++ b/R/search-pv.R @@ -11,26 +11,41 @@ tojson_2 <- function(x, ...) { } #' @noRd -to_arglist <- function(fields, page, per_page, sort) { +to_arglist <- function(fields, size, sort, after) { + opts <- list(size = size) + if (!is.null(after)) { + opts$after <- after + } + list( fields = fields, sort = list(as.list(sort)), - opts = list( - offset = (page - 1) * per_page, - size = per_page - ) + opts = opts ) } +#' @noRd +set_sort_param <- function(before) { + # Fixes former bug + # for sort = c("patent_id" = "asc", "citation_patent_id" = "asc") + # we sent [{"patent_id":"asc","citation_patent_id":"asc"}] + # API wants [{"patent_id": "asc" },{"citation_patent_id": "asc" }] + # TODO(any): brute meet force- there must be a better way... + after <- tojson_2(before, auto_unbox = TRUE) + after <- gsub('","', '"},{"', after) + after +} + #' @noRd get_get_url <- function(query, base_url, arg_list) { j <- paste0( base_url, "?q=", utils::URLencode(query, reserved = TRUE), "&f=", tojson_2(arg_list$fields), - "&o=", tojson_2(arg_list$opts, auto_unbox = TRUE), - "&s=", tojson_2(arg_list$sort, auto_unbox = TRUE) + "&s=", set_sort_param(arg_list$sort), + "&o=", tojson_2(arg_list$opts, auto_unbox = TRUE) ) + utils::URLencode(j) } @@ -40,87 +55,106 @@ get_post_body <- function(query, arg_list) { "{", '"q":', query, ",", '"f":', tojson_2(arg_list$fields), ",", - '"o":', tojson_2(arg_list$opts, auto_unbox = TRUE), ",", - '"s":', tojson_2(arg_list$sort, auto_unbox = TRUE), + '"s":', set_sort_param(arg_list$sort), ",", + '"o":', tojson_2(arg_list$opts, auto_unbox = TRUE), "}" ) - gsub('(,"[fs]":)([,}])', paste0("\\1", "{}", "\\2"), body) + # The API can now act weirdly if we pass f:{},s:{} as we did in the past. + # (Weirdly in that the post results may not equal the get results or posts error out) + # Now we'd remove "f":, and "s":, We're guaranteed to have q: and at least "size":1000 as o: + gsub('("[fs]":,)', "", body) } #' @noRd -one_request <- function(method, query, base_url, arg_list, api_key, ...) { - ua <- httr::user_agent("https://github.com/ropensci/patentsview") +patentsview_error_body <- function(resp) { + if (httr2::resp_status(resp) == 400) c(httr2::resp_header(resp, "X-Status-Reason")) else NULL +} +#' @noRd +one_request <- function(method, query, base_url, arg_list, api_key, ...) { if (method == "GET") { get_url <- get_get_url(query, base_url, arg_list) - resp <- httr::GET( - get_url, - httr::add_headers("X-Api-Key" = api_key), - ua, ... - ) + req <- httr2::request(get_url) |> + httr2::req_method("GET") } else { body <- get_post_body(query, arg_list) - resp <- httr::POST( - base_url, - httr::add_headers( - "X-Api-Key" = api_key, - "Content-Type" = "application/json" - ), - body = body, - ua, ... - ) + req <- httr2::request(base_url) |> + httr2::req_body_raw(body) |> + httr2::req_headers("Content-Type" = "application/json") |> + httr2::req_method("POST") } - # Sleep and retry on a 429 (too many requests). The Retry-After header is the - # seconds to sleep - if (httr::status_code(resp) == 429) { - num_seconds <- httr::headers(resp)[["Retry-After"]] - maybe_an_s <- if (num_seconds == "1") "" else "s" - message(paste0( - "The API's requests per minute limit has been reached. ", - "Pausing for ", num_seconds, " second", maybe_an_s, - " before continuing." - )) - Sys.sleep(num_seconds) - - one_request(method, query, base_url, arg_list, api_key, ...) - } else { - resp + resp <- req |> + httr2::req_user_agent("https://github.com/ropensci/patentsview") |> + httr2::req_options(...) |> + httr2::req_retry(max_tries = 20) |> # automatic 429 Retry-After + httr2::req_headers("X-Api-Key" = api_key, .redact = "X-Api-Key") |> + httr2::req_error(body = patentsview_error_body) |> + httr2::req_perform() + + resp +} + +#' Pad patent_id +#' +#' This function strategically pads a patent_id with zeroes to 8 characters, +#' needed only for custom paging that uses sorts by patent_id. +#' +#' @param patent_id The patent_id that needs to be padded. It can +#' be the patent_id for a utility, design, plant or reissue patent. +#' +#' @examples +#' \dontrun{ +#' padded <- pad_patent_id("RE36479") +#' +#' padded2 <- pad_patent_id("3930306") +#' } +#' +#' @export +# zero pad patent_id to 8 characters. +pad_patent_id <- function(patent_id) { + pad <- 8 - nchar(patent_id) + if (pad > 0) { + patent_id <- paste0(sprintf("%0*d", pad, 0), patent_id) + patent_id <- sub("(0+)([[:alpha:]]+)([[:digit:]]+)", "\\2\\1\\3", patent_id) } + patent_id } #' @noRd request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, ...) { matched_records <- ex_res$query_results[[1]] req_pages <- ceiling(matched_records / arg_list$opts$size) - if (req_pages < 1) { - stop2("No records matched your query...Can't download multiple pages") - } - if (matched_records > 10000) { - stop2( - "The API only allows you to download 10,000 records in a single query. ", - "Your query returned ", matched_records, " records. See for ", - "how to get around this limitation." - ) - } - if (req_pages > 10) { - stop2( - "The API only allows you to download 10 pages in a single query. ", - "Your query returned ", req_pages, " pages. See for ", - "how to get around this limitation." - ) - } tmp <- lapply(seq_len(req_pages), function(i) { - arg_list$opts$offset <- (i - 1) * arg_list$opts$size x <- one_request(method, query, base_url, arg_list, api_key, ...) x <- process_resp(x) + + # now to page we need to set the "after" attribute to where we left off + # we want the last value of the primary sort field + s <- names(arg_list$sort[[1]])[[1]] + index <- nrow(x$data[[1]]) + last_value <- x$data[[1]][[s]][[index]] + + if (s == "patent_id") { + last_value <- pad_patent_id(last_value) + } + + arg_list$opts$after <<- last_value + x$data[[1]] }) do.call("rbind", c(tmp, make.row.names = FALSE)) } +#' @noRd +get_default_sort <- function(endpoint) { + default <- c("asc") + names(default) <- get_ok_pk(endpoint) + default +} + #' Search PatentsView #' #' This function makes an HTTP request to the PatentsView API for data matching @@ -137,51 +171,62 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' #' \item An object of class \code{pv_query}, which you create by calling one #' of the functions found in the \code{\link{qry_funs}} list...See the -#' \href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing +#' \href{../articles/writing-queries.html}{writing #' queries vignette} for details.\cr #' E.g., \code{qry_funs$gte(patent_date = "2007-01-04")} #' } #' @param fields A character vector of the fields that you want returned to you. -#' A value of \code{NULL} indicates that the default fields should be -#' returned. Acceptable fields for a given endpoint can be found at the API's +#' A value of \code{NULL} indicates to the API that it should return the default fields +#' for that endpoint. Acceptable fields for a given endpoint can be found at the API's #' online documentation (e.g., check out the field list for the -#' \href{https://patentsview.org/apis/api-endpoints/patents}{patents +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patents #' endpoint}) or by viewing the \code{fieldsdf} data frame #' (\code{View(fieldsdf)}). You can also use \code{\link{get_fields}} to list #' out the fields available for a given endpoint. +#' +#' Nested fields can be fully qualified, e.g., "application.filing_date" or the +#' group name can be used to retrieve all of its nested fields, E.g. "application". +#' The latter would be similar to passing \code{get_fields("patent", group = "application")} +#' except it's the API that decides what fields to return. #' @param endpoint The web service resource you wish to search. Use #' \code{get_endpoints()} to list the available endpoints. -#' @param subent_cnts `r lifecycle::badge("deprecated")` Non-matched subentities -#' will always be returned under the new version of the API +#' @param subent_cnts `r lifecycle::badge("deprecated")` This is always FALSE in the +#' new version of the API as the total counts of unique subentities is no longer available. #' @param mtchd_subent_only `r lifecycle::badge("deprecated")` This is always -#' FALSE in the new version of the API. -#' @param page The page number of the results that should be returned. -#' @param per_page The number of records that should be returned per page. This -#' value can be as high as 1,000 (e.g., \code{per_page = 1000}). +#' FALSE in the new version of the API as non-matched subentities +#' will always be returned. +#' @param page `r lifecycle::badge("deprecated")` The new version of the API does not use +#' \code{page} as a parameter for paging, it uses \code{after}. +#' @param per_page `r lifecycle::badge("deprecated")` The API now uses \code{size} +#' @param size The number of records that should be returned per page. This +#' value can be as high as 1,000 (e.g., \code{size = 1000}). +#' @param after A list of sort key values that defaults to NULL. This +#' exposes the API's paging parameter for users who want to implement their own +#' paging. It cannot be set when \code{all_pages = TRUE} as the R package manipulates it +#' for users automatically. See \href{../articles/result-set-paging.html}{result set paging} #' @param all_pages Do you want to download all possible pages of output? If -#' \code{all_pages = TRUE}, the values of \code{page} and \code{per_page} are -#' ignored. +#' \code{all_pages = TRUE}, the value of \code{size} is ignored. #' @param sort A named character vector where the name indicates the field to #' sort by and the value indicates the direction of sorting (direction should -#' be either "asc" or "desc"). For example, \code{sort = c("patent_number" = -#' "asc")} or \cr\code{sort = c("patent_number" = "asc", "patent_date" = +#' be either "asc" or "desc"). For example, \code{sort = c("patent_id" = +#' "asc")} or \cr\code{sort = c("patent_id" = "asc", "patent_date" = #' "desc")}. \code{sort = NULL} (the default) means do not sort the results. #' You must include any fields that you wish to sort by in \code{fields}. #' @param method The HTTP method that you want to use to send the request. #' Possible values include "GET" or "POST". Use the POST method when #' your query is very long (say, over 2,000 characters in length). #' @param error_browser `r lifecycle::badge("deprecated")` -#' @param api_key API key. See \href{https://patentsview.org/apis/keyrequest}{ -#' Here} for info on creating a key. -#' @param ... Arguments passed along to httr's \code{\link[httr]{GET}} or -#' \code{\link[httr]{POST}} function. +#' @param api_key API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +#' \href{https://patentsview-support.atlassian.net/servicedesk/customer/portals}{here}. +#' @param ... Curl options passed along to httr2's \code{\link[httr2]{req_options}} +#' when we do GETs or POSTs. #' #' @return A list with the following three elements: #' \describe{ #' \item{data}{A list with one element - a named data frame containing the #' data returned by the server. Each row in the data frame corresponds to a #' single value for the primary entity. For example, if you search the -#' assignees endpoint, then the data frame will be on the assignee-level, +#' assignee endpoint, then the data frame will be on the assignee-level, #' where each row corresponds to a single assignee. Fields that are not on #' the assignee-level would be returned in list columns.} #' @@ -207,8 +252,8 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' search_pv( #' query = qry_funs$gt(patent_year = 2010), #' method = "POST", -#' fields = "patent_number", -#' sort = c("patent_number" = "asc") +#' fields = "patent_id", +#' sort = c("patent_id" = "asc") #' ) #' #' search_pv( @@ -223,9 +268,14 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' ) #' #' search_pv( -#' query = qry_funs$contains(inventors_at_grant.name_last = "Smith"), +#' query = qry_funs$contains(inventors.inventor_name_last = "Smith"), #' endpoint = "patent", -#' config = httr::timeout(40) +#' timeout = 40 +#' ) +#' +#' search_pv( +#' query = qry_funs$eq(patent_id = "11530080"), +#' fields = "application" #' ) #' } #' @@ -235,67 +285,187 @@ search_pv <- function(query, endpoint = "patent", subent_cnts = FALSE, mtchd_subent_only = lifecycle::deprecated(), - page = 1, - per_page = 1000, + page = lifecycle::deprecated(), + per_page = lifecycle::deprecated(), + size = 1000, + after = NULL, all_pages = FALSE, sort = NULL, method = "GET", error_browser = NULL, api_key = Sys.getenv("PATENTSVIEW_API_KEY"), ...) { - - validate_args(api_key, fields, endpoint, method, page, per_page, sort) - deprecate_warn_all(error_browser, subent_cnts, mtchd_subent_only) + validate_args(api_key, fields, endpoint, method, sort, after, size, all_pages) + deprecate_warn_all(error_browser, subent_cnts, mtchd_subent_only, page, per_page) + if (lifecycle::is_present(per_page)) size <- per_page if (is.list(query)) { - # check_query(query, endpoint) + check_query(query, endpoint) query <- jsonlite::toJSON(query, auto_unbox = TRUE) } - arg_list <- to_arglist(fields, page, per_page, sort) + + arg_list <- to_arglist(fields, size, sort, after) base_url <- get_base(endpoint) result <- one_request(method, query, base_url, arg_list, api_key, ...) result <- process_resp(result) - if (!all_pages) return(result) - full_data <- request_apply(result, method, query, base_url, arg_list, api_key, ...) - result$data[[1]] <- full_data + if (all_pages && result$query_result$total_hits == 0) { + stop2("No records matched your query...Can't download multiple pages") + } + # return if we don't need to make additional API requests + if (!all_pages || + result$query_result$total_hits == 0 || + result$query_result$total_hits == nrow(result$data[[1]])) { + return(result) + } + + # Here we ignore the user's sort and instead have the API sort by the primary + # key for the requested endpoint. + primary_sort_key <- get_default_sort(endpoint) + + # We check what fields we got back from the first call. If the user didn't + # specify fields, we'd get back the API's defaults. We may need to request + # additional fields from the API so we can apply the users sort and then remove + # the additional fields. + returned_fields <- names(result$data[[1]]) + + if (!is.null(sort)) { + sort_fields <- names(sort) + additional_fields <- sort_fields[!(sort_fields %in% returned_fields)] + if (is.null(fields)) { + fields <- returned_fields # the default fields + } else { + fields <- fields # user passed + } + fields <- append(fields, additional_fields) + } else { + additional_fields <- c() + } + + arg_list <- to_arglist(fields, size, primary_sort_key, after) + paged_data <- request_apply(result, method, query, base_url, arg_list, api_key, ...) + + # we apply the user's sort, if they supplied one, using order() + # was data.table::setorderv(paged_data, names(sort), ifelse(as.vector(sort) == "asc", 1, -1)) + if (!is.null(sort)) { + sort_order <- mapply(function(col, direction) { + if (direction == "asc") { + return(paged_data[[col]]) + } else { + return(-rank(paged_data[[col]], ties.method = "min")) # Invert for descending order + } + }, col = names(sort), direction = as.vector(sort), SIMPLIFY = FALSE) + + # Final sorting + paged_data <- paged_data[do.call(order, sort_order), , drop = FALSE] + } + + # remove the fields we added in order to do the user's sort ourselves + paged_data <- paged_data[, !names(paged_data) %in% additional_fields] + + result$data[[1]] <- paged_data result } -#' Get Linked Data +#' Retrieve Linked Data #' #' Some of the endpoints now return HATEOAS style links to get more data. E.g., -#' the inventors endpoint may return a link such as: -#' "https://search.patentsview.org/api/v1/inventor/252373/" +#' the patent endpoint may return a link such as: +#' "https://search.patentsview.org/api/v1/inventor/fl:th_ln:jefferson-1/" +#' +#' @param url A link that was returned by the API on a previous call, an example +#' in the documentation or a Request URL from the \href{https://search.patentsview.org/swagger-ui/}{API's Swagger UI page}. +#' +#' @param encoded_url boolean to indicate whether the url has been URL encoded, defaults to FALSE. +#' Set to TRUE for Request URLs from Swagger UI. +#' +#' @param ... Curl options passed along to httr2's \code{\link[httr2]{req_options}} function. +#' +#' @return A list with the following three elements: +#' \describe{ +#' \item{data}{A list with one element - a named data frame containing the +#' data returned by the server. Each row in the data frame corresponds to a +#' single value for the primary entity. For example, if you search the +#' assignee endpoint, then the data frame will be on the assignee-level, +#' where each row corresponds to a single assignee. Fields that are not on +#' the assignee-level would be returned in list columns.} +#' +#' \item{query_results}{Entity counts across all pages of output (not just +#' the page returned to you).} #' -#' @param url The link that was returned by the API on a previous call. +#' \item{request}{Details of the GET HTTP request that was sent to the server.} +#' } #' -#' @inherit search_pv return #' @inheritParams search_pv #' #' @examples #' \dontrun{ #' #' retrieve_linked_data( -#' "https://search.patentsview.org/api/v1/cpc_subgroup/G01S7:4811/" -#' ) +#' "https://search.patentsview.org/api/v1/cpc_group/G01S7:4811/" +#' ) +#' +#' endpoint_url <- "https://search.patentsview.org/api/v1/patent/" +#' q_param <- '?q={"_text_any":{"patent_title":"COBOL cotton gin"}}' +#' s_and_o_params <- '&s=[{"patent_id": "asc" }]&o={"size":50}' +#' f_param <- '&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]' +#' # (URL broken up to avoid a long line warning in this Rd) +#' +#' retrieve_linked_data( +#' paste0(endpoint_url, q_param, s_and_o_params, f_param) +#' ) +#' +#' retrieve_linked_data( +#' "https://search.patentsview.org/api/v1/patent/?q=%7B%22patent_date%22%3A%221976-01-06%22%7D", +#' encoded_url = TRUE +#' ) #' } #' #' @export retrieve_linked_data <- function(url, + encoded_url = FALSE, api_key = Sys.getenv("PATENTSVIEW_API_KEY"), ... ) { + if (encoded_url) { + url <- utils::URLdecode(url) + } - # Don't sent the API key to any domain other than patentsview.org - if (!grepl("^https://[^/]*\\.patentsview.org/", url)) { + # There wouldn't be url parameters on a HATEOAS link but we'll also accept + # example urls from the documentation, where there could be parameters + url_peices <- httr2::url_parse(url) + + # Only send the API key to subdomains of patentsview.org + if (!grepl("^.*\\.patentsview.org$", url_peices$hostname)) { stop2("retrieve_linked_data is only for patentsview.org urls") } + params <- list() + query <- "" + + if (!is.null(url_peices$query)) { + # Need to change f to fields vector, s to sort vector and o to opts + # There is probably a whizbangy better way to do this in R + if (!is.null(url_peices$query$f)) { + params$fields <- unlist(strsplit(gsub("[\\[\\]]", "", url_peices$query$f, perl = TRUE), ",\\s*")) + } + + if (!is.null(url_peices$query$s)) { + params$sort <- jsonlite::fromJSON(sub(".*s=([^&]*).*", "\\1", url)) + } + + if (!is.null(url_peices$query$o)) { + params$opts <- jsonlite::fromJSON(sub(".*o=([^&]*).*", "\\1", url)) + } + + query <- if (!is.null(url_peices$query$q)) sub(".*q=([^&]*).*", "\\1", url) else "" + url <- paste0(url_peices$scheme, "://", url_peices$hostname, url_peices$path) + } + # Go through one_request, which handles resend on throttle errors - # The API doesn't seem to mind ?q=&f=&o=&s= appended to the URL - res <- one_request("GET", "", url, list(), api_key, ...) + # The API doesn't seem to mind ?q=&f=&o=&s= appended to HATEOAS URLs + res <- one_request("GET", query, url, params, api_key, ...) process_resp(res) } diff --git a/R/unnest-pv-data.R b/R/unnest-pv-data.R index a86ca441..19b29370 100644 --- a/R/unnest-pv-data.R +++ b/R/unnest-pv-data.R @@ -4,24 +4,50 @@ #' in \code{\link{unnest_pv_data}}, based on the endpoint you searched. #' It will return a potential unique identifier for a given entity (i.e., a #' given endpoint). For example, it will return "patent_id" when -#' \code{endpoint = "patent"}. +#' \code{endpoint_or_entity = "patent"}. It would return the same value if +#' the entity name "patents" was passed via \code{get_ok_pk(names(pv_return$data))} +#' where pv_return was returned from \code{\link{search_pv}}. #' -#' @param endpoint The endpoint which you would like to know a potential primary -#' key for. +#' @param endpoint_or_entity The endpoint or entity name for which you +#' would like to know a potential primary key for. #' #' @return The name of a primary key (\code{pk}) that you could pass to #' \code{\link{unnest_pv_data}}. #' #' @examples -#' get_ok_pk(endpoint = "inventor") -#' get_ok_pk(endpoint = "cpc_subclass") -#' get_ok_pk("publication/rel_app_text") +#' get_ok_pk(endpoint_or_entity = "inventor") # Returns "inventor_id" +#' get_ok_pk(endpoint_or_entity = "cpc_group") # Returns "cpc_group_id" #' #' @export -get_ok_pk <- function(endpoint) { +get_ok_pk <- function(endpoint_or_entity) { + endpoint_df <- fieldsdf[fieldsdf$endpoint == endpoint_or_entity, ] + if (nrow(endpoint_df) > 0) { + endpoint <- endpoint_or_entity + } else { + endpoint_df <- fieldsdf[fieldsdf$group == endpoint_or_entity, ] + endpoint <- unique(endpoint_df$endpoint) + + # watch out here- several endpoints return entities that are groups returned + # by the patent and publication endpoints (attourneys, inventors, assignees) + if(length(endpoint) > 1) { + endpoint <- endpoint[!endpoint %in% c("patent", "publication")] + endpoint_df <- fieldsdf[fieldsdf$endpoint == endpoint, ] + } + } + unnested_endpoint <- sub("^(patent|publication)/", "", endpoint) possible_pks <- c("patent_id", "document_number", paste0(unnested_endpoint, "_id")) - fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field %in% possible_pks, "field"] + pk <- endpoint_df[endpoint_df$field %in% possible_pks, "field"] + + # we're unable to determine the pk if an entity name of rel_app_texts was passed + asrt( + length(pk) == 1, + "The primary key cannot be determined for ", endpoint_or_entity, + ". Try using the endpoint's name instead ", + paste(unique(fieldsdf[fieldsdf$group == endpoint_or_entity, "endpoint"]), collapse = ", ") + ) + + pk } #' Unnest PatentsView data @@ -39,8 +65,8 @@ get_ok_pk <- function(endpoint) { #' inside it. See examples. #' @param pk The column/field name that will link the data frames together. This #' should be the unique identifier for the primary entity. For example, if you -#' used the patents endpoint in your call to \code{search_pv}, you could -#' specify \code{pk = "patent_number"}. \strong{This identifier has to have +#' used the patent endpoint in your call to \code{search_pv}, you could +#' specify \code{pk = "patent_id"}. \strong{This identifier has to have #' been included in your \code{fields} vector when you called #' \code{search_pv}}. You can use \code{\link{get_ok_pk}} to suggest a #' potential primary key for your data. @@ -58,12 +84,20 @@ get_ok_pk <- function(endpoint) { #' } #' #' @export -unnest_pv_data <- function(data, pk = get_ok_pk(names(data))) { - +unnest_pv_data <- function(data, pk = NULL) { validate_pv_data(data) df <- data[[1]] + if (is.null(pk)) { + # now there are two endpoints that return rel_app_texts entities with different pks + if (names(data) == "rel_app_texts") { + pk <- if ("document_number" %in% names(df)) "document_number" else "patent_id" + } else { + pk = get_ok_pk(names(data)) + } + } + asrt( pk %in% colnames(df), pk, " not in primary entity data frame...Did you include it in your ", @@ -75,14 +109,12 @@ unnest_pv_data <- function(data, pk = get_ok_pk(names(data))) { sub_ent_df <- df[, !prim_ent_var, drop = FALSE] sub_ents <- colnames(sub_ent_df) - ok_pk <- get_ok_pk(names(data)) - out_sub_ent <- lapply2(sub_ents, function(x) { temp <- sub_ent_df[[x]] asrt( length(unique(df[, pk])) == length(temp), pk, " cannot act as a primary key because it is not a unique identifier.\n\n", - "Try using ", ok_pk, " instead." + "Try using ", pk, " instead." ) names(temp) <- df[, pk] xn <- do.call("rbind", temp) diff --git a/R/utils.R b/R/utils.R index cc487623..a6f50a6b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -4,6 +4,16 @@ stop2 <- function(...) stop(..., call. = FALSE) #' @noRd asrt <- function(expr, ...) if (!expr) stop2(...) +#' @noRd +parse_resp <- function(resp) { + j <- resp |> httr2::resp_body_string(encoding = "UTF-8") + + jsonlite::fromJSON( + j, + simplifyVector = TRUE, simplifyDataFrame = TRUE, simplifyMatrix = TRUE + ) +} + #' @noRd format_num <- function(x) { format( diff --git a/R/validate-args.R b/R/validate-args.R index a669e182..31dd6467 100644 --- a/R/validate-args.R +++ b/R/validate-args.R @@ -8,14 +8,23 @@ validate_endpoint <- function(endpoint) { } #' @noRd -validate_args <- function(api_key, fields, endpoint, method, page, per_page, - sort) { +validate_args <- function(api_key, fields, endpoint, method, + sort, after, size, all_pages) { asrt( !identical(api_key, ""), "The new version of the API requires an API key" ) flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint, "field"] + + # Now the API allows the group name to be requested as in fields to get all of + # the group's nested fields. ex.: "assignees" on the patent endpoint gets you all + # of the assignee fields. Note that "patents" can't be requested + groups <- unique(fieldsdf[fieldsdf$endpoint == endpoint, c("group")]) + pk <- get_ok_pk(endpoint) + plural_entity <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field == pk, "group"] + flds_flt <- append(flds_flt, groups[!groups == plural_entity]) + asrt( all(fields %in% flds_flt), "Bad field(s): ", paste(fields[!(fields %in% flds_flt)], collapse = ", ") @@ -27,27 +36,35 @@ validate_args <- function(api_key, fields, endpoint, method, page, per_page, all(method %in% c("GET", "POST"), length(method) == 1), "method must be either 'GET' or 'POST'" ) + asrt( - all(is.numeric(page), length(page) == 1, page >= 1), - "page must be a numeric value greater than 1" - ) - asrt( - all(is.numeric(per_page), length(per_page) == 1, per_page <= 1000), - "per_page must be a numeric value less than or equal to 1,000" + all(is.numeric(size), length(size) == 1, size <= 1000), + "size must be a numeric value less than or equal to 1,000" ) - if (!is.null(sort)) + + # Removed all(names(sort) %in% fields) Was it our requirement or the API's? + # It does seem to work when we don't request fields and rely on the API to sort + # using them. + if (!is.null(sort)) { asrt( all( - all(names(sort) %in% fields), all(sort %in% c("asc", "desc")), - !is.list(sort)), - "sort has to be a named character vector and each name has to be ", - "specified in the field argument. See examples" + all(sort %in% c("asc", "desc")), + !is.list(sort) + ), + "sort has to be a named character vector. See examples" ) + } + + asrt( + any(is.null(after), !all_pages), + "'after' cannot be set when all_pages = TRUE" + ) } #' @noRd validate_groups <- function(endpoint, groups) { ok_grps <- unique(fieldsdf[fieldsdf$endpoint == endpoint, "group"]) + asrt( all(groups %in% ok_grps), "for the ", endpoint, " endpoint, group must be one of the following: ", @@ -64,14 +81,14 @@ validate_pv_data <- function(data) { } #' @noRd -deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only) { +deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only, page, per_page) { if (!is.null(error_browser)) { lifecycle::deprecate_warn(when = "0.2.0", what = "search_pv(error_browser)") } # Was previously defaulting to FALSE and we're still defaulting to FALSE to # mirror the fact that the API doesn't support subent_cnts. Warn only if user - # tries to set subent_cnts to TRUE. - if (isTRUE(subent_cnts)) { + # tries to set subent_cnts to anything other than FALSE (test cases try NULL and 7) + if (!(is.logical(subent_cnts) && isFALSE(subent_cnts))) { lifecycle::deprecate_warn( when = "1.0.0", what = "search_pv(subent_cnts)", @@ -88,4 +105,21 @@ deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only) { version of the API" ) } -} \ No newline at end of file + + if (lifecycle::is_present(per_page)) { + lifecycle::deprecate_warn( + when = "0.3.0", + what = "search_pv(per_page)", + details = "The new version of the API uses 'size' instead of 'per_page'", + with = "search_pv(size)" + ) + } + + if (lifecycle::is_present(page)) { + lifecycle::deprecate_warn( + when = "0.3.0", + what = "search_pv(page)", + details = "The new version of the API does not support the page parameter" + ) + } +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 75d58fbb..fe54d8ba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -23,6 +23,9 @@ reference: - get_ok_pk - cast_pv_data - retrieve_linked_data + - title: Utility + contents: + - pad_patent_id navbar: components: diff --git a/docs/reference/cast_pv_data.html b/docs/reference/cast_pv_data.html index 08ef48c0..ecc3f786 100644 --- a/docs/reference/cast_pv_data.html +++ b/docs/reference/cast_pv_data.html @@ -1,70 +1,13 @@ - - - - - - - -Cast PatentsView data — cast_pv_data • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Cast PatentsView data — cast_pv_data • patentsview + + - - - - -
-
- -
- -
+
-

This will cast the data fields returned by search_pv so that +

This will cast the data fields returned by search_pv so that they have their most appropriate data types (e.g., date, numeric, etc.).

-
cast_pv_data(data)
+
+
cast_pv_data(data)
+
-

Arguments

- - - - - - -
data

The data returned by search_pv. This is the first +

+

Arguments

+
data
+

The data returned by search_pv. This is the first element of the three-element result object you got back from search_pv. It should be a list of length 1, with one data frame -inside it. See examples.

- -

Value

+inside it. See examples.

-

The same type of object that you passed into cast_pv_data.

+
+
+

Value

+ -

Examples

-
if (FALSE) { - -fields <- c("patent_date", "patent_title", "patent_year") -res <- search_pv(query = "{\"patent_number\":\"5116621\"}", fields = fields) -cast_pv_data(data = res$data) -} +

The same type of object that you passed into cast_pv_data.

+
-
+
+

Examples

+
if (FALSE) {
+
+fields <- c("patent_date", "patent_title", "patent_year")
+res <- search_pv(query = "{\"patent_id\":\"5116621\"}", fields = fields)
+cast_pv_data(data = res$data)
+}
+
+
+
+
- - - + + diff --git a/docs/reference/fieldsdf.html b/docs/reference/fieldsdf.html index 11fc300e..19611599 100644 --- a/docs/reference/fieldsdf.html +++ b/docs/reference/fieldsdf.html @@ -1,74 +1,16 @@ - - - - - - - -Fields data frame — fieldsdf • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Fields data frame — fieldsdf • patentsview - - + + - - -
-
- -
- -
+
-

A data frame containing the names of retrievable and queryable fields for -each of the 7 API endpoints. A yes/no flag (can_query) indicates -which fields can be included in the user's query. You can also find this -data on the API's online documentation for each endpoint as well (e.g., -the patents -endpoint field list table)

+

A data frame containing the names of retrievable fields for each of the +endpoints. You can find this data on the API's online documentation for each +endpoint as well (e.g., the +patent endpoint +field list table).

-
fieldsdf
+
+
fieldsdf
+
+
+

Format

+

A data frame with the following columns:

endpoint
+

The endpoint that this field record is for

-

Format

+
field
+

The complete name of the field, including the parent group if +applicable

-

A data frame with 992 rows and 7 variables:

-
endpoint

The endpoint that this field record is for

-
field

The name of the field

-
data_type

The field's data type (string, date, float, integer, - fulltext)

-
can_query

An indicator for whether the field can be included in - the user query for the given endpoint

-
group

The group the field belongs to

-
common_name

The field's common name

-
description

A description of the field

+
data_type
+

The field's input data type

-
+
group
+

The group the field belongs to

+
common_name
+

The field name without the parent group structure

+ + +
+
-
- - + + diff --git a/docs/reference/figures/lifecycle-archived.svg b/docs/reference/figures/lifecycle-archived.svg new file mode 100644 index 00000000..48f72a6f --- /dev/null +++ b/docs/reference/figures/lifecycle-archived.svg @@ -0,0 +1 @@ + lifecyclelifecyclearchivedarchived \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-defunct.svg b/docs/reference/figures/lifecycle-defunct.svg new file mode 100644 index 00000000..01452e5f --- /dev/null +++ b/docs/reference/figures/lifecycle-defunct.svg @@ -0,0 +1 @@ +lifecyclelifecycledefunctdefunct \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-deprecated.svg b/docs/reference/figures/lifecycle-deprecated.svg new file mode 100644 index 00000000..4baaee01 --- /dev/null +++ b/docs/reference/figures/lifecycle-deprecated.svg @@ -0,0 +1 @@ +lifecyclelifecycledeprecateddeprecated \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-experimental.svg b/docs/reference/figures/lifecycle-experimental.svg new file mode 100644 index 00000000..d1d060e9 --- /dev/null +++ b/docs/reference/figures/lifecycle-experimental.svg @@ -0,0 +1 @@ +lifecyclelifecycleexperimentalexperimental \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-maturing.svg b/docs/reference/figures/lifecycle-maturing.svg new file mode 100644 index 00000000..df713101 --- /dev/null +++ b/docs/reference/figures/lifecycle-maturing.svg @@ -0,0 +1 @@ +lifecyclelifecyclematuringmaturing \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-questioning.svg b/docs/reference/figures/lifecycle-questioning.svg new file mode 100644 index 00000000..08ee0c90 --- /dev/null +++ b/docs/reference/figures/lifecycle-questioning.svg @@ -0,0 +1 @@ +lifecyclelifecyclequestioningquestioning \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-stable.svg b/docs/reference/figures/lifecycle-stable.svg new file mode 100644 index 00000000..e015dc81 --- /dev/null +++ b/docs/reference/figures/lifecycle-stable.svg @@ -0,0 +1 @@ +lifecyclelifecyclestablestable \ No newline at end of file diff --git a/docs/reference/figures/lifecycle-superseded.svg b/docs/reference/figures/lifecycle-superseded.svg new file mode 100644 index 00000000..75f24f55 --- /dev/null +++ b/docs/reference/figures/lifecycle-superseded.svg @@ -0,0 +1 @@ + lifecyclelifecyclesupersededsuperseded \ No newline at end of file diff --git a/docs/reference/get_endpoints.html b/docs/reference/get_endpoints.html index e510ce14..e9eea4ab 100644 --- a/docs/reference/get_endpoints.html +++ b/docs/reference/get_endpoints.html @@ -1,70 +1,13 @@ - - - - - - - -Get endpoints — get_endpoints • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get endpoints — get_endpoints • patentsview + + - - - - -
-
- -
- -
+
-

This function reminds the user what the 7 possible PatentsView API endpoints +

This function reminds the user what the possible PatentsView API endpoints are.

-
get_endpoints()
- - -

Value

+
+
get_endpoints()
+
-

A character vector with the names of the 7 endpoints. Those endpoints are:

-
    -
  • assignees

  • -
  • cpc_subsections

  • -
  • inventors

  • -
  • locations

  • -
  • nber_subcategories

  • -
  • patents

  • -
  • uspc_mainclasses

  • -
+
+

Value

+ +

A character vector with the names of each endpoint.

+
-

Examples

-
get_endpoints() -
#> [1] "assignees" "cpc_subsections" "inventors" -#> [4] "locations" "nber_subcategories" "patents" -#> [7] "uspc_mainclasses"
+
-
- - + + diff --git a/docs/reference/get_fields.html b/docs/reference/get_fields.html index 3951913c..4e1bb43a 100644 --- a/docs/reference/get_fields.html +++ b/docs/reference/get_fields.html @@ -98,7 +98,7 @@

Get list of retrievable fields

-
get_fields(endpoint, groups = NULL)
+
get_fields(endpoint, groups = NULL, include_pk = FALSE)
@@ -114,10 +114,16 @@

Arguments

endpoint's fields (i.e., do not filter the field list based on group membership). See the field tables located online to see which groups you can specify for a given endpoint (e.g., the -patent +patents endpoint table), or use the fieldsdf table (e.g., unique(fieldsdf[fieldsdf$endpoint == "patent", "group"])).

+ +
include_pk
+

Boolean on whether to include the endpoint's primary key, +defaults to FALSE. The primary key is needed if you plan on calling +unnest_pv_data on the results of search_pv

+

Value

@@ -128,8 +134,8 @@

Value

Examples

-
# Get all assignee-level fields for the patent endpoint:
-fields <- get_fields(endpoint = "patent", groups = "assignees")
+    
# Get all top level (non-nested) fields for the patent endpoint:
+fields <- get_fields(endpoint = "patent", groups = c("patents"))
 
 # ...Then pass to search_pv:
 if (FALSE) {
@@ -139,9 +145,8 @@ 

Examples

fields = fields ) } -# Get all patent and assignee-level fields for the patent endpoint: +# Get unnested patent and assignee-level fields for the patent endpoint: fields <- get_fields(endpoint = "patent", groups = c("assignees", "patents")) -#> Error: group must be one of the following: , assignee_years, inventor_years, applicants, application, assignees, attorneys, botanic, cpc_at_issue, cpc_current, examiners, figures, foreign_priority, gov_interest_contract_award_numbers, gov_interest_organizations, granted_pregrant_crosswalk, inventors, ipcr, pct_data, us_related_documents, us_term_of_grant, uspc_at_issue, wipo, us_parties if (FALSE) { # ...Then pass to search_pv: @@ -150,6 +155,19 @@

Examples

fields = fields ) } +# Get the nested inventors fields and the primary key in order to call unnest_pv_data +# on the returned data. unnest_pv_data would throw an error if the primary key was +# not present in the results. +fields <- get_fields(endpoint = "patent", groups = c("inventors"), include_pk = TRUE) + +if (FALSE) { +# ...Then pass to search_pv and unnest the results +results <- search_pv( + query = '{"_gte":{"patent_date":"2007-01-04"}}', + fields = fields +) +unnest_pv_data(results$data) +}
diff --git a/docs/reference/get_ok_pk.html b/docs/reference/get_ok_pk.html index 149a8283..d91c7dad 100644 --- a/docs/reference/get_ok_pk.html +++ b/docs/reference/get_ok_pk.html @@ -3,7 +3,9 @@ in unnest_pv_data, based on the endpoint you searched. It will return a potential unique identifier for a given entity (i.e., a given endpoint). For example, it will return "patent_id" when -endpoint = "patent".'> @@ -94,18 +96,20 @@

Get OK primary key

in unnest_pv_data, based on the endpoint you searched. It will return a potential unique identifier for a given entity (i.e., a given endpoint). For example, it will return "patent_id" when -endpoint = "patent".

+endpoint_or_entity = "patent". It would return the same value if +the entity name "patents" was passed via get_ok_pk(names(pv_return$data)) +where pv_return was returned from search_pv.

-
get_ok_pk(endpoint)
+
get_ok_pk(endpoint_or_entity)

Arguments

-
endpoint
-

The endpoint which you would like to know a potential primary -key for.

+
endpoint_or_entity
+

The endpoint or entity name for which you +would like to know a potential primary key for.

@@ -118,12 +122,10 @@

Value

Examples

-
get_ok_pk(endpoint = "inventor")
+    
get_ok_pk(endpoint_or_entity = "inventor") # Returns "inventor_id"
 #> [1] "inventor_id"
-get_ok_pk(endpoint = "cpc_subclass")
-#> [1] "cpc_subclass_id"
-get_ok_pk("publication/rel_app_text")
-#> [1] "document_number"
+get_ok_pk(endpoint_or_entity = "cpc_group") # Returns "cpc_group_id"
+#> [1] "cpc_group_id"
 
 
diff --git a/docs/reference/index.html b/docs/reference/index.html index ee40061b..ab30c2ca 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -1,68 +1,12 @@ - - - - - - - -Function reference • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Function reference • patentsview - - + + - - -
-
- -
- -
+
- - - - - - - - - - - + + + + +
-

The API client

+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+

The API client

+

search_pv()

Search PatentsView

-

Convenience objects for search_pv

+
+

Convenience objects for search_pv

+

get_endpoints()

Get endpoints

+

get_fields()

Get list of retrievable fields

+

fieldsdf

Fields data frame

-

Writing queries with the DSL

+
+

Writing queries with the DSL

+

qry_funs

List of query functions

+

with_qfuns()

With qry_funs

-

Manipulating patentsview data

+
+

Manipulating patentsview data

+

unnest_pv_data()

Unnest PatentsView data

+

get_ok_pk()

Get OK primary key

+

cast_pv_data()

Cast PatentsView data

- +
+

retrieve_linked_data()

+

Retrieve Linked Data

+

Utility

+

+
+

pad_patent_id()

+

Pad patent_id

+
-
- +
- - + + diff --git a/docs/reference/pad_patent_id.html b/docs/reference/pad_patent_id.html new file mode 100644 index 00000000..d92031de --- /dev/null +++ b/docs/reference/pad_patent_id.html @@ -0,0 +1,139 @@ + +Pad patent_id — pad_patent_id • patentsview + + +
+
+ + + +
+
+ + +
+

This function strategically pads a patent_id with zeroes to 8 characters, +needed only for custom paging that uses sorts by patent_id.

+
+ +
+
pad_patent_id(patent_id)
+
+ +
+

Arguments

+
patent_id
+

The patent_id that needs to be padded. It can +be the patent_id for a utility, design, plant or reissue patent.

+ +
+ +
+

Examples

+
if (FALSE) {
+padded <- pad_patent_id("RE36479")
+
+padded2 <- pad_patent_id("3930306")
+}
+
+
+
+
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/patentsview-package.html b/docs/reference/patentsview-package.html index bea2797e..1ed343ea 100644 --- a/docs/reference/patentsview-package.html +++ b/docs/reference/patentsview-package.html @@ -1,5 +1,5 @@ -patentsview: An R Client to the 'PatentsView' API — patentsview-package • patentsviewpatentsview: An R Client to the 'PatentsView' API — patentsview-package • patentsview @@ -86,7 +86,7 @@

patentsview: An R Client to the 'PatentsView' API

-

Provides functions to simplify the 'PatentsView' API (https://patentsview.org/apis/purpose) query language, send GET and POST requests to the API's seven endpoints, and parse the data that comes back.

+

Provides functions to simplify the 'PatentsView' API (https://patentsview.org/apis/purpose) query language, send GET and POST requests to the API's twenty seven endpoints, and parse the data that comes back.

diff --git a/docs/reference/qry_funs.html b/docs/reference/qry_funs.html index f332a17e..c73b1b52 100644 --- a/docs/reference/qry_funs.html +++ b/docs/reference/qry_funs.html @@ -1,72 +1,14 @@ - - - - - - - -List of query functions — qry_funs • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -List of query functions — qry_funs • patentsview - + + - - - -
-
- -
- -
+

A list of functions that make it easy to write PatentsView queries. See the -details section below for a list of the 14 functions, as well as the -writing -queries vignette for further details.

+details section below for a list of the 15 functions, as well as the +writing queries vignette for further details.

-
qry_funs
- - -

Format

- -

An object of class list of length 14.

-

Value

+
+
qry_funs
+
-

An object of class pv_query. This is basically just a simple - list with a print method attached to it.

-

Details

+
+

Format

+

An object of class list of length 15.

+
+
+

Value

+ -

1. Comparison operator functions

+

An object of class pv_query. This is basically just a simple +list with a print method attached to it.

+
+
+

Details

+

1. Comparison operator functions

There are 6 comparison operator functions that work with fields of type -integer, float, date, or string:

    -
  • eq - Equal to

  • +integer, float, date, or string:

    • eq - Equal to

    • neq - Not equal to

    • gt - Greater than

    • gte - Greater than or equal to

    • lt - Less than

    • lte - Less than or equal to

    • -
    - -

    There are 2 comparison operator functions that only work with fields of type -string:

      -
    • begins - The string begins with the value string

    • +

    There are 2 comparison operator functions that only work with fields of type +string:

    • begins - The string begins with the value string

    • contains - The string contains the value string

    • -
    - -

    There are 3 comparison operator functions that only work with fields of type -fulltext:

      -
    • text_all - The text contains all the words in the value - string

    • +

    There are 3 comparison operator functions that only work with fields of type +fulltext:

    • text_all - The text contains all the words in the value +string

    • text_any - The text contains any of the words in the value - string

    • +string

    • text_phrase - The text contains the exact phrase of the value - string

    • -
    - -

    2. Array functions

    -

    There are 2 array functions:

      -
    • and - Both members of the array must be true

    • +string

      +

    2. Array functions

    +

    There are 2 array functions:

    • and - Both members of the array must be true

    • or - Only one member of the array must be true

    • -
    - -

    3. Negation function

    -

    There is 1 negation function:

      -
    • not - The comparison is not true

    • -
    - - -

    Examples

    -
    qry_funs$eq(patent_date = "2001-01-01") -
    #> {"_eq":{"patent_date":"2001-01-01"}}
    -qry_funs$not(qry_funs$eq(patent_date = "2001-01-01")) -
    #> {"_not":{"_eq":{"patent_date":"2001-01-01"}}}
    -
    +

3. Negation function

+

There is 1 negation function:

  • not - The comparison is not true

  • +

4. Convenience function

+

There is 1 convenience function:

  • in_range - Builds a <= x <= b query

  • +
+ +
+

Examples

+
qry_funs$eq(patent_date = "2001-01-01")
+#> {"_eq":{"patent_date":"2001-01-01"}}
+
+qry_funs$not(qry_funs$eq(patent_date = "2001-01-01"))
+#> {"_not":{"_eq":{"patent_date":"2001-01-01"}}}
+
+qry_funs$in_range(patent_year = c(2010, 2021))
+#> {"_and":[{"_gte":{"patent_year":2010}},{"_lte":{"patent_year":2021}}]}
+
+qry_funs$in_range(patent_date = c("1976-01-01", "1983-02-28"))
+#> {"_and":[{"_gte":{"patent_date":"1976-01-01"}},{"_lte":{"patent_date":"1983-02-28"}}]}
+
+
+
-
- +
- - + + diff --git a/docs/reference/retrieve_linked_data.html b/docs/reference/retrieve_linked_data.html new file mode 100644 index 00000000..d54faca9 --- /dev/null +++ b/docs/reference/retrieve_linked_data.html @@ -0,0 +1,199 @@ + +Retrieve Linked Data — retrieve_linked_data • patentsview + + +
+
+ + + +
+
+ + +
+

Some of the endpoints now return HATEOAS style links to get more data. E.g., +the patent endpoint may return a link such as: +"https://search.patentsview.org/api/v1/inventor/fl:th_ln:jefferson-1/"

+
+ +
+
retrieve_linked_data(
+  url,
+  encoded_url = FALSE,
+  api_key = Sys.getenv("PATENTSVIEW_API_KEY"),
+  ...
+)
+
+ +
+

Arguments

+
url
+

A link that was returned by the API on a previous call, an example +in the documentation or a Request URL from the API's Swagger UI page.

+ + +
encoded_url
+

boolean to indicate whether the url has been URL encoded, defaults to FALSE. +Set to TRUE for Request URLs from Swagger UI.

+ + +
api_key
+

API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +here.

+ + +
...
+

Curl options passed along to httr2's req_options function.

+ +
+
+

Value

+ + +

A list with the following three elements:

data
+

A list with one element - a named data frame containing the +data returned by the server. Each row in the data frame corresponds to a +single value for the primary entity. For example, if you search the +assignee endpoint, then the data frame will be on the assignee-level, +where each row corresponds to a single assignee. Fields that are not on +the assignee-level would be returned in list columns.

+ + +
query_results
+

Entity counts across all pages of output (not just +the page returned to you).

+ + +
request
+

Details of the GET HTTP request that was sent to the server.

+ + +
+ +
+

Examples

+
if (FALSE) {
+
+retrieve_linked_data(
+  "https://search.patentsview.org/api/v1/cpc_group/G01S7:4811/"
+)
+
+endpoint_url <- "https://search.patentsview.org/api/v1/patent/"
+q_param <- '?q={"_text_any":{"patent_title":"COBOL cotton gin"}}'
+s_and_o_params <- '&s=[{"patent_id": "asc" }]&o={"size":50}'
+f_param <- '&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]'
+# (URL broken up to avoid a long line warning in this Rd)
+
+retrieve_linked_data(
+  paste0(endpoint_url, q_param, s_and_o_params, f_param)
+)
+
+retrieve_linked_data(
+  "https://search.patentsview.org/api/v1/patent/?q=%7B%22patent_date%22%3A%221976-01-06%22%7D",
+  encoded_url = TRUE
+)
+}
+
+
+
+
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/reference/search_pv.html b/docs/reference/search_pv.html index 8710e3a7..e52cbf6f 100644 --- a/docs/reference/search_pv.html +++ b/docs/reference/search_pv.html @@ -98,8 +98,10 @@

Search PatentsView

endpoint = "patent", subent_cnts = FALSE, mtchd_subent_only = lifecycle::deprecated(), - page = 1, - per_page = 1000, + page = lifecycle::deprecated(), + per_page = lifecycle::deprecated(), + size = 1000, + after = NULL, all_pages = FALSE, sort = NULL, method = "GET", @@ -119,7 +121,7 @@

Arguments

E.g., list("_gte" = list("patent_date" = "2007-01-04"))

  • An object of class pv_query, which you create by calling one of the functions found in the qry_funs list...See the -writing +writing queries vignette for details.
    E.g., qry_funs$gte(patent_date = "2007-01-04")

  • @@ -127,13 +129,17 @@

    Arguments

    fields

    A character vector of the fields that you want returned to you. -A value of NULL indicates that the default fields should be -returned. Acceptable fields for a given endpoint can be found at the API's +A value of NULL indicates to the API that it should return the default fields +for that endpoint. Acceptable fields for a given endpoint can be found at the API's online documentation (e.g., check out the field list for the -patents +patents endpoint) or by viewing the fieldsdf data frame (View(fieldsdf)). You can also use get_fields to list -out the fields available for a given endpoint.

    +out the fields available for a given endpoint.

    +

    Nested fields can be fully qualified, e.g., "application.filing_date" or the +group name can be used to retrieve all of its nested fields, E.g. "application". +The latter would be similar to passing get_fields("patent", group = "application") +except it's the API that decides what fields to return.

    endpoint
    @@ -142,35 +148,47 @@

    Arguments

    subent_cnts
    -

    [Deprecated] Non-matched subentities -will always be returned under the new version of the API

    +

    [Deprecated] This is always FALSE in the +new version of the API as the total counts of unique subentities is no longer available.

    mtchd_subent_only

    [Deprecated] This is always -FALSE in the new version of the API.

    +FALSE in the new version of the API as non-matched subentities +will always be returned.

    page
    -

    The page number of the results that should be returned.

    +

    [Deprecated] The new version of the API does not use +page as a parameter for paging, it uses after.

    per_page
    +

    [Deprecated] The API now uses size

    + + +
    size

    The number of records that should be returned per page. This -value can be as high as 1,000 (e.g., per_page = 1000).

    +value can be as high as 1,000 (e.g., size = 1000).

    + + +
    after
    +

    A list of sort key values that defaults to NULL. This +exposes the API's paging parameter for users who want to implement their own +paging. It cannot be set when all_pages = TRUE as the R package manipulates it +for users automatically. See result set paging

    all_pages

    Do you want to download all possible pages of output? If -all_pages = TRUE, the values of page and per_page are -ignored.

    +all_pages = TRUE, the value of size is ignored.

    sort

    A named character vector where the name indicates the field to sort by and the value indicates the direction of sorting (direction should -be either "asc" or "desc"). For example, sort = c("patent_number" = - "asc") or
    sort = c("patent_number" = "asc", "patent_date" = +be either "asc" or "desc"). For example, sort = c("patent_id" = + "asc") or
    sort = c("patent_id" = "asc", "patent_date" = "desc"). sort = NULL (the default) means do not sort the results. You must include any fields that you wish to sort by in fields.

    @@ -186,13 +204,13 @@

    Arguments

    api_key
    -

    API key. See -Here for info on creating a key.

    +

    API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +here.

    ...
    -

    Arguments passed along to httr's GET or -POST function.

    +

    Curl options passed along to httr2's req_options +when we do GETs or POSTs.

    @@ -203,7 +221,7 @@

    Value

    A list with one element - a named data frame containing the data returned by the server. Each row in the data frame corresponds to a single value for the primary entity. For example, if you search the -assignees endpoint, then the data frame will be on the assignee-level, +assignee endpoint, then the data frame will be on the assignee-level, where each row corresponds to a single assignee. Fields that are not on the assignee-level would be returned in list columns.

    @@ -236,8 +254,8 @@

    Examples

    search_pv( query = qry_funs$gt(patent_year = 2010), method = "POST", - fields = "patent_number", - sort = c("patent_number" = "asc") + fields = "patent_id", + sort = c("patent_id" = "asc") ) search_pv( @@ -252,9 +270,14 @@

    Examples

    ) search_pv( - query = qry_funs$contains(inventors_at_grant.name_last = "Smith"), + query = qry_funs$contains(inventors.inventor_name_last = "Smith"), endpoint = "patent", - config = httr::timeout(40) + timeout = 40 +) + +search_pv( + query = qry_funs$eq(patent_id = "11530080"), + fields = "application" ) } diff --git a/docs/reference/unnest_pv_data.html b/docs/reference/unnest_pv_data.html index 5c41902b..e055c5bd 100644 --- a/docs/reference/unnest_pv_data.html +++ b/docs/reference/unnest_pv_data.html @@ -100,7 +100,7 @@

    Unnest PatentsView data

    -
    unnest_pv_data(data, pk = get_ok_pk(names(data)))
    +
    unnest_pv_data(data, pk = NULL)
    @@ -115,8 +115,8 @@

    Arguments

    pk

    The column/field name that will link the data frames together. This should be the unique identifier for the primary entity. For example, if you -used the patents endpoint in your call to search_pv, you could -specify pk = "patent_number". This identifier has to have +used the patent endpoint in your call to search_pv, you could +specify pk = "patent_id". This identifier has to have been included in your fields vector when you called search_pv. You can use get_ok_pk to suggest a potential primary key for your data.

    diff --git a/docs/reference/with_qfuns.html b/docs/reference/with_qfuns.html index d1d1564c..c1e80e77 100644 --- a/docs/reference/with_qfuns.html +++ b/docs/reference/with_qfuns.html @@ -1,73 +1,16 @@ - - - - - - - -With qry_funs — with_qfuns • patentsview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -With qry_funs — with_qfuns • patentsview - - - - - - - - - - - + + - - -
    -
    - -
    - -
    +

    This function evaluates whatever code you pass to it in the environment of -the qry_funs list. This allows you to cut down on typing when +the qry_funs list. This allows you to cut down on typing when writing your queries. If you want to cut down on typing even more, you can -try assigning the qry_funs list into your global environment -with: list2env(qry_funs, envir = globalenv()).

    +try assigning the qry_funs list into your global environment +with: list2env(qry_funs, envir = globalenv()).

    -
    with_qfuns(code, envir = parent.frame())
    - -

    Arguments

    - - - - - - - - - - -
    code

    Code to evaluate. See example.

    envir

    Where should R look for objects present in code that -aren't present in qry_funs.

    - -

    Value

    - -

    The result of code - i.e., your query.

    - -

    Examples

    -
    # Without with_qfuns, we have to do: -qry_funs$and( - qry_funs$gte(patent_date = "2007-01-01"), - qry_funs$text_phrase(patent_abstract = c("computer program")), - qry_funs$or( - qry_funs$eq(inventor_last_name = "ihaka"), - qry_funs$eq(inventor_first_name = "chris") - ) -) -
    #> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventor_last_name":"ihaka"}},{"_eq":{"inventor_first_name":"chris"}}]}]}
    -#...With it, this becomes: -with_qfuns( - and( - gte(patent_date = "2007-01-01"), - text_phrase(patent_abstract = c("computer program")), - or( - eq(inventor_last_name = "ihaka"), - eq(inventor_first_name = "chris") - ) - ) -) -
    #> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventor_last_name":"ihaka"}},{"_eq":{"inventor_first_name":"chris"}}]}]}
    -
    +
    +
    with_qfuns(code, envir = parent.frame())
    +
    + +
    +

    Arguments

    +
    code
    +

    Code to evaluate. See example.

    + + +
    envir
    +

    Where should R look for objects present in code that +aren't present in qry_funs.

    + +
    +
    +

    Value

    + + +

    The result of code - i.e., your query.

    +
    + +
    +

    Examples

    +
    qry_funs$and(
    +  qry_funs$gte(patent_date = "2007-01-01"),
    +  qry_funs$text_phrase(patent_abstract = c("computer program")),
    +  qry_funs$or(
    +    qry_funs$eq(inventors.inventor_name_last = "Ihaka"),
    +    qry_funs$eq(inventors.inventor_name_last = "Chris")
    +  )
    +)
    +#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventors.inventor_name_last":"Ihaka"}},{"_eq":{"inventors.inventor_name_last":"Chris"}}]}]}
    +
    +# ...With it, this becomes:
    +with_qfuns(
    +  and(
    +    gte(patent_date = "2007-01-01"),
    +    text_phrase(patent_abstract = c("computer program")),
    +    or(
    +      eq(inventors.inventor_name_last = "Ihaka"),
    +      eq(inventors.inventor_name_last = "Chris")
    +    )
    +  )
    +)
    +#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventors.inventor_name_last":"Ihaka"}},{"_eq":{"inventors.inventor_name_last":"Chris"}}]}]}
    +
    +
    +
    +
    -
    - - + + diff --git a/man/cast_pv_data.Rd b/man/cast_pv_data.Rd index 829b3c13..3ed42d36 100644 --- a/man/cast_pv_data.Rd +++ b/man/cast_pv_data.Rd @@ -23,7 +23,7 @@ they have their most appropriate data types (e.g., date, numeric, etc.). \dontrun{ fields <- c("patent_date", "patent_title", "patent_year") -res <- search_pv(query = "{\"patent_number\":\"5116621\"}", fields = fields) +res <- search_pv(query = "{\"patent_id\":\"5116621\"}", fields = fields) cast_pv_data(data = res$data) } diff --git a/man/fieldsdf.Rd b/man/fieldsdf.Rd index b55ddee3..aa107283 100644 --- a/man/fieldsdf.Rd +++ b/man/fieldsdf.Rd @@ -22,7 +22,7 @@ fieldsdf A data frame containing the names of retrievable fields for each of the endpoints. You can find this data on the API's online documentation for each endpoint as well (e.g., the -\href{https://patentsview.org/apis/api-endpoints/patents}{patents endpoint +\href{https://search.patentsview.org/docs/docs/Search\%20API/SearchAPIReference/#patent}{patent endpoint field list table}). } \keyword{datasets} diff --git a/man/get_fields.Rd b/man/get_fields.Rd index 4dc23060..b6ce3639 100644 --- a/man/get_fields.Rd +++ b/man/get_fields.Rd @@ -4,7 +4,7 @@ \alias{get_fields} \title{Get list of retrievable fields} \usage{ -get_fields(endpoint, groups = NULL) +get_fields(endpoint, groups = NULL, include_pk = FALSE) } \arguments{ \item{endpoint}{The API endpoint whose field list you want to get. See @@ -15,9 +15,13 @@ returned. A value of \code{NULL} indicates that you want all of the endpoint's fields (i.e., do not filter the field list based on group membership). See the field tables located online to see which groups you can specify for a given endpoint (e.g., the -\href{https://search.patentsview.org/docs/docs/Search\%20API/SearchAPIReference/#patent}{patent +\href{https://search.patentsview.org/docs/docs/Search\%20API/SearchAPIReference/#patent}{patents endpoint table}), or use the \code{fieldsdf} table (e.g., \code{unique(fieldsdf[fieldsdf$endpoint == "patent", "group"])}).} + +\item{include_pk}{Boolean on whether to include the endpoint's primary key, +defaults to FALSE. The primary key is needed if you plan on calling +\code{\link{unnest_pv_data}} on the results of \code{\link{search_pv}}} } \value{ A character vector with field names. @@ -30,8 +34,8 @@ entity group(s) as well (which is recommended, given the large number of possible fields for each endpoint). } \examples{ -# Get all assignee-level fields for the patent endpoint: -fields <- get_fields(endpoint = "patent", groups = "assignees") +# Get all top level (non-nested) fields for the patent endpoint: +fields <- get_fields(endpoint = "patent", groups = c("patents")) # ...Then pass to search_pv: \dontrun{ @@ -41,7 +45,7 @@ search_pv( fields = fields ) } -# Get all patent and assignee-level fields for the patent endpoint: +# Get unnested patent and assignee-level fields for the patent endpoint: fields <- get_fields(endpoint = "patent", groups = c("assignees", "patents")) \dontrun{ @@ -51,5 +55,18 @@ search_pv( fields = fields ) } +# Get the nested inventors fields and the primary key in order to call unnest_pv_data +# on the returned data. unnest_pv_data would throw an error if the primary key was +# not present in the results. +fields <- get_fields(endpoint = "patent", groups = c("inventors"), include_pk = TRUE) + +\dontrun{ +# ...Then pass to search_pv and unnest the results +results <- search_pv( + query = '{"_gte":{"patent_date":"2007-01-04"}}', + fields = fields +) +unnest_pv_data(results$data) +} } diff --git a/man/get_ok_pk.Rd b/man/get_ok_pk.Rd index 6bd223c9..36d3dbf3 100644 --- a/man/get_ok_pk.Rd +++ b/man/get_ok_pk.Rd @@ -4,11 +4,11 @@ \alias{get_ok_pk} \title{Get OK primary key} \usage{ -get_ok_pk(endpoint) +get_ok_pk(endpoint_or_entity) } \arguments{ -\item{endpoint}{The endpoint which you would like to know a potential primary -key for.} +\item{endpoint_or_entity}{The endpoint or entity name for which you +would like to know a potential primary key for.} } \value{ The name of a primary key (\code{pk}) that you could pass to @@ -19,11 +19,12 @@ This function suggests a value that you could use for the \code{pk} argument in \code{\link{unnest_pv_data}}, based on the endpoint you searched. It will return a potential unique identifier for a given entity (i.e., a given endpoint). For example, it will return "patent_id" when -\code{endpoint = "patent"}. +\code{endpoint_or_entity = "patent"}. It would return the same value if +the entity name "patents" was passed via \code{get_ok_pk(names(pv_return$data))} +where pv_return was returned from \code{\link{search_pv}}. } \examples{ -get_ok_pk(endpoint = "inventor") -get_ok_pk(endpoint = "cpc_subclass") -get_ok_pk("publication/rel_app_text") +get_ok_pk(endpoint_or_entity = "inventor") # Returns "inventor_id" +get_ok_pk(endpoint_or_entity = "cpc_group") # Returns "cpc_group_id" } diff --git a/man/pad_patent_id.Rd b/man/pad_patent_id.Rd new file mode 100644 index 00000000..544ee243 --- /dev/null +++ b/man/pad_patent_id.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search-pv.R +\name{pad_patent_id} +\alias{pad_patent_id} +\title{Pad patent_id} +\usage{ +pad_patent_id(patent_id) +} +\arguments{ +\item{patent_id}{The patent_id that needs to be padded. It can +be the patent_id for a utility, design, plant or reissue patent.} +} +\description{ +This function strategically pads a patent_id with zeroes to 8 characters, +needed only for custom paging that uses sorts by patent_id. +} +\examples{ +\dontrun{ +padded <- pad_patent_id("RE36479") + +padded2 <- pad_patent_id("3930306") +} + +} diff --git a/man/patentsview-package.Rd b/man/patentsview-package.Rd index 621a26f7..6b688378 100644 --- a/man/patentsview-package.Rd +++ b/man/patentsview-package.Rd @@ -6,7 +6,7 @@ \alias{patentsview-package} \title{patentsview: An R Client to the 'PatentsView' API} \description{ -Provides functions to simplify the 'PatentsView' API (\url{https://patentsview.org/apis/purpose}) query language, send GET and POST requests to the API's seven endpoints, and parse the data that comes back. +Provides functions to simplify the 'PatentsView' API (\url{https://patentsview.org/apis/purpose}) query language, send GET and POST requests to the API's twenty seven endpoints, and parse the data that comes back. } \seealso{ Useful links: diff --git a/man/qry_funs.Rd b/man/qry_funs.Rd index cff6667f..6a7c90e1 100644 --- a/man/qry_funs.Rd +++ b/man/qry_funs.Rd @@ -5,7 +5,7 @@ \alias{qry_funs} \title{List of query functions} \format{ -An object of class \code{list} of length 14. +An object of class \code{list} of length 15. } \usage{ qry_funs @@ -16,9 +16,8 @@ list with a print method attached to it. } \description{ A list of functions that make it easy to write PatentsView queries. See the -details section below for a list of the 14 functions, as well as the -\href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing -queries vignette} for further details. +details section below for a list of the 15 functions, as well as the +\href{../articles/writing-queries.html}{writing queries vignette} for further details. } \details{ \strong{1. Comparison operator functions} \cr @@ -66,11 +65,21 @@ There is 1 negation function: \itemize{ \item \code{not} - The comparison is not true } + +\strong{4. Convenience function} \cr + +There is 1 convenience function: +\itemize{ +\item \code{in_range} - Builds a <= x <= b query +} } \examples{ qry_funs$eq(patent_date = "2001-01-01") qry_funs$not(qry_funs$eq(patent_date = "2001-01-01")) +qry_funs$in_range(patent_year = c(2010, 2021)) + +qry_funs$in_range(patent_date = c("1976-01-01", "1983-02-28")) } \keyword{datasets} diff --git a/man/retrieve_linked_data.Rd b/man/retrieve_linked_data.Rd index 6a0e347f..838c2d7a 100644 --- a/man/retrieve_linked_data.Rd +++ b/man/retrieve_linked_data.Rd @@ -2,18 +2,26 @@ % Please edit documentation in R/search-pv.R \name{retrieve_linked_data} \alias{retrieve_linked_data} -\title{Get Linked Data} +\title{Retrieve Linked Data} \usage{ -retrieve_linked_data(url, api_key = Sys.getenv("PATENTSVIEW_API_KEY"), ...) +retrieve_linked_data( + url, + encoded_url = FALSE, + api_key = Sys.getenv("PATENTSVIEW_API_KEY"), + ... +) } \arguments{ -\item{url}{The link that was returned by the API on a previous call.} +\item{url}{A link that was returned by the API on a previous call, an example +in the documentation or a Request URL from the \href{https://search.patentsview.org/swagger-ui/}{API's Swagger UI page}.} + +\item{encoded_url}{boolean to indicate whether the url has been URL encoded, defaults to FALSE. +Set to TRUE for Request URLs from Swagger UI.} -\item{api_key}{API key. See \href{https://patentsview.org/apis/keyrequest}{ -Here} for info on creating a key.} +\item{api_key}{API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +\href{https://patentsview-support.atlassian.net/servicedesk/customer/portals}{here}.} -\item{...}{Arguments passed along to httr's \code{\link[httr]{GET}} or -\code{\link[httr]{POST}} function.} +\item{...}{Curl options passed along to httr2's \code{\link[httr2]{req_options}} function.} } \value{ A list with the following three elements: @@ -21,30 +29,42 @@ A list with the following three elements: \item{data}{A list with one element - a named data frame containing the data returned by the server. Each row in the data frame corresponds to a single value for the primary entity. For example, if you search the -assignees endpoint, then the data frame will be on the assignee-level, +assignee endpoint, then the data frame will be on the assignee-level, where each row corresponds to a single assignee. Fields that are not on the assignee-level would be returned in list columns.} \item{query_results}{Entity counts across all pages of output (not just the page returned to you).} -\item{request}{Details of the HTTP request that was sent to the server. -When you set \code{all_pages = TRUE}, you will only get a sample request. -In other words, you will not be given multiple requests for the multiple -calls that were made to the server (one for each page of results).} +\item{request}{Details of the GET HTTP request that was sent to the server.} } } \description{ Some of the endpoints now return HATEOAS style links to get more data. E.g., -the inventors endpoint may return a link such as: -"https://search.patentsview.org/api/v1/inventor/252373/" +the patent endpoint may return a link such as: +"https://search.patentsview.org/api/v1/inventor/fl:th_ln:jefferson-1/" } \examples{ \dontrun{ retrieve_linked_data( - "https://search.patentsview.org/api/v1/cpc_subgroup/G01S7:4811/" - ) + "https://search.patentsview.org/api/v1/cpc_group/G01S7:4811/" +) + +endpoint_url <- "https://search.patentsview.org/api/v1/patent/" +q_param <- '?q={"_text_any":{"patent_title":"COBOL cotton gin"}}' +s_and_o_params <- '&s=[{"patent_id": "asc" }]&o={"size":50}' +f_param <- '&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]' +# (URL broken up to avoid a long line warning in this Rd) + +retrieve_linked_data( + paste0(endpoint_url, q_param, s_and_o_params, f_param) +) + +retrieve_linked_data( + "https://search.patentsview.org/api/v1/patent/?q=\%7B\%22patent_date\%22\%3A\%221976-01-06\%22\%7D", + encoded_url = TRUE +) } } diff --git a/man/search_pv.Rd b/man/search_pv.Rd index d767abc8..094f362c 100644 --- a/man/search_pv.Rd +++ b/man/search_pv.Rd @@ -10,8 +10,10 @@ search_pv( endpoint = "patent", subent_cnts = FALSE, mtchd_subent_only = lifecycle::deprecated(), - page = 1, - per_page = 1000, + page = lifecycle::deprecated(), + per_page = lifecycle::deprecated(), + size = 1000, + after = NULL, all_pages = FALSE, sort = NULL, method = "GET", @@ -32,42 +34,55 @@ E.g., \code{list("_gte" = list("patent_date" = "2007-01-04"))} \item An object of class \code{pv_query}, which you create by calling one of the functions found in the \code{\link{qry_funs}} list...See the -\href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing +\href{../articles/writing-queries.html}{writing queries vignette} for details.\cr E.g., \code{qry_funs$gte(patent_date = "2007-01-04")} }} \item{fields}{A character vector of the fields that you want returned to you. -A value of \code{NULL} indicates that the default fields should be -returned. Acceptable fields for a given endpoint can be found at the API's +A value of \code{NULL} indicates to the API that it should return the default fields +for that endpoint. Acceptable fields for a given endpoint can be found at the API's online documentation (e.g., check out the field list for the -\href{https://patentsview.org/apis/api-endpoints/patents}{patents +\href{https://search.patentsview.org/docs/docs/Search\%20API/SearchAPIReference/#patent}{patents endpoint}) or by viewing the \code{fieldsdf} data frame (\code{View(fieldsdf)}). You can also use \code{\link{get_fields}} to list -out the fields available for a given endpoint.} +out the fields available for a given endpoint. + +Nested fields can be fully qualified, e.g., "application.filing_date" or the +group name can be used to retrieve all of its nested fields, E.g. "application". +The latter would be similar to passing \code{get_fields("patent", group = "application")} +except it's the API that decides what fields to return.} \item{endpoint}{The web service resource you wish to search. Use \code{get_endpoints()} to list the available endpoints.} -\item{subent_cnts}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Non-matched subentities -will always be returned under the new version of the API} +\item{subent_cnts}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This is always FALSE in the +new version of the API as the total counts of unique subentities is no longer available.} \item{mtchd_subent_only}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This is always -FALSE in the new version of the API.} +FALSE in the new version of the API as non-matched subentities +will always be returned.} + +\item{page}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} The new version of the API does not use +\code{page} as a parameter for paging, it uses \code{after}.} + +\item{per_page}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} The API now uses \code{size}} -\item{page}{The page number of the results that should be returned.} +\item{size}{The number of records that should be returned per page. This +value can be as high as 1,000 (e.g., \code{size = 1000}).} -\item{per_page}{The number of records that should be returned per page. This -value can be as high as 1,000 (e.g., \code{per_page = 1000}).} +\item{after}{A list of sort key values that defaults to NULL. This +exposes the API's paging parameter for users who want to implement their own +paging. It cannot be set when \code{all_pages = TRUE} as the R package manipulates it +for users automatically. See \href{../articles/result-set-paging.html}{result set paging}} \item{all_pages}{Do you want to download all possible pages of output? If -\code{all_pages = TRUE}, the values of \code{page} and \code{per_page} are -ignored.} +\code{all_pages = TRUE}, the value of \code{size} is ignored.} \item{sort}{A named character vector where the name indicates the field to sort by and the value indicates the direction of sorting (direction should -be either "asc" or "desc"). For example, \code{sort = c("patent_number" = - "asc")} or \cr\code{sort = c("patent_number" = "asc", "patent_date" = +be either "asc" or "desc"). For example, \code{sort = c("patent_id" = + "asc")} or \cr\code{sort = c("patent_id" = "asc", "patent_date" = "desc")}. \code{sort = NULL} (the default) means do not sort the results. You must include any fields that you wish to sort by in \code{fields}.} @@ -77,11 +92,11 @@ your query is very long (say, over 2,000 characters in length).} \item{error_browser}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}} -\item{api_key}{API key. See \href{https://patentsview.org/apis/keyrequest}{ -Here} for info on creating a key.} +\item{api_key}{API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +\href{https://patentsview-support.atlassian.net/servicedesk/customer/portals}{here}.} -\item{...}{Arguments passed along to httr's \code{\link[httr]{GET}} or -\code{\link[httr]{POST}} function.} +\item{...}{Curl options passed along to httr2's \code{\link[httr2]{req_options}} +when we do GETs or POSTs.} } \value{ A list with the following three elements: @@ -89,7 +104,7 @@ A list with the following three elements: \item{data}{A list with one element - a named data frame containing the data returned by the server. Each row in the data frame corresponds to a single value for the primary entity. For example, if you search the -assignees endpoint, then the data frame will be on the assignee-level, +assignee endpoint, then the data frame will be on the assignee-level, where each row corresponds to a single assignee. Fields that are not on the assignee-level would be returned in list columns.} @@ -119,8 +134,8 @@ search_pv( search_pv( query = qry_funs$gt(patent_year = 2010), method = "POST", - fields = "patent_number", - sort = c("patent_number" = "asc") + fields = "patent_id", + sort = c("patent_id" = "asc") ) search_pv( @@ -135,9 +150,14 @@ search_pv( ) search_pv( - query = qry_funs$contains(inventors_at_grant.name_last = "Smith"), + query = qry_funs$contains(inventors.inventor_name_last = "Smith"), endpoint = "patent", - config = httr::timeout(40) + timeout = 40 +) + +search_pv( + query = qry_funs$eq(patent_id = "11530080"), + fields = "application" ) } diff --git a/man/unnest_pv_data.Rd b/man/unnest_pv_data.Rd index a2c528ef..46de5723 100644 --- a/man/unnest_pv_data.Rd +++ b/man/unnest_pv_data.Rd @@ -4,7 +4,7 @@ \alias{unnest_pv_data} \title{Unnest PatentsView data} \usage{ -unnest_pv_data(data, pk = get_ok_pk(names(data))) +unnest_pv_data(data, pk = NULL) } \arguments{ \item{data}{The data returned by \code{\link{search_pv}}. This is the first @@ -14,8 +14,8 @@ inside it. See examples.} \item{pk}{The column/field name that will link the data frames together. This should be the unique identifier for the primary entity. For example, if you -used the patents endpoint in your call to \code{search_pv}, you could -specify \code{pk = "patent_number"}. \strong{This identifier has to have +used the patent endpoint in your call to \code{search_pv}, you could +specify \code{pk = "patent_id"}. \strong{This identifier has to have been included in your \code{fields} vector when you called \code{search_pv}}. You can use \code{\link{get_ok_pk}} to suggest a potential primary key for your data.} diff --git a/man/with_qfuns.Rd b/man/with_qfuns.Rd index 40f755bb..ba847615 100644 --- a/man/with_qfuns.Rd +++ b/man/with_qfuns.Rd @@ -23,26 +23,25 @@ try assigning the \code{\link{qry_funs}} list into your global environment with: \code{list2env(qry_funs, envir = globalenv())}. } \examples{ -# Without with_qfuns, we have to do: qry_funs$and( qry_funs$gte(patent_date = "2007-01-01"), qry_funs$text_phrase(patent_abstract = c("computer program")), qry_funs$or( - qry_funs$eq(inventor_last_name = "ihaka"), - qry_funs$eq(inventor_first_name = "chris") + qry_funs$eq(inventors.inventor_name_last = "Ihaka"), + qry_funs$eq(inventors.inventor_name_last = "Chris") ) ) -#...With it, this becomes: +# ...With it, this becomes: with_qfuns( - and( - gte(patent_date = "2007-01-01"), - text_phrase(patent_abstract = c("computer program")), - or( - eq(inventor_last_name = "ihaka"), - eq(inventor_first_name = "chris") - ) - ) + and( + gte(patent_date = "2007-01-01"), + text_phrase(patent_abstract = c("computer program")), + or( + eq(inventors.inventor_name_last = "Ihaka"), + eq(inventors.inventor_name_last = "Chris") + ) + ) ) } diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R index 3fe9dc36..d39b4e00 100644 --- a/tests/testthat/helpers.R +++ b/tests/testthat/helpers.R @@ -1,5 +1,5 @@ # Vector of queries (one for each endpoint) that are used during testing. We -# need this b/c in the new version of the api, only three of the endpoints are +# need this b/c in the new version of the api, only ten of the endpoints are # searchable by patent number (i.e., we can't use a generic patent number # search query). further, now patent_number has been patent_id @@ -32,3 +32,22 @@ TEST_QUERIES <- c( "uspc_subclass" = '{"uspc_subclass_id": "100/1"}', "wipo" = '{"wipo_id": "1"}' ) + +to_plural <- function(x) { + pk <- get_ok_pk(x) + fieldsdf[fieldsdf$endpoint == x & fieldsdf$field == pk, "group"] +} + +to_singular <- function(entity) { + endpoint_df <- fieldsdf[fieldsdf$group == entity, ] + endpoint <- unique(endpoint_df$endpoint) + + # watch out here- several endpoints return entities that are groups returned + # by the patent and publication endpoints (attorneys, inventors, assignees) + if(length(endpoint) > 1) { + endpoint <- endpoint[!endpoint %in% c("patent", "publication")] + } + + # can't distinguish rel_app_texts between patent/rel_app_text and publication/rel_app_text + endpoint +} diff --git a/tests/testthat/test-api-bugs.R b/tests/testthat/test-api-bugs.R new file mode 100644 index 00000000..55e7cb3d --- /dev/null +++ b/tests/testthat/test-api-bugs.R @@ -0,0 +1,356 @@ + +# Tests from the other files in this directory that are masking API errors +# This file was submitted to the API team as PVS-1125 + +eps <- (get_endpoints()) + +add_base_url <- function(x) { + paste0("https://search.patentsview.org/api/v1/", x) +} + +test_that("there is trouble paging", { + skip_on_cran() + + # reprex inspired by https://patentsview.org/forum/7/topic/812 + # Not all requested groups are coming back as we page, causing + # Error in rbind(deparse.level, ...) : + # numbers of columns of arguments do not match + # This query fails if any of these groups are specified + # "applicants", "cpc_at_issue", "gov_interest_contract_award_numbers", + # "uspc_at_issue") + + query <- with_qfuns( + and( + gte(application.filing_date = "2000-01-01"), + eq(cpc_current.cpc_subclass_id = "A01D") + ) + ) + + sort <- c("patent_id" = "asc") + fields <- c( + "patent_id", "applicants", "cpc_at_issue", + "gov_interest_contract_award_numbers", "uspc_at_issue" + ) + + result1 <- search_pv(query, + method = "GET", all_pages = FALSE, + fields = fields, sort = sort, size = 1000 + ) + + result2 <- search_pv(query, + method = "GET", all_pages = FALSE, + fields = fields, sort = sort, size = 1000, after = "06901731" + ) + + # result1$data$patents$applicants is sparse, mostly NULL + # there isn't a result2$data$patents$applicants + names1 <- names(result1$data$patents) + names2 <- names(result2$data$patents) + + expect_failure( + expect_setequal(names1, names2) + ) +}) + +test_that("there is case sensitivity on string equals", { + skip_on_cran() + + # reported to the API team PVS-1147 + # not sure if this is a bug or feature - original API was case insensitive + # using both forms of equals, impied and explicit + + assignee <- "Johnson & Johnson International" + query1 <- sprintf('{"assignee_organization": \"%s\"}', assignee) + a <- search_pv(query1, endpoint = "assignee") + query2 <- qry_funs$eq(assignee_organization = assignee) + b <- search_pv(query2, endpoint = "assignee") + expect_equal(a$query_results$total_hits, 1) + expect_equal(b$query_results$total_hits, 1) + + assignee <- tolower(assignee) + query1 <- sprintf('{"assignee_organization": \"%s\"}', assignee) + c <- search_pv(query1, endpoint = "assignee") + query2 <- qry_funs$eq(assignee_organization = assignee) + d <- search_pv(query2, endpoint = "assignee") + expect_equal(c$query_results$total_hits, 0) + expect_equal(d$query_results$total_hits, 0) +}) + +test_that("string vs text operators behave differently", { + skip_on_cran() + + # # reported to the API team PVS-1147 + query <- qry_funs$begins(assignee_organization = "johnson") + a <- search_pv(query, endpoint = "assignee") + + query <- qry_funs$text_any(assignee_organization = "johnson") + b <- search_pv(query, endpoint = "assignee") + + expect_failure( + expect_equal(a$query_results$total_hits, b$query_results$total_hits) + ) +}) + +test_that("API returns all requested groups", { + skip_on_cran() + + # can we traverse the return building a list of fields? + # sort both requested fields and returned ones to see if they are equal + + # TODO: remove the trickery to get this test to pass, once the API is fixed + bad_eps <- c( + "cpc_subclasses", + "location" # Error: Invalid field: location_latitude + , "uspc_subclasse" # Error: Internal Server Error + , "uspc_mainclass" # Error: Internal Server Error + , "wipo" # Error: Internal Server Error + , "claim" # Error: Invalid field: claim_dependent + , "draw_desc_text" # Error: Invalid field: description_sequence + , "cpc_subclass" # 404? check the test query + , "uspc_subclass" # 404 + , "pg_claim" # Invalid field: claim_dependent + ) + + mismatched_returns <- c( + "patent", + "publication" + ) + + # this will fail when the api is fixed + z <- lapply(bad_eps, function(x) { + print(x) + expect_error( + j <- search_pv(query = TEST_QUERIES[[x]], endpoint = x, fields = get_fields(x)) + ) + }) + + # this will fail when the API is fixed + z <- lapply(mismatched_returns, function(x) { + print(x) + res <- search_pv( + query = TEST_QUERIES[[x]], + endpoint = x, + fields = get_fields(x) + ) + + dl <- unnest_pv_data(res$data) + + actual_groups <- names(dl) + expected_groups <- unique(fieldsdf[fieldsdf$endpoint == x, "group"]) + + # we now need to unnest the endpoints for the comparison to work + expected_groups <- gsub("^(patent|publication)/", "", expected_groups) + + # the expected group for unnested attributes would be "" in actuality the come back + # in an entity matching the plural form of the unnested endpoint + expected_groups <- replace(expected_groups, expected_groups == "", to_plural(x)) + + expect_failure( + expect_setequal(actual_groups, expected_groups) + ) + }) + + # make it noticeable that all is not right with the API + skip("Skip for API bugs") # TODO: remove when the API is fixed +}) + +eps <- (get_endpoints()) + +test_that("We can call all the legitimate HATEOAS endpoints", { + skip_on_cran() + + # these currently throw Error: Internal Server Error + broken_single_item_queries <- c( + "cpc_subclass/A01B/", + "uspc_mainclass/30/", + "uspc_subclass/30:100/", + "wipo/1/" + ) + + + # TODO: remove when this is fixed + # we'll know the api is fixed when this test fails + dev_null <- lapply(broken_single_item_queries, function(q) { + expect_error( + j <- retrieve_linked_data(add_base_url(q)) + ) + }) +}) + +test_that("individual fields are still broken", { + skip_on_cran() + + # Sample fields that cause 500 errors when requested by themselves. + # Some don't throw errors when included in get_fields() but they do if + # they are the only field requested. Other individual fields at these + # same endpoints throw errors. Check fields again when these fail. + sample_bad_fields <- c( + "assignee_organization" = "assignees", + "inventor_lastknown_longitude" = "inventors", + "inventor_gender_code" = "inventors", + "location_name" = "locations", + "attorney_name_last" = "patent/attorneys", + "citation_country" = "patent/foreign_citations", + "ipc_id" = "ipcs" + ) + + dev_null <- lapply(names(sample_bad_fields), function(x) { + endpoint <- sample_bad_fields[[x]] + expect_error( + out <- search_pv(query = TEST_QUERIES[[endpoint]], endpoint = endpoint, fields = c(x)) + ) + }) +}) + +test_that("we can't sort by all fields", { + skip_on_cran() + + # PVS-1377 + sorts_to_try <- c( + assignee = "assignee_lastknown_city", + cpc_class = "cpc_class_title", + cpc_group = "cpc_group_title", + cpc_subclass = "cpc_subclass", + g_brf_sum_text = "summary_text", + g_claim = "claim_text", + g_detail_desc_text = "description_text", + g_draw_desc_text = "draw_desc_text", + inventor = "inventor_lastknown_city", + patent = "patent_id" # good pair to show that the code works + ) + + results <- lapply(names(sorts_to_try), function(endpoint) { + field <- sorts_to_try[[endpoint]] + print(paste(endpoint, field)) + + tryCatch( + { + sort <- c("asc") + names(sort) <- field + j <- search_pv( + query = TEST_QUERIES[[endpoint]], + endpoint = endpoint, sort = sort, method = "GET" + ) + NA + }, + error = function(e) { + paste(endpoint, field) + } + ) + }) + + results <- results[!is.na(results)] + expect_gt(length(results), 0) + expect_lt(length(results), length(sorts_to_try)) # assert that at least one sort worked +}) + + +test_that("withdrawn patents are still present in the database", { + skip_on_cran() + + # PVS-1342 Underlying data issues + # There are 8,000 patents that were in the bulk xml files patentsiew is based on. + # The patents were subsequently withdrawn but not removed from the database + withdrawn <- c( + "9978309", "9978406", "9978509", "9978615", "9978659", + "9978697", "9978830", "9978838", "9978886", "9978906", "9978916", + "9979255", "9979355", "9979482", "9979700", "9979841", "9979847", + "9980139", "9980711", "9980782", "9981222", "9981277", "9981423", + "9981472", "9981603", "9981760", "9981914", "9982126", "9982172", + "9982670", "9982860", "9982871", "9983588", "9983756", "9984058", + "9984899", "9984952", "9985340", "9985480", "9985987", "9986046" + ) + + query <- qry_funs$eq("patent_id" = c(withdrawn)) + results <- search_pv(query, method = "POST") + expect_equal(results$query_results$total_hits, length(withdrawn)) +}) + +test_that("missing patents are still missing", { + skip_on_cran() + + # PVS-1342 Underlying data issues + # There are around 300 patents that aren't in the bulk xml files patentsiew is based on. + missing <- c( + "4097517", "4424514", "4480077", "4487876", "4704648", "4704721", + "4705017", "4705031", "4705032", "4705036", "4705037", "4705097", "4705107", + "4705125", "4705142", "4705169", "4705170", "4705230", "4705274", "4705328", + "4705412", "4705416", "4705437", "4705455", "4705462", "5493812", "5509710", + "5697964", "5922850", "6087542", "6347059", "6680878", "6988922", "7151114", + "7200832", "7464613", "7488564", "7606803", "8309694", "8455078" + ) + query <- qry_funs$eq("patent_id" = missing) + results <- search_pv(query, method = "POST") + + # This would fail if these patents are added to the patentsview database + expect_equal(results$query_results$total_hits, 0) +}) + +test_that("we can't explicitly request assignee_ or inventor_years.num_patents", { + skip_on_cran() + + bad_eps <- c( + "assignee", # Invalid field: assignee_years.num_patents. assignee_years is not a nested field + "inventor" # Invalid field: inventor_years.num_patents. + ) + + # PVS-1437 Errors are thrown when requesting assignee_years or inventor_years + # (it works if the group name is used but fails on fully qualified nested fields) + tmp <- lapply(bad_eps, function(endpoint) { + expect_error( + pv_out <- search_pv( + query = TEST_QUERIES[[endpoint]], + endpoint = endpoint, + fields = fieldsdf[fieldsdf$endpoint == endpoint, "field"] + ), + "Invalid field: (assignee|inventor)_years.num_patents" + ) + }) +}) + +test_that("uspcs aren't right", { + skip_on_cran() + + # PVS-1615 + + endpoint <- "patent" + res <- search_pv( + query = TEST_QUERIES[[endpoint]], + endpoint = endpoint, + fields = get_fields(endpoint, groups = "uspc_at_issue") + ) + + # id fields are correct, non id fields should be HATEOAS links + uspcs <- res$data$patents$uspc_at_issue + + # these should fail when the API is fixed + expect_equal(uspcs$uspc_mainclass, uspcs$uspc_mainclass_id) + expect_equal(uspcs$uspc_subclass, uspcs$uspc_subclass_id) +}) + +test_that("endpoints are still broken", { + skip_on_cran() + # this will fail when the api is fixed + + broken_endpoints <- c( + "claim" # Error: Invalid field: claim_dependent + , "draw_desc_text" # Error: Invalid field: description_sequence + , "cpc_subclass" # 404? check the test query + , "location" # Error: Invalid field: location_latitude + , "pg_claim" # Invalid field: claim_dependent + , "uspc_subclass" # Error: Internal Server Error + , "uspc_mainclass" # Error: Internal Server Error + , "wipo" # Error: Internal Server Error + ) + + dev_null <- lapply(broken_endpoints, function(x) { + print(x) + expect_error( + search_pv( + query = TEST_QUERIES[[x]], + endpoint = x, + fields = get_fields(x) + ) + ) + }) +}) diff --git a/tests/testthat/test-arg-validation.R b/tests/testthat/test-arg-validation.R deleted file mode 100644 index 8115baf2..00000000 --- a/tests/testthat/test-arg-validation.R +++ /dev/null @@ -1,40 +0,0 @@ -context("validate_args") - -test_that("validate_args throws errors for all bad args", { - skip_on_cran() - # TODO(any): Remove: - skip("Temp skip for API redesign PR") - - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', endpoint = "patent"), - "endpoint" - ) - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', method = "Post"), - "method" - ) - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', subent_cnts = NULL), - "subent_cnts" - ) - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', mtchd_subent_only = NULL), - "mtchd_subent_only" - ) - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', per_page = "50"), - "per_page" - ) - expect_error( - search_pv('{"patent_date":["1976-01-06"]}', page = NA), - "page" - ) - expect_error( - search_pv( - '{"patent_date":["1976-01-06"]}', - fields = "patent_date", - sort = c("patent_number" = "asc") - ), - "sort" - ) -}) diff --git a/tests/testthat/test-cast-pv-data.R b/tests/testthat/test-cast-pv-data.R index 91eaa7ad..27dcae04 100644 --- a/tests/testthat/test-cast-pv-data.R +++ b/tests/testthat/test-cast-pv-data.R @@ -1,19 +1,72 @@ -context("cast_pv_data") +test_that("cast_pv_data casts patent fields as expected", { + skip_on_cran() + + pv_out <- search_pv( + query = '{"patent_id":"5116621"}', fields = get_fields("patent") + ) + + dat <- cast_pv_data(data = pv_out$data) + + # patent_date was received as a string and should be cast to a date + date <- class(dat$patents$patent_date) == "Date" + + # patent_detail_desc_length was recieved as an int and should still be one + num <- is.numeric(dat$patents$patent_detail_desc_length) + + # assignee type is a string like "3" from the api and gets cast to an integer + assignee_type <- is.numeric(dat$patents$assignees[[1]]$assignee_type[[1]]) + + expect_true(num && date && assignee_type) + + # application.rule_47_flag is received as a boolean and casting should leave it alone + expect_true(is.logical(dat$patents$application[[1]]$rule_47_flag)) +}) -test_that("cast_pv_data casts data types as expected", { +test_that("cast_pv_data casts assignee fields as expected", { skip_on_cran() - # TODO(any): Remove: - skip("Temp skip for API redesign PR") + # ** Invalid field: assignee_years.num_patents. assignee_years is not a nested field pv_out <- search_pv( - query = "{\"patent_number\":\"5116621\"}", fields = get_fields("patents") + query = '{"_text_phrase":{"assignee_individual_name_last": "Clinton"}}', + endpoint = "assignee", + fields = get_fields("assignee", groups = "assignees") # ** ) dat <- cast_pv_data(data = pv_out$data) - date <- !is.character(dat$patents$patent_date) - num <- is.numeric(dat$patents$patent_num_claims) - date2 <- !is.character(dat$patents$assignees[[1]]$assignee_last_seen_date[1]) + # latitude comes from the api as numeric and is left as is by casting + lat <- is.numeric(dat$assignees$assignee_lastknown_latitude[[1]]) + + # here we have the same funky conversion mentioned above + # on the field "assigneee_type" + assignee_type <- is.numeric(dat$assignees$assignee_type[[1]]) + + # was first seen date cast properly? + cast_date <- class(dat$assignees$assignee_first_seen_date[[1]]) == "Date" + + # integer from the API should remain an integer + years_active <- is.numeric(dat$assignees$assignee_years_active[[1]]) + + expect_true(lat) + expect_true(assignee_type) + expect_true(cast_date) + expect_true(years_active) + + skip("Skip for API bugs") +}) + +test_that("we can cast a bool", { + skip_on_cran() + + # TODO(any): remove when the API returns this as a boolean + fields <- c("rule_47_flag") + endpoint <- "publication" + results <- search_pv(query = TEST_QUERIES[[endpoint]], endpoint = endpoint, fields = fields) + + # this would fail when the API is fixed + expect_true(is.character(results$data$publications$rule_47_flag)) + + cast_results <- cast_pv_data(results$data) - expect_true(date && num && date2) + expect_true(is.logical(cast_results$publications$rule_47_flag)) }) diff --git a/tests/testthat/test-check-query.R b/tests/testthat/test-check-query.R new file mode 100644 index 00000000..8f846032 --- /dev/null +++ b/tests/testthat/test-check-query.R @@ -0,0 +1,98 @@ + +test_that("errors are thrown on invalid queries", { + skip_on_cran() + + expect_error( + search_pv(qry_funs$eq("shoe_size" = 11.5)), + "^.* is not a valid field to query for your endpoint$" + ) + + expect_error( + search_pv(qry_funs$gt("patent_id" = "10000000")), + "^You cannot use the operator .* with the field .*$" + ) + + expect_error( + search_pv(qry_funs$eq("patent_date" = "10000000")), + "^Bad date: .*\\. Date must be in the format of yyyy-mm-dd$" + ) + + expect_error( + search_pv(qry_funs$gt("patent_id" = 10000000)), + "^.* must be of type character$" + ) + + expect_error( + search_pv(qry_funs$gt("patent_year" = 1980.5)), + "^.* must be an integer$" + ) + + expect_error( + search_pv(qry_funs$gt("patent_year" = "1980")), + "^.* must be an integer$" + ) + + expect_error( + search_pv(qry_funs$eq("application.rule_47_flag" = "TRUE")), + "^.* must be a boolean$" + ) + + expect_error( + search_pv(qry_funs$eq("rule_47_flag" = TRUE), endpoint = "publication"), + "^.* must be of type character$" + ) + + expect_error( + search_pv(qry_funs$gt("location_latitude" = "TRUE"), endpoint = "location"), + "^.* must be a number$" + ) + + expect_error( + search_pv(list(patent_number = "10000000")), + "is not a valid operator or not a valid field" + ) + + bogus_operator_query <- + list( + "_ends_with" = + list(patent_title = "dog") + ) + + expect_error( + search_pv(bogus_operator_query), + "is not a valid operator or not a valid field" + ) +}) + +test_that("a valid nested field can be queried", { + skip_on_cran() + + results <- search_pv(qry_funs$eq("application.rule_47_flag" = FALSE)) + + expect_gt(results$query_results$total_hits, 8000000) +}) + +test_that("the _eq message is thrown when appropriate", { + skip_on_cran() + + expect_message( + search_pv(list(patent_date = "2007-03-06")), + "^The _eq operator is a safer alternative to using field:value pairs" + ) +}) + +test_that("a query with an and operator returns results", { + skip_on_cran() + + patents_query <- + with_qfuns( + and( + text_phrase(inventors.inventor_name_first = "George"), + text_phrase(inventors.inventor_name_last = "Washington") + ) + ) + + result <- search_pv(patents_query) + + expect_gte(result$query_results$total_hits, 1) +}) diff --git a/tests/testthat/test-search-pv.R b/tests/testthat/test-search-pv.R index 29857741..494c3308 100644 --- a/tests/testthat/test-search-pv.R +++ b/tests/testthat/test-search-pv.R @@ -1,18 +1,37 @@ -context("search_pv") -# TODO: add a test to see if all the requested fields come back +add_base_url <- function(x) { + paste0("https://search.patentsview.org/api/v1/", x) +} endpoints <- get_endpoints() test_that("API returns expected df names for all endpoints", { skip_on_cran() - df_names <- vapply(endpoints, function(x) { + broken_endpoints <- c( + "cpc_subclass", + "uspc_subclass", + "uspc_mainclass", + "wipo" + ) + + # these both return rel_app_texts + overloaded_entities <- c("patent/rel_app_text", "publication/rel_app_text") + + goodendpoints <- endpoints[!endpoints %in% c(broken_endpoints, overloaded_entities)] + + df_names <- vapply(goodendpoints, function(x) { + print(x) out <- search_pv(query = TEST_QUERIES[[x]], endpoint = x) - names(out[[1]]) + + # now the endpoints are singular and most entites are plural + to_singular(names(out[[1]])) }, FUN.VALUE = character(1), USE.NAMES = FALSE) - expect_equal(endpoints, df_names) + # publication/rel_app_text's entity is rel_app_text_publications + df_names <- gsub("rel_app_text_publication", "rel_app_text", df_names) + + expect_equal(goodendpoints, df_names) }) test_that("DSL-based query returns expected results", { @@ -37,18 +56,26 @@ test_that("You can download up to 9,000+ records", { # Should return 9,000+ rows query <- with_qfuns( and( - gte(patent_date = "2021-12-13"), - lte(patent_date = "2021-12-24") + gte(patent_date = "2021-12-13"), + lte(patent_date = "2021-12-24") ) ) - out <- search_pv(query, per_page = 1000, all_pages = TRUE) + out <- search_pv(query, size = 1000, all_pages = TRUE) expect_gt(out$query_results$total_hits, 9000) }) test_that("search_pv can pull all fields for all endpoints", { skip_on_cran() - dev_null <- lapply(endpoints, function(x) { + troubled_endpoints <- c( + "cpc_subclass", "location", + "uspc_subclass", "uspc_mainclass", "wipo", "claim", "draw_desc_text", + "pg_claim" # Invalid field: claim_dependent + ) + + # We should be able to get all fields from the non troubled endpoints + dev_null <- lapply(endpoints[!(endpoints %in% troubled_endpoints)], function(x) { + print(x) search_pv( query = TEST_QUERIES[[x]], endpoint = x, @@ -62,13 +89,13 @@ test_that("Sort option works as expected", { skip_on_cran() out <- search_pv( - qry_funs$neq(assignee_id = 1), - fields = get_fields("assignees"), - endpoint = "assignees", - sort = c("lastknown_latitude" = "desc"), - per_page = 100 + qry_funs$neq(assignee_id = ""), + fields = get_fields("assignee", groups = c("assignees")), + endpoint = "assignee", + sort = c("assignee_lastknown_latitude" = "desc"), + size = 100 ) - lat <- as.numeric(out$data$assignees$lastknown_latitude) + lat <- as.numeric(out$data$assignees$assignee_lastknown_latitude) expect_true(lat[1] >= lat[100]) }) @@ -76,45 +103,30 @@ test_that("search_pv properly URL encodes queries", { skip_on_cran() # Covers https://github.com/ropensci/patentsview/issues/24 - # need to use the assignee endpoint now and the field is full_text - ampersand_query <- with_qfuns(text_phrase(organization = "Johnson & Johnson")) - dev_null <- search_pv(ampersand_query, endpoint = "assignees") - expect_true(TRUE) -}) + # need to use the assignee endpoint now + organization <- "Johnson & Johnson International" + text_query <- with_qfuns(text_phrase(assignee_organization = organization)) + phrase_search <- search_pv(text_query, endpoint = "assignee") + expect_true(phrase_search$query_results$total_hits == 1) -# Below we request the same data in built_singly and result_all, with the only -# difference being that we intentionally get throttled in built_singly by -# sending one request per patent number (instead of all requests at once). If -# the two responses match, then we've correctly handled throttling errors. -test_that("Throttled requests are automatically retried", { - skip_on_cran() + # also test that the string operator does not matter now + eq_query <- with_qfuns(eq(assignee_organization = organization)) + eq_search <- search_pv(eq_query, endpoint = "assignee") + expect_identical(eq_search$data, phrase_search$data) - res <- search_pv('{"_gte":{"patent_date":"2007-01-04"}}', per_page = 50) - patent_numbers <- res$data$patents$patent_number + # text_phrase seems to be case insensitive but equal is not + organization <- tolower(organization) - built_singly <- lapply(patent_numbers, function(patent_number) { - search_pv( - query = qry_funs$eq(patent_number = patent_number), - endpoint = "patent_citations", - fields = c("patent_number", "cited_patent_number"), - sort = c("cited_patent_number" = "asc") - )[["data"]][["patent_citations"]] - }) - built_singly <- do.call(rbind, built_singly) + text_query <- with_qfuns(text_phrase(assignee_organization = organization)) + phrase_search <- search_pv(text_query, endpoint = "assignee") + expect_true(phrase_search$query_results$total_hits == 1) - result_all <- search_pv( - query = qry_funs$eq(patent_number = patent_numbers), - endpoint = "patent_citations", - fields = c("patent_number", "cited_patent_number"), - sort = c("patent_number" = "asc", "cited_patent_number" = "asc"), - per_page = 1000, - all_pages = TRUE - ) - result_all <- result_all$data$patent_citations - - expect_identical(built_singly, result_all) + eq_query <- with_qfuns(eq(assignee_organization = organization)) + eq_search <- search_pv(eq_query, endpoint = "assignee") + expect_true(eq_search$query_results$total_hits == 0) }) + test_that("We won't expose the user's patentsview API key to random websites", { skip_on_cran() @@ -128,28 +140,339 @@ test_that("We can call all the legitimate HATEOAS endpoints", { skip_on_cran() single_item_queries <- c( - "https://search.patentsview.org/api/v1/assignee/10/", - "https://search.patentsview.org/api/v1/cpc_group/A01B/", - "https://search.patentsview.org/api/v1/cpc_subgroup/G01S7:4811/", - "https://search.patentsview.org/api/v1/cpc_subsection/A01/", - "https://search.patentsview.org/api/v1/inventor/10/", - "https://search.patentsview.org/api/v1/nber_category/1/", - "https://search.patentsview.org/api/v1/nber_subcategory/11/", - "https://search.patentsview.org/api/v1/patent/10757852/", - "https://search.patentsview.org/api/v1/uspc_mainclass/30/", - "https://search.patentsview.org/api/v1/uspc_subclass/30:100/" + "cpc_subclass/A01B/", + "cpc_class/A01/", + "cpc_group/G01S7:4811/", + "patent/10757852/", + "uspc_mainclass/30/", + "uspc_subclass/30:100/", + "wipo/1/", + "publication/20010000001/" ) + + # these currently throw Error: Internal Server Error + broken_single_item_queries <- c( + "cpc_subclass/A01B/", + "uspc_mainclass/30/", + "uspc_subclass/30:100/", + "wipo/1/" + ) + + single_item_queries <- single_item_queries[!single_item_queries %in% broken_single_item_queries] + dev_null <- lapply(single_item_queries, function(q) { - j <- retrieve_linked_data(q) + print(q) + j <- retrieve_linked_data(add_base_url(q)) expect_equal(j$query_results$total_hits, 1) }) multi_item_queries <- c( - "https://search.patentsview.org/api/v1/application_citation/10966293/", - "https://search.patentsview.org/api/v1/patent_citation/10966293/" + "patent/us_application_citation/10966293/", + "patent/us_patent_citation/10966293/" ) dev_null <- lapply(multi_item_queries, function(q) { - j <- retrieve_linked_data(q) + j <- retrieve_linked_data(add_base_url(q)) expect_true(j$query_results$total_hits > 1) }) + + + # We'll make a call to get an inventor and assignee HATEOAS link + # in case their ids are not persistent + # new weirdness: we request inventor_id and assignee_id but the + # fields come back without the _id + res <- search_pv('{"patent_id":"10000000"}', + fields = c("inventors.inventor_id", "assignees.assignee_id") + ) + + assignee <- retrieve_linked_data(res$data$patents$assignees[[1]]$assignee) + expect_true(assignee$query_results$total_hits == 1) + + inventor <- retrieve_linked_data(res$data$patents$inventors[[1]]$inventor) + expect_true(inventor$query_results$total_hits == 1) + + # Query to get a location HATEOAS link in case location_ids are not persistent + res <- search_pv('{"location_name":"Chicago"}', + fields = c("location_id"), + endpoint = "location" + ) + + location <- retrieve_linked_data(add_base_url(paste0("location/", res$data$locations$location_id, "/"))) + expect_true(location$query_results$total_hits == 1) +}) + +# Make sure gets and posts return the same data. +# Posts had issues that went undetected for a while using the new API +# (odd results with posts when either no fields or sort was passed +# see get_post_body in search-pv.R) + +test_that("posts and gets return the same data", { + skip_on_cran() + + bad_eps <- c( + "cpc_subclass" + # ,"location" # Error: Invalid field: location_latitude + , "uspc_subclass" # Error: Internal Server Error + , "uspc_mainclass" # Error: Internal Server Error + , "wipo" # Error: Internal Server Error + , "claim" # Error: Invalid field: claim_dependent + , "draw_desc_text" # Error: Invalid field: description_sequence + , "cpc_subclass" # 404? check the test query + , "uspc_subclass" # 404 + # , "pg_claim" # check this one + ) + + good_eps <- endpoints[!endpoints %in% bad_eps] + + z <- lapply(good_eps, function(endpoint) { + print(endpoint) + get_res <- search_pv( + query = TEST_QUERIES[[endpoint]], + endpoint = endpoint, + method = "GET" + ) + + g <- unnest_pv_data(get_res$data, pk = get_ok_pk(endpoint)) + + post_res <- search_pv( + query = TEST_QUERIES[[endpoint]], + endpoint = endpoint, + method = "POST" + ) + + p <- unnest_pv_data(post_res$data) + + expect_equal(g, p) + }) +}) + +test_that("nested shorthand produces the same results as fully qualified ones", { + skip_on_cran() + + # the API now allows a shorthand in the fields/f: parameter + # just the group name will retrieve all that group's attributes + # This is indirectly testing our parse of the OpenAPI object and actual API responses + fields <- fieldsdf[fieldsdf$endpoint == "patent" & fieldsdf$group == "application", "field"] + + shorthand_res <- search_pv(TEST_QUERIES[["patent"]], fields = c("application")) + qualified_res <- search_pv(TEST_QUERIES[["patent"]], fields = fields) + + # the request$urls will be different but the data should match + expect_failure(expect_equal(shorthand_res$request$url, qualified_res$request$url)) + expect_equal(shorthand_res$data, qualified_res$data) +}) + + +test_that("the 'after' parameter works properly", { + skip_on_cran() + + sort <- c("patent_id" = "asc") + big_query <- qry_funs$eq(patent_date = "2000-01-04") # 3003 total_hits + results <- search_pv(big_query, all_pages = FALSE, sort = sort) + expect_gt(results$query_results$total_hits, 1000) + + after <- results$data$patents$patent_id[[nrow(results$data$patents)]] + subsequent <- search_pv(big_query, all_pages = FALSE, after = after, sort = sort) + + # ** New API bug? should be expect_equal `actual`: 399 + expect_lt(nrow(subsequent$data$patents), 1000) + + # the first row's patent_id should be bigger than after + # now "D418273" + # expect_gt(as.integer(subsequent$data$patents$patent_id[[1]]), as.integer(after)) + + # now we'll add a descending sort to make sure that also works + sort <- c("patent_id" = "desc") + fields <- NULL # c("patent_id") + + results <- search_pv(big_query, all_pages = FALSE, fields = fields, sort = sort) + after <- results$data$patents$patent_id[[nrow(results$data$patents)]] + + subsequent <- search_pv(big_query, + all_pages = FALSE, after = after, sort = sort, + fields = fields + ) + + # now the first row's patent_id should be smaller than after + # should be expect_lt + expect_gt(as.integer(subsequent$data$patents$patent_id[[1]]), as.integer(after)) + skip("New API bug?") +}) + +test_that("the documentation and Swagger UI URLs work properly", { + skip_on_cran() + + documentation_url <- + 'https://search.patentsview.org/api/v1/patent/?q={"_text_any":{"patent_title":"COBOL cotton gin"}}&s=[{"patent_id": "asc" }]&o={"size":50}&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]' + + results <- retrieve_linked_data(documentation_url) + + expect_gt(results$query_results$total_hits, 0) + + swagger_url <- "https://search.patentsview.org/api/v1/patent/?q=%7B%22patent_date%22%3A%221976-01-06%22%7D" + + results <- retrieve_linked_data(swagger_url, encoded = TRUE) + expect_gt(results$query_results$total_hits, 0) +}) + +test_that("an error occurs if all_pages is TRUE and there aren't any results", { + skip_on_cran() + + too_early <- qry_funs$lt(patent_date = "1976-01-01") + + results <- search_pv(too_early, all_pages = FALSE) + + # would like this test to fail! (meaning API added earlier data) + expect_equal(results$query_results$total_hits, 0) + + expect_error( + search_pv(too_early, all_pages = TRUE), + "No records matched your query" + ) +}) + +test_that("we can retrieve all_pages = TRUE without specifiying fields", { + skip_on_cran() + + query <- qry_funs$eq(patent_date = "1976-01-06") + sort <- c("patent_type" = "asc", "patent_id" = "asc") + + # here we aren't requesting fields but are requesting a sort + results <- search_pv(query, sort = sort, all_pages = TRUE) + + expect_gt(results$query_results$total_hits, 1300) +}) + +# Below we request the same data in built_singly and result_all, with the only +# difference being that we intentionally get throttled in built_singly by +# sending one request per patent number (instead of all requests at once). If +# the two responses match, then we've correctly handled throttling errors. +test_that("Throttled requests are automatically retried", { + skip_on_cran() + + res <- search_pv('{"_gte":{"patent_date":"2007-01-04"}}', size = 50) + patent_ids <- res$data$patents$patent_id + + # now we don't get message "The API's requests per minute limit has been reached. " + # so we'll testthat it takes over 60 seconds to run (since we got throttled) + # TODO(any): can we use evaluate_promise to find "Waiting 45s for retry backoff"? + + duration <- system.time( + built_singly <- lapply(patent_ids, function(patent_id) { + search_pv( + query = qry_funs$eq(patent_id = patent_id), + endpoint = "patent/us_patent_citation", + fields = c("patent_id", "citation_patent_id"), + sort = c("citation_patent_id" = "asc") + )[["data"]][["us_patent_citations"]] + }) + ) + + expect_gt(duration[["elapsed"]], 60) + + built_singly <- do.call(rbind, built_singly) + + # we'll also test that the results are the same for a post and get + # when there is a secondary sort on the bulk requests + sort <- c("patent_id" = "asc", "citation_patent_id" = "asc") + methods <- c("POST", "GET") + output <- lapply(methods, function(method) { + result_all <- search_pv( + query = qry_funs$eq(patent_id = patent_ids), + endpoint = "patent/us_patent_citation", + fields = c("patent_id", "citation_patent_id"), + sort = sort, + size = 1000, + all_pages = TRUE, + method = method + ) + result_all <- result_all$data$us_patent_citations + }) + + expect_equal(output[[1]], output[[2]]) + + # We'll do our own sort and check that it matches the API output + # We want to make sure we sent in the sort parameter correctly, where + # the API is doing the sort (since the we didn't need to page) + + second_output <- output[[2]] + + # Sorting logic using order() + sort_order <- mapply(function(col, direction) { + if (direction == "asc") { + return(second_output[[col]]) + } else { + return(-rank(second_output[[col]], ties.method = "min")) # Invert for descending order + } + }, col = names(sort), direction = as.vector(sort), SIMPLIFY = FALSE) + + # Final sorting + second_output <- second_output[do.call(order, sort_order), , drop = FALSE] + + expect_equal(output[[1]], second_output) + + # TODO(any): fix this: + # expect_equal says actual row.names are an integer vector and expected + # row.names is a character vector. Not sure why + row.names(output[[1]]) <- NULL + row.names(built_singly) <- NULL + + expect_equal(built_singly, output[[1]]) +}) + +test_that("we can sort on an unrequested field across page boundaries", { + skip_on_cran() + + # total_hits = 5,352 + query <- qry_funs$in_range(patent_date = c("1976-01-01", "1976-01-31")) + fields <- c("patent_title", "patent_date") + sort <- c("patent_date" = "desc", "patent_id" = "desc") + + r_ordered <- search_pv( + query = query, + fields = fields, + sort = sort, + all_pages = TRUE + ) + + fields <- c(fields, "patent_id") + api_ordered <- search_pv( + query = query, + fields = fields, + sort = sort, + all_pages = TRUE + ) + + # Remove patent_id before comparison. We're also indirectly testing that the + # patent_id field added by the first search_pv was removed, otherwise this + # expect equal would fail + api_ordered$data$patents[["patent_id"]] <- NULL + expect_equal(r_ordered$data, api_ordered$data) +}) + +test_that("sort works across page boundaries", { + skip_on_cran() + + sort <- c("patent_type" = "desc", "patent_id" = "desc") + results <- search_pv( + qry_funs$eq(patent_date = "1976-01-06"), + fields = c("patent_type", "patent_id"), + sort = sort, + all_pages = TRUE + ) + + double_check <- results$data$patents + + # Sorting logic using order() + sort_order <- mapply(function(col, direction) { + if (direction == "asc") { + return(double_check[[col]]) + } else { + return(-rank(double_check[[col]], ties.method = "min")) # Invert for descending order + } + }, col = names(sort), direction = as.vector(sort), SIMPLIFY = FALSE) + + # Final sorting + double_check <- double_check[do.call(order, sort_order), , drop = FALSE] + + expect_equal(results$data$patents, double_check) }) diff --git a/tests/testthat/test-unnest-pv-data.R b/tests/testthat/test-unnest-pv-data.R index eb2807cd..afe0fcbc 100644 --- a/tests/testthat/test-unnest-pv-data.R +++ b/tests/testthat/test-unnest-pv-data.R @@ -1,23 +1,78 @@ -context("unnest_pv_data") - eps <- get_endpoints() -test_that("", { +test_that("we can unnest all entities", { skip_on_cran() - # TODO(any): Remove: - skip("Temp skip for API redesign PR") - eps_no_loc <- eps[eps != "locations"] + # TODO(any): add back fields = get_fields(x) + # API throws 500s if some nested fields are included + + # locations endpoint is back but it fails this test + bad_endpoints <- c( + "location", "cpc_subclass", + "uspc_subclass", "uspc_mainclass", "wipo", + "claim", "draw_desc_text", "pg_claim" + ) + overloaded_entities <- c("patent/rel_app_text", "publication/rel_app_text") + good_eps <- eps[!eps %in% c(bad_endpoints, overloaded_entities)] + + z <- lapply(good_eps, function(x) { + print(x) - z <- lapply(eps_no_loc, function(x) { - Sys.sleep(1) pv_out <- search_pv( - "{\"patent_number\":\"5116621\"}", + query = TEST_QUERIES[[x]], endpoint = x, - fields = get_fields(x) + fields = get_fields(x, group = to_plural(x)) # requesting non-nested attributes ) + + expect_gt(pv_out$query_results$total_hits, 0) # check that the query worked unnest_pv_data(pv_out[["data"]]) }) expect_true(TRUE) + + # this will fail when the api is fixed + z <- lapply(bad_endpoints, function(x) { + print(x) + expect_error( + pv_out <- search_pv( + query = TEST_QUERIES[[x]], + endpoint = x, + fields = get_fields(x, group = to_plural(x)) # requesting non-nested attributes + ) + ) + }) + + # make it noticeable that all is not right with the API + skip("Skip for API bugs") # TODO: remove when the API is fixed/bad_endpoints removed +}) + +test_that("endpoint's pks match their entity's pks", { + skip_on_cran() + + # the overloaded_entities endpoints return the same entity, rel_app_texts, + # so we can't determine the endpoint from the entity like we can + # for the rest of the entities + overloaded_entities <- c("patent/rel_app_text", "publication/rel_app_text") + bad_endpoints <- c("uspc_subclass", "cpc_subclass", "uspc_mainclass", "wipo") + good_eps <- eps[!eps %in% c(bad_endpoints, overloaded_entities)] + + endpoint_pks <- lapply(good_eps, function(endpoint) { + print(endpoint) + get_ok_pk(endpoint) + }) + + entity_pks <- lapply(good_eps, function(endpoint) { + result <- search_pv(TEST_QUERIES[[endpoint]], endpoint = endpoint) + get_ok_pk(names(result$data)) + }) + + expect_equal(endpoint_pks, entity_pks) + + # this will fail when the api is fixed + z <- lapply(bad_endpoints, function(endpoint) { + print(endpoint) + expect_error( + result <- search_pv(TEST_QUERIES[[endpoint]], endpoint = endpoint) + ) + }) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R new file mode 100644 index 00000000..f38200b9 --- /dev/null +++ b/tests/testthat/test-utils.R @@ -0,0 +1,12 @@ +test_that("we can cast the endpoints that return the same entity", { + skip_on_cran() + + endpoints <- c("patent/rel_app_text", "publication/rel_app_text") + + nul <- lapply(endpoints, function(endpoint) { + results <- search_pv(query = TEST_QUERIES[[endpoint]], endpoint = endpoint) + cast <- cast_pv_data(results$data) + }) + + expect_true(TRUE) +})