diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 906c0fb1..093db343 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -23,10 +23,10 @@ jobs: steps: - name: Confirm crew102 triggered the build run: | - if [ "${{ github.event.sender.login }}" == "crew102" ]; then + if [ "${{ env.GITHUB_ACTOR }}" == "crew102" ]; then echo "Actor is crew102" else - echo "Actor is ${{ github.actor }}, failing build." + echo "Actor is ${{ env.GITHUB_ACTOR }}, failing build." exit 1 fi @@ -119,4 +119,4 @@ jobs: uses: actions/upload-artifact@main with: name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check \ No newline at end of file + path: check diff --git a/DESCRIPTION b/DESCRIPTION index cacc3d76..37ee6a1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ LazyData: TRUE Depends: R (>= 3.1) Imports: - httr, + httr2, lifecycle, jsonlite, utils diff --git a/NAMESPACE b/NAMESPACE index 46981d70..2cff0eaa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(cast_pv_data) export(get_endpoints) export(get_fields) export(get_ok_pk) +export(pad_patent_id) export(qry_funs) export(retrieve_linked_data) export(search_pv) diff --git a/R/cast-pv-data.R b/R/cast-pv-data.R index 57716461..e3cd59da 100644 --- a/R/cast-pv-data.R +++ b/R/cast-pv-data.R @@ -5,21 +5,23 @@ as_is <- function(x) x get_cast_fun <- function(data_type) { # Some fields aren't documented, so we don't know what their data type is. Use # string type for these. + # new version of the API: state of string vs fulltext is in flux. Latter currently unused if (length(data_type) != 1) data_type <- "string" - switch( - data_type, + switch(data_type, "string" = as_is, "date" = as.Date, - "float" = as.numeric, - "integer" = as.integer, + "number" = as_is, + "integer" = as_is, "int" = as.integer, - "fulltext" = as_is + "fulltext" = as_is, + "boolean" = as_is, + "bool" = as.logical ) } #' @noRd lookup_cast_fun <- function(name, typesdf) { - data_type <- typesdf[typesdf$field == name, "data_type"] + data_type <- typesdf[typesdf$common_name == name, "data_type"] get_cast_fun(data_type = data_type) } @@ -29,6 +31,18 @@ cast_one.character <- function(one, name, typesdf) { cast_fun(one) } +#' @noRd +cast_one.double <- function(one, name, typesdf) { + cast_fun <- lookup_cast_fun(name, typesdf) + cast_fun(one) +} + +#' @noRd +cast_one.integer <- function(one, name, typesdf) { + cast_fun <- lookup_cast_fun(name, typesdf) + cast_fun(one) +} + #' @noRd cast_one.default <- function(one, name, typesdf) NA @@ -69,7 +83,7 @@ cast_one <- function(one, name, typesdf) UseMethod("cast_one") #' \dontrun{ #' #' fields <- c("patent_date", "patent_title", "patent_year") -#' res <- search_pv(query = "{\"patent_number\":\"5116621\"}", fields = fields) +#' res <- search_pv(query = "{\"patent_id\":\"5116621\"}", fields = fields) #' cast_pv_data(data = res$data) #' } #' @@ -77,9 +91,25 @@ cast_one <- function(one, name, typesdf) UseMethod("cast_one") cast_pv_data <- function(data) { validate_pv_data(data) - endpoint <- names(data) + entity_name <- names(data) + + if (entity_name == "rel_app_texts") { + # blend the fields from both rel_app_texts entities + typesdf <- unique(fieldsdf[fieldsdf$group == entity_name, c("common_name", "data_type")]) + } else { + # need to get the endpoint from entity_name + endpoint_df <- fieldsdf[fieldsdf$group == entity_name, ] + endpoint <- unique(endpoint_df$endpoint) + + # watch out here- several endpoints return entities that are groups returned + # by the patent and publication endpoints (attorneys, inventors, assignees) + if(length(endpoint) > 1) { + endpoint <- endpoint[!endpoint %in% c("patent", "publication")] + } + + typesdf <- fieldsdf[fieldsdf$endpoint == endpoint, c("common_name", "data_type")] - typesdf <- fieldsdf[fieldsdf$endpoint == endpoint, c("field", "data_type")] + } df <- data[[1]] @@ -89,7 +119,7 @@ cast_pv_data <- function(data) { df[] <- list_out out_data <- list(x = df) - names(out_data) <- endpoint + names(out_data) <- entity_name structure( out_data, diff --git a/R/check-query.R b/R/check-query.R index c572e02e..238c8824 100644 --- a/R/check-query.R +++ b/R/check-query.R @@ -10,28 +10,32 @@ is_int <- function(x) #' @noRd is_date <- function(x) - grepl("[12][[:digit:]]{3}-[01][[:digit:]]-[0-3][[:digit:]]", x) + grepl("^[12][[:digit:]]{3}-[01][[:digit:]]-[0-3][[:digit:]]$", x) #' @noRd one_check <- function(operator, field, value, f1) { - if (nrow(f1) == 0) stop2(field, " is not a valid field to query for your endpoint") if (f1$data_type == "date" && !is_date(value)) stop2("Bad date: ", value, ". Date must be in the format of yyyy-mm-dd") - if (f1$data_type %in% c("string", "fulltext") && !is.character(value)) + if (f1$data_type %in% c("bool", "int", "string", "fulltext") && !is.character(value)) stop2(value, " must be of type character") if (f1$data_type == "integer" && !is_int(value)) stop2(value, " must be an integer") + if (f1$data_type == "boolean" && !is.logical(value)) + stop2(value, " must be a boolean") + if (f1$data_type == "number" && !is.numeric(value)) + stop2(value, " must be a number") if ( - (operator %in% c("_begins", "_contains") && !(f1$data_type == "string")) || - (operator %in% c("_text_all", "_text_any", "_text_phrase") && - !(f1$data_type == "fulltext")) || - (f1$data_type %in% c("string", "fulltext") && - operator %in% c("_gt", "_gte", "_lt", "_lte")) - ) + # The new version of the API blurrs the distinction between string/fulltext fields. + # It looks like the string/fulltext functions can be used interchangeably + (operator %in% c("_begins", "_contains", "_text_all", "_text_any", "_text_phrase") && + !(f1$data_type == "fulltext" || f1$data_type == "string")) || + (f1$data_type %in% c("string", "fulltext") && + operator %in% c("_gt", "_gte", "_lt", "_lte"))) { stop2("You cannot use the operator ", operator, " with the field ", field) + } } #' @noRd @@ -40,13 +44,16 @@ check_query <- function(query, endpoint) { num_opr <- c("_gt", "_gte", "_lt", "_lte") str_opr <- c("_begins", "_contains") fltxt_opr <- c("_text_all", "_text_any", "_text_phrase") - all_opr <- c(simp_opr, num_opr, str_opr, fltxt_opr) + all_opr <- c(simp_opr, num_opr, str_opr, fltxt_opr, "_in_range") - flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$can_query == "y", ] + flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint, ] apply_checks <- function(x, endpoint) { x <- swap_null_nms(x) - if (names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { + + # troublesome next line: 'length(x) = 2 > 1' in coercion to 'logical(1)' + # if (names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { + if (length(names(x)) > 1 || names(x) %in% c("_not", "_and", "_or") || is.na(names(x))) { lapply(x, FUN = apply_checks) } else if (names(x) %in% all_opr) { f1 <- flds_flt[flds_flt$field == names(x[[1]]), ] @@ -61,8 +68,8 @@ check_query <- function(query, endpoint) { ) } else { stop2( - names(x), " is either not a valid operator or not a ", - "queryable field for this endpoint" + names(x), " is not a valid operator or not a ", + "valid field for this endpoint" ) } } diff --git a/R/data.R b/R/data.R index 53cc8ecb..49ae72f3 100644 --- a/R/data.R +++ b/R/data.R @@ -3,7 +3,7 @@ #' A data frame containing the names of retrievable fields for each of the #' endpoints. You can find this data on the API's online documentation for each #' endpoint as well (e.g., the -#' \href{https://patentsview.org/apis/api-endpoints/patents}{patents endpoint +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patent endpoint #' field list table}). #' #' @format A data frame with the following columns: diff --git a/R/get-fields.R b/R/get-fields.R index 850eff89..062c804e 100644 --- a/R/get-fields.R +++ b/R/get-fields.R @@ -1,3 +1,9 @@ +#' @noRd +get_top_level_attributes <- function(endpoint) { + fieldsdf[fieldsdf$endpoint == endpoint & !grepl("\\.", fieldsdf$field), "field"] +} + + #' Get list of retrievable fields #' #' This function returns a vector of fields that you can retrieve from a given @@ -13,15 +19,18 @@ #' endpoint's fields (i.e., do not filter the field list based on group #' membership). See the field tables located online to see which groups you #' can specify for a given endpoint (e.g., the -#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patent +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patents #' endpoint table}), or use the \code{fieldsdf} table #' (e.g., \code{unique(fieldsdf[fieldsdf$endpoint == "patent", "group"])}). +#' @param include_pk Boolean on whether to include the endpoint's primary key, +#' defaults to FALSE. The primary key is needed if you plan on calling +#' \code{\link{unnest_pv_data}} on the results of \code{\link{search_pv}} #' #' @return A character vector with field names. #' #' @examples -#' # Get all assignee-level fields for the patent endpoint: -#' fields <- get_fields(endpoint = "patent", groups = "assignees") +#' # Get all top level (non-nested) fields for the patent endpoint: +#' fields <- get_fields(endpoint = "patent", groups = c("patents")) #' #' # ...Then pass to search_pv: #' \dontrun{ @@ -31,7 +40,7 @@ #' fields = fields #' ) #' } -#' # Get all patent and assignee-level fields for the patent endpoint: +#' # Get unnested patent and assignee-level fields for the patent endpoint: #' fields <- get_fields(endpoint = "patent", groups = c("assignees", "patents")) #' #' \dontrun{ @@ -41,15 +50,49 @@ #' fields = fields #' ) #' } +#' # Get the nested inventors fields and the primary key in order to call unnest_pv_data +#' # on the returned data. unnest_pv_data would throw an error if the primary key was +#' # not present in the results. +#' fields <- get_fields(endpoint = "patent", groups = c("inventors"), include_pk = TRUE) +#' +#' \dontrun{ +#' # ...Then pass to search_pv and unnest the results +#' results <- search_pv( +#' query = '{"_gte":{"patent_date":"2007-01-04"}}', +#' fields = fields +#' ) +#' unnest_pv_data(results$data) +#' } #' #' @export -get_fields <- function(endpoint, groups = NULL) { +get_fields <- function(endpoint, groups = NULL, include_pk = FALSE) { validate_endpoint(endpoint) + + # using API's shorthand notation, group names can be requested as fields instead of + # fully qualifying each nested field. Fully qualified, all patent endpoint's attributes + # is over 4K, too big to be sent on a GET with a modest query + + pk <- get_ok_pk(endpoint) + plural_entity <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field == pk, "group"] + top_level_attributes <- get_top_level_attributes(endpoint) + if (is.null(groups)) { - fieldsdf[fieldsdf$endpoint == endpoint, "field"] + c( + top_level_attributes, + unique(fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$group != plural_entity, "group"]) + ) } else { validate_groups(endpoint, groups = groups) - fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$group %in% groups, "field"] + + # don't include pk if plural_entity group is requested (pk would be a member) + extra_field <- if (include_pk && !plural_entity %in% groups) pk else NULL + extra_fields <- if (plural_entity %in% groups) top_level_attributes else NULL + + c( + extra_field, + extra_fields, + groups[!groups == plural_entity] + ) } } diff --git a/R/print.R b/R/print.R index 65bb53fe..025e7b4f 100644 --- a/R/print.R +++ b/R/print.R @@ -24,7 +24,10 @@ print.pv_data_result <- function(x, ...) { ) utils::str( - x, vec.len = 1, max.level = 2, give.attr = FALSE, strict.width = "cut" + x, vec.len = 1, max.level = 2, give.attr = FALSE, strict.width = "cut", + formatNum = function(x, ...) { + format(x, trim = TRUE, drop0trailing = TRUE, scientific = FALSE, ...) + } ) } diff --git a/R/process-error.R b/R/process-error.R deleted file mode 100644 index 4136f5d7..00000000 --- a/R/process-error.R +++ /dev/null @@ -1,57 +0,0 @@ -#' @noRd -throw_er <- function(resp) { - throw_if_loc_error(resp) - xheader_er_or_status(resp) -} - -#' @noRd -throw_if_loc_error <- function(resp) { - if (hit_locations_ep(resp$url) && httr::status_code(resp) == 500) { - num_grps <- get_num_groups(resp$url) - if (num_grps > 2) { - stop2( - "Your request resulted in a 500 error, likely because you have ", - "requested too many fields in your request (the location endpoint ", - "currently has restrictions on the number of fields/groups you can ", - "request). Try slimming down your field list and trying again." - ) - } - } -} - -# Not sure this is still applicable -#' @noRd -hit_locations_ep <- function(url) { - grepl( - "^https://search.patentsview.org/api/v1/location/", - url, - ignore.case = TRUE - ) -} - -#' @noRd -get_num_groups <- function(url) { - prsd_json_filds <- gsub(".*&f=([^&]*).*", "\\1", utils::URLdecode(url)) - fields <- jsonlite::fromJSON(prsd_json_filds) - grps <- fieldsdf[fieldsdf$endpoint == "location" & - fieldsdf$field %in% fields, "group"] - length(unique(grps)) -} - -#' @noRd -xheader_er_or_status <- function(resp) { - - # look for the api's ultra-helpful X-Status-Reason header - xhdr <- get_x_status(resp) - - if (length(xhdr) != 1) - httr::stop_for_status(resp) - else - stop(xhdr[[1]], call. = FALSE) -} - -#' @noRd -get_x_status <- function(resp) { - headers <- httr::headers(resp) - headers[grepl("x-status-reason$", names(headers), ignore.case = TRUE)] -} diff --git a/R/process-resp.R b/R/process-resp.R index 4fb9ed3d..426f3f4b 100644 --- a/R/process-resp.R +++ b/R/process-resp.R @@ -1,23 +1,10 @@ -#' @noRd -parse_resp <- function(resp) { - j <- httr::content(resp, as = "text", encoding = "UTF-8") - jsonlite::fromJSON( - j, - simplifyVector = TRUE, simplifyDataFrame = TRUE, simplifyMatrix = TRUE - ) -} - #' @noRd get_request <- function(resp) { gp <- structure( - list(method = resp$req$method, url = resp$req$url), + list(method = resp$request$method, url = resp$request$url), class = c("list", "pv_request") ) - if (gp$method == "POST") { - gp$body <- rawToChar(resp$req$options$postfields) - } - gp } @@ -42,11 +29,10 @@ get_query_results <- function(prsd_resp) { #' @noRd process_resp <- function(resp) { - if (httr::http_error(resp)) throw_er(resp) - prsd_resp <- parse_resp(resp) request <- get_request(resp) data <- get_data(prsd_resp) + query_results <- get_query_results(prsd_resp) structure( diff --git a/R/query-dsl.R b/R/query-dsl.R index 51bf0640..f3ea00ef 100644 --- a/R/query-dsl.R +++ b/R/query-dsl.R @@ -54,12 +54,31 @@ create_not_fun <- function(fun) { } } +#' @noRd +create_in_range_fun <- function(fun) { + force(fun) + function(...) { + value_p <- list(...) + field <- names(value_p) + value <- unlist(value_p) + names(value) <- NULL + + # throw an error if the length isn't two + asrt(length(value) == 2, fun, " expects a range of exactly two arguments") + + low <- create_one_fun(field = field, value = value[1], fun = "gte") + high <- create_one_fun(field = field, value = value[2], fun = "lte") + z <- list(`_and` = list(low, high)) + + structure(z, class = c(class(z), "pv_query")) + } +} + #' List of query functions #' #' A list of functions that make it easy to write PatentsView queries. See the -#' details section below for a list of the 14 functions, as well as the -#' \href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing -#' queries vignette} for further details. +#' details section below for a list of the 15 functions, as well as the +#' \href{../articles/writing-queries.html}{writing queries vignette} for further details. #' #' @details #' @@ -109,6 +128,13 @@ create_not_fun <- function(fun) { #' \item \code{not} - The comparison is not true #' } #' +#' \strong{4. Convenience function} \cr +#' +#' There is 1 convenience function: +#' \itemize{ +#' \item \code{in_range} - Builds a <= x <= b query +#' } +#' #' @return An object of class \code{pv_query}. This is basically just a simple #' list with a print method attached to it. #' @@ -117,6 +143,10 @@ create_not_fun <- function(fun) { #' #' qry_funs$not(qry_funs$eq(patent_date = "2001-01-01")) #' +#' qry_funs$in_range(patent_year = c(2010, 2021)) +#' +#' qry_funs$in_range(patent_date = c("1976-01-01", "1983-02-28")) + #' @export qry_funs <- c( lapply2( @@ -126,7 +156,8 @@ qry_funs <- c( ), create_key_fun ), lapply2(c("and", "or"), create_array_fun), - lapply2("not", create_not_fun) + lapply2("not", create_not_fun), + lapply2("in_range", create_in_range_fun) ) #' With qry_funs @@ -144,26 +175,25 @@ qry_funs <- c( #' @return The result of \code{code} - i.e., your query. #' #' @examples -#' # Without with_qfuns, we have to do: #' qry_funs$and( #' qry_funs$gte(patent_date = "2007-01-01"), #' qry_funs$text_phrase(patent_abstract = c("computer program")), #' qry_funs$or( -#' qry_funs$eq(inventor_last_name = "ihaka"), -#' qry_funs$eq(inventor_first_name = "chris") +#' qry_funs$eq(inventors.inventor_name_last = "Ihaka"), +#' qry_funs$eq(inventors.inventor_name_last = "Chris") #' ) #' ) #' -#' #...With it, this becomes: +#' # ...With it, this becomes: #' with_qfuns( -#' and( -#' gte(patent_date = "2007-01-01"), -#' text_phrase(patent_abstract = c("computer program")), -#' or( -#' eq(inventor_last_name = "ihaka"), -#' eq(inventor_first_name = "chris") -#' ) -#' ) +#' and( +#' gte(patent_date = "2007-01-01"), +#' text_phrase(patent_abstract = c("computer program")), +#' or( +#' eq(inventors.inventor_name_last = "Ihaka"), +#' eq(inventors.inventor_name_last = "Chris") +#' ) +#' ) #' ) #' #' @export diff --git a/R/search-pv.R b/R/search-pv.R index 7e711c0b..0f6b32c0 100644 --- a/R/search-pv.R +++ b/R/search-pv.R @@ -11,26 +11,41 @@ tojson_2 <- function(x, ...) { } #' @noRd -to_arglist <- function(fields, page, per_page, sort) { +to_arglist <- function(fields, size, sort, after) { + opts <- list(size = size) + if (!is.null(after)) { + opts$after <- after + } + list( fields = fields, sort = list(as.list(sort)), - opts = list( - offset = (page - 1) * per_page, - size = per_page - ) + opts = opts ) } +#' @noRd +set_sort_param <- function(before) { + # Fixes former bug + # for sort = c("patent_id" = "asc", "citation_patent_id" = "asc") + # we sent [{"patent_id":"asc","citation_patent_id":"asc"}] + # API wants [{"patent_id": "asc" },{"citation_patent_id": "asc" }] + # TODO(any): brute meet force- there must be a better way... + after <- tojson_2(before, auto_unbox = TRUE) + after <- gsub('","', '"},{"', after) + after +} + #' @noRd get_get_url <- function(query, base_url, arg_list) { j <- paste0( base_url, "?q=", utils::URLencode(query, reserved = TRUE), "&f=", tojson_2(arg_list$fields), - "&o=", tojson_2(arg_list$opts, auto_unbox = TRUE), - "&s=", tojson_2(arg_list$sort, auto_unbox = TRUE) + "&s=", set_sort_param(arg_list$sort), + "&o=", tojson_2(arg_list$opts, auto_unbox = TRUE) ) + utils::URLencode(j) } @@ -40,87 +55,106 @@ get_post_body <- function(query, arg_list) { "{", '"q":', query, ",", '"f":', tojson_2(arg_list$fields), ",", - '"o":', tojson_2(arg_list$opts, auto_unbox = TRUE), ",", - '"s":', tojson_2(arg_list$sort, auto_unbox = TRUE), + '"s":', set_sort_param(arg_list$sort), ",", + '"o":', tojson_2(arg_list$opts, auto_unbox = TRUE), "}" ) - gsub('(,"[fs]":)([,}])', paste0("\\1", "{}", "\\2"), body) + # The API can now act weirdly if we pass f:{},s:{} as we did in the past. + # (Weirdly in that the post results may not equal the get results or posts error out) + # Now we'd remove "f":, and "s":, We're guaranteed to have q: and at least "size":1000 as o: + gsub('("[fs]":,)', "", body) } #' @noRd -one_request <- function(method, query, base_url, arg_list, api_key, ...) { - ua <- httr::user_agent("https://github.com/ropensci/patentsview") +patentsview_error_body <- function(resp) { + if (httr2::resp_status(resp) == 400) c(httr2::resp_header(resp, "X-Status-Reason")) else NULL +} +#' @noRd +one_request <- function(method, query, base_url, arg_list, api_key, ...) { if (method == "GET") { get_url <- get_get_url(query, base_url, arg_list) - resp <- httr::GET( - get_url, - httr::add_headers("X-Api-Key" = api_key), - ua, ... - ) + req <- httr2::request(get_url) |> + httr2::req_method("GET") } else { body <- get_post_body(query, arg_list) - resp <- httr::POST( - base_url, - httr::add_headers( - "X-Api-Key" = api_key, - "Content-Type" = "application/json" - ), - body = body, - ua, ... - ) + req <- httr2::request(base_url) |> + httr2::req_body_raw(body) |> + httr2::req_headers("Content-Type" = "application/json") |> + httr2::req_method("POST") } - # Sleep and retry on a 429 (too many requests). The Retry-After header is the - # seconds to sleep - if (httr::status_code(resp) == 429) { - num_seconds <- httr::headers(resp)[["Retry-After"]] - maybe_an_s <- if (num_seconds == "1") "" else "s" - message(paste0( - "The API's requests per minute limit has been reached. ", - "Pausing for ", num_seconds, " second", maybe_an_s, - " before continuing." - )) - Sys.sleep(num_seconds) - - one_request(method, query, base_url, arg_list, api_key, ...) - } else { - resp + resp <- req |> + httr2::req_user_agent("https://github.com/ropensci/patentsview") |> + httr2::req_options(...) |> + httr2::req_retry(max_tries = 20) |> # automatic 429 Retry-After + httr2::req_headers("X-Api-Key" = api_key, .redact = "X-Api-Key") |> + httr2::req_error(body = patentsview_error_body) |> + httr2::req_perform() + + resp +} + +#' Pad patent_id +#' +#' This function strategically pads a patent_id with zeroes to 8 characters, +#' needed only for custom paging that uses sorts by patent_id. +#' +#' @param patent_id The patent_id that needs to be padded. It can +#' be the patent_id for a utility, design, plant or reissue patent. +#' +#' @examples +#' \dontrun{ +#' padded <- pad_patent_id("RE36479") +#' +#' padded2 <- pad_patent_id("3930306") +#' } +#' +#' @export +# zero pad patent_id to 8 characters. +pad_patent_id <- function(patent_id) { + pad <- 8 - nchar(patent_id) + if (pad > 0) { + patent_id <- paste0(sprintf("%0*d", pad, 0), patent_id) + patent_id <- sub("(0+)([[:alpha:]]+)([[:digit:]]+)", "\\2\\1\\3", patent_id) } + patent_id } #' @noRd request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, ...) { matched_records <- ex_res$query_results[[1]] req_pages <- ceiling(matched_records / arg_list$opts$size) - if (req_pages < 1) { - stop2("No records matched your query...Can't download multiple pages") - } - if (matched_records > 10000) { - stop2( - "The API only allows you to download 10,000 records in a single query. ", - "Your query returned ", matched_records, " records. See for ", - "how to get around this limitation." - ) - } - if (req_pages > 10) { - stop2( - "The API only allows you to download 10 pages in a single query. ", - "Your query returned ", req_pages, " pages. See for ", - "how to get around this limitation." - ) - } tmp <- lapply(seq_len(req_pages), function(i) { - arg_list$opts$offset <- (i - 1) * arg_list$opts$size x <- one_request(method, query, base_url, arg_list, api_key, ...) x <- process_resp(x) + + # now to page we need to set the "after" attribute to where we left off + # we want the last value of the primary sort field + s <- names(arg_list$sort[[1]])[[1]] + index <- nrow(x$data[[1]]) + last_value <- x$data[[1]][[s]][[index]] + + if (s == "patent_id") { + last_value <- pad_patent_id(last_value) + } + + arg_list$opts$after <<- last_value + x$data[[1]] }) do.call("rbind", c(tmp, make.row.names = FALSE)) } +#' @noRd +get_default_sort <- function(endpoint) { + default <- c("asc") + names(default) <- get_ok_pk(endpoint) + default +} + #' Search PatentsView #' #' This function makes an HTTP request to the PatentsView API for data matching @@ -137,51 +171,62 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' #' \item An object of class \code{pv_query}, which you create by calling one #' of the functions found in the \code{\link{qry_funs}} list...See the -#' \href{https://docs.ropensci.org/patentsview/articles/writing-queries.html}{writing +#' \href{../articles/writing-queries.html}{writing #' queries vignette} for details.\cr #' E.g., \code{qry_funs$gte(patent_date = "2007-01-04")} #' } #' @param fields A character vector of the fields that you want returned to you. -#' A value of \code{NULL} indicates that the default fields should be -#' returned. Acceptable fields for a given endpoint can be found at the API's +#' A value of \code{NULL} indicates to the API that it should return the default fields +#' for that endpoint. Acceptable fields for a given endpoint can be found at the API's #' online documentation (e.g., check out the field list for the -#' \href{https://patentsview.org/apis/api-endpoints/patents}{patents +#' \href{https://search.patentsview.org/docs/docs/Search%20API/SearchAPIReference/#patent}{patents #' endpoint}) or by viewing the \code{fieldsdf} data frame #' (\code{View(fieldsdf)}). You can also use \code{\link{get_fields}} to list #' out the fields available for a given endpoint. +#' +#' Nested fields can be fully qualified, e.g., "application.filing_date" or the +#' group name can be used to retrieve all of its nested fields, E.g. "application". +#' The latter would be similar to passing \code{get_fields("patent", group = "application")} +#' except it's the API that decides what fields to return. #' @param endpoint The web service resource you wish to search. Use #' \code{get_endpoints()} to list the available endpoints. -#' @param subent_cnts `r lifecycle::badge("deprecated")` Non-matched subentities -#' will always be returned under the new version of the API +#' @param subent_cnts `r lifecycle::badge("deprecated")` This is always FALSE in the +#' new version of the API as the total counts of unique subentities is no longer available. #' @param mtchd_subent_only `r lifecycle::badge("deprecated")` This is always -#' FALSE in the new version of the API. -#' @param page The page number of the results that should be returned. -#' @param per_page The number of records that should be returned per page. This -#' value can be as high as 1,000 (e.g., \code{per_page = 1000}). +#' FALSE in the new version of the API as non-matched subentities +#' will always be returned. +#' @param page `r lifecycle::badge("deprecated")` The new version of the API does not use +#' \code{page} as a parameter for paging, it uses \code{after}. +#' @param per_page `r lifecycle::badge("deprecated")` The API now uses \code{size} +#' @param size The number of records that should be returned per page. This +#' value can be as high as 1,000 (e.g., \code{size = 1000}). +#' @param after A list of sort key values that defaults to NULL. This +#' exposes the API's paging parameter for users who want to implement their own +#' paging. It cannot be set when \code{all_pages = TRUE} as the R package manipulates it +#' for users automatically. See \href{../articles/result-set-paging.html}{result set paging} #' @param all_pages Do you want to download all possible pages of output? If -#' \code{all_pages = TRUE}, the values of \code{page} and \code{per_page} are -#' ignored. +#' \code{all_pages = TRUE}, the value of \code{size} is ignored. #' @param sort A named character vector where the name indicates the field to #' sort by and the value indicates the direction of sorting (direction should -#' be either "asc" or "desc"). For example, \code{sort = c("patent_number" = -#' "asc")} or \cr\code{sort = c("patent_number" = "asc", "patent_date" = +#' be either "asc" or "desc"). For example, \code{sort = c("patent_id" = +#' "asc")} or \cr\code{sort = c("patent_id" = "asc", "patent_date" = #' "desc")}. \code{sort = NULL} (the default) means do not sort the results. #' You must include any fields that you wish to sort by in \code{fields}. #' @param method The HTTP method that you want to use to send the request. #' Possible values include "GET" or "POST". Use the POST method when #' your query is very long (say, over 2,000 characters in length). #' @param error_browser `r lifecycle::badge("deprecated")` -#' @param api_key API key. See \href{https://patentsview.org/apis/keyrequest}{ -#' Here} for info on creating a key. -#' @param ... Arguments passed along to httr's \code{\link[httr]{GET}} or -#' \code{\link[httr]{POST}} function. +#' @param api_key API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +#' \href{https://patentsview-support.atlassian.net/servicedesk/customer/portals}{here}. +#' @param ... Curl options passed along to httr2's \code{\link[httr2]{req_options}} +#' when we do GETs or POSTs. #' #' @return A list with the following three elements: #' \describe{ #' \item{data}{A list with one element - a named data frame containing the #' data returned by the server. Each row in the data frame corresponds to a #' single value for the primary entity. For example, if you search the -#' assignees endpoint, then the data frame will be on the assignee-level, +#' assignee endpoint, then the data frame will be on the assignee-level, #' where each row corresponds to a single assignee. Fields that are not on #' the assignee-level would be returned in list columns.} #' @@ -207,8 +252,8 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' search_pv( #' query = qry_funs$gt(patent_year = 2010), #' method = "POST", -#' fields = "patent_number", -#' sort = c("patent_number" = "asc") +#' fields = "patent_id", +#' sort = c("patent_id" = "asc") #' ) #' #' search_pv( @@ -223,9 +268,14 @@ request_apply <- function(ex_res, method, query, base_url, arg_list, api_key, .. #' ) #' #' search_pv( -#' query = qry_funs$contains(inventors_at_grant.name_last = "Smith"), +#' query = qry_funs$contains(inventors.inventor_name_last = "Smith"), #' endpoint = "patent", -#' config = httr::timeout(40) +#' timeout = 40 +#' ) +#' +#' search_pv( +#' query = qry_funs$eq(patent_id = "11530080"), +#' fields = "application" #' ) #' } #' @@ -235,67 +285,187 @@ search_pv <- function(query, endpoint = "patent", subent_cnts = FALSE, mtchd_subent_only = lifecycle::deprecated(), - page = 1, - per_page = 1000, + page = lifecycle::deprecated(), + per_page = lifecycle::deprecated(), + size = 1000, + after = NULL, all_pages = FALSE, sort = NULL, method = "GET", error_browser = NULL, api_key = Sys.getenv("PATENTSVIEW_API_KEY"), ...) { - - validate_args(api_key, fields, endpoint, method, page, per_page, sort) - deprecate_warn_all(error_browser, subent_cnts, mtchd_subent_only) + validate_args(api_key, fields, endpoint, method, sort, after, size, all_pages) + deprecate_warn_all(error_browser, subent_cnts, mtchd_subent_only, page, per_page) + if (lifecycle::is_present(per_page)) size <- per_page if (is.list(query)) { - # check_query(query, endpoint) + check_query(query, endpoint) query <- jsonlite::toJSON(query, auto_unbox = TRUE) } - arg_list <- to_arglist(fields, page, per_page, sort) + + arg_list <- to_arglist(fields, size, sort, after) base_url <- get_base(endpoint) result <- one_request(method, query, base_url, arg_list, api_key, ...) result <- process_resp(result) - if (!all_pages) return(result) - full_data <- request_apply(result, method, query, base_url, arg_list, api_key, ...) - result$data[[1]] <- full_data + if (all_pages && result$query_result$total_hits == 0) { + stop2("No records matched your query...Can't download multiple pages") + } + # return if we don't need to make additional API requests + if (!all_pages || + result$query_result$total_hits == 0 || + result$query_result$total_hits == nrow(result$data[[1]])) { + return(result) + } + + # Here we ignore the user's sort and instead have the API sort by the primary + # key for the requested endpoint. + primary_sort_key <- get_default_sort(endpoint) + + # We check what fields we got back from the first call. If the user didn't + # specify fields, we'd get back the API's defaults. We may need to request + # additional fields from the API so we can apply the users sort and then remove + # the additional fields. + returned_fields <- names(result$data[[1]]) + + if (!is.null(sort)) { + sort_fields <- names(sort) + additional_fields <- sort_fields[!(sort_fields %in% returned_fields)] + if (is.null(fields)) { + fields <- returned_fields # the default fields + } else { + fields <- fields # user passed + } + fields <- append(fields, additional_fields) + } else { + additional_fields <- c() + } + + arg_list <- to_arglist(fields, size, primary_sort_key, after) + paged_data <- request_apply(result, method, query, base_url, arg_list, api_key, ...) + + # we apply the user's sort, if they supplied one, using order() + # was data.table::setorderv(paged_data, names(sort), ifelse(as.vector(sort) == "asc", 1, -1)) + if (!is.null(sort)) { + sort_order <- mapply(function(col, direction) { + if (direction == "asc") { + return(paged_data[[col]]) + } else { + return(-rank(paged_data[[col]], ties.method = "min")) # Invert for descending order + } + }, col = names(sort), direction = as.vector(sort), SIMPLIFY = FALSE) + + # Final sorting + paged_data <- paged_data[do.call(order, sort_order), , drop = FALSE] + } + + # remove the fields we added in order to do the user's sort ourselves + paged_data <- paged_data[, !names(paged_data) %in% additional_fields] + + result$data[[1]] <- paged_data result } -#' Get Linked Data +#' Retrieve Linked Data #' #' Some of the endpoints now return HATEOAS style links to get more data. E.g., -#' the inventors endpoint may return a link such as: -#' "https://search.patentsview.org/api/v1/inventor/252373/" +#' the patent endpoint may return a link such as: +#' "https://search.patentsview.org/api/v1/inventor/fl:th_ln:jefferson-1/" +#' +#' @param url A link that was returned by the API on a previous call, an example +#' in the documentation or a Request URL from the \href{https://search.patentsview.org/swagger-ui/}{API's Swagger UI page}. +#' +#' @param encoded_url boolean to indicate whether the url has been URL encoded, defaults to FALSE. +#' Set to TRUE for Request URLs from Swagger UI. +#' +#' @param ... Curl options passed along to httr2's \code{\link[httr2]{req_options}} function. +#' +#' @return A list with the following three elements: +#' \describe{ +#' \item{data}{A list with one element - a named data frame containing the +#' data returned by the server. Each row in the data frame corresponds to a +#' single value for the primary entity. For example, if you search the +#' assignee endpoint, then the data frame will be on the assignee-level, +#' where each row corresponds to a single assignee. Fields that are not on +#' the assignee-level would be returned in list columns.} +#' +#' \item{query_results}{Entity counts across all pages of output (not just +#' the page returned to you).} #' -#' @param url The link that was returned by the API on a previous call. +#' \item{request}{Details of the GET HTTP request that was sent to the server.} +#' } #' -#' @inherit search_pv return #' @inheritParams search_pv #' #' @examples #' \dontrun{ #' #' retrieve_linked_data( -#' "https://search.patentsview.org/api/v1/cpc_subgroup/G01S7:4811/" -#' ) +#' "https://search.patentsview.org/api/v1/cpc_group/G01S7:4811/" +#' ) +#' +#' endpoint_url <- "https://search.patentsview.org/api/v1/patent/" +#' q_param <- '?q={"_text_any":{"patent_title":"COBOL cotton gin"}}' +#' s_and_o_params <- '&s=[{"patent_id": "asc" }]&o={"size":50}' +#' f_param <- '&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]' +#' # (URL broken up to avoid a long line warning in this Rd) +#' +#' retrieve_linked_data( +#' paste0(endpoint_url, q_param, s_and_o_params, f_param) +#' ) +#' +#' retrieve_linked_data( +#' "https://search.patentsview.org/api/v1/patent/?q=%7B%22patent_date%22%3A%221976-01-06%22%7D", +#' encoded_url = TRUE +#' ) #' } #' #' @export retrieve_linked_data <- function(url, + encoded_url = FALSE, api_key = Sys.getenv("PATENTSVIEW_API_KEY"), ... ) { + if (encoded_url) { + url <- utils::URLdecode(url) + } - # Don't sent the API key to any domain other than patentsview.org - if (!grepl("^https://[^/]*\\.patentsview.org/", url)) { + # There wouldn't be url parameters on a HATEOAS link but we'll also accept + # example urls from the documentation, where there could be parameters + url_peices <- httr2::url_parse(url) + + # Only send the API key to subdomains of patentsview.org + if (!grepl("^.*\\.patentsview.org$", url_peices$hostname)) { stop2("retrieve_linked_data is only for patentsview.org urls") } + params <- list() + query <- "" + + if (!is.null(url_peices$query)) { + # Need to change f to fields vector, s to sort vector and o to opts + # There is probably a whizbangy better way to do this in R + if (!is.null(url_peices$query$f)) { + params$fields <- unlist(strsplit(gsub("[\\[\\]]", "", url_peices$query$f, perl = TRUE), ",\\s*")) + } + + if (!is.null(url_peices$query$s)) { + params$sort <- jsonlite::fromJSON(sub(".*s=([^&]*).*", "\\1", url)) + } + + if (!is.null(url_peices$query$o)) { + params$opts <- jsonlite::fromJSON(sub(".*o=([^&]*).*", "\\1", url)) + } + + query <- if (!is.null(url_peices$query$q)) sub(".*q=([^&]*).*", "\\1", url) else "" + url <- paste0(url_peices$scheme, "://", url_peices$hostname, url_peices$path) + } + # Go through one_request, which handles resend on throttle errors - # The API doesn't seem to mind ?q=&f=&o=&s= appended to the URL - res <- one_request("GET", "", url, list(), api_key, ...) + # The API doesn't seem to mind ?q=&f=&o=&s= appended to HATEOAS URLs + res <- one_request("GET", query, url, params, api_key, ...) process_resp(res) } diff --git a/R/unnest-pv-data.R b/R/unnest-pv-data.R index a86ca441..19b29370 100644 --- a/R/unnest-pv-data.R +++ b/R/unnest-pv-data.R @@ -4,24 +4,50 @@ #' in \code{\link{unnest_pv_data}}, based on the endpoint you searched. #' It will return a potential unique identifier for a given entity (i.e., a #' given endpoint). For example, it will return "patent_id" when -#' \code{endpoint = "patent"}. +#' \code{endpoint_or_entity = "patent"}. It would return the same value if +#' the entity name "patents" was passed via \code{get_ok_pk(names(pv_return$data))} +#' where pv_return was returned from \code{\link{search_pv}}. #' -#' @param endpoint The endpoint which you would like to know a potential primary -#' key for. +#' @param endpoint_or_entity The endpoint or entity name for which you +#' would like to know a potential primary key for. #' #' @return The name of a primary key (\code{pk}) that you could pass to #' \code{\link{unnest_pv_data}}. #' #' @examples -#' get_ok_pk(endpoint = "inventor") -#' get_ok_pk(endpoint = "cpc_subclass") -#' get_ok_pk("publication/rel_app_text") +#' get_ok_pk(endpoint_or_entity = "inventor") # Returns "inventor_id" +#' get_ok_pk(endpoint_or_entity = "cpc_group") # Returns "cpc_group_id" #' #' @export -get_ok_pk <- function(endpoint) { +get_ok_pk <- function(endpoint_or_entity) { + endpoint_df <- fieldsdf[fieldsdf$endpoint == endpoint_or_entity, ] + if (nrow(endpoint_df) > 0) { + endpoint <- endpoint_or_entity + } else { + endpoint_df <- fieldsdf[fieldsdf$group == endpoint_or_entity, ] + endpoint <- unique(endpoint_df$endpoint) + + # watch out here- several endpoints return entities that are groups returned + # by the patent and publication endpoints (attourneys, inventors, assignees) + if(length(endpoint) > 1) { + endpoint <- endpoint[!endpoint %in% c("patent", "publication")] + endpoint_df <- fieldsdf[fieldsdf$endpoint == endpoint, ] + } + } + unnested_endpoint <- sub("^(patent|publication)/", "", endpoint) possible_pks <- c("patent_id", "document_number", paste0(unnested_endpoint, "_id")) - fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field %in% possible_pks, "field"] + pk <- endpoint_df[endpoint_df$field %in% possible_pks, "field"] + + # we're unable to determine the pk if an entity name of rel_app_texts was passed + asrt( + length(pk) == 1, + "The primary key cannot be determined for ", endpoint_or_entity, + ". Try using the endpoint's name instead ", + paste(unique(fieldsdf[fieldsdf$group == endpoint_or_entity, "endpoint"]), collapse = ", ") + ) + + pk } #' Unnest PatentsView data @@ -39,8 +65,8 @@ get_ok_pk <- function(endpoint) { #' inside it. See examples. #' @param pk The column/field name that will link the data frames together. This #' should be the unique identifier for the primary entity. For example, if you -#' used the patents endpoint in your call to \code{search_pv}, you could -#' specify \code{pk = "patent_number"}. \strong{This identifier has to have +#' used the patent endpoint in your call to \code{search_pv}, you could +#' specify \code{pk = "patent_id"}. \strong{This identifier has to have #' been included in your \code{fields} vector when you called #' \code{search_pv}}. You can use \code{\link{get_ok_pk}} to suggest a #' potential primary key for your data. @@ -58,12 +84,20 @@ get_ok_pk <- function(endpoint) { #' } #' #' @export -unnest_pv_data <- function(data, pk = get_ok_pk(names(data))) { - +unnest_pv_data <- function(data, pk = NULL) { validate_pv_data(data) df <- data[[1]] + if (is.null(pk)) { + # now there are two endpoints that return rel_app_texts entities with different pks + if (names(data) == "rel_app_texts") { + pk <- if ("document_number" %in% names(df)) "document_number" else "patent_id" + } else { + pk = get_ok_pk(names(data)) + } + } + asrt( pk %in% colnames(df), pk, " not in primary entity data frame...Did you include it in your ", @@ -75,14 +109,12 @@ unnest_pv_data <- function(data, pk = get_ok_pk(names(data))) { sub_ent_df <- df[, !prim_ent_var, drop = FALSE] sub_ents <- colnames(sub_ent_df) - ok_pk <- get_ok_pk(names(data)) - out_sub_ent <- lapply2(sub_ents, function(x) { temp <- sub_ent_df[[x]] asrt( length(unique(df[, pk])) == length(temp), pk, " cannot act as a primary key because it is not a unique identifier.\n\n", - "Try using ", ok_pk, " instead." + "Try using ", pk, " instead." ) names(temp) <- df[, pk] xn <- do.call("rbind", temp) diff --git a/R/utils.R b/R/utils.R index cc487623..a6f50a6b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -4,6 +4,16 @@ stop2 <- function(...) stop(..., call. = FALSE) #' @noRd asrt <- function(expr, ...) if (!expr) stop2(...) +#' @noRd +parse_resp <- function(resp) { + j <- resp |> httr2::resp_body_string(encoding = "UTF-8") + + jsonlite::fromJSON( + j, + simplifyVector = TRUE, simplifyDataFrame = TRUE, simplifyMatrix = TRUE + ) +} + #' @noRd format_num <- function(x) { format( diff --git a/R/validate-args.R b/R/validate-args.R index a669e182..31dd6467 100644 --- a/R/validate-args.R +++ b/R/validate-args.R @@ -8,14 +8,23 @@ validate_endpoint <- function(endpoint) { } #' @noRd -validate_args <- function(api_key, fields, endpoint, method, page, per_page, - sort) { +validate_args <- function(api_key, fields, endpoint, method, + sort, after, size, all_pages) { asrt( !identical(api_key, ""), "The new version of the API requires an API key" ) flds_flt <- fieldsdf[fieldsdf$endpoint == endpoint, "field"] + + # Now the API allows the group name to be requested as in fields to get all of + # the group's nested fields. ex.: "assignees" on the patent endpoint gets you all + # of the assignee fields. Note that "patents" can't be requested + groups <- unique(fieldsdf[fieldsdf$endpoint == endpoint, c("group")]) + pk <- get_ok_pk(endpoint) + plural_entity <- fieldsdf[fieldsdf$endpoint == endpoint & fieldsdf$field == pk, "group"] + flds_flt <- append(flds_flt, groups[!groups == plural_entity]) + asrt( all(fields %in% flds_flt), "Bad field(s): ", paste(fields[!(fields %in% flds_flt)], collapse = ", ") @@ -27,27 +36,35 @@ validate_args <- function(api_key, fields, endpoint, method, page, per_page, all(method %in% c("GET", "POST"), length(method) == 1), "method must be either 'GET' or 'POST'" ) + asrt( - all(is.numeric(page), length(page) == 1, page >= 1), - "page must be a numeric value greater than 1" - ) - asrt( - all(is.numeric(per_page), length(per_page) == 1, per_page <= 1000), - "per_page must be a numeric value less than or equal to 1,000" + all(is.numeric(size), length(size) == 1, size <= 1000), + "size must be a numeric value less than or equal to 1,000" ) - if (!is.null(sort)) + + # Removed all(names(sort) %in% fields) Was it our requirement or the API's? + # It does seem to work when we don't request fields and rely on the API to sort + # using them. + if (!is.null(sort)) { asrt( all( - all(names(sort) %in% fields), all(sort %in% c("asc", "desc")), - !is.list(sort)), - "sort has to be a named character vector and each name has to be ", - "specified in the field argument. See examples" + all(sort %in% c("asc", "desc")), + !is.list(sort) + ), + "sort has to be a named character vector. See examples" ) + } + + asrt( + any(is.null(after), !all_pages), + "'after' cannot be set when all_pages = TRUE" + ) } #' @noRd validate_groups <- function(endpoint, groups) { ok_grps <- unique(fieldsdf[fieldsdf$endpoint == endpoint, "group"]) + asrt( all(groups %in% ok_grps), "for the ", endpoint, " endpoint, group must be one of the following: ", @@ -64,14 +81,14 @@ validate_pv_data <- function(data) { } #' @noRd -deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only) { +deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only, page, per_page) { if (!is.null(error_browser)) { lifecycle::deprecate_warn(when = "0.2.0", what = "search_pv(error_browser)") } # Was previously defaulting to FALSE and we're still defaulting to FALSE to # mirror the fact that the API doesn't support subent_cnts. Warn only if user - # tries to set subent_cnts to TRUE. - if (isTRUE(subent_cnts)) { + # tries to set subent_cnts to anything other than FALSE (test cases try NULL and 7) + if (!(is.logical(subent_cnts) && isFALSE(subent_cnts))) { lifecycle::deprecate_warn( when = "1.0.0", what = "search_pv(subent_cnts)", @@ -88,4 +105,21 @@ deprecate_warn_all <- function(error_browser, subent_cnts, mtchd_subent_only) { version of the API" ) } -} \ No newline at end of file + + if (lifecycle::is_present(per_page)) { + lifecycle::deprecate_warn( + when = "0.3.0", + what = "search_pv(per_page)", + details = "The new version of the API uses 'size' instead of 'per_page'", + with = "search_pv(size)" + ) + } + + if (lifecycle::is_present(page)) { + lifecycle::deprecate_warn( + when = "0.3.0", + what = "search_pv(page)", + details = "The new version of the API does not support the page parameter" + ) + } +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 75d58fbb..fe54d8ba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -23,6 +23,9 @@ reference: - get_ok_pk - cast_pv_data - retrieve_linked_data + - title: Utility + contents: + - pad_patent_id navbar: components: diff --git a/docs/reference/cast_pv_data.html b/docs/reference/cast_pv_data.html index 08ef48c0..ecc3f786 100644 --- a/docs/reference/cast_pv_data.html +++ b/docs/reference/cast_pv_data.html @@ -1,70 +1,13 @@ - - -
- - - - -This will cast the data fields returned by search_pv
so that
+
This will cast the data fields returned by search_pv
so that
they have their most appropriate data types (e.g., date, numeric, etc.).
cast_pv_data(data)+
cast_pv_data(data)
data | -The data returned by
+ Arguments+
|
-
---|
The same type of object that you passed into cast_pv_data
.
A data frame containing the names of retrievable and queryable fields for
-each of the 7 API endpoints. A yes/no flag (can_query
) indicates
-which fields can be included in the user's query. You can also find this
-data on the API's online documentation for each endpoint as well (e.g.,
-the patents
-endpoint field list table)
A data frame containing the names of retrievable fields for each of the +endpoints. You can find this data on the API's online documentation for each +endpoint as well (e.g., the +patent endpoint +field list table).
fieldsdf
+ fieldsdf
A data frame with the following columns:
The endpoint that this field record is for
The complete name of the field, including the parent group if +applicable
A data frame with 992 rows and 7 variables:
The endpoint that this field record is for
The name of the field
The field's data type (string, date, float, integer, - fulltext)
An indicator for whether the field can be included in - the user query for the given endpoint
The group the field belongs to
The field's common name
A description of the field
The field's input data type
The group the field belongs to
The field name without the parent group structure
This function reminds the user what the 7 possible PatentsView API endpoints +
This function reminds the user what the possible PatentsView API endpoints are.
get_endpoints()- - -
get_endpoints()
A character vector with the names of the 7 endpoints. Those endpoints are:
-assignees
cpc_subsections
inventors
locations
nber_subcategories
patents
uspc_mainclasses
A character vector with the names of each endpoint.
+get_endpoints() -#> [1] "assignees" "cpc_subsections" "inventors" -#> [4] "locations" "nber_subcategories" "patents" -#> [7] "uspc_mainclasses"
get_fields(endpoint, groups = NULL)
get_fields(endpoint, groups = NULL, include_pk = FALSE)
fieldsdf
table
(e.g., unique(fieldsdf[fieldsdf$endpoint == "patent", "group"])
).
+
+Boolean on whether to include the endpoint's primary key,
+defaults to FALSE. The primary key is needed if you plan on calling
+unnest_pv_data
on the results of search_pv
# Get all assignee-level fields for the patent endpoint:
-fields <- get_fields(endpoint = "patent", groups = "assignees")
+ # Get all top level (non-nested) fields for the patent endpoint:
+fields <- get_fields(endpoint = "patent", groups = c("patents"))
# ...Then pass to search_pv:
if (FALSE) {
@@ -139,9 +145,8 @@ Examples
fields = fields
)
}
-# Get all patent and assignee-level fields for the patent endpoint:
+# Get unnested patent and assignee-level fields for the patent endpoint:
fields <- get_fields(endpoint = "patent", groups = c("assignees", "patents"))
-#> Error: group must be one of the following: , assignee_years, inventor_years, applicants, application, assignees, attorneys, botanic, cpc_at_issue, cpc_current, examiners, figures, foreign_priority, gov_interest_contract_award_numbers, gov_interest_organizations, granted_pregrant_crosswalk, inventors, ipcr, pct_data, us_related_documents, us_term_of_grant, uspc_at_issue, wipo, us_parties
if (FALSE) {
# ...Then pass to search_pv:
@@ -150,6 +155,19 @@ Examples
fields = fields
)
}
+# Get the nested inventors fields and the primary key in order to call unnest_pv_data
+# on the returned data. unnest_pv_data would throw an error if the primary key was
+# not present in the results.
+fields <- get_fields(endpoint = "patent", groups = c("inventors"), include_pk = TRUE)
+
+if (FALSE) {
+# ...Then pass to search_pv and unnest the results
+results <- search_pv(
+ query = '{"_gte":{"patent_date":"2007-01-04"}}',
+ fields = fields
+)
+unnest_pv_data(results$data)
+}
unnest_pv_data
, based on the endpoint you searched.
It will return a potential unique identifier for a given entity (i.e., a
given endpoint). For example, it will return "patent_id" when
-endpoint = "patent"
.
+endpoint_or_entity = "patent"
. It would return the same value if
+the entity name "patents" was passed via get_ok_pk(names(pv_return$data))
+where pv_return was returned from search_pv
.
get_ok_pk(endpoint)
get_ok_pk(endpoint_or_entity)
The endpoint which you would like to know a potential primary -key for.
The endpoint or entity name for which you +would like to know a potential primary key for.
get_ok_pk(endpoint = "inventor")
+ get_ok_pk(endpoint_or_entity = "inventor") # Returns "inventor_id"
#> [1] "inventor_id"
-get_ok_pk(endpoint = "cpc_subclass")
-#> [1] "cpc_subclass_id"
-get_ok_pk("publication/rel_app_text")
-#> [1] "document_number"
+get_ok_pk(endpoint_or_entity = "cpc_group") # Returns "cpc_group_id"
+#> [1] "cpc_group_id"
This function strategically pads a patent_id with zeroes to 8 characters, +needed only for custom paging that uses sorts by patent_id.
+pad_patent_id(patent_id)
The patent_id that needs to be padded. It can +be the patent_id for a utility, design, plant or reissue patent.
if (FALSE) {
+padded <- pad_patent_id("RE36479")
+
+padded2 <- pad_patent_id("3930306")
+}
+
+
Developed by Christopher Baker, Russ Allen.
+Site built with pkgdown 2.0.9.
+Provides functions to simplify the 'PatentsView' API (https://patentsview.org/apis/purpose) query language, send GET and POST requests to the API's seven endpoints, and parse the data that comes back.
+Provides functions to simplify the 'PatentsView' API (https://patentsview.org/apis/purpose) query language, send GET and POST requests to the API's twenty seven endpoints, and parse the data that comes back.
A list of functions that make it easy to write PatentsView queries. See the -details section below for a list of the 14 functions, as well as the -writing -queries vignette for further details.
+details section below for a list of the 15 functions, as well as the +writing queries vignette for further details.qry_funs
-
-
- An object of class list
of length 14.
qry_funs
An object of class pv_query
. This is basically just a simple
- list with a print method attached to it.
An object of class list
of length 15.
1. Comparison operator functions
An object of class pv_query
. This is basically just a simple
+list with a print method attached to it.
1. Comparison operator functions
There are 6 comparison operator functions that work with fields of type -integer, float, date, or string:
eq
- Equal to
eq
- Equal to
neq
- Not equal to
gt
- Greater than
gte
- Greater than or equal to
lt
- Less than
lte
- Less than or equal to
There are 2 comparison operator functions that only work with fields of type -string:
begins
- The string begins with the value string
There are 2 comparison operator functions that only work with fields of type +string:
begins
- The string begins with the value string
contains
- The string contains the value string
There are 3 comparison operator functions that only work with fields of type -fulltext:
text_all
- The text contains all the words in the value
- string
There are 3 comparison operator functions that only work with fields of type +fulltext:
text_all
- The text contains all the words in the value
+string
text_any
- The text contains any of the words in the value
- string
text_phrase
- The text contains the exact phrase of the value
- string
2. Array functions
There are 2 array functions:
and
- Both members of the array must be true
2. Array functions
There are 2 array functions:
and
- Both members of the array must be true
or
- Only one member of the array must be true
3. Negation function
There is 1 negation function:
not
- The comparison is not true
+qry_funs$eq(patent_date = "2001-01-01") -#> {"_eq":{"patent_date":"2001-01-01"}}-qry_funs$not(qry_funs$eq(patent_date = "2001-01-01")) -#> {"_not":{"_eq":{"patent_date":"2001-01-01"}}}-
3. Negation function
There is 1 negation function:
not
- The comparison is not true
4. Convenience function
There is 1 convenience function:
in_range
- Builds a <= x <= b query
qry_funs$eq(patent_date = "2001-01-01")
+#> {"_eq":{"patent_date":"2001-01-01"}}
+
+qry_funs$not(qry_funs$eq(patent_date = "2001-01-01"))
+#> {"_not":{"_eq":{"patent_date":"2001-01-01"}}}
+
+qry_funs$in_range(patent_year = c(2010, 2021))
+#> {"_and":[{"_gte":{"patent_year":2010}},{"_lte":{"patent_year":2021}}]}
+
+qry_funs$in_range(patent_date = c("1976-01-01", "1983-02-28"))
+#> {"_and":[{"_gte":{"patent_date":"1976-01-01"}},{"_lte":{"patent_date":"1983-02-28"}}]}
+
Some of the endpoints now return HATEOAS style links to get more data. E.g., +the patent endpoint may return a link such as: +"https://search.patentsview.org/api/v1/inventor/fl:th_ln:jefferson-1/"
+retrieve_linked_data(
+ url,
+ encoded_url = FALSE,
+ api_key = Sys.getenv("PATENTSVIEW_API_KEY"),
+ ...
+)
A link that was returned by the API on a previous call, an example +in the documentation or a Request URL from the API's Swagger UI page.
boolean to indicate whether the url has been URL encoded, defaults to FALSE. +Set to TRUE for Request URLs from Swagger UI.
API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +here.
Curl options passed along to httr2's req_options
function.
A list with the following three elements:
A list with one element - a named data frame containing the +data returned by the server. Each row in the data frame corresponds to a +single value for the primary entity. For example, if you search the +assignee endpoint, then the data frame will be on the assignee-level, +where each row corresponds to a single assignee. Fields that are not on +the assignee-level would be returned in list columns.
Entity counts across all pages of output (not just +the page returned to you).
Details of the GET HTTP request that was sent to the server.
if (FALSE) {
+
+retrieve_linked_data(
+ "https://search.patentsview.org/api/v1/cpc_group/G01S7:4811/"
+)
+
+endpoint_url <- "https://search.patentsview.org/api/v1/patent/"
+q_param <- '?q={"_text_any":{"patent_title":"COBOL cotton gin"}}'
+s_and_o_params <- '&s=[{"patent_id": "asc" }]&o={"size":50}'
+f_param <- '&f=["inventors.inventor_name_last","patent_id","patent_date","patent_title"]'
+# (URL broken up to avoid a long line warning in this Rd)
+
+retrieve_linked_data(
+ paste0(endpoint_url, q_param, s_and_o_params, f_param)
+)
+
+retrieve_linked_data(
+ "https://search.patentsview.org/api/v1/patent/?q=%7B%22patent_date%22%3A%221976-01-06%22%7D",
+ encoded_url = TRUE
+)
+}
+
+
Developed by Christopher Baker, Russ Allen.
+Site built with pkgdown 2.0.9.
+list("_gte" = list("patent_date" = "2007-01-04"))
An object of class pv_query
, which you create by calling one
of the functions found in the qry_funs
list...See the
-writing
+writing
queries vignette for details.
E.g., qry_funs$gte(patent_date = "2007-01-04")
A character vector of the fields that you want returned to you.
-A value of NULL
indicates that the default fields should be
-returned. Acceptable fields for a given endpoint can be found at the API's
+A value of NULL
indicates to the API that it should return the default fields
+for that endpoint. Acceptable fields for a given endpoint can be found at the API's
online documentation (e.g., check out the field list for the
-patents
+patents
endpoint) or by viewing the fieldsdf
data frame
(View(fieldsdf)
). You can also use get_fields
to list
-out the fields available for a given endpoint.
Nested fields can be fully qualified, e.g., "application.filing_date" or the
+group name can be used to retrieve all of its nested fields, E.g. "application".
+The latter would be similar to passing get_fields("patent", group = "application")
+except it's the API that decides what fields to return.
Non-matched subentities
-will always be returned under the new version of the API
This is always FALSE in the
+new version of the API as the total counts of unique subentities is no longer available.
The page number of the results that should be returned.
The new version of the API does not use
+
page
as a parameter for paging, it uses after
.
The number of records that should be returned per page. This
-value can be as high as 1,000 (e.g., per_page = 1000
).
size = 1000
).
+
+
+A list of sort key values that defaults to NULL. This
+exposes the API's paging parameter for users who want to implement their own
+paging. It cannot be set when all_pages = TRUE
as the R package manipulates it
+for users automatically. See result set paging
Do you want to download all possible pages of output? If
-all_pages = TRUE
, the values of page
and per_page
are
-ignored.
all_pages = TRUE
, the value of size
is ignored.
A named character vector where the name indicates the field to
sort by and the value indicates the direction of sorting (direction should
-be either "asc" or "desc"). For example, sort = c("patent_number" =
- "asc")
or sort = c("patent_number" = "asc", "patent_date" =
+be either "asc" or "desc"). For example,
sort = c("patent_id" =
+ "asc")
or sort = c("patent_id" = "asc", "patent_date" =
"desc")
. sort = NULL
(the default) means do not sort the results.
You must include any fields that you wish to sort by in fields
.
API key. See -Here for info on creating a key.
API key, it defaults to Sys.getenv("PATENTSVIEW_API_KEY"). Request a key +here.
Curl options passed along to httr2's req_options
+when we do GETs or POSTs.
A list with one element - a named data frame containing the data returned by the server. Each row in the data frame corresponds to a single value for the primary entity. For example, if you search the -assignees endpoint, then the data frame will be on the assignee-level, +assignee endpoint, then the data frame will be on the assignee-level, where each row corresponds to a single assignee. Fields that are not on the assignee-level would be returned in list columns.
The column/field name that will link the data frames together. This
should be the unique identifier for the primary entity. For example, if you
-used the patents endpoint in your call to search_pv
, you could
-specify pk = "patent_number"
. This identifier has to have
+used the patent endpoint in your call to search_pv
, you could
+specify pk = "patent_id"
. This identifier has to have
been included in your fields
vector when you called
search_pv
. You can use get_ok_pk
to suggest a
potential primary key for your data.
This function evaluates whatever code you pass to it in the environment of
-the qry_funs
list. This allows you to cut down on typing when
+the qry_funs
list. This allows you to cut down on typing when
writing your queries. If you want to cut down on typing even more, you can
-try assigning the qry_funs
list into your global environment
-with: list2env(qry_funs, envir = globalenv())
.
qry_funs
list into your global environment
+with: list2env(qry_funs, envir = globalenv())
.
with_qfuns(code, envir = parent.frame())- -
code | -Code to evaluate. See example. |
-
---|---|
envir | -Where should R look for objects present in |
-
The result of code
- i.e., your query.
+# Without with_qfuns, we have to do: -qry_funs$and( - qry_funs$gte(patent_date = "2007-01-01"), - qry_funs$text_phrase(patent_abstract = c("computer program")), - qry_funs$or( - qry_funs$eq(inventor_last_name = "ihaka"), - qry_funs$eq(inventor_first_name = "chris") - ) -) -#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventor_last_name":"ihaka"}},{"_eq":{"inventor_first_name":"chris"}}]}]}-#...With it, this becomes: -with_qfuns( - and( - gte(patent_date = "2007-01-01"), - text_phrase(patent_abstract = c("computer program")), - or( - eq(inventor_last_name = "ihaka"), - eq(inventor_first_name = "chris") - ) - ) -) -#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventor_last_name":"ihaka"}},{"_eq":{"inventor_first_name":"chris"}}]}]}-
with_qfuns(code, envir = parent.frame())
Code to evaluate. See example.
Where should R look for objects present in code
that
+aren't present in qry_funs
.
The result of code
- i.e., your query.
qry_funs$and(
+ qry_funs$gte(patent_date = "2007-01-01"),
+ qry_funs$text_phrase(patent_abstract = c("computer program")),
+ qry_funs$or(
+ qry_funs$eq(inventors.inventor_name_last = "Ihaka"),
+ qry_funs$eq(inventors.inventor_name_last = "Chris")
+ )
+)
+#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventors.inventor_name_last":"Ihaka"}},{"_eq":{"inventors.inventor_name_last":"Chris"}}]}]}
+
+# ...With it, this becomes:
+with_qfuns(
+ and(
+ gte(patent_date = "2007-01-01"),
+ text_phrase(patent_abstract = c("computer program")),
+ or(
+ eq(inventors.inventor_name_last = "Ihaka"),
+ eq(inventors.inventor_name_last = "Chris")
+ )
+ )
+)
+#> {"_and":[{"_gte":{"patent_date":"2007-01-01"}},{"_text_phrase":{"patent_abstract":"computer program"}},{"_or":[{"_eq":{"inventors.inventor_name_last":"Ihaka"}},{"_eq":{"inventors.inventor_name_last":"Chris"}}]}]}
+
+