From 1cbbf96fa69d223e0f5de1fa83f7ec8fef8abc8f Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Mon, 23 Dec 2024 14:39:29 -0600 Subject: [PATCH] Improve url manipulation tooling * Check inputs and export `url_modify()`. Fixes #464. * Check inputs to `url_build()`. Fixes #449. --- NAMESPACE | 1 + NEWS.md | 1 + R/url.R | 126 ++++++++++++++++++++++++++++------- _pkgdown.yml | 5 +- man/url_build.Rd | 21 ++++++ man/url_modify.Rd | 60 +++++++++++++++++ man/url_parse.Rd | 35 ++++------ tests/testthat/_snaps/url.md | 61 +++++++++++++++++ tests/testthat/test-url.R | 47 ++++++++++--- 9 files changed, 304 insertions(+), 53 deletions(-) create mode 100644 man/url_build.Rd create mode 100644 man/url_modify.Rd diff --git a/NAMESPACE b/NAMESPACE index 0613c62a..aba8428f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -143,6 +143,7 @@ export(secret_write_rds) export(signal_total_pages) export(throttle_status) export(url_build) +export(url_modify) export(url_parse) export(with_mock) export(with_mocked_responses) diff --git a/NEWS.md b/NEWS.md index f09f990e..9fe1299c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # httr2 (development version) +* New `url_modify()` makes it easier to modify an existing url (#464). * `url_parse()` now uses `curl::curl_parse_url()` which is much faster and more correct (#577). * `req_retry()` now defaults to `max_tries = 2` with a message. Set to `max_tries = 1` to disable retries. diff --git a/R/url.R b/R/url.R index 518fe2d5..866e326b 100644 --- a/R/url.R +++ b/R/url.R @@ -1,28 +1,20 @@ -#' Parse and build URLs +#' Parse a URL #' -#' `url_parse()` parses a URL into its component pieces; `url_build()` does -#' the reverse, converting a list of pieces into a string URL. See `r rfc(3986)` -#' for the details of the parsing algorithm. +#' `url_parse()` parses a URL into its component pieces, powered by +#' [curl::curl_parse_url()]. See `r rfc(3986)` for the details of the +#' parsing algorithm. #' -#' @param url For `url_parse()` a string to parse into a URL; -#' for `url_build()` a URL to turn back into a string. -#' @returns -#' * `url_build()` returns a string. -#' * `url_parse()` returns a URL: a S3 list with class `httr2_url` -#' and elements `scheme`, `hostname`, `port`, `path`, `fragment`, `query`, -#' `username`, `password`. +#' @param url A string to parse. +#' @returns A URL, i.e. a S3 object with class `httr2_url` and elements +#' `scheme`, `hostname`, `username`, `password`, `port`, `path`, `query`, and +#' `fragment`. #' @export +#' @family URL manipulation #' @examples #' url_parse("http://google.com/") #' url_parse("http://google.com:80/") #' url_parse("http://google.com:80/?a=1&b=2") #' url_parse("http://username@google.com:80/path;test?a=1&b=2#40") -#' -#' url <- url_parse("http://google.com/") -#' url$port <- 80 -#' url$hostname <- "example.com" -#' url$query <- list(a = 1, b = 2, c = 3) -#' url_build(url) url_parse <- function(url) { check_string(url) @@ -42,10 +34,88 @@ url_parse <- function(url) { parsed } -url_modify <- function(url, ..., error_call = caller_env()) { - url <- url_parse(url) - url <- modify_list(url, ..., error_call = error_call) - url_build(url) +#' Modify a url +#' +#' Modify components of a URL. The default value of each argument, `NULL`, +#' means leave the component as is. If you want to remove a component, +#' set it to `""`. Note that setting `scheme` or `hostname` to `""` will +#' create a relative url. +#' +#' @param url A string or [parsed URL](url_parse). +#' @param scheme The scheme, typically either `http` or `https`. +#' @param hostname The hostname, e.g. `www.google.com` or `posit.co`. +#' @param username,password Username and password to embed in the URL. +#' Not generally recommended but needed for some legacy applications. +#' @param port An integer port number. +#' @param path The path, e.g. `/search`. Paths must start with `/`, so this +#' will be automatically added if ommitted. +#' @param query Either a query string or a named list of query components. +#' @param fragment The fragment, e.g. `#section-1`. +#' @return An object the same type as `url``. +#' @export +#' @family URL manipulation +#' @examples +#' url_modify("http://hadley.nz", path = "about") +#' url_modify("http://hadley.nz", scheme = "https") +#' url_modify("http://hadley.nz/abc", path = "/cde") +#' url_modify("http://hadley.nz/abc", path = "") +#' url_modify("http://hadley.nz?a=1", query = "b=2") +#' url_modify("http://hadley.nz?a=1", query = list(c = 3)) +url_modify <- function(url, + scheme = NULL, + hostname = NULL, + username = NULL, + password = NULL, + port = NULL, + path = NULL, + query = NULL, + fragment = NULL) { + + if (!is_string(url) && !is_url(url)) { + stop_input_type(url, "a string or parsed URL") + } + string_url <- is_string(url) + if (string_url) { + url <- url_parse(url) + } + + check_string(scheme, allow_null = TRUE) + check_string(hostname, allow_null = TRUE) + check_string(username, allow_null = TRUE) + check_string(password, allow_null = TRUE) + check_number_whole(port, min = 1, allow_null = TRUE) + check_string(path, allow_null = TRUE) + check_string(fragment, allow_null = TRUE) + + if (is_string(query)) { + query <- query_parse(query) + } else if (is.list(query) && (is_named(query) || length(query) == 0)) { + for (nm in names(query)) { + check_query_param(query[[nm]], paste0("query$", nm)) + } + } else if (!is.null(query)) { + stop_input_type(query, "a character vector, named list, or NULL") + } + + new <- compact(list( + scheme = scheme, + hostname = hostname, + username = username, + password = password, + port = port, + path = path, + query = query, + fragment = fragment + )) + is_empty <- map_lgl(new, identical, "") + new[is_empty] <- list(NULL) + url[names(new)] <- new + + if (string_url) { + url_build(url) + } else { + url + } } is_url <- function(x) inherits(x, "httr2_url") @@ -85,9 +155,19 @@ print.httr2_url <- function(x, ...) { invisible(x) } +#' Build a string from a URL object +#' +#' This is the converse of [url_parse], taking a parsed URL object and +#' turning it back into a string. +#' +#' @param url An URL object created by [url_parse]. +#' @family URL manipulation #' @export -#' @rdname url_parse url_build <- function(url) { + if (!is_url(url)) { + stop_input_type(url, "a parsed URL") + } + if (!is.null(url$query)) { query <- query_build(url$query) } else { @@ -113,7 +193,7 @@ url_build <- function(url) { authority <- NULL } - if (!is.null(url$path) && !startsWith(url$path, "/")) { + if (is.null(url$path) || !startsWith(url$path, "/")) { url$path <- paste0("/", url$path) } diff --git a/_pkgdown.yml b/_pkgdown.yml index d92798f2..a17aae11 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -70,12 +70,15 @@ reference: contents: - starts_with("resp_") +- title: URL manipulation + contents: + - starts_with("url_") + - title: Miscellaenous helpers contents: - curl_translate - secrets - obfuscate - - url_parse - title: OAuth desc: > diff --git a/man/url_build.Rd b/man/url_build.Rd new file mode 100644 index 00000000..ee8090c7 --- /dev/null +++ b/man/url_build.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/url.R +\name{url_build} +\alias{url_build} +\title{Build a string from a URL object} +\usage{ +url_build(url) +} +\arguments{ +\item{url}{An URL object created by \link{url_parse}.} +} +\description{ +This is the converse of \link{url_parse}, taking a parsed URL object and +turning it back into a string. +} +\seealso{ +Other URL manipulation: +\code{\link{url_modify}()}, +\code{\link{url_parse}()} +} +\concept{URL manipulation} diff --git a/man/url_modify.Rd b/man/url_modify.Rd new file mode 100644 index 00000000..c0edc105 --- /dev/null +++ b/man/url_modify.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/url.R +\name{url_modify} +\alias{url_modify} +\title{Modify a url} +\usage{ +url_modify( + url, + scheme = NULL, + hostname = NULL, + username = NULL, + password = NULL, + port = NULL, + path = NULL, + query = NULL, + fragment = NULL +) +} +\arguments{ +\item{url}{A string or \href{url_parse}{parsed URL}.} + +\item{scheme}{The scheme, typically either \code{http} or \code{https}.} + +\item{hostname}{The hostname, e.g. \code{www.google.com} or \code{posit.co}.} + +\item{username, password}{Username and password to embed in the URL. +Not generally recommended but needed for some legacy applications.} + +\item{port}{An integer port number.} + +\item{path}{The path, e.g. \verb{/search}. Paths must start with \code{/}, so this +will be automatically added if ommitted.} + +\item{query}{Either a query string or a named list of query components.} + +\item{fragment}{The fragment, e.g. \verb{#section-1}.} +} +\value{ +An object the same type as `url``. +} +\description{ +Modify components of a URL. The default value of each argument, \code{NULL}, +means leave the component as is. If you want to remove a component, +set it to \code{""}. Note that setting \code{scheme} or \code{hostname} to \code{""} will +create a relative url. +} +\examples{ +url_modify("http://hadley.nz", path = "about") +url_modify("http://hadley.nz", scheme = "https") +url_modify("http://hadley.nz/abc", path = "/cde") +url_modify("http://hadley.nz/abc", path = "") +url_modify("http://hadley.nz?a=1", query = "b=2") +url_modify("http://hadley.nz?a=1", query = list(c = 3)) +} +\seealso{ +Other URL manipulation: +\code{\link{url_build}()}, +\code{\link{url_parse}()} +} +\concept{URL manipulation} diff --git a/man/url_parse.Rd b/man/url_parse.Rd index 15a8d0aa..3d4ee179 100644 --- a/man/url_parse.Rd +++ b/man/url_parse.Rd @@ -2,39 +2,32 @@ % Please edit documentation in R/url.R \name{url_parse} \alias{url_parse} -\alias{url_build} -\title{Parse and build URLs} +\title{Parse a URL} \usage{ url_parse(url) - -url_build(url) } \arguments{ -\item{url}{For \code{url_parse()} a string to parse into a URL; -for \code{url_build()} a URL to turn back into a string.} +\item{url}{A string to parse.} } \value{ -\itemize{ -\item \code{url_build()} returns a string. -\item \code{url_parse()} returns a URL: a S3 list with class \code{httr2_url} -and elements \code{scheme}, \code{hostname}, \code{port}, \code{path}, \code{fragment}, \code{query}, -\code{username}, \code{password}. -} +A URL, i.e. a S3 object with class \code{httr2_url} and elements +\code{scheme}, \code{hostname}, \code{username}, \code{password}, \code{port}, \code{path}, \code{query}, and +\code{fragment}. } \description{ -\code{url_parse()} parses a URL into its component pieces; \code{url_build()} does -the reverse, converting a list of pieces into a string URL. See \href{https://datatracker.ietf.org/doc/html/rfc3986}{RFC 3986} -for the details of the parsing algorithm. +\code{url_parse()} parses a URL into its component pieces, powered by +\code{\link[curl:curl_parse_url]{curl::curl_parse_url()}}. See \href{https://datatracker.ietf.org/doc/html/rfc3986}{RFC 3986} for the details of the +parsing algorithm. } \examples{ url_parse("http://google.com/") url_parse("http://google.com:80/") url_parse("http://google.com:80/?a=1&b=2") url_parse("http://username@google.com:80/path;test?a=1&b=2#40") - -url <- url_parse("http://google.com/") -url$port <- 80 -url$hostname <- "example.com" -url$query <- list(a = 1, b = 2, c = 3) -url_build(url) } +\seealso{ +Other URL manipulation: +\code{\link{url_build}()}, +\code{\link{url_modify}()} +} +\concept{URL manipulation} diff --git a/tests/testthat/_snaps/url.md b/tests/testthat/_snaps/url.md index a80e0ec1..e6da266c 100644 --- a/tests/testthat/_snaps/url.md +++ b/tests/testthat/_snaps/url.md @@ -24,6 +24,67 @@ Error in `url_build()`: ! Cannot set url `password` without `username`. +# url_modify checks its inputs + + Code + url_modify(1) + Condition + Error in `url_modify()`: + ! `url` must be a string or parsed URL, not the number 1. + Code + url_modify(url, scheme = 1) + Condition + Error in `url_modify()`: + ! `scheme` must be a single string or `NULL`, not the number 1. + Code + url_modify(url, hostname = 1) + Condition + Error in `url_modify()`: + ! `hostname` must be a single string or `NULL`, not the number 1. + Code + url_modify(url, port = "x") + Condition + Error in `url_modify()`: + ! `port` must be a whole number or `NULL`, not the string "x". + Code + url_modify(url, username = 1) + Condition + Error in `url_modify()`: + ! `username` must be a single string or `NULL`, not the number 1. + Code + url_modify(url, password = 1) + Condition + Error in `url_modify()`: + ! `password` must be a single string or `NULL`, not the number 1. + Code + url_modify(url, path = 1) + Condition + Error in `url_modify()`: + ! `path` must be a single string or `NULL`, not the number 1. + Code + url_modify(url, fragment = 1) + Condition + Error in `url_modify()`: + ! `fragment` must be a single string or `NULL`, not the number 1. + +# checks various query formats + + Code + url_modify(url, query = 1) + Condition + Error in `url_modify()`: + ! `query` must be a character vector, named list, or NULL, not the number 1. + Code + url_modify(url, query = list(1)) + Condition + Error in `url_modify()`: + ! `query` must be a character vector, named list, or NULL, not a list. + Code + url_modify(url, query = list(x = 1:2)) + Condition + Error in `url_modify()`: + ! Query value `query$x` must be a length-1 atomic vector, not an integer vector. + # validates inputs Code diff --git a/tests/testthat/test-url.R b/tests/testthat/test-url.R index 885d08bc..168e2a22 100644 --- a/tests/testthat/test-url.R +++ b/tests/testthat/test-url.R @@ -6,7 +6,6 @@ test_that("can parse special cases", { test_that("can round trip urls", { urls <- list( - "file:///", "http://google.com/", "http://google.com/path", "http://google.com/path?a=1&b=2", @@ -27,18 +26,50 @@ test_that("can print all url details", { ) }) -test_that("ensures path always starts with /", { - expect_equal( - url_modify("https://example.com/abc", path = "def"), - "https://example.com/def" - ) -}) - test_that("password also requires username", { url <- url_parse("http://username:pwd@example.com") url$username <- NULL expect_snapshot(url_build(url), error = TRUE) +}) + +# modify url ------------------------------------------------------------- + +test_that("url_modify checks its inputs", { + url <- "http://example.com" + + expect_snapshot(error = TRUE, { + url_modify(1) + url_modify(url, scheme = 1) + url_modify(url, hostname = 1) + url_modify(url, port = "x") + url_modify(url, username = 1) + url_modify(url, password = 1) + url_modify(url, path = 1) + url_modify(url, fragment = 1) + }) +}) + +test_that("no arguments is idempotent", { + string <- "http://example.com/" + url <- url_parse(string) + + expect_equal(url_modify(string), string) + expect_equal(url_modify(url), url) +}) + +test_that("checks various query formats", { + url <- "http://example.com" + + expect_snapshot(error = TRUE, { + url_modify(url, query = 1) + url_modify(url, query = list(1)) + url_modify(url, query = list(x = 1:2)) + }) +}) +test_that("path always starts with /", { + expect_equal(url_modify("https://x.com/abc", path = "def"), "https://x.com/def") + expect_equal(url_modify("https://x.com/abc", path = ""), "https://x.com/") }) # query -------------------------------------------------------------------