From 4813184aac31853077ec4f803f71bdb56f16be75 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Wed, 1 Feb 2023 21:19:51 -0800 Subject: [PATCH 1/5] Set X-Robots-Tag: noindex for nonlatest rustdoc We want /latest/ URLs to be indexed, but not /1.2.3/ URLs, because the latter compete for pagerank with /latest/ and aren't usually what people want. This replaces the previous canonical URL header. Canonical URLs were doing some good, but did not work in all cases. In particular, if some older version of a crate's documentation was very different than the latest version, Google would not accept the canonicalization. This would sometimes result in the old version still showing up in the search results instead of /latest/. --- src/test/fakes.rs | 5 --- src/web/rustdoc.rs | 97 +++++++--------------------------------------- 2 files changed, 13 insertions(+), 89 deletions(-) diff --git a/src/test/fakes.rs b/src/test/fakes.rs index d7ae25192..4ad7b37c8 100644 --- a/src/test/fakes.rs +++ b/src/test/fakes.rs @@ -246,11 +246,6 @@ impl<'a> FakeRelease<'a> { self } - pub(crate) fn documentation_url(mut self, documentation: Option) -> Self { - self.package.documentation = documentation; - self - } - /// Returns the release_id pub(crate) fn create(mut self) -> Result { use std::fs; diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index 59eabbb97..30dfe438f 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -13,7 +13,6 @@ use crate::{ encode_url_path, error::{AxumNope, AxumResult}, file::File, - headers::CanonicalUrl, match_version_axum, metrics::RenderingTimesRecorder, page::TemplateData, @@ -25,8 +24,7 @@ use anyhow::{anyhow, Context as _}; use axum::{ extract::{Extension, Path, Query}, http::{StatusCode, Uri}, - response::{Html, IntoResponse, Response as AxumResponse}, - TypedHeader, + response::{AppendHeaders, Html, IntoResponse, Response as AxumResponse}, }; use lol_html::errors::RewritingError; use once_cell::sync::Lazy; @@ -289,7 +287,6 @@ pub(crate) async fn rustdoc_redirector_handler( #[derive(Debug, Clone, Serialize)] struct RustdocPage { latest_path: String, - canonical_url: CanonicalUrl, permalink_path: String, latest_version: String, target: String, @@ -315,7 +312,6 @@ impl RustdocPage { file_path: &str, ) -> AxumResult { let is_latest_url = self.is_latest_url; - let canonical_url = self.canonical_url.clone(); // Build the page of documentation let ctx = tera::Context::from_serialize(self).context("error creating tera context")?; @@ -336,9 +332,10 @@ impl RustdocPage { result => result.context("error rewriting HTML")?, }; + let robots = if is_latest_url { "" } else { "noindex" }; Ok(( StatusCode::OK, - TypedHeader(canonical_url), + AppendHeaders([("X-Robots-Tag", robots)]), Extension(if is_latest_url { CachePolicy::ForeverInCdn } else { @@ -640,18 +637,6 @@ pub(crate) async fn rustdoc_html_server_handler( params.name, target_redirect, query_string ); - // Set the canonical URL for search engines to the `/latest/` page on docs.rs. - // Note: The URL this points to may not exist. For instance, if we're rendering - // `struct Foo` in version 0.1.0 of a crate, and version 0.2.0 of that crate removes - // `struct Foo`, this will point at a 404. That's fine: search engines will crawl - // the target and will not canonicalize to a URL that doesn't exist. - // Don't include index.html in the canonical URL. - let canonical_url = CanonicalUrl::from_path(format!( - "/{}/latest/{}", - params.name, - inner_path.replace("index.html", ""), - )); - metrics .recently_accessed_releases .record(krate.crate_id, krate.release_id, target); @@ -671,7 +656,6 @@ pub(crate) async fn rustdoc_html_server_handler( move || { Ok(RustdocPage { latest_path, - canonical_url, permalink_path, latest_version, target, @@ -2394,90 +2378,35 @@ mod test { } #[test] - fn canonical_url() { + fn noindex_nonlatest() { wrapper(|env| { env.fake_release() - .name("dummy-dash") - .version("0.1.0") - .documentation_url(Some("http://example.com".to_string())) - .rustdoc_file("dummy_dash/index.html") - .create()?; - - let utf8_filename = "序.html"; - env.fake_release() - .name("dummy-docs") - .version("0.1.0") - .documentation_url(Some("https://docs.rs/foo".to_string())) - .rustdoc_file("dummy_docs/index.html") - .rustdoc_file(&format!("dummy_docs/{utf8_filename}")) - .create()?; - - env.fake_release() - .name("dummy-nodocs") + .name("dummy") .version("0.1.0") - .documentation_url(None) - .rustdoc_file("dummy_nodocs/index.html") - .rustdoc_file("dummy_nodocs/struct.Foo.html") + .rustdoc_file("dummy/index.html") .create()?; let web = env.frontend(); assert!(web - .get("/dummy-dash/0.1.0/dummy_dash/") + .get("/dummy/0.1.0/dummy/") .send()? .headers() - .get("link") + .get("x-robots-tag") .unwrap() .to_str() .unwrap() - .contains("rel=\"canonical\""),); + .contains("noindex")); - assert_eq!( - web.get("/dummy-docs/0.1.0/dummy_docs/") - .send()? - .headers() - .get("link") - .unwrap() - .to_str() - .unwrap(), - "; rel=\"canonical\"" - ); - - assert_eq!( - web.get(&format!("/dummy-docs/0.1.0/dummy_docs/{utf8_filename}")) - .send()? - .headers() - .get("link") - .unwrap() - .to_str() - .unwrap(), - "; rel=\"canonical\"", - ); - - assert!(web - .get("/dummy-nodocs/0.1.0/dummy_nodocs/") + assert!(!web + .get("/dummy/latest/dummy/") .send()? .headers() - .get("link") + .get("x-robots-tag") .unwrap() .to_str() .unwrap() - .contains( - "; rel=\"canonical\"" - ),); - - assert_eq!( - web - .get("/dummy-nodocs/0.1.0/dummy_nodocs/struct.Foo.html") - .send()? - .headers() - .get("link") - .unwrap() - .to_str() - .unwrap(), - "; rel=\"canonical\"", - ); - + .contains("noindex")); Ok(()) }) } From 551962f2241113f6b31e0601aec81d855892c105 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 2 Feb 2023 00:33:34 -0800 Subject: [PATCH 2/5] Use a plain array --- src/web/rustdoc.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index 30dfe438f..cac5b9bb1 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -24,7 +24,7 @@ use anyhow::{anyhow, Context as _}; use axum::{ extract::{Extension, Path, Query}, http::{StatusCode, Uri}, - response::{AppendHeaders, Html, IntoResponse, Response as AxumResponse}, + response::{Html, IntoResponse, Response as AxumResponse}, }; use lol_html::errors::RewritingError; use once_cell::sync::Lazy; @@ -335,7 +335,7 @@ impl RustdocPage { let robots = if is_latest_url { "" } else { "noindex" }; Ok(( StatusCode::OK, - AppendHeaders([("X-Robots-Tag", robots)]), + [("X-Robots-Tag", robots)], Extension(if is_latest_url { CachePolicy::ForeverInCdn } else { From b43d826bb8d1d993fadc402c657063d6b4edb6f4 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 2 Feb 2023 08:19:42 -0800 Subject: [PATCH 3/5] Use Some([header]) --- src/web/rustdoc.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index cac5b9bb1..80f48723e 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -332,10 +332,9 @@ impl RustdocPage { result => result.context("error rewriting HTML")?, }; - let robots = if is_latest_url { "" } else { "noindex" }; Ok(( StatusCode::OK, - [("X-Robots-Tag", robots)], + (!is_latest_url).then_some([("X-Robots-Tag", "noindex")]), Extension(if is_latest_url { CachePolicy::ForeverInCdn } else { From 84d979e06f0d6dc68d36e970d74143412a03cc45 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 2 Feb 2023 10:08:37 -0800 Subject: [PATCH 4/5] Fix test --- src/web/rustdoc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index 80f48723e..df19e7467 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -2402,7 +2402,7 @@ mod test { .send()? .headers() .get("x-robots-tag") - .unwrap() + .unwrap_or_default() .to_str() .unwrap() .contains("noindex")); From 82b1bd5f19b5ff03d92787209f87268adef7f097 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 2 Feb 2023 10:13:17 -0800 Subject: [PATCH 5/5] Fix test --- src/web/rustdoc.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/web/rustdoc.rs b/src/web/rustdoc.rs index df19e7467..30ff77f6e 100644 --- a/src/web/rustdoc.rs +++ b/src/web/rustdoc.rs @@ -2397,15 +2397,12 @@ mod test { .unwrap() .contains("noindex")); - assert!(!web + assert!(web .get("/dummy/latest/dummy/") .send()? .headers() .get("x-robots-tag") - .unwrap_or_default() - .to_str() - .unwrap() - .contains("noindex")); + .is_none()); Ok(()) }) }