Skip to content

Split single sitemap into index and sub-sitemaps by starting letter in crate-name #1222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/web/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ pub(super) fn build_routes() -> Routes {
// https://support.google.com/webmasters/answer/183668?hl=en
routes.static_resource("/robots.txt", PermanentRedirect("/-/static/robots.txt"));
routes.static_resource("/favicon.ico", PermanentRedirect("/-/static/favicon.ico"));
routes.static_resource("/sitemap.xml", super::sitemap::sitemap_handler);
routes.static_resource("/sitemap.xml", super::sitemap::sitemapindex_handler);
routes.static_resource(
"/-/sitemap/:letter/sitemap.xml",
super::sitemap::sitemap_handler,
);

// This should not need to be served from the root as we reference the inner path in links,
// but clients might have cached the url and need to update it.
Expand Down
101 changes: 92 additions & 9 deletions src/web/sitemap.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
use crate::{db::Pool, docbuilder::Limits, impl_webpage, web::page::WebPage};
use crate::{db::Pool, docbuilder::Limits, impl_webpage, web::error::Nope, web::page::WebPage};
use chrono::{DateTime, Utc};
use iron::{
headers::ContentType,
mime::{Mime, SubLevel, TopLevel},
IronResult, Request, Response,
};
use router::Router;
use serde::Serialize;
use serde_json::Value;

/// sitemap index
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
struct SitemapIndexXml {
sitemaps: Vec<char>,
}

impl_webpage! {
SitemapIndexXml = "core/sitemapindex.xml",
content_type = ContentType(Mime(TopLevel::Application, SubLevel::Xml, vec![])),
}

pub fn sitemapindex_handler(req: &mut Request) -> IronResult<Response> {
let sitemaps: Vec<char> = ('a'..='z').collect();

SitemapIndexXml { sitemaps }.into_response(req)
}

/// The sitemap
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
struct SitemapXml {
Expand All @@ -21,16 +39,30 @@ impl_webpage! {
}

pub fn sitemap_handler(req: &mut Request) -> IronResult<Response> {
let router = extension!(req, Router);
let letter = cexpect!(req, router.find("letter"));

if letter.len() != 1 {
return Err(Nope::ResourceNotFound.into());
} else if let Some(ch) = letter.chars().next() {
if !(ch.is_ascii_lowercase()) {
return Err(Nope::ResourceNotFound.into());
}
}

let mut conn = extension!(req, Pool).get()?;
let query = conn
.query(
"SELECT DISTINCT ON (crates.name)
crates.name,
releases.release_time
"SELECT crates.name,
MAX(releases.release_time) as release_time
FROM crates
INNER JOIN releases ON releases.crate_id = crates.id
WHERE rustdoc_status = true",
&[],
WHERE
rustdoc_status = true AND
crates.name ILIKE $1
GROUP BY crates.name
",
&[&format!("{}%", letter)],
)
.unwrap();

Expand Down Expand Up @@ -119,19 +151,70 @@ pub fn about_handler(req: &mut Request) -> IronResult<Response> {
#[cfg(test)]
mod tests {
use crate::test::{assert_success, wrapper};
use reqwest::StatusCode;

#[test]
fn sitemap() {
fn sitemap_index() {
wrapper(|env| {
let web = env.frontend();
assert_success("/sitemap.xml", web)?;
assert_success("/sitemap.xml", web)
})
}

#[test]
fn sitemap_invalid_letters() {
wrapper(|env| {
let web = env.frontend();

// everything not length=1 and ascii-lowercase should fail
for invalid_letter in &["1", "aa", "A", ""] {
println!("trying to fail letter {}", invalid_letter);
assert_eq!(
web.get(&format!("/-/sitemap/{}/sitemap.xml", invalid_letter))
.send()?
.status(),
StatusCode::NOT_FOUND
);
}
Ok(())
})
}

#[test]
fn sitemap_letter() {
wrapper(|env| {
let web = env.frontend();

// letter-sitemaps always work, even without crates & releases
for letter in 'a'..='z' {
assert_success(&format!("/-/sitemap/{}/sitemap.xml", letter), web)?;
}

env.fake_release().name("some_random_crate").create()?;
env.fake_release()
.name("some_random_crate_that_failed")
.build_result_successful(false)
.create()?;
assert_success("/sitemap.xml", web)

// these fake crates appear only in the `s` sitemap
let response = web.get("/-/sitemap/s/sitemap.xml").send()?;
assert!(response.status().is_success());

let content = response.text()?;
assert!(content.contains(&"some_random_crate"));
assert!(!(content.contains(&"some_random_crate_that_failed")));

// and not in the others
for letter in ('a'..='z').filter(|&c| c != 's') {
let response = web
.get(&format!("/-/sitemap/{}/sitemap.xml", letter))
.send()?;

assert!(response.status().is_success());
assert!(!(response.text()?.contains("some_random_crate")));
}

Ok(())
})
}

Expand Down
8 changes: 8 additions & 0 deletions templates/core/sitemapindex.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for which in sitemaps -%}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Kixiron do you know if tera happens to have a "character range" operator? I found range(end=5) but I'm not sure if it works for characters instead.

<sitemap>
<loc>https://docs.rs/-/sitemap/{{ which }}/sitemap.xml</loc>
</sitemap>
{%- endfor %}
</sitemapindex>