From 9953c9380bf7cf0aac86902e895d2c56a37adaf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Re=C3=A9?= Date: Mon, 3 Apr 2023 12:18:19 +0200 Subject: [PATCH 1/3] Generate a split sitemap `sitemap.xml.gz` remains identical. `sitemap-index.xml` can be used instead as an index file, which will link to `sitemap1.xml.gz`, `sitemap2.xml.gz`, ... The default index size is 2000 which also considers the max file size to remain under Google's limit. (50k) --- news/4638.bugfix | 1 + src/express-middleware/sitemap.js | 40 +++++++++++++++++++++++++-- src/helpers/Sitemap/Sitemap.js | 46 +++++++++++++++++++++++++++++-- 3 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 news/4638.bugfix diff --git a/news/4638.bugfix b/news/4638.bugfix new file mode 100644 index 0000000000..75b0f1490d --- /dev/null +++ b/news/4638.bugfix @@ -0,0 +1 @@ +Generate a split sitemap @reebalazs diff --git a/src/express-middleware/sitemap.js b/src/express-middleware/sitemap.js index f593219456..58de415ef9 100644 --- a/src/express-middleware/sitemap.js +++ b/src/express-middleware/sitemap.js @@ -1,11 +1,35 @@ import express from 'express'; -import { generateSitemap } from '@plone/volto/helpers/Sitemap/Sitemap'; +import { + generateSitemap, + generateSitemapIndex, + SITEMAP_BATCH_SIZE, +} from '@plone/volto/helpers/Sitemap/Sitemap'; export const sitemap = function (req, res, next) { - generateSitemap(req).then((sitemap) => { + let start = 0; + let size = undefined; + const { batch: batchStr } = req.params; + if (batchStr !== undefined) { + const batch = parseInt(batchStr); + if (isNaN(batch) || batch === 0 || '' + batch !== batchStr) { + res.status(404); + // Some data, such as the internal API address, may be sensitive to be published + res.send( + `Invalid sitemap name, use sitemap.xml.gz, or batched sitemapN.xml.gz where N is a positive integer.`, + ); + return; + } + start = SITEMAP_BATCH_SIZE * (batch - 1); + size = SITEMAP_BATCH_SIZE; + } + generateSitemap(req, start, size).then((sitemap) => { if (Buffer.isBuffer(sitemap)) { res.set('Content-Type', 'application/x-gzip'); - res.set('Content-Disposition', 'attachment; filename="sitemap.xml.gz"'); + res.set('Content-Encoding', 'gzip'); + res.set( + 'Content-Disposition', + `attachment; filename="sitemap${batchStr || ''}.xml.gz"`, + ); res.send(sitemap); } else { // {"errno":-111, "code":"ECONNREFUSED", "host": ...} @@ -16,10 +40,20 @@ export const sitemap = function (req, res, next) { }); }; +export const sitemapIndex = function (req, res, next) { + generateSitemapIndex(req).then((sitemapIndex) => { + res.set('Content-Type', 'application/xml'); + res.set('Content-Disposition', 'attachment; filename="sitemap-index.xml"'); + res.send(sitemapIndex); + }); +}; + export default function () { const middleware = express.Router(); middleware.all('**/sitemap.xml.gz', sitemap); + middleware.all('**/sitemap:batch.xml.gz', sitemap); + middleware.all('**/sitemap-index.xml', sitemapIndex); middleware.id = 'sitemap.xml.gz'; return middleware; } diff --git a/src/helpers/Sitemap/Sitemap.js b/src/helpers/Sitemap/Sitemap.js index ac2629e61f..64cc8dadb2 100644 --- a/src/helpers/Sitemap/Sitemap.js +++ b/src/helpers/Sitemap/Sitemap.js @@ -11,19 +11,23 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy'; import config from '@plone/volto/registry'; +export const SITEMAP_BATCH_SIZE = 5000; + /** * Generate sitemap * @function generateSitemap * @param {Object} _req Request object * @return {string} Generated sitemap */ -export const generateSitemap = (_req) => +export const generateSitemap = (_req, start = 0, size = undefined) => new Promise((resolve) => { const { settings } = config; const APISUFIX = settings.legacyTraverse ? '' : '/++api++'; const apiPath = settings.internalApiPath ?? settings.apiPath; const request = superagent.get( - `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=100000000&use_site_search_settings=1`, + `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_start=${start}&b_size=${ + size !== undefined ? size : 100000000 + }&use_site_search_settings=1`, ); request.set('Accept', 'application/json'); request.use(addHeadersFactory(_req)); @@ -50,3 +54,41 @@ export const generateSitemap = (_req) => } }); }); + +/** + * Generate sitemap + * @function generateSitemapIndex + * @param {Object} _req Request object + * @return {string} Generated sitemap index + */ +export const generateSitemapIndex = (_req) => + new Promise((resolve) => { + const { settings } = config; + const APISUFIX = settings.legacyTraverse ? '' : '/++api++'; + const apiPath = settings.internalApiPath ?? settings.apiPath; + const request = superagent.get( + `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=0&use_site_search_settings=1`, + ); + request.set('Accept', 'application/json'); + const authToken = _req.universalCookies.get('auth_token'); + if (authToken) { + request.set('Authorization', `Bearer ${authToken}`); + } + request.end((error, { body } = {}) => { + if (error) { + resolve(body || error); + } else { + const items = Array.from( + { length: Math.ceil(body.items_total / SITEMAP_BATCH_SIZE) }, + (_, i) => + ` + ${toPublicURL('/sitemap' + (i + 1) + '.xml.gz')} + `, + ); + const result = ` + +${items.join('\n')}\n`; + resolve(result); + } + }); + }); From ad71c0842b4ef65e6d0609b90a16dfeb423e337e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Re=C3=A9?= Date: Wed, 5 Apr 2023 18:32:12 +0200 Subject: [PATCH 2/3] Remove the content encoding from the sitemap.xml.gz Although it's all the same for Google with or without this, it's more correct not to add the content encoding gzip header, as we just want to transfer the gzipped file as a binary and not consider the browser to decode it when downloaded. (The other option would be to leave the content encoding header but then just call the file as `.xml` without the `.gz` ending. That would however only result in larger file sizes when saved and would give no extra benefit. It would also lead to non-compatible changes.) --- src/express-middleware/sitemap.js | 1 - 1 file changed, 1 deletion(-) diff --git a/src/express-middleware/sitemap.js b/src/express-middleware/sitemap.js index 58de415ef9..8b05884bb3 100644 --- a/src/express-middleware/sitemap.js +++ b/src/express-middleware/sitemap.js @@ -25,7 +25,6 @@ export const sitemap = function (req, res, next) { generateSitemap(req, start, size).then((sitemap) => { if (Buffer.isBuffer(sitemap)) { res.set('Content-Type', 'application/x-gzip'); - res.set('Content-Encoding', 'gzip'); res.set( 'Content-Disposition', `attachment; filename="sitemap${batchStr || ''}.xml.gz"`, From b18e3aabf56b292f969dbabfb60bae26b2a27259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Re=C3=A9?= Date: Wed, 5 Apr 2023 18:35:06 +0200 Subject: [PATCH 3/3] Fix robots.txt to contain a public link Replace http://backend from the robots.txt provided by the backend with the public facing url. Also, publish the index file instead of the single file that would be rejected by Google. --- src/helpers/Robots/Robots.js | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/helpers/Robots/Robots.js b/src/helpers/Robots/Robots.js index 040e4b2ea0..38a7e5527f 100644 --- a/src/helpers/Robots/Robots.js +++ b/src/helpers/Robots/Robots.js @@ -15,12 +15,9 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy'; */ export const generateRobots = (req) => new Promise((resolve) => { - //const url = `${req.protocol}://${req.get('Host')}`; - const request = superagent.get( - `${ - config.settings.internalApiPath ?? config.settings.apiPath - }/robots.txt`, - ); + const internalUrl = + config.settings.internalApiPath ?? config.settings.apiPath; + const request = superagent.get(`${internalUrl}/robots.txt`); request.set('Accept', 'text/plain'); const authToken = req.universalCookies.get('auth_token'); if (authToken) { @@ -31,6 +28,12 @@ export const generateRobots = (req) => if (error) { resolve(text || error); } else { + // Plone has probably returned the sitemap link with the internal url. + // If so, let's replace it with the current one. + const url = `${req.protocol}://${req.get('Host')}`; + text = text.replace(internalUrl, url); + // Replace the sitemap with the sitemap index. + text = text.replace('sitemap.xml.gz', 'sitemap-index.xml'); resolve(text); } });