From 73dcdddeed08bf818a36eb640584eebae2500675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Fern=C3=A1ndez=20de=20Alba?= Date: Fri, 14 Apr 2023 15:28:55 +0200 Subject: [PATCH] Generate a split sitemap (also fix robots.txt) (#4639) (#4701) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Balázs Reé --- news/4638.bugfix | 1 + src/express-middleware/sitemap.js | 39 ++++++++++++++++++++++++-- src/helpers/Robots/Robots.js | 15 ++++++---- src/helpers/Sitemap/Sitemap.js | 46 +++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 11 deletions(-) create mode 100644 news/4638.bugfix diff --git a/news/4638.bugfix b/news/4638.bugfix new file mode 100644 index 0000000000..75b0f1490d --- /dev/null +++ b/news/4638.bugfix @@ -0,0 +1 @@ +Generate a split sitemap @reebalazs diff --git a/src/express-middleware/sitemap.js b/src/express-middleware/sitemap.js index 8e9e6724f5..8b05884bb3 100644 --- a/src/express-middleware/sitemap.js +++ b/src/express-middleware/sitemap.js @@ -1,11 +1,34 @@ import express from 'express'; -import { generateSitemap } from '@plone/volto/helpers'; +import { + generateSitemap, + generateSitemapIndex, + SITEMAP_BATCH_SIZE, +} from '@plone/volto/helpers/Sitemap/Sitemap'; export const sitemap = function (req, res, next) { - generateSitemap(req).then((sitemap) => { + let start = 0; + let size = undefined; + const { batch: batchStr } = req.params; + if (batchStr !== undefined) { + const batch = parseInt(batchStr); + if (isNaN(batch) || batch === 0 || '' + batch !== batchStr) { + res.status(404); + // Some data, such as the internal API address, may be sensitive to be published + res.send( + `Invalid sitemap name, use sitemap.xml.gz, or batched sitemapN.xml.gz where N is a positive integer.`, + ); + return; + } + start = SITEMAP_BATCH_SIZE * (batch - 1); + size = SITEMAP_BATCH_SIZE; + } + generateSitemap(req, start, size).then((sitemap) => { if (Buffer.isBuffer(sitemap)) { res.set('Content-Type', 'application/x-gzip'); - res.set('Content-Disposition', 'attachment; filename="sitemap.xml.gz"'); + res.set( + 'Content-Disposition', + `attachment; filename="sitemap${batchStr || ''}.xml.gz"`, + ); res.send(sitemap); } else { // {"errno":-111, "code":"ECONNREFUSED", "host": ...} @@ -16,10 +39,20 @@ export const sitemap = function (req, res, next) { }); }; +export const sitemapIndex = function (req, res, next) { + generateSitemapIndex(req).then((sitemapIndex) => { + res.set('Content-Type', 'application/xml'); + res.set('Content-Disposition', 'attachment; filename="sitemap-index.xml"'); + res.send(sitemapIndex); + }); +}; + export default function () { const middleware = express.Router(); middleware.all('**/sitemap.xml.gz', sitemap); + middleware.all('**/sitemap:batch.xml.gz', sitemap); + middleware.all('**/sitemap-index.xml', sitemapIndex); middleware.id = 'sitemap.xml.gz'; return middleware; } diff --git a/src/helpers/Robots/Robots.js b/src/helpers/Robots/Robots.js index 040e4b2ea0..38a7e5527f 100644 --- a/src/helpers/Robots/Robots.js +++ b/src/helpers/Robots/Robots.js @@ -15,12 +15,9 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy'; */ export const generateRobots = (req) => new Promise((resolve) => { - //const url = `${req.protocol}://${req.get('Host')}`; - const request = superagent.get( - `${ - config.settings.internalApiPath ?? config.settings.apiPath - }/robots.txt`, - ); + const internalUrl = + config.settings.internalApiPath ?? config.settings.apiPath; + const request = superagent.get(`${internalUrl}/robots.txt`); request.set('Accept', 'text/plain'); const authToken = req.universalCookies.get('auth_token'); if (authToken) { @@ -31,6 +28,12 @@ export const generateRobots = (req) => if (error) { resolve(text || error); } else { + // Plone has probably returned the sitemap link with the internal url. + // If so, let's replace it with the current one. + const url = `${req.protocol}://${req.get('Host')}`; + text = text.replace(internalUrl, url); + // Replace the sitemap with the sitemap index. + text = text.replace('sitemap.xml.gz', 'sitemap-index.xml'); resolve(text); } }); diff --git a/src/helpers/Sitemap/Sitemap.js b/src/helpers/Sitemap/Sitemap.js index ac2629e61f..64cc8dadb2 100644 --- a/src/helpers/Sitemap/Sitemap.js +++ b/src/helpers/Sitemap/Sitemap.js @@ -11,19 +11,23 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy'; import config from '@plone/volto/registry'; +export const SITEMAP_BATCH_SIZE = 5000; + /** * Generate sitemap * @function generateSitemap * @param {Object} _req Request object * @return {string} Generated sitemap */ -export const generateSitemap = (_req) => +export const generateSitemap = (_req, start = 0, size = undefined) => new Promise((resolve) => { const { settings } = config; const APISUFIX = settings.legacyTraverse ? '' : '/++api++'; const apiPath = settings.internalApiPath ?? settings.apiPath; const request = superagent.get( - `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=100000000&use_site_search_settings=1`, + `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_start=${start}&b_size=${ + size !== undefined ? size : 100000000 + }&use_site_search_settings=1`, ); request.set('Accept', 'application/json'); request.use(addHeadersFactory(_req)); @@ -50,3 +54,41 @@ export const generateSitemap = (_req) => } }); }); + +/** + * Generate sitemap + * @function generateSitemapIndex + * @param {Object} _req Request object + * @return {string} Generated sitemap index + */ +export const generateSitemapIndex = (_req) => + new Promise((resolve) => { + const { settings } = config; + const APISUFIX = settings.legacyTraverse ? '' : '/++api++'; + const apiPath = settings.internalApiPath ?? settings.apiPath; + const request = superagent.get( + `${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=0&use_site_search_settings=1`, + ); + request.set('Accept', 'application/json'); + const authToken = _req.universalCookies.get('auth_token'); + if (authToken) { + request.set('Authorization', `Bearer ${authToken}`); + } + request.end((error, { body } = {}) => { + if (error) { + resolve(body || error); + } else { + const items = Array.from( + { length: Math.ceil(body.items_total / SITEMAP_BATCH_SIZE) }, + (_, i) => + ` + ${toPublicURL('/sitemap' + (i + 1) + '.xml.gz')} + `, + ); + const result = ` + +${items.join('\n')}\n`; + resolve(result); + } + }); + });