Skip to content

Commit

Permalink
Generate a split sitemap (also fix robots.txt) (#4639) (#4701)
Browse files Browse the repository at this point in the history
Co-authored-by: Balázs Reé <ree@greenfinity.hu>
  • Loading branch information
sneridagh and reebalazs authored Apr 14, 2023
1 parent a9c7c57 commit 73dcddd
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 11 deletions.
1 change: 1 addition & 0 deletions news/4638.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Generate a split sitemap @reebalazs
39 changes: 36 additions & 3 deletions src/express-middleware/sitemap.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
import express from 'express';
import { generateSitemap } from '@plone/volto/helpers';
import {
generateSitemap,
generateSitemapIndex,
SITEMAP_BATCH_SIZE,
} from '@plone/volto/helpers/Sitemap/Sitemap';

export const sitemap = function (req, res, next) {
generateSitemap(req).then((sitemap) => {
let start = 0;
let size = undefined;
const { batch: batchStr } = req.params;
if (batchStr !== undefined) {
const batch = parseInt(batchStr);
if (isNaN(batch) || batch === 0 || '' + batch !== batchStr) {
res.status(404);
// Some data, such as the internal API address, may be sensitive to be published
res.send(
`Invalid sitemap name, use sitemap.xml.gz, or batched sitemapN.xml.gz where N is a positive integer.`,
);
return;
}
start = SITEMAP_BATCH_SIZE * (batch - 1);
size = SITEMAP_BATCH_SIZE;
}
generateSitemap(req, start, size).then((sitemap) => {
if (Buffer.isBuffer(sitemap)) {
res.set('Content-Type', 'application/x-gzip');
res.set('Content-Disposition', 'attachment; filename="sitemap.xml.gz"');
res.set(
'Content-Disposition',
`attachment; filename="sitemap${batchStr || ''}.xml.gz"`,
);
res.send(sitemap);
} else {
// {"errno":-111, "code":"ECONNREFUSED", "host": ...}
Expand All @@ -16,10 +39,20 @@ export const sitemap = function (req, res, next) {
});
};

export const sitemapIndex = function (req, res, next) {
generateSitemapIndex(req).then((sitemapIndex) => {
res.set('Content-Type', 'application/xml');
res.set('Content-Disposition', 'attachment; filename="sitemap-index.xml"');
res.send(sitemapIndex);
});
};

export default function () {
const middleware = express.Router();

middleware.all('**/sitemap.xml.gz', sitemap);
middleware.all('**/sitemap:batch.xml.gz', sitemap);
middleware.all('**/sitemap-index.xml', sitemapIndex);
middleware.id = 'sitemap.xml.gz';
return middleware;
}
15 changes: 9 additions & 6 deletions src/helpers/Robots/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy';
*/
export const generateRobots = (req) =>
new Promise((resolve) => {
//const url = `${req.protocol}://${req.get('Host')}`;
const request = superagent.get(
`${
config.settings.internalApiPath ?? config.settings.apiPath
}/robots.txt`,
);
const internalUrl =
config.settings.internalApiPath ?? config.settings.apiPath;
const request = superagent.get(`${internalUrl}/robots.txt`);
request.set('Accept', 'text/plain');
const authToken = req.universalCookies.get('auth_token');
if (authToken) {
Expand All @@ -31,6 +28,12 @@ export const generateRobots = (req) =>
if (error) {
resolve(text || error);
} else {
// Plone has probably returned the sitemap link with the internal url.
// If so, let's replace it with the current one.
const url = `${req.protocol}://${req.get('Host')}`;
text = text.replace(internalUrl, url);
// Replace the sitemap with the sitemap index.
text = text.replace('sitemap.xml.gz', 'sitemap-index.xml');
resolve(text);
}
});
Expand Down
46 changes: 44 additions & 2 deletions src/helpers/Sitemap/Sitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,23 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy';

import config from '@plone/volto/registry';

export const SITEMAP_BATCH_SIZE = 5000;

/**
* Generate sitemap
* @function generateSitemap
* @param {Object} _req Request object
* @return {string} Generated sitemap
*/
export const generateSitemap = (_req) =>
export const generateSitemap = (_req, start = 0, size = undefined) =>
new Promise((resolve) => {
const { settings } = config;
const APISUFIX = settings.legacyTraverse ? '' : '/++api++';
const apiPath = settings.internalApiPath ?? settings.apiPath;
const request = superagent.get(
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=100000000&use_site_search_settings=1`,
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_start=${start}&b_size=${
size !== undefined ? size : 100000000
}&use_site_search_settings=1`,
);
request.set('Accept', 'application/json');
request.use(addHeadersFactory(_req));
Expand All @@ -50,3 +54,41 @@ export const generateSitemap = (_req) =>
}
});
});

/**
* Generate sitemap
* @function generateSitemapIndex
* @param {Object} _req Request object
* @return {string} Generated sitemap index
*/
export const generateSitemapIndex = (_req) =>
new Promise((resolve) => {
const { settings } = config;
const APISUFIX = settings.legacyTraverse ? '' : '/++api++';
const apiPath = settings.internalApiPath ?? settings.apiPath;
const request = superagent.get(
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=0&use_site_search_settings=1`,
);
request.set('Accept', 'application/json');
const authToken = _req.universalCookies.get('auth_token');
if (authToken) {
request.set('Authorization', `Bearer ${authToken}`);
}
request.end((error, { body } = {}) => {
if (error) {
resolve(body || error);
} else {
const items = Array.from(
{ length: Math.ceil(body.items_total / SITEMAP_BATCH_SIZE) },
(_, i) =>
` <sitemap>
<loc>${toPublicURL('/sitemap' + (i + 1) + '.xml.gz')}</loc>
</sitemap>`,
);
const result = `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
${items.join('\n')}\n</sitemapindex>`;
resolve(result);
}
});
});

0 comments on commit 73dcddd

Please sign in to comment.