diff --git a/.gitignore b/.gitignore index ac2ed4b7e..32eab6e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ /drush/Commands/contrib/ /web/profiles/contrib/ /web/libraries/ +/web/robots.txt # Ignore configuration files that may contain sensitive information. /web/sites/*/settings*.php diff --git a/composer.json b/composer.json index a1a87eaf8..0586c86b6 100644 --- a/composer.json +++ b/composer.json @@ -66,6 +66,7 @@ "drupal/redirect": "^1.8", "drupal/remove_http_headers": "^2.0", "drupal/rest_views": "^3.0", + "drupal/robotstxt": "^1.5", "drupal/s3fs": "^3.1", "drupal/samlauth": "^3.8", "drupal/simple_sitemap": "^4.1", @@ -143,5 +144,13 @@ " composer remove drupal/core-project-message" ] } + }, + "scripts": { + "post-install-cmd": [ + "test -e web/robots.txt && rm web/robots.txt || echo The default robots.txt file has been deleted." + ], + "post-update-cmd": [ + "test -e web/robots.txt && rm web/robots.txt || echo The default robots.txt file has been deleted." + ] } } diff --git a/composer.lock b/composer.lock index 873562206..66ed21159 100644 --- a/composer.lock +++ b/composer.lock @@ -4280,6 +4280,63 @@ "issues": "https://www.drupal.org/project/issues/rest_views" } }, + { + "name": "drupal/robotstxt", + "version": "1.5.0", + "source": { + "type": "git", + "url": "https://git.drupalcode.org/project/robotstxt.git", + "reference": "8.x-1.5" + }, + "dist": { + "type": "zip", + "url": "https://ftp.drupal.org/files/projects/robotstxt-8.x-1.5.zip", + "reference": "8.x-1.5", + "shasum": "08c2b8dcd7b7f7e3bf0f7b7b88f128603d3b11b1" + }, + "require": { + "drupal/core": "^9.3 || ^10" + }, + "type": "drupal-module", + "extra": { + "drupal": { + "version": "8.x-1.5", + "datestamp": "1671555186", + "security-coverage": { + "status": "covered", + "message": "Covered by Drupal's security advisory policy" + } + } + }, + "notification-url": "https://packages.drupal.org/8/downloads", + "license": [ + "GPL-2.0-or-later" + ], + "authors": [ + { + "name": "hass", + "homepage": "https://www.drupal.org/u/hass" + }, + { + "name": "See other contributors", + "homepage": "https://www.drupal.org/node/53579/committers" + }, + { + "name": "mikeegoulding", + "homepage": "https://www.drupal.org/user/2867877" + }, + { + "name": "Todd Nienkerk", + "homepage": "https://www.drupal.org/user/92096" + } + ], + "description": "Generates the robots.txt file dynamically and gives you the chance to edit it, on a per-site basis, from the web UI.", + "homepage": "https://www.drupal.org/project/robotstxt", + "support": { + "source": "https://git.drupal.org/project/robotstxt.git", + "issues": "https://www.drupal.org/project/issues/robotstxt" + } + }, { "name": "drupal/s3fs", "version": "3.5.0", diff --git a/config/production/robotstxt.settings.yml b/config/production/robotstxt.settings.yml new file mode 100644 index 000000000..db0d5bb10 --- /dev/null +++ b/config/production/robotstxt.settings.yml @@ -0,0 +1,3 @@ +_core: + default_config_hash: ceCx5XZ_ay1Mxcv-sB95U_fBKoVkpvo8RaQiwutSZLI +content: "#\r\n# robots.txt\r\n#\r\n# This file is to prevent the crawling and indexing of certain parts\r\n# of your site by web crawlers and spiders run by sites like Yahoo!\r\n# and Google. By telling these \"robots\" where not to go on your site,\r\n# you save bandwidth and server resources.\r\n#\r\n# This file will be ignored unless it is at the root of your host:\r\n# Used: http://example.com/robots.txt\r\n# Ignored: http://example.com/site/robots.txt\r\n#\r\n# For more information about the robots.txt standard, see:\r\n# http://www.robotstxt.org/robotstxt.html\r\n\r\nUser-agent: *\r\nCrawl-delay: 10\r\n\r\n# Sitemaps\r\nSitemap: https://vote.gov/sitemap.xml\r\n\r\n# CSS, JS, Images\r\nAllow: /*.css$\r\nAllow: /*.js$\r\nAllow: /core/*.css$\r\nAllow: /core/*.css?\r\nAllow: /core/*.js$\r\nAllow: /modules/*.css$\r\nAllow: /modules/*.css?\r\nAllow: /modules/*.js$\r\nAllow: /modules/*.js?\r\nAllow: /modules/*.gif\r\nAllow: /modules/*.jpg\r\nAllow: /modules/*.jpeg\r\nAllow: /modules/*.png\r\nAllow: /themes/*.css$\r\nAllow: /themes/*.css?\r\nAllow: /themes/*.js$\r\nAllow: /themes/*.js?\r\nAllow: /themes/*.gif\r\nAllow: /themes/*.jpg\r\nAllow: /themes/*.jpeg\r\nAllow: /themes/*.png\r\n# Directories\r\nDisallow: /core/\r\nDisallow: /modules/\r\nDisallow: /themes/\r\n" diff --git a/config/sync/config_split.config_split.production.yml b/config/sync/config_split.config_split.production.yml index faa7fe327..cfe8510cd 100644 --- a/config/sync/config_split.config_split.production.yml +++ b/config/sync/config_split.config_split.production.yml @@ -16,6 +16,7 @@ theme: { } complete_list: - log_stdout.settings - new_relic_rpm.settings + - robotstxt.settings - s3fs.settings - samlauth.authentication - usagov_login.settings diff --git a/config/sync/core.extension.yml b/config/sync/core.extension.yml index e3c2f126e..4c75a305c 100644 --- a/config/sync/core.extension.yml +++ b/config/sync/core.extension.yml @@ -63,6 +63,7 @@ module: remove_http_headers: 0 rest: 0 rest_views: 0 + robotstxt: 0 serialization: 0 simple_sitemap: 0 system: 0 diff --git a/config/sync/robotstxt.settings.yml b/config/sync/robotstxt.settings.yml new file mode 100644 index 000000000..ff9526806 --- /dev/null +++ b/config/sync/robotstxt.settings.yml @@ -0,0 +1 @@ +content: "#\r\n# robots.txt\r\n#\r\n# This file is to prevent the crawling and indexing of certain parts\r\n# of your site by web crawlers and spiders run by sites like Yahoo!\r\n# and Google. By telling these \"robots\" where not to go on your site,\r\n# you save bandwidth and server resources.\r\n#\r\n# This file will be ignored unless it is at the root of your host:\r\n# Used: http://example.com/robots.txt\r\n# Ignored: http://example.com/site/robots.txt\r\n#\r\n# For more information about the robots.txt standard, see:\r\n# http://www.robotstxt.org/robotstxt.html\r\n\r\nUser-agent: *\r\nDisallow: /\r\n" diff --git a/config/sync/user.role.site_builder.yml b/config/sync/user.role.site_builder.yml index cc96ca45c..d426749b6 100644 --- a/config/sync/user.role.site_builder.yml +++ b/config/sync/user.role.site_builder.yml @@ -59,6 +59,7 @@ dependencies: - redirect - remove_http_headers - rest + - robotstxt - simple_sitemap - system - taxonomy @@ -110,6 +111,7 @@ permissions: - 'administer redirect settings' - 'administer redirects' - 'administer rest resources' + - 'administer robots.txt' - 'administer site configuration' - 'administer sitemap settings' - 'administer software updates' diff --git a/web/robots.txt b/web/robots.txt deleted file mode 100644 index 22e58aafe..000000000 --- a/web/robots.txt +++ /dev/null @@ -1,20 +0,0 @@ -# -# robots.txt -# -# This file is to prevent the crawling and indexing of certain parts -# of your site by web crawlers and spiders run by sites like Yahoo! -# and Google. By telling these "robots" where not to go on your site, -# you save bandwidth and server resources. -# -# This file will be ignored unless it is at the root of your host: -# Used: http://example.com/robots.txt -# Ignored: http://example.com/site/robots.txt -# -# For more information about the robots.txt standard, see: -# http://www.robotstxt.org/robotstxt.html - -User-agent: usasearch -Allow: / - -User-agent: * -Disallow: /