Skip to content

Commit

Permalink
new_audit(blocked-from-indexing): page is blocked from indexing (#3657)
Browse files Browse the repository at this point in the history
  • Loading branch information
kdzwinel authored and brendankenny committed Nov 8, 2017
1 parent 8a1af52 commit fb2cb02
Show file tree
Hide file tree
Showing 8 changed files with 360 additions and 6 deletions.
1 change: 1 addition & 0 deletions lighthouse-cli/test/fixtures/seo/seo-failure-cases.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<meta charset="utf-8">
<meta name="viewport" content="invalid-content=should_have_looked_it_up">
<!-- no <meta name="description" content=""> -->
<meta name="robots" content="nofollow, NOINDEX, all">
</head>
<body>
<h1>SEO</h1>
Expand Down
23 changes: 19 additions & 4 deletions lighthouse-cli/test/fixtures/static-server.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const path = require('path');
const fs = require('fs');
const parseQueryString = require('querystring').parse;
const parseURL = require('url').parse;
const HEADER_SAFELIST = new Set(['x-robots-tag']);

const lhRootDirPath = path.join(__dirname, '../../../');

Expand Down Expand Up @@ -51,13 +52,14 @@ function requestHandler(request, response) {
}

function sendResponse(statusCode, data) {
let headers;
const headers = {};

if (filePath.endsWith('.js')) {
headers = {'Content-Type': 'text/javascript'};
headers['Content-Type'] = 'text/javascript';
} else if (filePath.endsWith('.css')) {
headers = {'Content-Type': 'text/css'};
headers['Content-Type'] = 'text/css';
} else if (filePath.endsWith('.svg')) {
headers = {'Content-Type': 'image/svg+xml'};
headers['Content-Type'] = 'image/svg+xml';
}

let delay = 0;
Expand All @@ -72,6 +74,19 @@ function requestHandler(request, response) {
delay = parseInt(queryString.delay, 10) || 2000;
}

if (typeof queryString.extra_header !== 'undefined') {
let extraHeaders = queryString.extra_header;
extraHeaders = Array.isArray(extraHeaders) ? extraHeaders : [extraHeaders];

extraHeaders.forEach(header => {
const [headerName, ...headerValue] = header.split(':');

if (HEADER_SAFELIST.has(headerName.toLowerCase())) {
headers[headerName] = headerValue.join(':');
}
});
}

// redirect url to new url if present
if (typeof queryString.redirect !== 'undefined') {
return setTimeout(sendRedirect, delay, queryString.redirect);
Expand Down
15 changes: 13 additions & 2 deletions lighthouse-cli/test/smokehouse/seo/expectations.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@ module.exports = [
'link-text': {
score: true,
},
'is-crawlable': {
score: true,
},
},
},
{
initialUrl: 'http://localhost:10200/seo/seo-failure-cases.html?status_code=403',
url: 'http://localhost:10200/seo/seo-failure-cases.html?status_code=403',
initialUrl: 'http://localhost:10200/seo/seo-failure-cases.html?status_code=403&extra_header=x-robots-tag:none',
url: 'http://localhost:10200/seo/seo-failure-cases.html?status_code=403&extra_header=x-robots-tag:none',
audits: {
'viewport': {
score: false,
Expand Down Expand Up @@ -61,6 +64,14 @@ module.exports = [
},
},
},
'is-crawlable': {
score: false,
details: {
items: {
length: 2,
},
},
},
},
},
];
112 changes: 112 additions & 0 deletions lighthouse-core/audits/seo/is-crawlable.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/**
* @license Copyright 2017 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/
'use strict';

const Audit = require('../audit');
const BLOCKLIST = new Set([
'noindex',
'none',
]);
const ROBOTS_HEADER = 'x-robots-tag';
const UNAVAILABLE_AFTER = 'unavailable_after';

/**
* Checks if given directive is a valid unavailable_after directive with a date in the past
* @param {string} directive
* @returns {boolean}
*/
function isUnavailable(directive) {
const parts = directive.split(':');

if (parts.length <= 1 || parts[0] !== UNAVAILABLE_AFTER) {
return false;
}

const date = Date.parse(parts.slice(1).join(':'));

return !isNaN(date) && date < Date.now();
}

/**
* Returns true if any of provided directives blocks page from being indexed
* @param {string} directives
* @returns {boolean}
*/
function hasBlockingDirective(directives) {
return directives.split(',')
.map(d => d.toLowerCase().trim())
.some(d => BLOCKLIST.has(d) || isUnavailable(d));
}

/**
* Returns true if robots header specifies user agent (e.g. `googlebot: noindex`)
* @param {string} directives
* @returns {boolean}
*/
function hasUserAgent(directives) {
const parts = directives.match(/^([^,:]+):/);

// Check if directives are prefixed with `googlebot:`, `googlebot-news:`, `otherbot:`, etc.
// but ignore `unavailable_after:` which is a valid directive
return !!parts && parts[1].toLowerCase() !== UNAVAILABLE_AFTER;
}

class IsCrawlable extends Audit {
/**
* @return {!AuditMeta}
*/
static get meta() {
return {
name: 'is-crawlable',
description: 'Page isn’t blocked from indexing',
failureDescription: 'Page is blocked from indexing',
helpText: 'The "Robots" directives tell crawlers how your content should be indexed. ' +
'[Learn more](https://developers.google.com/search/reference/robots_meta_tag).',
requiredArtifacts: ['MetaRobots'],
};
}

/**
* @param {!Artifacts} artifacts
* @return {!AuditResult}
*/
static audit(artifacts) {
return artifacts.requestMainResource(artifacts.devtoolsLogs[Audit.DEFAULT_PASS])
.then(mainResource => {
const blockingDirectives = [];

if (artifacts.MetaRobots) {
const isBlocking = hasBlockingDirective(artifacts.MetaRobots);

if (isBlocking) {
blockingDirectives.push({
source: {
type: 'node',
snippet: `<meta name="robots" content="${artifacts.MetaRobots}" />`,
},
});
}
}

mainResource.responseHeaders
.filter(h => h.name.toLowerCase() === ROBOTS_HEADER && !hasUserAgent(h.value) &&
hasBlockingDirective(h.value))
.forEach(h => blockingDirectives.push({source: `${h.name}: ${h.value}`}));

const headings = [
{key: 'source', itemType: 'code', text: 'Source'},
];
const details = Audit.makeTableDetails(headings, blockingDirectives);

return {
rawValue: blockingDirectives.length === 0,
details,
};
});
}
}

module.exports = IsCrawlable;
4 changes: 4 additions & 0 deletions lighthouse-core/config/default.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ module.exports = {
'dobetterweb/tags-blocking-first-paint',
'dobetterweb/websql',
'seo/meta-description',
'seo/crawlable-links',
'seo/meta-robots',
],
},
{
Expand Down Expand Up @@ -148,6 +150,8 @@ module.exports = {
'dobetterweb/uses-passive-event-listeners',
'seo/meta-description',
'seo/http-status-code',
'seo/link-text',
'seo/is-crawlable',
],

groups: {
Expand Down
3 changes: 3 additions & 0 deletions lighthouse-core/config/seo.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ module.exports = {
gatherers: [
'seo/meta-description',
'seo/crawlable-links',
'seo/meta-robots',
],
}],
audits: [
'seo/meta-description',
'seo/http-status-code',
'seo/link-text',
'seo/is-crawlable',
],
groups: {
'seo-mobile': {
Expand All @@ -44,6 +46,7 @@ module.exports = {
{id: 'meta-description', weight: 1, group: 'seo-content'},
{id: 'http-status-code', weight: 1, group: 'seo-crawl'},
{id: 'link-text', weight: 1, group: 'seo-content'},
{id: 'is-crawlable', weight: 1, group: 'seo-crawl'},
],
},
},
Expand Down
23 changes: 23 additions & 0 deletions lighthouse-core/gather/gatherers/seo/meta-robots.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* @license Copyright 2017 Google Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*/
'use strict';

const Gatherer = require('../gatherer');

class MetaRobots extends Gatherer {
/**
* @param {{driver: !Driver}} options Run options
* @return {!Promise<?string>} The value of the description meta's content attribute, or null
*/
afterPass(options) {
const driver = options.driver;

return driver.querySelector('head meta[name="robots" i]')
.then(node => node && node.getAttribute('content'));
}
}

module.exports = MetaRobots;
Loading

0 comments on commit fb2cb02

Please sign in to comment.