Skip to content
This repository has been archived by the owner on Jan 7, 2018. It is now read-only.

Commit

Permalink
Really fix the rarely seen 503 Varnish errors (pretty sure this time).
Browse files Browse the repository at this point in the history
Disabling the server-side keepalive connection between Varnish and the
nginx backend held the key to preventing Varnish from retrying to reuse
bad connections and sometimes generating 503s (after nginx was
reloaded).

This reverts some earlier attempts from
60d694d

This solution seems to actually fix this problem, and holds up to 2
hours of our long running "grunt multiLongConnectionDrops" task.

See 18F/api.data.gov#207
  • Loading branch information
GUI committed Apr 5, 2015
1 parent 4a3bb3e commit 2e9007c
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 119 deletions.
22 changes: 14 additions & 8 deletions Gruntfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,26 @@ module.exports = function(grunt) {
exec = require('child_process').exec;

async.timesSeries(20, function(index, next) {
process.stdout.write('Run ' + (index + 1) + ' ');
var runNum = index + 1;
process.stdout.write('Run ' + runNum + ' ');
var progress = setInterval(function() {
process.stdout.write('.');
}, 5000);

var startTime = process.hrtime();
exec('./node_modules/.bin/grunt 2>&1', function(error, stdout) {
exec('./node_modules/.bin/grunt 2>&1', function(error, stdout, stderr) {
clearInterval(progress);

var duration = process.hrtime(startTime);
console.info(' ' + duration[0] + 's');

if(error !== null) {
console.info(stdout);
console.info('Run ' + runNum + ' encountered an error');
console.info('STDOUT: ', stdout);
console.info('STDERR: ', stderr);
}

next();
next(error);
});
}, function(error) {
done(error);
Expand All @@ -81,24 +84,27 @@ module.exports = function(grunt) {
exec = require('child_process').exec;

async.timesSeries(20, function(index, next) {
process.stdout.write('Run ' + (index + 1) + ' ');
var runNum = index + 1;
process.stdout.write('Run ' + runNum + ' ');
var progress = setInterval(function() {
process.stdout.write('.');
}, 5000);

var startTime = process.hrtime();
process.env.CONNECTION_DROPS_DURATION = 10 * 60;
exec('./node_modules/.bin/mocha test/integration/dns.js -g "handles ip changes without dropping any connections" 2>&1', function(error, stdout) {
exec('./node_modules/.bin/mocha test/integration/dns.js -g "handles ip changes without dropping any connections" 2>&1', function(error, stdout, stderr) {
clearInterval(progress);

var duration = process.hrtime(startTime);
console.info(' ' + duration[0] + 's');

if(error !== null) {
console.info(stdout);
console.info('Run ' + runNum + ' encountered an error');
console.info('STDOUT: ', stdout);
console.info('STDERR: ', stderr);
}

next();
next(error);
});
}, function(error) {
done(error);
Expand Down
107 changes: 2 additions & 105 deletions lib/config_reloader/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -368,14 +368,8 @@ _.extend(Worker.prototype, {
if(nginxConfig.changed) {
this.emit('nginx');
this.reloadNginx(function(error) {
if(error) {
return writeNginxConfigCallback(error);
}

this.reloadVarnish(function(error) {
writeNginxConfigCallback(error);
});
}.bind(this));
writeNginxConfigCallback(error);
});
}
},

Expand All @@ -400,103 +394,6 @@ _.extend(Worker.prototype, {
});
},

// Reload varnish after nginx gets reload.. This shouldn't really be
// necessary, but this seems to help with rare 503 errors when nginx gets
// reloaded due to DNS changes: https://github.com/18F/api.data.gov/issues/207
//
// While this seems to help with the 503s, I'm still not convinced it totally
// solves the problem, I think it may just make the 503s more rare. The issue
// is that Varnish holds open persistent keepalive connections to the
// backend. But when nginx gets reloaded, these sockets need to be re-opened.
// Varnish is supposed to retry once on failures if the sockets was closed,
// but for some reason we still randomly get 503s. In the logs these seem to
// be the relevant errors:
//
// http first read error: EOF
// Backend fetch failed

// I don't think we're the only one experiencing these problems with Varnish:
// http://comments.gmane.org/gmane.comp.web.varnish.misc/3621
// http://comments.gmane.org/gmane.comp.web.varnish.misc/8566
// Potential patch: http://comments.gmane.org/gmane.comp.web.varnish.dev/2352
//
// Things to still explore: the patch above, disabling keepalive connections
// (but maybe just between Varnish & nginx, which might lessen the impact),
// or perhaps this might be another plus for TrafficServer.
//
// We have a basic test to try to catch this in our integration test suite,
// but to really test this, the tests sometimes have to be run for much
// longer. See the "grunt multiLongConnectionDrops" task and let that run for
// 30-60 minutes to try to reproduce this.
reloadVarnish: function(callback) {
logger.info('Reloading varnish...');

var newConfigName = 'reload_' + new Date().toISOString();
var configPath = path.join(config.get('etc_dir'), 'varnish.vcl');
var varnishDir = path.join(config.get('run_dir'), 'varnish/api-umbrella');
var errorMessage;

execFile('varnishadm', ['-n', varnishDir, 'vcl.list'], { env: processEnv.env() }, function(error, vclList, stderr) {
if(error) {
errorMessage = 'varnishadm vcl.list failed';
logger.error({ err: error, stdout: vclList, stderr: stderr }, errorMessage);
return callback(errorMessage);
}

if(_.contains(vclList, newConfigName)) {
errorMessage = 'varnishadm vcl ' + newConfigName + ' already in use';
logger.error({ err: error, stdout: vclList, stderr: stderr }, errorMessage);
return callback(errorMessage);
}

execFile('varnishadm', ['-n', varnishDir, 'vcl.load', newConfigName, configPath], { env: processEnv.env() }, function(error, stdout, stderr) {
if(error) {
errorMessage = 'varnishadm vcl.load failed';
logger.error({ err: error, stdout: stdout, stderr: stderr }, errorMessage);
return callback(errorMessage);
}

execFile('varnishadm', ['-n', varnishDir, 'vcl.use', newConfigName], { env: processEnv.env() }, function(error, stdout, stderr) {
if(error) {
errorMessage = 'varnishadm vcl.use failed';
logger.error({ err: error, stdout: stdout, stderr: stderr }, errorMessage);
return callback(errorMessage);
}

var vcls = _.map(vclList.trim().split('\n'), function(vclRow) {
var parts = vclRow.split(/\s+/);
return {
status: parts[0],
threadCount: parts[1],
name: parts[2],
};
});

// Delete all but the last 25 VCLs.
var deleteVcls = vcls.slice(0, -25);
async.eachSeries(deleteVcls, function(vcl, eachCallback) {
// Only delete if it's not the active VCL and it has no threads
// still using it.
if(vcl.status === 'available' && vcl.threadCount === '0') {
execFile('varnishadm', ['-n', varnishDir, 'vcl.discard', vcl.name], { env: processEnv.env() }, function(error, stdout, stderr) {
if(error) {
logger.warn({ err: error, stdout: stdout, stderr: stderr }, 'varnishadm vcl.discard ' + vcl.name + ' failed');
}

eachCallback();
});
} else {
eachCallback();
}
}, function() {
logger.info({ stdout: stdout, stderr: stderr }, 'varnish reload complete');
callback();
});
});
});
});
},

handleWriteConfigs: function(error) {
if(error) {
logger.error({ err: error }, 'Error writing configs');
Expand Down
2 changes: 0 additions & 2 deletions templates/etc/nginx/router.conf.hbs
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,6 @@ http {
proxy_http_version 1.1;
proxy_set_header Connection "";

keepalive_timeout 10s;

proxy_set_header Host $host;
proxy_set_header X-Api-Umbrella-Backend-Scheme "";
proxy_set_header X-Api-Umbrella-Backend-Id "";
Expand Down
30 changes: 26 additions & 4 deletions templates/etc/varnish.vcl.hbs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ backend default {
.host = "{{varnish.backend.host}}";
.port = "{{varnish.backend.port}}";

# Increase the connect timeout to help prevent possible 503 errors if the
# nginx backend is reloading.
.connect_timeout = 10s;

# Define a timeout in Varnish longer than the 60 second timeout defined in
# nginx. This works around the issue of Varnish sending a second request to
# the backend when the backend times out:
Expand Down Expand Up @@ -82,6 +78,32 @@ sub vcl_hash {
}
}

sub vcl_backend_fetch {
# Disable keep-alive between Varnish and the backend nginx server. This fixes
# rare, but repeatable 503 errors Varnish generates if the nginx backend is
# being reloaded (as happens during our DNS changes). These are caused
# because Varnish holds open connections to the backend servers, but then
# doesn't always handle things properly when the nginx backend gets reloaded.
#
# While having keep-alive would be ideal, this shouldn't be a huge loss,
# since it just affects the localhost connections between Varnish and the
# nginx backend. End-user clients can still hold keep-alive connections
# against the initial nginx server. And the backend nginx server holds
# keep-alive connections to the API backend servers.
#
# We have a basic test to try to catch this in our integration test suite,
# but to really test this, the tests sometimes have to be run for much
# longer. See the "grunt multiLongConnectionDrops" task and let that run for
# 30-60 minutes to reproduce this if considering a change in behavior..
#
# See:
# https://github.com/18F/api.data.gov/issues/207
# http://comments.gmane.org/gmane.comp.web.varnish.misc/8566
# http://comments.gmane.org/gmane.comp.web.varnish.misc/3621
# http://comments.gmane.org/gmane.comp.web.varnish.dev/2352
set bereq.http.Connection = "Close";
}

sub vcl_backend_response {
# Don't cache error pages.
#
Expand Down
12 changes: 12 additions & 0 deletions test/integration/proxying.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ require('../test_helper');
var _ = require('lodash'),
async = require('async'),
Curler = require('curler').Curler,
execFile = require('child_process').execFile,
Factory = require('factory-lady'),
fs = require('fs'),
http = require('http'),
Expand Down Expand Up @@ -292,6 +293,17 @@ describe('proxying', function() {
});
});

describe('client-side keep alive', function() {
it('reuses connections', function(done) {
var url = 'http://localhost:9080/hello/?api_key=' + this.apiKey;
execFile('curl', ['-v', url, url], function(error, stdout, stderr) {
stdout.should.eql('Hello WorldHello World');
stderr.should.match(/200 OK[\s\S]+Re-using existing connection[\s\S]+200 OK/);
done();
});
});
});

describe('server-side keep alive', function() {
it('keeps 10 idle keepalive connections (per nginx worker) opened to the backend', function(done) {
this.timeout(30000);
Expand Down
4 changes: 4 additions & 0 deletions test/support/example_backend_app.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ app.use(multer({
},
}));

app.get('/hello', function(req, res) {
res.send('Hello World');
});

app.all('/info/*', function(req, res) {
var rawUrl = req.protocol + '://' + req.hostname + req.url;
res.set('X-Received-Method', req.method);
Expand Down

0 comments on commit 2e9007c

Please sign in to comment.