From 2e9007c983e930a13db5df6510249ad908634398 Mon Sep 17 00:00:00 2001
From: Nick Muerdter <nick.muerdter@nrel.gov>
Date: Sun, 5 Apr 2015 07:12:21 -0600
Subject: [PATCH] Really fix the rarely seen 503 Varnish errors (pretty sure
 this time).

Disabling the server-side keepalive connection between Varnish and the
nginx backend held the key to preventing Varnish from retrying to reuse
bad connections and sometimes generating 503s (after nginx was
reloaded).

This reverts some earlier attempts from
60d694d9297f1f301cea26eae6fa36c2be22c15d

This solution seems to actually fix this problem, and holds up to 2
hours of our long running "grunt multiLongConnectionDrops" task.

See https://github.com/18F/api.data.gov/issues/207
---
 Gruntfile.js                        |  22 +++---
 lib/config_reloader/worker.js       | 107 +---------------------------
 templates/etc/nginx/router.conf.hbs |   2 -
 templates/etc/varnish.vcl.hbs       |  30 ++++++--
 test/integration/proxying.js        |  12 ++++
 test/support/example_backend_app.js |   4 ++
 6 files changed, 58 insertions(+), 119 deletions(-)

diff --git a/Gruntfile.js b/Gruntfile.js
index 9c74de0..fd0524b 100644
--- a/Gruntfile.js
+++ b/Gruntfile.js
@@ -51,23 +51,26 @@ module.exports = function(grunt) {
         exec = require('child_process').exec;
 
     async.timesSeries(20, function(index, next) {
-      process.stdout.write('Run ' + (index + 1) + ' ');
+      var runNum = index + 1;
+      process.stdout.write('Run ' + runNum + ' ');
       var progress = setInterval(function() {
         process.stdout.write('.');
       }, 5000);
 
       var startTime = process.hrtime();
-      exec('./node_modules/.bin/grunt 2>&1', function(error, stdout) {
+      exec('./node_modules/.bin/grunt 2>&1', function(error, stdout, stderr) {
         clearInterval(progress);
 
         var duration = process.hrtime(startTime);
         console.info(' ' + duration[0] + 's');
 
         if(error !== null) {
-          console.info(stdout);
+          console.info('Run ' + runNum + ' encountered an error');
+          console.info('STDOUT: ', stdout);
+          console.info('STDERR: ', stderr);
         }
 
-        next();
+        next(error);
       });
     }, function(error) {
       done(error);
@@ -81,24 +84,27 @@ module.exports = function(grunt) {
         exec = require('child_process').exec;
 
     async.timesSeries(20, function(index, next) {
-      process.stdout.write('Run ' + (index + 1) + ' ');
+      var runNum = index + 1;
+      process.stdout.write('Run ' + runNum + ' ');
       var progress = setInterval(function() {
         process.stdout.write('.');
       }, 5000);
 
       var startTime = process.hrtime();
       process.env.CONNECTION_DROPS_DURATION = 10 * 60;
-      exec('./node_modules/.bin/mocha test/integration/dns.js -g "handles ip changes without dropping any connections" 2>&1', function(error, stdout) {
+      exec('./node_modules/.bin/mocha test/integration/dns.js -g "handles ip changes without dropping any connections" 2>&1', function(error, stdout, stderr) {
         clearInterval(progress);
 
         var duration = process.hrtime(startTime);
         console.info(' ' + duration[0] + 's');
 
         if(error !== null) {
-          console.info(stdout);
+          console.info('Run ' + runNum + ' encountered an error');
+          console.info('STDOUT: ', stdout);
+          console.info('STDERR: ', stderr);
         }
 
-        next();
+        next(error);
       });
     }, function(error) {
       done(error);
diff --git a/lib/config_reloader/worker.js b/lib/config_reloader/worker.js
index edd22a9..c6b10db 100644
--- a/lib/config_reloader/worker.js
+++ b/lib/config_reloader/worker.js
@@ -368,14 +368,8 @@ _.extend(Worker.prototype, {
     if(nginxConfig.changed) {
       this.emit('nginx');
       this.reloadNginx(function(error) {
-        if(error) {
-          return writeNginxConfigCallback(error);
-        }
-
-        this.reloadVarnish(function(error) {
-          writeNginxConfigCallback(error);
-        });
-      }.bind(this));
+        writeNginxConfigCallback(error);
+      });
     }
   },
 
@@ -400,103 +394,6 @@ _.extend(Worker.prototype, {
     });
   },
 
-  // Reload varnish after nginx gets reload.. This shouldn't really be
-  // necessary, but this seems to help with rare 503 errors when nginx gets
-  // reloaded due to DNS changes: https://github.com/18F/api.data.gov/issues/207
-  //
-  // While this seems to help with the 503s, I'm still not convinced it totally
-  // solves the problem, I think it may just make the 503s more rare. The issue
-  // is that Varnish holds open persistent keepalive connections to the
-  // backend. But when nginx gets reloaded, these sockets need to be re-opened.
-  // Varnish is supposed to retry once on failures if the sockets was closed,
-  // but for some reason we still randomly get 503s. In the logs these seem to
-  // be the relevant errors:
-  //
-  // http first read error: EOF
-  // Backend fetch failed
-
-  // I don't think we're the only one experiencing these problems with Varnish:
-  // http://comments.gmane.org/gmane.comp.web.varnish.misc/3621
-  // http://comments.gmane.org/gmane.comp.web.varnish.misc/8566
-  // Potential patch: http://comments.gmane.org/gmane.comp.web.varnish.dev/2352
-  //
-  // Things to still explore: the patch above, disabling keepalive connections
-  // (but maybe just between Varnish & nginx, which might lessen the impact),
-  // or perhaps this might be another plus for TrafficServer.
-  //
-  // We have a basic test to try to catch this in our integration test suite,
-  // but to really test this, the tests sometimes have to be run for much
-  // longer. See the "grunt multiLongConnectionDrops" task and let that run for
-  // 30-60 minutes to try to reproduce this.
-  reloadVarnish: function(callback) {
-    logger.info('Reloading varnish...');
-
-    var newConfigName = 'reload_' + new Date().toISOString();
-    var configPath = path.join(config.get('etc_dir'), 'varnish.vcl');
-    var varnishDir = path.join(config.get('run_dir'), 'varnish/api-umbrella');
-    var errorMessage;
-
-    execFile('varnishadm', ['-n', varnishDir, 'vcl.list'], { env: processEnv.env() }, function(error, vclList, stderr) {
-      if(error) {
-        errorMessage = 'varnishadm vcl.list failed';
-        logger.error({ err: error, stdout: vclList, stderr: stderr }, errorMessage);
-        return callback(errorMessage);
-      }
-
-      if(_.contains(vclList, newConfigName)) {
-        errorMessage = 'varnishadm vcl ' + newConfigName + ' already in use';
-        logger.error({ err: error, stdout: vclList, stderr: stderr }, errorMessage);
-        return callback(errorMessage);
-      }
-
-      execFile('varnishadm', ['-n', varnishDir, 'vcl.load', newConfigName, configPath], { env: processEnv.env() }, function(error, stdout, stderr) {
-        if(error) {
-          errorMessage = 'varnishadm vcl.load failed';
-          logger.error({ err: error, stdout: stdout, stderr: stderr }, errorMessage);
-          return callback(errorMessage);
-        }
-
-        execFile('varnishadm', ['-n', varnishDir, 'vcl.use', newConfigName], { env: processEnv.env() }, function(error, stdout, stderr) {
-          if(error) {
-            errorMessage = 'varnishadm vcl.use failed';
-            logger.error({ err: error, stdout: stdout, stderr: stderr }, errorMessage);
-            return callback(errorMessage);
-          }
-
-          var vcls = _.map(vclList.trim().split('\n'), function(vclRow) {
-            var parts = vclRow.split(/\s+/);
-            return {
-              status: parts[0],
-              threadCount: parts[1],
-              name: parts[2],
-            };
-          });
-
-          // Delete all but the last 25 VCLs.
-          var deleteVcls = vcls.slice(0, -25);
-          async.eachSeries(deleteVcls, function(vcl, eachCallback) {
-            // Only delete if it's not the active VCL and it has no threads
-            // still using it.
-            if(vcl.status === 'available' && vcl.threadCount === '0') {
-              execFile('varnishadm', ['-n', varnishDir, 'vcl.discard', vcl.name], { env: processEnv.env() }, function(error, stdout, stderr) {
-                if(error) {
-                  logger.warn({ err: error, stdout: stdout, stderr: stderr }, 'varnishadm vcl.discard ' + vcl.name + ' failed');
-                }
-
-                eachCallback();
-              });
-            } else {
-              eachCallback();
-            }
-          }, function() {
-            logger.info({ stdout: stdout, stderr: stderr }, 'varnish reload complete');
-            callback();
-          });
-        });
-      });
-    });
-  },
-
   handleWriteConfigs: function(error) {
     if(error) {
       logger.error({ err: error }, 'Error writing configs');
diff --git a/templates/etc/nginx/router.conf.hbs b/templates/etc/nginx/router.conf.hbs
index 035f365..8c824f0 100644
--- a/templates/etc/nginx/router.conf.hbs
+++ b/templates/etc/nginx/router.conf.hbs
@@ -201,8 +201,6 @@ http {
     proxy_http_version 1.1;
     proxy_set_header Connection "";
 
-    keepalive_timeout 10s;
-
     proxy_set_header Host $host;
     proxy_set_header X-Api-Umbrella-Backend-Scheme "";
     proxy_set_header X-Api-Umbrella-Backend-Id "";
diff --git a/templates/etc/varnish.vcl.hbs b/templates/etc/varnish.vcl.hbs
index f988f1b..11cf7d7 100644
--- a/templates/etc/varnish.vcl.hbs
+++ b/templates/etc/varnish.vcl.hbs
@@ -6,10 +6,6 @@ backend default {
   .host = "{{varnish.backend.host}}";
   .port = "{{varnish.backend.port}}";
 
-  # Increase the connect timeout to help prevent possible 503 errors if the
-  # nginx backend is reloading.
-  .connect_timeout = 10s;
-
   # Define a timeout in Varnish longer than the 60 second timeout defined in
   # nginx. This works around the issue of Varnish sending a second request to
   # the backend when the backend times out:
@@ -82,6 +78,32 @@ sub vcl_hash {
   }
 }
 
+sub vcl_backend_fetch {
+  # Disable keep-alive between Varnish and the backend nginx server. This fixes
+  # rare, but repeatable 503 errors Varnish generates if the nginx backend is
+  # being reloaded (as happens during our DNS changes). These are caused
+  # because Varnish holds open connections to the backend servers, but then
+  # doesn't always handle things properly when the nginx backend gets reloaded.
+  #
+  # While having keep-alive would be ideal, this shouldn't be a huge loss,
+  # since it just affects the localhost connections between Varnish and the
+  # nginx backend. End-user clients can still hold keep-alive connections
+  # against the initial nginx server. And the backend nginx server holds
+  # keep-alive connections to the API backend servers.
+  #
+  # We have a basic test to try to catch this in our integration test suite,
+  # but to really test this, the tests sometimes have to be run for much
+  # longer. See the "grunt multiLongConnectionDrops" task and let that run for
+  # 30-60 minutes to reproduce this if considering a change in behavior..
+  #
+  # See:
+  # https://github.com/18F/api.data.gov/issues/207
+  # http://comments.gmane.org/gmane.comp.web.varnish.misc/8566
+  # http://comments.gmane.org/gmane.comp.web.varnish.misc/3621
+  # http://comments.gmane.org/gmane.comp.web.varnish.dev/2352
+  set bereq.http.Connection = "Close";
+}
+
 sub vcl_backend_response {
   # Don't cache error pages.
   #
diff --git a/test/integration/proxying.js b/test/integration/proxying.js
index f607e95..34e9a4d 100644
--- a/test/integration/proxying.js
+++ b/test/integration/proxying.js
@@ -5,6 +5,7 @@ require('../test_helper');
 var _ = require('lodash'),
     async = require('async'),
     Curler = require('curler').Curler,
+    execFile = require('child_process').execFile,
     Factory = require('factory-lady'),
     fs = require('fs'),
     http = require('http'),
@@ -292,6 +293,17 @@ describe('proxying', function() {
     });
   });
 
+  describe('client-side keep alive', function() {
+    it('reuses connections', function(done) {
+      var url = 'http://localhost:9080/hello/?api_key=' + this.apiKey;
+      execFile('curl', ['-v', url, url], function(error, stdout, stderr) {
+        stdout.should.eql('Hello WorldHello World');
+        stderr.should.match(/200 OK[\s\S]+Re-using existing connection[\s\S]+200 OK/);
+        done();
+      });
+    });
+  });
+
   describe('server-side keep alive', function() {
     it('keeps 10 idle keepalive connections (per nginx worker) opened to the backend', function(done) {
       this.timeout(30000);
diff --git a/test/support/example_backend_app.js b/test/support/example_backend_app.js
index 9c76fb3..80321b7 100644
--- a/test/support/example_backend_app.js
+++ b/test/support/example_backend_app.js
@@ -31,6 +31,10 @@ app.use(multer({
   },
 }));
 
+app.get('/hello', function(req, res) {
+  res.send('Hello World');
+});
+
 app.all('/info/*', function(req, res) {
   var rawUrl = req.protocol + '://' + req.hostname + req.url;
   res.set('X-Received-Method', req.method);