Skip to content

Commit 5a9efeb

Browse files
committed
Connection should not fail in case of diego issues
During the bosh update of diego-api/bbs or AZ failover the request from CC might fail with 503 - Runner is unavailable. This PR aim to prevent this bbs failover with exponential back-off.
1 parent 63408d4 commit 5a9efeb

File tree

2 files changed

+144
-29
lines changed

2 files changed

+144
-29
lines changed

lib/diego/client.rb

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Client
88
def initialize(url:, ca_cert_file:, client_cert_file:, client_key_file:,
99
connect_timeout:, send_timeout:, receive_timeout:)
1010
ENV['PB_IGNORE_DEPRECATIONS'] ||= 'true'
11+
@logger = Steno.logger('cc.diego.client')
1112
@client = build_client(
1213
url,
1314
ca_cert_file,
@@ -161,11 +162,25 @@ def actual_lrps_by_process_guid(process_guid)
161162
end
162163

163164
def with_request_error_handling
164-
tries ||= 3
165-
yield
166-
rescue StandardError => e
167-
retry unless (tries -= 1).zero?
168-
raise RequestError.new(e.message)
165+
delay = 0.25
166+
max_delay = 5
167+
retry_until = Time.now + 60 # retry for 1 minute from now
168+
factor = 2
169+
170+
begin
171+
yield
172+
rescue StandardError => e
173+
if Time.now > retry_until
174+
@logger.error('Unable to establish a connection to diego backend, no more retries, raising an exception.')
175+
raise RequestError.new(e.message)
176+
else
177+
sleep_time = [delay, max_delay].min
178+
@logger.info("Attempting to connect to the diego backend. Total #{(retry_until - Time.now).round(2)} seconds remaining until retrying. Next retry after #{sleep_time} seconds delay.")
179+
sleep(sleep_time)
180+
delay *= factor
181+
retry
182+
end
183+
end
169184
end
170185

171186
private

spec/diego/client_spec.rb

Lines changed: 124 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ module Diego
1111
let(:client_key_file) { File.join(Paths::FIXTURES, 'certs/bbs_client.key') }
1212
let(:timeout) { 10 }
1313
let(:request_id) { '522960b781af4039b8b91a20ff6c0394' }
14+
let(:logger) { double(Steno::Logger) }
1415

1516
subject(:client) do
1617
Client.new(url: bbs_url, ca_cert_file: ca_cert_file, client_cert_file: client_cert_file, client_key_file: client_key_file,
@@ -20,6 +21,9 @@ module Diego
2021
before do
2122
# from middleware/vcap_request_id.rb
2223
::VCAP::Request.current_id = "#{request_id}::b62be6c2-0f2c-4199-94d3-41a69e00f67d"
24+
allow(Steno).to receive(:logger).with('cc.diego.client').and_return(logger)
25+
allow(logger).to receive(:info)
26+
allow(logger).to receive(:error)
2327
end
2428

2529
describe 'configuration' do
@@ -55,11 +59,12 @@ module Diego
5559
context 'when it fails to make the request' do
5660
before do
5761
stub_request(:post, "#{bbs_url}/v1/ping").to_raise(StandardError.new('error message'))
62+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
5863
end
5964

6065
it 'raises' do
6166
expect { client.ping }.to raise_error(RequestError, /error message/)
62-
expect(a_request(:post, "#{bbs_url}/v1/ping")).to have_been_made.times(3)
67+
expect(a_request(:post, "#{bbs_url}/v1/ping")).to have_been_made.times(17)
6368
end
6469
end
6570

@@ -106,11 +111,12 @@ module Diego
106111
context 'when it fails to make the request' do
107112
before do
108113
stub_request(:post, "#{bbs_url}/v1/domains/upsert").to_raise(StandardError.new('error message'))
114+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
109115
end
110116

111117
it 'raises' do
112118
expect { client.upsert_domain(domain:, ttl:) }.to raise_error(RequestError, /error message/)
113-
expect(a_request(:post, "#{bbs_url}/v1/domains/upsert")).to have_been_made.times(3)
119+
expect(a_request(:post, "#{bbs_url}/v1/domains/upsert")).to have_been_made.times(17)
114120
end
115121
end
116122

@@ -164,11 +170,12 @@ module Diego
164170
context 'when it fails to make the request' do
165171
before do
166172
stub_request(:post, "#{bbs_url}/v1/tasks/desire.r2").to_raise(StandardError.new('error message'))
173+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
167174
end
168175

169176
it 'raises' do
170177
expect { client.desire_task(task_definition: task_definition, task_guid: 'task_guid', domain: 'domain') }.to raise_error(RequestError, /error message/)
171-
expect(a_request(:post, "#{bbs_url}/v1/tasks/desire.r2")).to have_been_made.times(3)
178+
expect(a_request(:post, "#{bbs_url}/v1/tasks/desire.r2")).to have_been_made.times(17)
172179
end
173180
end
174181

@@ -245,11 +252,12 @@ module Diego
245252
context 'when it fails to make the request' do
246253
before do
247254
stub_request(:post, "#{bbs_url}/v1/tasks/list.r2").to_raise(StandardError.new('error message'))
255+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
248256
end
249257

250258
it 'raises' do
251259
expect { client.tasks }.to raise_error(RequestError, /error message/)
252-
expect(a_request(:post, "#{bbs_url}/v1/tasks/list.r2")).to have_been_made.times(3)
260+
expect(a_request(:post, "#{bbs_url}/v1/tasks/list.r2")).to have_been_made.times(17)
253261
end
254262
end
255263

@@ -300,11 +308,12 @@ module Diego
300308
context 'when it fails to make the request' do
301309
before do
302310
stub_request(:post, "#{bbs_url}/v1/tasks/get_by_task_guid.r2").to_raise(StandardError.new('error message'))
311+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
303312
end
304313

305314
it 'raises' do
306315
expect { client.task_by_guid('some-guid') }.to raise_error(RequestError, /error message/)
307-
expect(a_request(:post, "#{bbs_url}/v1/tasks/get_by_task_guid.r2")).to have_been_made.times(3)
316+
expect(a_request(:post, "#{bbs_url}/v1/tasks/get_by_task_guid.r2")).to have_been_made.times(17)
308317
end
309318
end
310319

@@ -355,11 +364,12 @@ module Diego
355364
context 'when it fails to make the request' do
356365
before do
357366
stub_request(:post, "#{bbs_url}/v1/tasks/cancel").to_raise(StandardError.new('error message'))
367+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
358368
end
359369

360370
it 'raises' do
361371
expect { client.cancel_task('some-guid') }.to raise_error(RequestError, /error message/)
362-
expect(a_request(:post, "#{bbs_url}/v1/tasks/cancel")).to have_been_made.times(3)
372+
expect(a_request(:post, "#{bbs_url}/v1/tasks/cancel")).to have_been_made.times(17)
363373
end
364374
end
365375

@@ -411,11 +421,12 @@ module Diego
411421
context 'when it fails to make the request' do
412422
before do
413423
stub_request(:post, "#{bbs_url}/v1/desired_lrp/desire.r2").to_raise(StandardError.new('error message'))
424+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
414425
end
415426

416427
it 'raises' do
417428
expect { client.desire_lrp(lrp) }.to raise_error(RequestError, /error message/)
418-
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/desire.r2")).to have_been_made.times(3)
429+
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/desire.r2")).to have_been_made.times(17)
419430
end
420431
end
421432

@@ -472,11 +483,12 @@ module Diego
472483
context 'when it fails to make the request' do
473484
before do
474485
stub_request(:post, "#{bbs_url}/v1/desired_lrps/get_by_process_guid.r2").to_raise(StandardError.new('error message'))
486+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
475487
end
476488

477489
it 'raises' do
478490
expect { client.desired_lrp_by_process_guid(process_guid) }.to raise_error(RequestError, /error message/)
479-
expect(a_request(:post, "#{bbs_url}/v1/desired_lrps/get_by_process_guid.r2")).to have_been_made.times(3)
491+
expect(a_request(:post, "#{bbs_url}/v1/desired_lrps/get_by_process_guid.r2")).to have_been_made.times(17)
480492
end
481493
end
482494

@@ -528,11 +540,12 @@ module Diego
528540
context 'when it fails to make the request' do
529541
before do
530542
stub_request(:post, "#{bbs_url}/v1/desired_lrp/remove").to_raise(StandardError.new('error message'))
543+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
531544
end
532545

533546
it 'raises' do
534547
expect { client.remove_desired_lrp(process_guid) }.to raise_error(RequestError, /error message/)
535-
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/remove")).to have_been_made.times(3)
548+
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/remove")).to have_been_made.times(17)
536549
end
537550
end
538551

@@ -586,11 +599,12 @@ module Diego
586599
context 'when it fails to make the request' do
587600
before do
588601
stub_request(:post, "#{bbs_url}/v1/actual_lrps/retire").to_raise(StandardError.new('error message'))
602+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
589603
end
590604

591605
it 'raises' do
592606
expect { client.retire_actual_lrp(actual_lrp_key) }.to raise_error(RequestError, /error message/)
593-
expect(a_request(:post, "#{bbs_url}/v1/actual_lrps/retire")).to have_been_made.times(3)
607+
expect(a_request(:post, "#{bbs_url}/v1/actual_lrps/retire")).to have_been_made.times(17)
594608
end
595609
end
596610

@@ -644,11 +658,12 @@ module Diego
644658
context 'when it fails to make the request' do
645659
before do
646660
stub_request(:post, "#{bbs_url}/v1/desired_lrp/update").to_raise(StandardError.new('error message'))
661+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
647662
end
648663

649664
it 'raises' do
650665
expect { client.update_desired_lrp(process_guid, lrp_update) }.to raise_error(RequestError, /error message/)
651-
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/update")).to have_been_made.times(3)
666+
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp/update")).to have_been_made.times(17)
652667
end
653668
end
654669

@@ -736,11 +751,12 @@ module Diego
736751
context 'when it fails to make the request' do
737752
before do
738753
stub_request(:post, "#{bbs_url}/v1/desired_lrp_scheduling_infos/list").to_raise(StandardError.new('error message'))
754+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
739755
end
740756

741757
it 'raises' do
742758
expect { client.desired_lrp_scheduling_infos(domain) }.to raise_error(RequestError, /error message/)
743-
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp_scheduling_infos/list")).to have_been_made.times(3)
759+
expect(a_request(:post, "#{bbs_url}/v1/desired_lrp_scheduling_infos/list")).to have_been_made.times(17)
744760
end
745761
end
746762

@@ -760,21 +776,104 @@ module Diego
760776
end
761777

762778
describe '#with_request_error_handling' do
763-
it 'retries' do
764-
tries = 0
779+
before do
780+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
781+
allow(logger).to receive(:info)
782+
allow(logger).to receive(:error)
783+
end
765784

766-
client.with_request_error_handling do
767-
tries += 1
768-
raise 'error' if tries < 2
785+
context 'when the block succeeds immediately' do
786+
it 'does not sleep or raise an exception' do
787+
expect { subject.with_request_error_handling {} }.not_to raise_error
769788
end
770-
771-
expect(tries).to be > 1
772789
end
773790

774-
it 'raises an error after all retries fail' do
775-
expect do
776-
client.with_request_error_handling { raise 'error' }
777-
end.to raise_error(RequestError)
791+
context 'when the block raises an exception' do
792+
let(:successful_block) do
793+
proc {
794+
@count ||= 0
795+
@count += 1
796+
@count == 2 ? true : raise('error')
797+
}
798+
end
799+
800+
it 'retries once and eventually succeeds' do
801+
expect { subject.with_request_error_handling(&successful_block) }.not_to raise_error
802+
end
803+
804+
it 'retries and eventually raises an error when the block fails' do
805+
attempts = 0
806+
expect do
807+
subject.with_request_error_handling do
808+
attempts += 1
809+
Timecop.travel(Time.now + 50)
810+
raise 'Error!'
811+
end
812+
end.to raise_error(RequestError, 'Error!')
813+
expect(logger).to have_received(:info).with(/Initiating a new attempt to connect to the diego backend./)
814+
expect(logger).to have_received(:error).with(/Unable to establish a connection to diego backend/)
815+
expect(logger).to have_received(:error).once
816+
expect(logger).to have_received(:info).once
817+
expect(attempts).to eq(2)
818+
end
819+
820+
it 'stops retrying after 60 seconds and raises an exception' do
821+
start_time = Time.now
822+
Timecop.freeze do
823+
expect do
824+
subject.with_request_error_handling do
825+
Timecop.travel(start_time + 61)
826+
raise 'Error!'
827+
end
828+
end.to raise_error(RequestError, 'Error!')
829+
end
830+
expected_log = 'Unable to establish a connection to diego backend, no more retries, raising an exception.'
831+
expect(logger).to have_received(:error).with(expected_log)
832+
end
833+
834+
it 'raises an error after 6 attempts in approximately 1 minute when each yield call takes 10 seconds' do
835+
attempts = 0
836+
expect do
837+
subject.with_request_error_handling do
838+
attempts += 1
839+
Timecop.travel(10.seconds.from_now) # assuming each yield call take 10 seconds
840+
raise 'Error!'
841+
end
842+
end.to raise_error(RequestError, 'Error!')
843+
expect(logger).to have_received(:error).with(/Unable to establish a connection to diego/).once
844+
expect(logger).to have_received(:info).with(/Initiating a new attempt to connect to the diego backend./).exactly(5).times
845+
expect(attempts).to be_within(1).of(6)
846+
end
847+
848+
it 'raises an error after 9 attempts in approximately 1 minute when each yield call takes 5 seconds' do
849+
attempts = 0
850+
expect do
851+
subject.with_request_error_handling do
852+
attempts += 1
853+
Timecop.travel(5.seconds.from_now) # assuming each yield call take 5 seconds
854+
raise 'Error!'
855+
end
856+
end.to raise_error(RequestError, 'Error!')
857+
expect(logger).to have_received(:error).with(/Unable to establish a connection to diego/).once
858+
expect(logger).to have_received(:info).with(/Initiating a new attempt to connect to the diego backend./).exactly(8).times
859+
expect(attempts).to be_within(1).of(9)
860+
end
861+
862+
it 'raises an error after 17 attempts in approximately 1 minute when each yield call immediately' do
863+
attempts = 0
864+
start_time = Time.now
865+
expect do
866+
subject.with_request_error_handling do
867+
attempts += 1
868+
raise RequestError
869+
end
870+
end.to raise_error(RequestError)
871+
end_time = Time.now
872+
duration = end_time.to_f - start_time.to_f
873+
874+
expect(attempts).to be_within(1).of(17)
875+
expect(duration).to be_within(1).of(62)
876+
end
778877
end
779878
end
780879

@@ -784,11 +883,12 @@ module Diego
784883
before do
785884
# We don't actually communicate over the socket, we just want to check the invocation.
786885
allow(TCPSocket).to receive(:new).and_raise('done!')
886+
allow(subject).to receive(:sleep) { |n| Timecop.travel(n) }
787887
end
788888

789889
it 'sets TCPSocket:connect_timeout to HTTPClient:connect_timeout / 2' do
790890
expect { client.ping }.to raise_error('done!')
791-
expect(TCPSocket).to have_received(:new).with(bbs_host, bbs_port, { connect_timeout: timeout / 2 }).exactly(3).times
891+
expect(TCPSocket).to have_received(:new).with(bbs_host, bbs_port, { connect_timeout: timeout / 2 }).exactly(17).times
792892
end
793893
end
794894

0 commit comments

Comments
 (0)