diff --git a/CHANGELOG.md b/CHANGELOG.md index 25e8d5e7..6ac68235 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ [//]: # (comment: Don't forget to update lib/datadog/statsd/version.rb:DogStatsd::Statsd::VERSION when releasing a new version) +## Unreleased + + * [BUGFIX] Recover from a wedged UDS socket on Datadog Agent restart. + `BadSocketError` now inherits from a new `Connection::RetryableError`, so + `Connection#write` closes the socket and reconnects instead of dropping + every subsequent metric. [#330][] by [@joshuay03][] + ## 5.7.1 / 2025.08.20 * [IMPROVEMENT] Suppress external env if origin detection is configured off. [#316][] by [@StephenWakely][] @@ -505,6 +512,7 @@ Future versions are likely to introduce backward incompatibilities with < Ruby 1 [#306]: https://github.com/DataDog/dogstatsd-ruby/issues/306 [#310]: https://github.com/DataDog/dogstatsd-ruby/issues/310 [#316]: https://github.com/DataDog/dogstatsd-ruby/pull/316 +[#330]: https://github.com/DataDog/dogstatsd-ruby/pull/330 [@AMekss]: https://github.com/AMekss [@abicky]: https://github.com/abicky [@adimitrov]: https://github.com/adimitrov @@ -527,6 +535,7 @@ Future versions are likely to introduce backward incompatibilities with < Ruby 1 [@janester]: https://github.com/janester [@jhawthorn]: https://github.com/jhawthorn [@jordan-brough]: https://github.com/jordan-brough +[@joshuay03]: https://github.com/joshuay03 [@jtzemp]: https://github.com/jtzemp [@kazu9su]: https://github.com/kazu9su [@kazwolfe]: https://github.com/kazwolfe diff --git a/lib/datadog/statsd/connection.rb b/lib/datadog/statsd/connection.rb index 2077e051..fadaddb4 100644 --- a/lib/datadog/statsd/connection.rb +++ b/lib/datadog/statsd/connection.rb @@ -3,6 +3,8 @@ module Datadog class Statsd class Connection + class RetryableError < StandardError; end + def initialize(telemetry: nil, logger: nil) @telemetry = telemetry @logger = logger @@ -25,7 +27,8 @@ def write(payload) # Try once to reconnect if the socket has been closed retries ||= 1 if retries <= 1 && - (boom.is_a?(Errno::ENOTCONN) or + (boom.is_a?(RetryableError) or + boom.is_a?(Errno::ENOTCONN) or boom.is_a?(Errno::ECONNREFUSED) or boom.is_a?(IOError) && boom.message =~ /closed stream/i) retries += 1 diff --git a/lib/datadog/statsd/uds_connection.rb b/lib/datadog/statsd/uds_connection.rb index d6e8f405..da34e079 100644 --- a/lib/datadog/statsd/uds_connection.rb +++ b/lib/datadog/statsd/uds_connection.rb @@ -5,7 +5,7 @@ module Datadog class Statsd class UDSConnection < Connection - class BadSocketError < StandardError; end + class BadSocketError < RetryableError; end # DogStatsd unix socket path attr_reader :socket_path @@ -39,9 +39,6 @@ def send_message(message) connect unless @socket @socket.sendmsg_nonblock(message) rescue Errno::ECONNREFUSED, Errno::ECONNRESET, Errno::ENOENT => e - # TODO: FIXME: This error should be considered as a retryable error in the - # Connection class. An even better solution would be to make BadSocketError inherit - # from a specific retryable error class in the Connection class. raise BadSocketError, "#{e.class}: #{e}" end end diff --git a/spec/integrations/connection_edge_case_spec.rb b/spec/integrations/connection_edge_case_spec.rb index b99332dd..5e440fc9 100644 --- a/spec/integrations/connection_edge_case_spec.rb +++ b/spec/integrations/connection_edge_case_spec.rb @@ -235,14 +235,16 @@ let(:fake_socket) do instance_double(Socket, connect: true, - sendmsg_nonblock: true + sendmsg_nonblock: true, + close: true ) end let(:fake_socket_retry) do instance_double(Socket, connect: true, - sendmsg_nonblock: true + sendmsg_nonblock: true, + close: true ) end @@ -262,26 +264,18 @@ subject.write('foobar') end - it 'retries on the second opened socket' # do - # expect(fake_socket_retry) - # .to receive(:sendmsg_nonblock) - # .with('foobar') - - # subject.write('foobar') - # end - - # FIXME: BadSocketError is not correctly caught by Connection class to retry - it 'does not correctly retry (1)' do + it 'retries on the second opened socket' do expect(fake_socket_retry) - .not_to receive(:sendmsg_nonblock) + .to receive(:sendmsg_nonblock) + .with('foobar') subject.write('foobar') end - it 'does not correctly retry (2)' do - subject.write('foobar') + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNRESET: Connection reset by peer' + subject.write('foobar') end end @@ -299,8 +293,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: RuntimeError yolo' end @@ -319,8 +312,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: SocketError yolo' end @@ -344,26 +336,18 @@ subject.write('foobar') end - it 'retries on the second opened socket' # do - # expect(fake_socket_retry) - # .to receive(:sendmsg_nonblock) - # .with('foobar') - - # subject.write('foobar') - # end - - # FIXME: BadSocketError is not correctly caught by Connection class to retry - it 'does not correctly retry (1)' do + it 'retries on the second opened socket' do expect(fake_socket_retry) - .not_to receive(:sendmsg_nonblock) + .to receive(:sendmsg_nonblock) + .with('foobar') subject.write('foobar') end - it 'does not correctly retry (2)' do - subject.write('foobar') + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNREFUSED: Connection refused - closed stream' + subject.write('foobar') end end @@ -381,8 +365,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: RuntimeError yolo' end @@ -401,48 +384,64 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') - expect(log.string).to match 'Errno::ECONNREFUSED Connection refused - yolo' + expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNREFUSED: Connection refused - yolo' end end end end - context 'when there is no socket (drop strategy)' do + context 'when the socket file is missing (retry strategy)' do before do allow(fake_socket) .to receive(:sendmsg_nonblock) .and_raise(Errno::ENOENT) end - it 'sends using the first socket' do - expect(fake_socket) - .to receive(:sendmsg_nonblock) - .with('foobar') + context 'when retrying is working' do + it 'tries with the initial socket' do + expect(fake_socket) + .to receive(:sendmsg_nonblock) + .with('foobar') - subject.write('foobar') - end + subject.write('foobar') + end + + it 'retries on the second opened socket' do + expect(fake_socket_retry) + .to receive(:sendmsg_nonblock) + .with('foobar') - it 'ignores the writing failure (message dropped)' do - expect do subject.write('foobar') - end.not_to raise_error - end + end - it 'does not retry to send message' do - expect(fake_socket_retry) - .not_to receive(:sendmsg_nonblock) + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - subject.write('foobar') + subject.write('foobar') + end end - # TODO: FIXME: we got to exclude the Errno::ENOENT for the retry strategy - it 'logs the error message', pending: true do - subject.write('foobar') + context 'when retrying fails' do + context 'because the socket file is still missing' do + before do + allow(fake_socket_retry) + .to receive(:sendmsg_nonblock) + .and_raise(Errno::ENOENT) + end - expect(log.string).to match 'Statsd: Errno::ENOENT No such file or directory' + it 'ignores the connection failure' do + expect do + subject.write('foobar') + end.not_to raise_error + end + + it 'logs the error message' do + subject.write('foobar') + expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ENOENT: No such file or directory' + end + end end end diff --git a/spec/statsd/uds_connection_spec.rb b/spec/statsd/uds_connection_spec.rb index d32752d8..1dcd2814 100644 --- a/spec/statsd/uds_connection_spec.rb +++ b/spec/statsd/uds_connection_spec.rb @@ -113,14 +113,16 @@ let(:fake_socket) do instance_double(Socket, connect: true, - sendmsg_nonblock: true + sendmsg_nonblock: true, + close: true ) end let(:fake_socket_retry) do instance_double(Socket, connect: true, - sendmsg_nonblock: true + sendmsg_nonblock: true, + close: true ) end @@ -140,32 +142,23 @@ subject.write('foobar') end - it 'retries on the second opened socket' # do - # expect(fake_socket_retry) - # .to receive(:sendmsg_nonblock) - # .with('foobar') - - # subject.write('foobar') - # end - - # FIXME: BadSocketError is not correctly caught by Connection class to retry - it 'does not correctly retry (1)' do - expect(Socket) - .to receive(:new) - .once + it 'retries on the second opened socket' do + expect(fake_socket_retry) + .to receive(:sendmsg_nonblock) + .with('foobar') subject.write('foobar') end - it 'does not correctly retry (2)' do - subject.write('foobar') + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNRESET: Connection reset by peer' + subject.write('foobar') end - it 'updates the "dropped_writer" telemetry counts' do + it 'updates the "sent" telemetry counts' do expect(telemetry) - .to receive(:dropped_writer) + .to receive(:sent) .with(bytes: 4, packets: 1) subject.write('test') @@ -186,8 +179,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: RuntimeError yolo' end @@ -214,8 +206,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: SocketError yolo' end @@ -247,31 +238,23 @@ subject.write('foobar') end - it 'retries on the second opened socket' # do - # expect(fake_socket_retry) - # .to receive(:sendmsg_nonblock) - # .with('foobar') - - # subject.write('foobar') - # end - - # FIXME: BadSocketError is not correctly caught by Connection class to retry - it 'does not correctly retry (1)' do + it 'retries on the second opened socket' do expect(fake_socket_retry) - .not_to receive(:sendmsg_nonblock) + .to receive(:sendmsg_nonblock) + .with('foobar') subject.write('foobar') end - it 'does not correctly retry (2)' do - subject.write('foobar') + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNREFUSED: Connection refused - closed stream' + subject.write('foobar') end - it 'updates the "dropped_writer" telemetry counts' do + it 'updates the "sent" telemetry counts' do expect(telemetry) - .to receive(:dropped_writer) + .to receive(:sent) .with(bytes: 4, packets: 1) subject.write('test') @@ -292,8 +275,7 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') expect(log.string).to match 'Statsd: RuntimeError yolo' end @@ -320,10 +302,9 @@ end.not_to raise_error end - # the mecanism to retry is broken, once it's fixed, this test should pass - it 'logs the error message', pending: true do + it 'logs the error message' do subject.write('foobar') - expect(log.string).to match 'Errno::ECONNREFUSED Connection refused - yolo' + expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ECONNREFUSED: Connection refused - yolo' end it 'updates the "dropped_writer" telemetry counts' do @@ -337,47 +318,72 @@ end end - context 'when there is no socket (drop strategy)' do + context 'when the socket file is missing (retry strategy)' do before do allow(fake_socket) .to receive(:sendmsg_nonblock) .and_raise(Errno::ENOENT) end - it 'sends using the first socket' do - expect(fake_socket) - .to receive(:sendmsg_nonblock) - .with('foobar') + context 'when retrying is working' do + it 'tries with the initial socket' do + expect(fake_socket) + .to receive(:sendmsg_nonblock) + .with('foobar') - subject.write('foobar') - end + subject.write('foobar') + end + + it 'retries on the second opened socket' do + expect(fake_socket_retry) + .to receive(:sendmsg_nonblock) + .with('foobar') - it 'ignores the writing failure (message dropped)' do - expect do subject.write('foobar') - end.not_to raise_error - end + end - it 'does not retry to send message' do - expect(fake_socket_retry) - .not_to receive(:sendmsg_nonblock) + it 'closes the original socket before reconnecting' do + expect(fake_socket).to receive(:close) - subject.write('foobar') - end + subject.write('foobar') + end - # TODO: FIXME: we got to exclude the Errno::ENOENT for the retry strategy - it 'logs the error message', pending: true do - subject.write('foobar') + it 'updates the "sent" telemetry counts' do + expect(telemetry) + .to receive(:sent) + .with(bytes: 4, packets: 1) - expect(log.string).to match 'Statsd: Errno::ENOENT No such file or directory' + subject.write('test') + end end - it 'updates the "dropped_writer" telemetry counts' do - expect(telemetry) - .to receive(:dropped_writer) - .with(bytes: 4, packets: 1) + context 'when retrying fails' do + context 'because the socket file is still missing' do + before do + allow(fake_socket_retry) + .to receive(:sendmsg_nonblock) + .and_raise(Errno::ENOENT) + end - subject.write('test') + it 'ignores the connection failure' do + expect do + subject.write('foobar') + end.not_to raise_error + end + + it 'logs the error message' do + subject.write('foobar') + expect(log.string).to match 'Statsd: Datadog::Statsd::UDSConnection::BadSocketError Errno::ENOENT: No such file or directory' + end + + it 'updates the "dropped_writer" telemetry counts' do + expect(telemetry) + .to receive(:dropped_writer) + .with(bytes: 4, packets: 1) + + subject.write('test') + end + end end end