From 8c52213e047cde29a773714fc304135635d8dedd Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Tue, 3 Oct 2023 11:02:58 +0200 Subject: [PATCH 1/6] Compress and encode schema information --- Steepfile | 1 + lib/datadog/appsec/event.rb | 14 +++++++++++++- sig/datadog/appsec/event.rbs | 2 ++ spec/datadog/appsec/event_spec.rb | 15 +++++++++++++-- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/Steepfile b/Steepfile index 1239e9c5875..538c8a11f47 100644 --- a/Steepfile +++ b/Steepfile @@ -619,6 +619,7 @@ target :ddtrace do library 'securerandom' library 'base64' library 'digest' + library 'zlib' repo_path 'vendor/rbs' library 'cucumber' diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 4462d4fcbf5..5547cf99fe4 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -1,4 +1,6 @@ require 'json' +require 'zlib' +require 'base64' require_relative 'rate_limiter' @@ -34,6 +36,8 @@ module Event Content-Language ].map!(&:downcase).freeze + MAX_ENCODED_SCHEMA_SIZE = 25000 + # Record events for a trace # # This is expected to be called only once per trace for the rate limiter @@ -110,7 +114,15 @@ def self.build_service_entry_tags(event_group) tags['_dd.appsec.triggers'] += waf_result.events waf_result.derivatives.each do |key, value| - tags[key] = JSON.dump(value) + data = Base64.encode64(Zlib.gzip(JSON.dump(value))) + + if data.size >= MAX_ENCODED_SCHEMA_SIZE + Datadog.logger.debug do + "Schema key: #{key} exceed max size value. We do not include it as part of the span tags" + end + next + end + tags[key] = data end tags diff --git a/sig/datadog/appsec/event.rbs b/sig/datadog/appsec/event.rbs index db301c40bf9..d63863c76d3 100644 --- a/sig/datadog/appsec/event.rbs +++ b/sig/datadog/appsec/event.rbs @@ -5,6 +5,8 @@ module Datadog ALLOWED_RESPONSE_HEADERS: untyped + MAX_ENCODED_SCHEMA_SIZE: Numeric + def self.record: (Datadog::Tracing::SpanOperation, *untyped events) -> (nil | untyped) def self.record_via_span: (Datadog::Tracing::SpanOperation, *untyped events) -> untyped diff --git a/spec/datadog/appsec/event_spec.rb b/spec/datadog/appsec/event_spec.rb index 99b97845cd2..0dff58fce8a 100644 --- a/spec/datadog/appsec/event_spec.rb +++ b/spec/datadog/appsec/event_spec.rb @@ -112,9 +112,20 @@ } end - it 'adds derivatives to the top level span meta' do + it 'adds derivatives after comporessing and encode to Base64 to the top level span meta' do meta = top_level_span.meta - expect(meta['_dd.appsec.s.req.headers']).to eq JSON.dump([{ 'host' => [8], 'version' => [8] }]) + result = Base64.encode64(Zlib.gzip(JSON.dump([{ 'host' => [8], 'version' => [8] }]))) + + expect(meta['_dd.appsec.s.req.headers']).to eq result + end + + context 'derivative values exceed Event::MAX_ENCODED_SCHEMA_SIZE value' do + it 'do not add derivative key to meta' do + stub_const('Datadog::AppSec::Event::MAX_ENCODED_SCHEMA_SIZE', 1) + meta = top_level_span.meta + + expect(meta['_dd.appsec.s.req.headers']).to be_nil + end end end end From f9af157f7babdd0809bdaa272e413314b858eb83 Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Tue, 3 Oct 2023 12:50:15 +0200 Subject: [PATCH 2/6] Add support for ruby 2.2 and 2.3 --- Steepfile | 2 +- lib/datadog/appsec/event.rb | 12 +- sig/datadog/appsec/event.rbs | 2 + spec/datadog/appsec/event_spec.rb | 3 +- vendor/rbs/zlib/0/gzip_file.rbs | 228 ++++++++++++++++++++++++++++ vendor/rbs/zlib/0/gzip_writer.rbs | 237 ++++++++++++++++++++++++++++++ vendor/rbs/zlib/0/zlib.rbs | 6 + 7 files changed, 487 insertions(+), 3 deletions(-) create mode 100644 vendor/rbs/zlib/0/gzip_file.rbs create mode 100644 vendor/rbs/zlib/0/gzip_writer.rbs create mode 100644 vendor/rbs/zlib/0/zlib.rbs diff --git a/Steepfile b/Steepfile index 538c8a11f47..09ad4472bc5 100644 --- a/Steepfile +++ b/Steepfile @@ -619,7 +619,6 @@ target :ddtrace do library 'securerandom' library 'base64' library 'digest' - library 'zlib' repo_path 'vendor/rbs' library 'cucumber' @@ -644,6 +643,7 @@ target :ddtrace do library 'opentelemetry-api' library 'passenger' library 'webmock' + library 'zlib' # TODO: gem 'libddwaf' library 'libddwaf' diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 5547cf99fe4..685b7c6a252 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -114,7 +114,7 @@ def self.build_service_entry_tags(event_group) tags['_dd.appsec.triggers'] += waf_result.events waf_result.derivatives.each do |key, value| - data = Base64.encode64(Zlib.gzip(JSON.dump(value))) + data = Base64.encode64(gzip(JSON.dump(value))) if data.size >= MAX_ENCODED_SCHEMA_SIZE Datadog.logger.debug do @@ -128,6 +128,16 @@ def self.build_service_entry_tags(event_group) tags end end + + def self.gzip(value) + sio = StringIO.new + gz = Zlib::GzipWriter.new(sio, Zlib::DEFAULT_COMPRESSION, Zlib::DEFAULT_STRATEGY) + gz.write(value) + gz.close + sio.string + end + + private_class_method :gzip end end end diff --git a/sig/datadog/appsec/event.rbs b/sig/datadog/appsec/event.rbs index d63863c76d3..48c723959c0 100644 --- a/sig/datadog/appsec/event.rbs +++ b/sig/datadog/appsec/event.rbs @@ -12,6 +12,8 @@ module Datadog def self.record_via_span: (Datadog::Tracing::SpanOperation, *untyped events) -> untyped def self.build_service_entry_tags: (Array[Hash[::Symbol, untyped]] event_group) -> Hash[::String, untyped] + + def self.gzip: (untyped String) -> String end end end diff --git a/spec/datadog/appsec/event_spec.rb b/spec/datadog/appsec/event_spec.rb index 0dff58fce8a..b07506a058d 100644 --- a/spec/datadog/appsec/event_spec.rb +++ b/spec/datadog/appsec/event_spec.rb @@ -114,7 +114,8 @@ it 'adds derivatives after comporessing and encode to Base64 to the top level span meta' do meta = top_level_span.meta - result = Base64.encode64(Zlib.gzip(JSON.dump([{ 'host' => [8], 'version' => [8] }]))) + gzip = described_class.send(:gzip, JSON.dump([{ 'host' => [8], 'version' => [8] }])) + result = Base64.encode64(gzip) expect(meta['_dd.appsec.s.req.headers']).to eq result end diff --git a/vendor/rbs/zlib/0/gzip_file.rbs b/vendor/rbs/zlib/0/gzip_file.rbs new file mode 100644 index 00000000000..ddb6dc9ec8c --- /dev/null +++ b/vendor/rbs/zlib/0/gzip_file.rbs @@ -0,0 +1,228 @@ +# +# This module provides access to the [zlib library](http://zlib.net). Zlib is +# designed to be a portable, free, general-purpose, legally unencumbered -- that +# is, not covered by any patents -- lossless data-compression library for use on +# virtually any computer hardware and operating system. +# +# The zlib compression library provides in-memory compression and decompression +# functions, including integrity checks of the uncompressed data. +# +# The zlib compressed data format is described in RFC 1950, which is a wrapper +# around a deflate stream which is described in RFC 1951. +# +# The library also supports reading and writing files in gzip (.gz) format with +# an interface similar to that of IO. The gzip format is described in RFC 1952 +# which is also a wrapper around a deflate stream. +# +# The zlib format was designed to be compact and fast for use in memory and on +# communications channels. The gzip format was designed for single-file +# compression on file systems, has a larger header than zlib to maintain +# directory information, and uses a different, slower check method than zlib. +# +# See your system's zlib.h for further information about zlib +# +# ## Sample usage +# +# Using the wrapper to compress strings with default parameters is quite simple: +# +# require "zlib" +# +# data_to_compress = File.read("don_quixote.txt") +# +# puts "Input size: #{data_to_compress.size}" +# #=> Input size: 2347740 +# +# data_compressed = Zlib::Deflate.deflate(data_to_compress) +# +# puts "Compressed size: #{data_compressed.size}" +# #=> Compressed size: 887238 +# +# uncompressed_data = Zlib::Inflate.inflate(data_compressed) +# +# puts "Uncompressed data is: #{uncompressed_data}" +# #=> Uncompressed data is: The Project Gutenberg EBook of Don Quixote... +# +# ## Class tree +# +# * Zlib::Deflate +# * Zlib::Inflate +# * Zlib::ZStream +# * Zlib::Error +# * Zlib::StreamEnd +# * Zlib::NeedDict +# * Zlib::DataError +# * Zlib::StreamError +# * Zlib::MemError +# * Zlib::BufError +# * Zlib::VersionError +# * Zlib::InProgressError +# +# +# +# (if you have GZIP_SUPPORT) +# * Zlib::GzipReader +# * Zlib::GzipWriter +# * Zlib::GzipFile +# * Zlib::GzipFile::Error +# * Zlib::GzipFile::LengthError +# * Zlib::GzipFile::CRCError +# * Zlib::GzipFile::NoFooter +# +module Zlib + # + # Zlib::GzipFile is an abstract class for handling a gzip formatted compressed + # file. The operations are defined in the subclasses, Zlib::GzipReader for + # reading, and Zlib::GzipWriter for writing. + # + # GzipReader should be used by associating an IO, or IO-like, object. + # + # ## Method Catalogue + # + # * ::wrap + # * ::open (Zlib::GzipReader::open and Zlib::GzipWriter::open) + # * #close + # * #closed? + # * #comment + # * comment= (Zlib::GzipWriter#comment=) + # * #crc + # * eof? (Zlib::GzipReader#eof?) + # * #finish + # * #level + # * lineno (Zlib::GzipReader#lineno) + # * lineno= (Zlib::GzipReader#lineno=) + # * #mtime + # * mtime= (Zlib::GzipWriter#mtime=) + # * #orig_name + # * orig_name (Zlib::GzipWriter#orig_name=) + # * #os_code + # * path (when the underlying IO supports #path) + # * #sync + # * #sync= + # * #to_io + # + # + # (due to internal structure, documentation may appear under Zlib::GzipReader or + # Zlib::GzipWriter) + # + class GzipFile + # + # Creates a GzipReader or GzipWriter associated with `io`, passing in any + # necessary extra options, and executes the block with the newly created object + # just like File.open. + # + # The GzipFile object will be closed automatically after executing the block. If + # you want to keep the associated IO object open, you may call + # Zlib::GzipFile#finish method in the block. + # + def self.wrap: (IO io, *untyped) { (instance gz) -> void } -> void + + public + + # + # Closes the GzipFile object. This method calls close method of the associated + # IO object. Returns the associated IO object. + # + def close: () -> void + + # + # Same as IO#closed? + # + def closed?: () -> void + + # + # Returns comments recorded in the gzip file header, or nil if the comments is + # not present. + # + def comment: () -> String? + + # + # Returns CRC value of the uncompressed data. + # + def crc: () -> Integer + + # + # Closes the GzipFile object. Unlike Zlib::GzipFile#close, this method never + # calls the close method of the associated IO object. Returns the associated IO + # object. + # + def finish: () -> IO + + # + # Returns compression level. + # + def level: () -> Integer + + # + # Returns last modification time recorded in the gzip file header. + # + def mtime: () -> Time + + # + # Returns original filename recorded in the gzip file header, or `nil` if + # original filename is not present. + # + def orig_name: () -> String? + + # + # Returns OS code number recorded in the gzip file header. + # + def os_code: () -> Integer + + # + # Same as IO#sync + # + def sync: () -> bool + + # + # Same as IO. If flag is `true`, the associated IO object must respond to the + # `flush` method. While `sync` mode is `true`, the compression ratio decreases + # sharply. + # + def sync=: (boolish) -> untyped + + # + # Same as IO. + # + def to_io: () -> IO + end +end diff --git a/vendor/rbs/zlib/0/gzip_writer.rbs b/vendor/rbs/zlib/0/gzip_writer.rbs new file mode 100644 index 00000000000..68e25a4e7e1 --- /dev/null +++ b/vendor/rbs/zlib/0/gzip_writer.rbs @@ -0,0 +1,237 @@ +# +# This module provides access to the [zlib library](http://zlib.net). Zlib is +# designed to be a portable, free, general-purpose, legally unencumbered -- that +# is, not covered by any patents -- lossless data-compression library for use on +# virtually any computer hardware and operating system. +# +# The zlib compression library provides in-memory compression and decompression +# functions, including integrity checks of the uncompressed data. +# +# The zlib compressed data format is described in RFC 1950, which is a wrapper +# around a deflate stream which is described in RFC 1951. +# +# The library also supports reading and writing files in gzip (.gz) format with +# an interface similar to that of IO. The gzip format is described in RFC 1952 +# which is also a wrapper around a deflate stream. +# +# The zlib format was designed to be compact and fast for use in memory and on +# communications channels. The gzip format was designed for single-file +# compression on file systems, has a larger header than zlib to maintain +# directory information, and uses a different, slower check method than zlib. +# +# See your system's zlib.h for further information about zlib +# +# ## Sample usage +# +# Using the wrapper to compress strings with default parameters is quite simple: +# +# require "zlib" +# +# data_to_compress = File.read("don_quixote.txt") +# +# puts "Input size: #{data_to_compress.size}" +# #=> Input size: 2347740 +# +# data_compressed = Zlib::Deflate.deflate(data_to_compress) +# +# puts "Compressed size: #{data_compressed.size}" +# #=> Compressed size: 887238 +# +# uncompressed_data = Zlib::Inflate.inflate(data_compressed) +# +# puts "Uncompressed data is: #{uncompressed_data}" +# #=> Uncompressed data is: The Project Gutenberg EBook of Don Quixote... +# +# ## Class tree +# +# * Zlib::Deflate +# * Zlib::Inflate +# * Zlib::ZStream +# * Zlib::Error +# * Zlib::StreamEnd +# * Zlib::NeedDict +# * Zlib::DataError +# * Zlib::StreamError +# * Zlib::MemError +# * Zlib::BufError +# * Zlib::VersionError +# * Zlib::InProgressError +# +# +# +# (if you have GZIP_SUPPORT) +# * Zlib::GzipReader +# * Zlib::GzipWriter +# * Zlib::GzipFile +# * Zlib::GzipFile::Error +# * Zlib::GzipFile::LengthError +# * Zlib::GzipFile::CRCError +# * Zlib::GzipFile::NoFooter +# +module Zlib + # + # Zlib::GzipWriter is a class for writing gzipped files. GzipWriter should be + # used with an instance of IO, or IO-like, object. + # + # Following two example generate the same result. + # + # Zlib::GzipWriter.open('hoge.gz') do |gz| + # gz.write 'jugemu jugemu gokou no surikire...' + # end + # + # File.open('hoge.gz', 'w') do |f| + # gz = Zlib::GzipWriter.new(f) + # gz.write 'jugemu jugemu gokou no surikire...' + # gz.close + # end + # + # To make like gzip(1) does, run following: + # + # orig = 'hoge.txt' + # Zlib::GzipWriter.open('hoge.gz') do |gz| + # gz.mtime = File.mtime(orig) + # gz.orig_name = orig + # gz.write IO.binread(orig) + # end + # + # NOTE: Due to the limitation of Ruby's finalizer, you must explicitly close + # GzipWriter objects by Zlib::GzipWriter#close etc. Otherwise, GzipWriter will + # be not able to write the gzip footer and will generate a broken gzip file. + # + class GzipWriter < Zlib::GzipFile + # + # Opens a file specified by `filename` for writing gzip compressed data, and + # returns a GzipWriter object associated with that file. Further details of + # this method are found in Zlib::GzipWriter.new and Zlib::GzipFile.wrap. + # + def self.open: (String filename) { (instance gz) -> void } -> instance + + public + + # + # Same as IO. + # + def <<: (_ToS obj) -> self + + # + # Specify the comment (`str`) in the gzip header. + # + def comment=: (String arg0) -> void + + # + # Flushes all the internal buffers of the GzipWriter object. The meaning of + # `flush` is same as in Zlib::Deflate#deflate. `Zlib::SYNC_FLUSH` is used if + # `flush` is omitted. It is no use giving flush `Zlib::NO_FLUSH`. + # + def flush: (?Integer flush) -> String + + # + # Specify the modification time (`mtime`) in the gzip header. Using an Integer. + # + # Setting the mtime in the gzip header does not effect the mtime of the file + # generated. Different utilities that expand the gzipped files may use the mtime + # header. For example the gunzip utility can use the `-N` flag which will set + # the resultant file's mtime to the value in the header. By default many tools + # will set the mtime of the expanded file to the mtime of the gzipped file, not + # the mtime in the header. + # + # If you do not set an mtime, the default value will be the time when + # compression started. Setting a value of 0 indicates no time stamp is + # available. + # + def mtime=: (string | _ToPath | IO file_name) -> Time + + # + # Specify the original name (`str`) in the gzip header. + # + def orig_name=: (String arg0) -> void + + # + # Total number of input bytes read so far. + # + def pos: () -> Integer + + # + # Same as IO. + # + def print: (*untyped arg0) -> NilClass + + # + # Same as IO. + # + def printf: (String format_string, *untyped arg0) -> NilClass + + # + # Same as IO. + # + def putc: (Numeric | String arg0) -> untyped + + # + # Same as IO. + # + def puts: (*untyped arg0) -> NilClass + + # + # Total number of input bytes read so far. + # + def tell: () -> Integer + + # + # Same as IO. + # + def write: (*_ToS string) -> Integer + + private + + # + # Creates a GzipWriter object associated with `io`. `level` and `strategy` + # should be the same as the arguments of Zlib::Deflate.new. The GzipWriter + # object writes gzipped data to `io`. `io` must respond to the `write` method + # that behaves the same as IO#write. + # + # The `options` hash may be used to set the encoding of the data. + # `:external_encoding`, `:internal_encoding` and `:encoding` may be set as in + # IO::new. + # + def initialize: (_Writer io, Integer level, Integer strategy, **untyped opts) -> void + end +end diff --git a/vendor/rbs/zlib/0/zlib.rbs b/vendor/rbs/zlib/0/zlib.rbs new file mode 100644 index 00000000000..c14d6c9d129 --- /dev/null +++ b/vendor/rbs/zlib/0/zlib.rbs @@ -0,0 +1,6 @@ +Zlib::DEFAULT_COMPRESSION: Integer + +# +# Default deflate strategy which is used for normal data. +# +Zlib::DEFAULT_STRATEGY: Integer From b6b77a40c8744088b99898be53c7062c92380718 Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Tue, 3 Oct 2023 16:59:27 +0200 Subject: [PATCH 3/6] refactor AppSec::Event to improve error handling and make sure to use the smaller version between the JSON string version and the compressed version for schema key values --- lib/datadog/appsec/event.rb | 152 ++++++++++++++++-------------- spec/datadog/appsec/event_spec.rb | 21 ++++- 2 files changed, 98 insertions(+), 75 deletions(-) diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 685b7c6a252..7f7c330028d 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -42,102 +42,114 @@ module Event # # This is expected to be called only once per trace for the rate limiter # to properly apply - def self.record(span, *events) - # ensure rate limiter is called only when there are events to record - return if events.empty? || span.nil? + class << self + def record(span, *events) + # ensure rate limiter is called only when there are events to record + return if events.empty? || span.nil? - Datadog::AppSec::RateLimiter.limit(:traces) do - record_via_span(span, *events) - end - end - - def self.record_via_span(span, *events) - events.group_by { |e| e[:trace] }.each do |trace, event_group| - unless trace - Datadog.logger.debug { "{ error: 'no trace: cannot record', event_group: #{event_group.inspect}}" } - next + Datadog::AppSec::RateLimiter.limit(:traces) do + record_via_span(span, *events) end + end - trace.keep! - trace.set_tag( - Datadog::Tracing::Metadata::Ext::Distributed::TAG_DECISION_MAKER, - Datadog::Tracing::Sampling::Ext::Decision::ASM - ) + def record_via_span(span, *events) + events.group_by { |e| e[:trace] }.each do |trace, event_group| + unless trace + Datadog.logger.debug { "{ error: 'no trace: cannot record', event_group: #{event_group.inspect}}" } + next + end - # prepare and gather tags to apply - service_entry_tags = build_service_entry_tags(event_group) + trace.keep! + trace.set_tag( + Datadog::Tracing::Metadata::Ext::Distributed::TAG_DECISION_MAKER, + Datadog::Tracing::Sampling::Ext::Decision::ASM + ) - # complex types are unsupported, we need to serialize to a string - triggers = service_entry_tags.delete('_dd.appsec.triggers') - span.set_tag('_dd.appsec.json', JSON.dump({ triggers: triggers })) + # prepare and gather tags to apply + service_entry_tags = build_service_entry_tags(event_group) - # apply tags to service entry span - service_entry_tags.each do |key, value| - span.set_tag(key, value) + # apply tags to service entry span + service_entry_tags.each do |key, value| + span.set_tag(key, value) + end end end - end - def self.build_service_entry_tags(event_group) - event_group.each_with_object({}) do |event, tags| - # TODO: assume HTTP request context for now + def build_service_entry_tags(event_group) + waf_events = [] + entry_tags = event_group.each_with_object({ '_dd.origin' => 'appsec' }) do |event, tags| + # TODO: assume HTTP request context for now + if (request = event[:request]) + request.headers.each do |header, value| + tags["http.request.headers.#{header}"] = value if ALLOWED_REQUEST_HEADERS.include?(header.downcase) + end - if (request = event[:request]) - request_headers = request.headers.select do |k, _| - ALLOWED_REQUEST_HEADERS.include?(k.downcase) + tags['http.host'] = request.host + tags['http.useragent'] = request.user_agent + tags['network.client.ip'] = request.remote_addr end - request_headers.each do |header, value| - tags["http.request.headers.#{header}"] = value + if (response = event[:response]) + response.headers.each do |header, value| + tags["http.response.headers.#{header}"] = value if ALLOWED_RESPONSE_HEADERS.include?(header.downcase) + end end - tags['http.host'] = request.host - tags['http.useragent'] = request.user_agent - tags['network.client.ip'] = request.remote_addr - end + waf_result = event[:waf_result] + # accumulate triggers + waf_events += waf_result.events - if (response = event[:response]) - response_headers = response.headers.select do |k, _| - ALLOWED_RESPONSE_HEADERS.include?(k.downcase) - end + waf_result.derivatives.each do |key, value| + parsed_value = json_parse(value) + parsed_value_size = parsed_value.size + + compressed_data = compressed_and_base64_encoded(parsed_value) + compressed_data_size = compressed_data.size - response_headers.each do |header, value| - tags["http.response.headers.#{header}"] = value + if compressed_data_size >= MAX_ENCODED_SCHEMA_SIZE && parsed_value_size >= MAX_ENCODED_SCHEMA_SIZE + Datadog.logger.debug do + "Schema key: #{key} exceed max size value. We do not include it as part of the span tags" + end + next + end + + derivative_value = parsed_value_size > compressed_data_size ? compressed_data : parsed_value + + tags[key] = derivative_value end + + tags end - tags['_dd.origin'] = 'appsec' + appsec_events = json_parse({ triggers: waf_events }) + entry_tags['_dd.appsec.json'] = appsec_events if appsec_events + entry_tags + end - # accumulate triggers - waf_result = event[:waf_result] - tags['_dd.appsec.triggers'] ||= [] - tags['_dd.appsec.triggers'] += waf_result.events + private - waf_result.derivatives.each do |key, value| - data = Base64.encode64(gzip(JSON.dump(value))) + def compressed_and_base64_encoded(value) + return unless value - if data.size >= MAX_ENCODED_SCHEMA_SIZE - Datadog.logger.debug do - "Schema key: #{key} exceed max size value. We do not include it as part of the span tags" - end - next - end - tags[key] = data - end + Base64.encode64(gzip(value)) + rescue TypeError + nil + end - tags + def json_parse(value) + JSON.dump(value) + rescue ArgumentError + nil end - end - def self.gzip(value) - sio = StringIO.new - gz = Zlib::GzipWriter.new(sio, Zlib::DEFAULT_COMPRESSION, Zlib::DEFAULT_STRATEGY) - gz.write(value) - gz.close - sio.string + def gzip(value) + sio = StringIO.new + gz = Zlib::GzipWriter.new(sio, Zlib::DEFAULT_COMPRESSION, Zlib::DEFAULT_STRATEGY) + gz.write(value) + gz.close + sio.string + end end - - private_class_method :gzip end end end diff --git a/spec/datadog/appsec/event_spec.rb b/spec/datadog/appsec/event_spec.rb index b07506a058d..3ffd927a7a8 100644 --- a/spec/datadog/appsec/event_spec.rb +++ b/spec/datadog/appsec/event_spec.rb @@ -112,12 +112,23 @@ } end - it 'adds derivatives after comporessing and encode to Base64 to the top level span meta' do - meta = top_level_span.meta - gzip = described_class.send(:gzip, JSON.dump([{ 'host' => [8], 'version' => [8] }])) - result = Base64.encode64(gzip) + context 'JSON payload' do + it 'uses JSON string as is smaller than the compressed value' do + meta = top_level_span.meta + + expect(meta['_dd.appsec.s.req.headers']).to eq('[{"host":[8],"version":[8]}]') + end + end - expect(meta['_dd.appsec.s.req.headers']).to eq result + context 'Compressed payload' do + it 'uses compressed value when is smaller than JSON string' do + result = "H4sIAOYoHGUAA4aphwAAAA=\n" + expect(described_class).to receive(:compressed_and_base64_encoded).and_return(result) + + meta = top_level_span.meta + + expect(meta['_dd.appsec.s.req.headers']).to eq(result) + end end context 'derivative values exceed Event::MAX_ENCODED_SCHEMA_SIZE value' do From a686d5b1d0038f2226ecd59e032e2c7358dc2a39 Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Tue, 3 Oct 2023 17:51:09 +0200 Subject: [PATCH 4/6] update RBS version --- Gemfile | 4 +- Steepfile | 2 +- lib/datadog/appsec/event.rb | 6 +- sig/datadog/appsec/event.rbs | 8 +- vendor/rbs/zlib/0/gzip_file.rbs | 228 ---------------------------- vendor/rbs/zlib/0/gzip_writer.rbs | 237 ------------------------------ vendor/rbs/zlib/0/zlib.rbs | 6 - 7 files changed, 14 insertions(+), 477 deletions(-) delete mode 100644 vendor/rbs/zlib/0/gzip_file.rbs delete mode 100644 vendor/rbs/zlib/0/gzip_writer.rbs delete mode 100644 vendor/rbs/zlib/0/zlib.rbs diff --git a/Gemfile b/Gemfile index 9164e8cd98c..11ac5d88b84 100644 --- a/Gemfile +++ b/Gemfile @@ -111,8 +111,8 @@ if RUBY_PLATFORM != 'java' end group :check do - if RUBY_VERSION >= '2.7.0' && RUBY_PLATFORM != 'java' - gem 'rbs', '~> 3.1.0', require: false + if RUBY_VERSION >= '3.0.0' && RUBY_PLATFORM != 'java' + gem 'rbs', '~> 3.2.0', require: false gem 'steep', '~> 1.4.0', require: false end end diff --git a/Steepfile b/Steepfile index 09ad4472bc5..538c8a11f47 100644 --- a/Steepfile +++ b/Steepfile @@ -619,6 +619,7 @@ target :ddtrace do library 'securerandom' library 'base64' library 'digest' + library 'zlib' repo_path 'vendor/rbs' library 'cucumber' @@ -643,7 +644,6 @@ target :ddtrace do library 'opentelemetry-api' library 'passenger' library 'webmock' - library 'zlib' # TODO: gem 'libddwaf' library 'libddwaf' diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 7f7c330028d..8d590560320 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -75,6 +75,7 @@ def record_via_span(span, *events) end end + # rubocop: disable Metrics/MethodLength def build_service_entry_tags(event_group) waf_events = [] entry_tags = event_group.each_with_object({ '_dd.origin' => 'appsec' }) do |event, tags| @@ -101,6 +102,8 @@ def build_service_entry_tags(event_group) waf_result.derivatives.each do |key, value| parsed_value = json_parse(value) + next unless parsed_value + parsed_value_size = parsed_value.size compressed_data = compressed_and_base64_encoded(parsed_value) @@ -125,12 +128,11 @@ def build_service_entry_tags(event_group) entry_tags['_dd.appsec.json'] = appsec_events if appsec_events entry_tags end + # rubocop: enable Metrics/MethodLength private def compressed_and_base64_encoded(value) - return unless value - Base64.encode64(gzip(value)) rescue TypeError nil diff --git a/sig/datadog/appsec/event.rbs b/sig/datadog/appsec/event.rbs index 48c723959c0..9f30df5d97a 100644 --- a/sig/datadog/appsec/event.rbs +++ b/sig/datadog/appsec/event.rbs @@ -13,7 +13,13 @@ module Datadog def self.build_service_entry_tags: (Array[Hash[::Symbol, untyped]] event_group) -> Hash[::String, untyped] - def self.gzip: (untyped String) -> String + private + + def self.compressed_and_base64_encoded: (untyped value) -> untyped + + def self.json_parse: (untyped value) -> untyped + + def self.gzip: (untyped value) -> untyped end end end diff --git a/vendor/rbs/zlib/0/gzip_file.rbs b/vendor/rbs/zlib/0/gzip_file.rbs deleted file mode 100644 index ddb6dc9ec8c..00000000000 --- a/vendor/rbs/zlib/0/gzip_file.rbs +++ /dev/null @@ -1,228 +0,0 @@ -# -# This module provides access to the [zlib library](http://zlib.net). Zlib is -# designed to be a portable, free, general-purpose, legally unencumbered -- that -# is, not covered by any patents -- lossless data-compression library for use on -# virtually any computer hardware and operating system. -# -# The zlib compression library provides in-memory compression and decompression -# functions, including integrity checks of the uncompressed data. -# -# The zlib compressed data format is described in RFC 1950, which is a wrapper -# around a deflate stream which is described in RFC 1951. -# -# The library also supports reading and writing files in gzip (.gz) format with -# an interface similar to that of IO. The gzip format is described in RFC 1952 -# which is also a wrapper around a deflate stream. -# -# The zlib format was designed to be compact and fast for use in memory and on -# communications channels. The gzip format was designed for single-file -# compression on file systems, has a larger header than zlib to maintain -# directory information, and uses a different, slower check method than zlib. -# -# See your system's zlib.h for further information about zlib -# -# ## Sample usage -# -# Using the wrapper to compress strings with default parameters is quite simple: -# -# require "zlib" -# -# data_to_compress = File.read("don_quixote.txt") -# -# puts "Input size: #{data_to_compress.size}" -# #=> Input size: 2347740 -# -# data_compressed = Zlib::Deflate.deflate(data_to_compress) -# -# puts "Compressed size: #{data_compressed.size}" -# #=> Compressed size: 887238 -# -# uncompressed_data = Zlib::Inflate.inflate(data_compressed) -# -# puts "Uncompressed data is: #{uncompressed_data}" -# #=> Uncompressed data is: The Project Gutenberg EBook of Don Quixote... -# -# ## Class tree -# -# * Zlib::Deflate -# * Zlib::Inflate -# * Zlib::ZStream -# * Zlib::Error -# * Zlib::StreamEnd -# * Zlib::NeedDict -# * Zlib::DataError -# * Zlib::StreamError -# * Zlib::MemError -# * Zlib::BufError -# * Zlib::VersionError -# * Zlib::InProgressError -# -# -# -# (if you have GZIP_SUPPORT) -# * Zlib::GzipReader -# * Zlib::GzipWriter -# * Zlib::GzipFile -# * Zlib::GzipFile::Error -# * Zlib::GzipFile::LengthError -# * Zlib::GzipFile::CRCError -# * Zlib::GzipFile::NoFooter -# -module Zlib - # - # Zlib::GzipFile is an abstract class for handling a gzip formatted compressed - # file. The operations are defined in the subclasses, Zlib::GzipReader for - # reading, and Zlib::GzipWriter for writing. - # - # GzipReader should be used by associating an IO, or IO-like, object. - # - # ## Method Catalogue - # - # * ::wrap - # * ::open (Zlib::GzipReader::open and Zlib::GzipWriter::open) - # * #close - # * #closed? - # * #comment - # * comment= (Zlib::GzipWriter#comment=) - # * #crc - # * eof? (Zlib::GzipReader#eof?) - # * #finish - # * #level - # * lineno (Zlib::GzipReader#lineno) - # * lineno= (Zlib::GzipReader#lineno=) - # * #mtime - # * mtime= (Zlib::GzipWriter#mtime=) - # * #orig_name - # * orig_name (Zlib::GzipWriter#orig_name=) - # * #os_code - # * path (when the underlying IO supports #path) - # * #sync - # * #sync= - # * #to_io - # - # - # (due to internal structure, documentation may appear under Zlib::GzipReader or - # Zlib::GzipWriter) - # - class GzipFile - # - # Creates a GzipReader or GzipWriter associated with `io`, passing in any - # necessary extra options, and executes the block with the newly created object - # just like File.open. - # - # The GzipFile object will be closed automatically after executing the block. If - # you want to keep the associated IO object open, you may call - # Zlib::GzipFile#finish method in the block. - # - def self.wrap: (IO io, *untyped) { (instance gz) -> void } -> void - - public - - # - # Closes the GzipFile object. This method calls close method of the associated - # IO object. Returns the associated IO object. - # - def close: () -> void - - # - # Same as IO#closed? - # - def closed?: () -> void - - # - # Returns comments recorded in the gzip file header, or nil if the comments is - # not present. - # - def comment: () -> String? - - # - # Returns CRC value of the uncompressed data. - # - def crc: () -> Integer - - # - # Closes the GzipFile object. Unlike Zlib::GzipFile#close, this method never - # calls the close method of the associated IO object. Returns the associated IO - # object. - # - def finish: () -> IO - - # - # Returns compression level. - # - def level: () -> Integer - - # - # Returns last modification time recorded in the gzip file header. - # - def mtime: () -> Time - - # - # Returns original filename recorded in the gzip file header, or `nil` if - # original filename is not present. - # - def orig_name: () -> String? - - # - # Returns OS code number recorded in the gzip file header. - # - def os_code: () -> Integer - - # - # Same as IO#sync - # - def sync: () -> bool - - # - # Same as IO. If flag is `true`, the associated IO object must respond to the - # `flush` method. While `sync` mode is `true`, the compression ratio decreases - # sharply. - # - def sync=: (boolish) -> untyped - - # - # Same as IO. - # - def to_io: () -> IO - end -end diff --git a/vendor/rbs/zlib/0/gzip_writer.rbs b/vendor/rbs/zlib/0/gzip_writer.rbs deleted file mode 100644 index 68e25a4e7e1..00000000000 --- a/vendor/rbs/zlib/0/gzip_writer.rbs +++ /dev/null @@ -1,237 +0,0 @@ -# -# This module provides access to the [zlib library](http://zlib.net). Zlib is -# designed to be a portable, free, general-purpose, legally unencumbered -- that -# is, not covered by any patents -- lossless data-compression library for use on -# virtually any computer hardware and operating system. -# -# The zlib compression library provides in-memory compression and decompression -# functions, including integrity checks of the uncompressed data. -# -# The zlib compressed data format is described in RFC 1950, which is a wrapper -# around a deflate stream which is described in RFC 1951. -# -# The library also supports reading and writing files in gzip (.gz) format with -# an interface similar to that of IO. The gzip format is described in RFC 1952 -# which is also a wrapper around a deflate stream. -# -# The zlib format was designed to be compact and fast for use in memory and on -# communications channels. The gzip format was designed for single-file -# compression on file systems, has a larger header than zlib to maintain -# directory information, and uses a different, slower check method than zlib. -# -# See your system's zlib.h for further information about zlib -# -# ## Sample usage -# -# Using the wrapper to compress strings with default parameters is quite simple: -# -# require "zlib" -# -# data_to_compress = File.read("don_quixote.txt") -# -# puts "Input size: #{data_to_compress.size}" -# #=> Input size: 2347740 -# -# data_compressed = Zlib::Deflate.deflate(data_to_compress) -# -# puts "Compressed size: #{data_compressed.size}" -# #=> Compressed size: 887238 -# -# uncompressed_data = Zlib::Inflate.inflate(data_compressed) -# -# puts "Uncompressed data is: #{uncompressed_data}" -# #=> Uncompressed data is: The Project Gutenberg EBook of Don Quixote... -# -# ## Class tree -# -# * Zlib::Deflate -# * Zlib::Inflate -# * Zlib::ZStream -# * Zlib::Error -# * Zlib::StreamEnd -# * Zlib::NeedDict -# * Zlib::DataError -# * Zlib::StreamError -# * Zlib::MemError -# * Zlib::BufError -# * Zlib::VersionError -# * Zlib::InProgressError -# -# -# -# (if you have GZIP_SUPPORT) -# * Zlib::GzipReader -# * Zlib::GzipWriter -# * Zlib::GzipFile -# * Zlib::GzipFile::Error -# * Zlib::GzipFile::LengthError -# * Zlib::GzipFile::CRCError -# * Zlib::GzipFile::NoFooter -# -module Zlib - # - # Zlib::GzipWriter is a class for writing gzipped files. GzipWriter should be - # used with an instance of IO, or IO-like, object. - # - # Following two example generate the same result. - # - # Zlib::GzipWriter.open('hoge.gz') do |gz| - # gz.write 'jugemu jugemu gokou no surikire...' - # end - # - # File.open('hoge.gz', 'w') do |f| - # gz = Zlib::GzipWriter.new(f) - # gz.write 'jugemu jugemu gokou no surikire...' - # gz.close - # end - # - # To make like gzip(1) does, run following: - # - # orig = 'hoge.txt' - # Zlib::GzipWriter.open('hoge.gz') do |gz| - # gz.mtime = File.mtime(orig) - # gz.orig_name = orig - # gz.write IO.binread(orig) - # end - # - # NOTE: Due to the limitation of Ruby's finalizer, you must explicitly close - # GzipWriter objects by Zlib::GzipWriter#close etc. Otherwise, GzipWriter will - # be not able to write the gzip footer and will generate a broken gzip file. - # - class GzipWriter < Zlib::GzipFile - # - # Opens a file specified by `filename` for writing gzip compressed data, and - # returns a GzipWriter object associated with that file. Further details of - # this method are found in Zlib::GzipWriter.new and Zlib::GzipFile.wrap. - # - def self.open: (String filename) { (instance gz) -> void } -> instance - - public - - # - # Same as IO. - # - def <<: (_ToS obj) -> self - - # - # Specify the comment (`str`) in the gzip header. - # - def comment=: (String arg0) -> void - - # - # Flushes all the internal buffers of the GzipWriter object. The meaning of - # `flush` is same as in Zlib::Deflate#deflate. `Zlib::SYNC_FLUSH` is used if - # `flush` is omitted. It is no use giving flush `Zlib::NO_FLUSH`. - # - def flush: (?Integer flush) -> String - - # - # Specify the modification time (`mtime`) in the gzip header. Using an Integer. - # - # Setting the mtime in the gzip header does not effect the mtime of the file - # generated. Different utilities that expand the gzipped files may use the mtime - # header. For example the gunzip utility can use the `-N` flag which will set - # the resultant file's mtime to the value in the header. By default many tools - # will set the mtime of the expanded file to the mtime of the gzipped file, not - # the mtime in the header. - # - # If you do not set an mtime, the default value will be the time when - # compression started. Setting a value of 0 indicates no time stamp is - # available. - # - def mtime=: (string | _ToPath | IO file_name) -> Time - - # - # Specify the original name (`str`) in the gzip header. - # - def orig_name=: (String arg0) -> void - - # - # Total number of input bytes read so far. - # - def pos: () -> Integer - - # - # Same as IO. - # - def print: (*untyped arg0) -> NilClass - - # - # Same as IO. - # - def printf: (String format_string, *untyped arg0) -> NilClass - - # - # Same as IO. - # - def putc: (Numeric | String arg0) -> untyped - - # - # Same as IO. - # - def puts: (*untyped arg0) -> NilClass - - # - # Total number of input bytes read so far. - # - def tell: () -> Integer - - # - # Same as IO. - # - def write: (*_ToS string) -> Integer - - private - - # - # Creates a GzipWriter object associated with `io`. `level` and `strategy` - # should be the same as the arguments of Zlib::Deflate.new. The GzipWriter - # object writes gzipped data to `io`. `io` must respond to the `write` method - # that behaves the same as IO#write. - # - # The `options` hash may be used to set the encoding of the data. - # `:external_encoding`, `:internal_encoding` and `:encoding` may be set as in - # IO::new. - # - def initialize: (_Writer io, Integer level, Integer strategy, **untyped opts) -> void - end -end diff --git a/vendor/rbs/zlib/0/zlib.rbs b/vendor/rbs/zlib/0/zlib.rbs deleted file mode 100644 index c14d6c9d129..00000000000 --- a/vendor/rbs/zlib/0/zlib.rbs +++ /dev/null @@ -1,6 +0,0 @@ -Zlib::DEFAULT_COMPRESSION: Integer - -# -# Default deflate strategy which is used for normal data. -# -Zlib::DEFAULT_STRATEGY: Integer From 733d010d2a2f2aec907d29f16b32d8a9e56bd762 Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Tue, 3 Oct 2023 17:54:57 +0200 Subject: [PATCH 5/6] improve the log message when the schema payload exceeds the maximun size --- lib/datadog/appsec/event.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 8d590560320..62a2247bda8 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -111,7 +111,7 @@ def build_service_entry_tags(event_group) if compressed_data_size >= MAX_ENCODED_SCHEMA_SIZE && parsed_value_size >= MAX_ENCODED_SCHEMA_SIZE Datadog.logger.debug do - "Schema key: #{key} exceed max size value. We do not include it as part of the span tags" + "Schema key: #{key} exceeds the max size value. It will not be included as part of the span tags" end next end From 5b100187320e3a9f3789cec5a6b651a7250cb788 Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Wed, 4 Oct 2023 18:30:10 +0200 Subject: [PATCH 6/6] update the Zlib algorith and add only compress the value if the JSON string exceeds certain byte size --- lib/datadog/appsec/event.rb | 34 +++++++++++++++++++++---------- sig/datadog/appsec/event.rbs | 1 + spec/datadog/appsec/event_spec.rb | 6 ++++-- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/lib/datadog/appsec/event.rb b/lib/datadog/appsec/event.rb index 62a2247bda8..b598241979a 100644 --- a/lib/datadog/appsec/event.rb +++ b/lib/datadog/appsec/event.rb @@ -37,6 +37,9 @@ module Event ].map!(&:downcase).freeze MAX_ENCODED_SCHEMA_SIZE = 25000 + # For more information about this number + # please check https://github.com/DataDog/dd-trace-rb/pull/3177#issuecomment-1747221082 + MIN_SCHEMA_SIZE_FOR_COMPRESSION = 260 # Record events for a trace # @@ -75,7 +78,7 @@ def record_via_span(span, *events) end end - # rubocop: disable Metrics/MethodLength + # rubocop:disable Metrics/MethodLength def build_service_entry_tags(event_group) waf_events = [] entry_tags = event_group.each_with_object({ '_dd.origin' => 'appsec' }) do |event, tags| @@ -106,19 +109,21 @@ def build_service_entry_tags(event_group) parsed_value_size = parsed_value.size - compressed_data = compressed_and_base64_encoded(parsed_value) - compressed_data_size = compressed_data.size + schema_value = if parsed_value_size >= MIN_SCHEMA_SIZE_FOR_COMPRESSION + compressed_and_base64_encoded(parsed_value) + else + parsed_value + end + next unless schema_value - if compressed_data_size >= MAX_ENCODED_SCHEMA_SIZE && parsed_value_size >= MAX_ENCODED_SCHEMA_SIZE + if schema_value.size >= MAX_ENCODED_SCHEMA_SIZE Datadog.logger.debug do "Schema key: #{key} exceeds the max size value. It will not be included as part of the span tags" end next end - derivative_value = parsed_value_size > compressed_data_size ? compressed_data : parsed_value - - tags[key] = derivative_value + tags[key] = schema_value end tags @@ -128,25 +133,32 @@ def build_service_entry_tags(event_group) entry_tags['_dd.appsec.json'] = appsec_events if appsec_events entry_tags end - # rubocop: enable Metrics/MethodLength + # rubocop:enable Metrics/MethodLength private def compressed_and_base64_encoded(value) Base64.encode64(gzip(value)) - rescue TypeError + rescue TypeError => e + Datadog.logger.debug do + "Failed to compress and encode value when populating AppSec::Event. Error: #{e.message}" + end nil end def json_parse(value) JSON.dump(value) - rescue ArgumentError + rescue ArgumentError => e + Datadog.logger.debug do + "Failed to parse value to JSON when populating AppSec::Event. Error: #{e.message}" + end nil end def gzip(value) sio = StringIO.new - gz = Zlib::GzipWriter.new(sio, Zlib::DEFAULT_COMPRESSION, Zlib::DEFAULT_STRATEGY) + # For an in depth comparison of Zlib options check https://github.com/DataDog/dd-trace-rb/pull/3177#issuecomment-1747215473 + gz = Zlib::GzipWriter.new(sio, Zlib::BEST_SPEED, Zlib::DEFAULT_STRATEGY) gz.write(value) gz.close sio.string diff --git a/sig/datadog/appsec/event.rbs b/sig/datadog/appsec/event.rbs index 9f30df5d97a..36d75ca6e55 100644 --- a/sig/datadog/appsec/event.rbs +++ b/sig/datadog/appsec/event.rbs @@ -6,6 +6,7 @@ module Datadog ALLOWED_RESPONSE_HEADERS: untyped MAX_ENCODED_SCHEMA_SIZE: Numeric + MIN_SCHEMA_SIZE_FOR_COMPRESSION: Numeric def self.record: (Datadog::Tracing::SpanOperation, *untyped events) -> (nil | untyped) diff --git a/spec/datadog/appsec/event_spec.rb b/spec/datadog/appsec/event_spec.rb index 3ffd927a7a8..e79ecfbde1e 100644 --- a/spec/datadog/appsec/event_spec.rb +++ b/spec/datadog/appsec/event_spec.rb @@ -113,7 +113,8 @@ end context 'JSON payload' do - it 'uses JSON string as is smaller than the compressed value' do + it 'uses JSON string when do not exceeds MIN_SCHEMA_SIZE_FOR_COMPRESSION' do + stub_const('Datadog::AppSec::Event::MIN_SCHEMA_SIZE_FOR_COMPRESSION', 3000) meta = top_level_span.meta expect(meta['_dd.appsec.s.req.headers']).to eq('[{"host":[8],"version":[8]}]') @@ -121,8 +122,9 @@ end context 'Compressed payload' do - it 'uses compressed value when is smaller than JSON string' do + it 'uses compressed value when JSON string is bigger than MIN_SCHEMA_SIZE_FOR_COMPRESSION' do result = "H4sIAOYoHGUAA4aphwAAAA=\n" + stub_const('Datadog::AppSec::Event::MIN_SCHEMA_SIZE_FOR_COMPRESSION', 1) expect(described_class).to receive(:compressed_and_base64_encoded).and_return(result) meta = top_level_span.meta