diff --git a/import_logs.py b/import_logs.py index 0786974..1b9e3d3 100755 --- a/import_logs.py +++ b/import_logs.py @@ -52,6 +52,7 @@ import textwrap import collections import glob +import io # Avoid "got more than 100 headers" error http.client._MAXHEADERS = 1000 @@ -459,6 +460,8 @@ def get(self, key): _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + r'\s+"(?P.*?)"\s+"(?P.*?)"' ) + + _S3_LOG_FORMAT = ( r'\S+\s+(?P\S+)\s+\[(?P.*?)\s+(?P.*?)\]\s+(?P[\w*.:-]+)\s+' r'(?P\S+)\s+\S+\s+\S+\s+\S+\s+"(?P\S+)\s+(?P.*?)\s+\S+"\s+(?P\d+)\s+\S+\s+(?P\S+)\s+' @@ -483,6 +486,10 @@ def get(self, key): r'.*:\ (?P[\w*.]+).*\[(?P.*)\].*\ (?P\b\d{3}\b)\ (?P\d+)\ -.*\"(?P\S+)\ (?P\S+).*' ) +_GANDI_SIMPLE_HOSTING_FORMAT = ( + r'(?P[0-9a-zA-Z-_.]+)\s+(?P[a-zA-Z0-9.]+)\s+\S+\s+(?P\S+)\s+\[(?P.+?)\s+(?P.+?)\]\s+\((?P[0-9a-zA-Z\s]*)\)\s+"(?P[A-Z]+)\s+(?P\S+)\s+(\S+)"\s+(?P[0-9]+)\s+(?P\S+)\s+"(?P\S+)"\s+"(?P[^"]+)"' +) + FORMATS = { 'common': RegexFormat('common', _COMMON_LOG_FORMAT), 'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT), @@ -498,7 +505,8 @@ def get(self, key): 'elb': RegexFormat('elb', _ELB_LOG_FORMAT, '%Y-%m-%dT%H:%M:%S'), 'nginx_json': JsonFormat('nginx_json'), 'ovh': RegexFormat('ovh', _OVH_FORMAT), - 'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f') + 'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f'), + 'gandi': RegexFormat('gandi', _GANDI_SIMPLE_HOSTING_FORMAT, '%d/%b/%Y:%H:%M:%S') } ## @@ -1527,9 +1535,10 @@ def _call(self, path, args, headers=None, url=None, data=None): self.RedirectHandlerWithLogging(), urllib.request.HTTPSHandler(**https_handler_args)) response = opener.open(request, timeout = timeout) + encoding = response.info().get_content_charset('utf-8') result = response.read() response.close() - return result + return result.decode(encoding) def _call_api(self, method, **kwargs): """ diff --git a/tests/logs/gandi.log b/tests/logs/gandi.log new file mode 100644 index 0000000..5da7fae --- /dev/null +++ b/tests/logs/gandi.log @@ -0,0 +1,2 @@ +www.example.com 1.2.3.4 - theuser [10/Feb/2012:16:42:07 -0500] (0 s) "GET / HTTP/1.1" 301 368 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" +www.example.com 125.125.125.125 - - [10/Feb/2012:16:42:07 -0500] (0 s) "GET / HTTP/1.1" 200 1124 "https://www.example.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:90.0) Gecko/20100101 Firefox/90.0" \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py index fd203da..b7ae020 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -115,7 +115,7 @@ def _test_ipv6(format_name, log_file = None): for format_name in import_logs.FORMATS.keys(): # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later - if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront' or format_name == 'ovh' or format_name == 'haproxy' or format_name == 'incapsula_w3c': + if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront' or format_name == 'ovh' or format_name == 'gandi' or format_name == 'haproxy' or format_name == 'incapsula_w3c': continue # 'Testing autodetection of format ' + format_name @@ -416,6 +416,9 @@ def check_match_groups(format_name, groups): def check_ovh_groups(groups): check_common_complete_groups(groups) +def check_gandi_groups(groups): + check_common_complete_groups(groups) + def check_haproxy_groups(groups): assert groups['ip'] == '4.3.2.1' assert groups['date'] == '25/Sep/2018:06:05:27.584' @@ -811,6 +814,70 @@ def test_ovh_parsing(): import_logs.config.options.log_hostname = 'foo' +def test_gandi_parsing(): + """test parsing of gandi logs (which needs to be forced, as it's not autodetected)""" + + file_ = 'logs/gandi.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = import_logs.FORMATS['gandi'] + import_logs.config.options.log_hostname = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == u'301' + assert hits[0]['userid'] == 'theuser' + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == '/' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == '' + assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}, 'uid': 'theuser'} + assert hits[0]['generation_time_milli'] == 0 + assert hits[0]['host'] == 'www.example.com' + assert hits[0]['filename'] == 'logs/gandi.log' + assert hits[0]['is_redirect'] == True + assert hits[0]['date'] == datetime.datetime(2012, 2, 10, 21, 42, 0o7) + assert hits[0]['lineno'] == 0 + assert hits[0]['ip'] == '1.2.3.4' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == '/' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == '/' + assert hits[0]['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' + + assert hits[1]['status'] == u'200' + assert hits[1]['userid'] == None + assert hits[1]['is_error'] == False + assert hits[1]['extension'] == '/' + assert hits[1]['is_download'] == False + assert hits[1]['referrer'] == u'https://www.example.com/' + assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}, 'uid': 'theuser'} + assert hits[1]['length'] == 1124 + assert hits[1]['generation_time_milli'] == 0 + assert hits[1]['host'] == 'www.example.com' + assert hits[1]['filename'] == 'logs/gandi.log' + assert hits[1]['is_redirect'] == False + assert hits[1]['date'] == datetime.datetime(2012, 2, 10, 21, 42, 0o7) + assert hits[1]['lineno'] == 1 + assert hits[1]['ip'] == u'125.125.125.125' + assert hits[1]['query_string'] == u'' + assert hits[1]['path'] == '/' + assert hits[1]['is_robot'] == False + assert hits[1]['full_path'] == u'/' + assert hits[1]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:90.0) Gecko/20100101 Firefox/90.0' + + assert len(hits) == 2 + + import_logs.config.options.log_hostname = 'foo' + def test_incapsulaw3c_parsing(): """test parsing of incapsula w3c logs (which needs to be forced, as it's not autodetected)"""