Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Gandi VirtualHost log format #316

Merged
merged 5 commits into from
Jun 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import textwrap
import collections
import glob
import io

# Avoid "got more than 100 headers" error
http.client._MAXHEADERS = 1000
Expand Down Expand Up @@ -459,6 +460,8 @@ def get(self, key):
_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)


_S3_LOG_FORMAT = (
r'\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>[\w*.:-]+)\s+'
r'(?P<userid>\S+)\s+\S+\s+\S+\s+\S+\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+\S+\s+(?P<length>\S+)\s+'
Expand All @@ -483,6 +486,10 @@ def get(self, key):
r'.*:\ (?P<ip>[\w*.]+).*\[(?P<date>.*)\].*\ (?P<status>\b\d{3}\b)\ (?P<length>\d+)\ -.*\"(?P<method>\S+)\ (?P<path>\S+).*'
)

_GANDI_SIMPLE_HOSTING_FORMAT = (
r'(?P<host>[0-9a-zA-Z-_.]+)\s+(?P<ip>[a-zA-Z0-9.]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.+?)\s+(?P<timezone>.+?)\]\s+\((?P<generation_time_secs>[0-9a-zA-Z\s]*)\)\s+"(?P<method>[A-Z]+)\s+(?P<path>\S+)\s+(\S+)"\s+(?P<status>[0-9]+)\s+(?P<length>\S+)\s+"(?P<referrer>\S+)"\s+"(?P<user_agent>[^"]+)"'
)
tups marked this conversation as resolved.
Show resolved Hide resolved

FORMATS = {
'common': RegexFormat('common', _COMMON_LOG_FORMAT),
'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT),
Expand All @@ -498,7 +505,8 @@ def get(self, key):
'elb': RegexFormat('elb', _ELB_LOG_FORMAT, '%Y-%m-%dT%H:%M:%S'),
'nginx_json': JsonFormat('nginx_json'),
'ovh': RegexFormat('ovh', _OVH_FORMAT),
'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f')
'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f'),
'gandi': RegexFormat('gandi', _GANDI_SIMPLE_HOSTING_FORMAT, '%d/%b/%Y:%H:%M:%S')
}

##
Expand Down Expand Up @@ -1527,9 +1535,10 @@ def _call(self, path, args, headers=None, url=None, data=None):
self.RedirectHandlerWithLogging(),
urllib.request.HTTPSHandler(**https_handler_args))
response = opener.open(request, timeout = timeout)
encoding = response.info().get_content_charset('utf-8')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

result = response.read()
response.close()
return result
return result.decode(encoding)
tups marked this conversation as resolved.
Show resolved Hide resolved

def _call_api(self, method, **kwargs):
"""
Expand Down
2 changes: 2 additions & 0 deletions tests/logs/gandi.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
www.example.com 1.2.3.4 - theuser [10/Feb/2012:16:42:07 -0500] (0 s) "GET / HTTP/1.1" 301 368 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
www.example.com 125.125.125.125 - - [10/Feb/2012:16:42:07 -0500] (0 s) "GET / HTTP/1.1" 200 1124 "https://www.example.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:90.0) Gecko/20100101 Firefox/90.0"
69 changes: 68 additions & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _test_ipv6(format_name, log_file = None):

for format_name in import_logs.FORMATS.keys():
# w3c extended tested by iis and netscaler log files; amazon cloudfront tested later
if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront' or format_name == 'ovh' or format_name == 'haproxy' or format_name == 'incapsula_w3c':
if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront' or format_name == 'ovh' or format_name == 'gandi' or format_name == 'haproxy' or format_name == 'incapsula_w3c':
continue

# 'Testing autodetection of format ' + format_name
Expand Down Expand Up @@ -416,6 +416,9 @@ def check_match_groups(format_name, groups):
def check_ovh_groups(groups):
check_common_complete_groups(groups)

def check_gandi_groups(groups):
check_common_complete_groups(groups)

def check_haproxy_groups(groups):
assert groups['ip'] == '4.3.2.1'
assert groups['date'] == '25/Sep/2018:06:05:27.584'
Expand Down Expand Up @@ -811,6 +814,70 @@ def test_ovh_parsing():

import_logs.config.options.log_hostname = 'foo'

def test_gandi_parsing():
"""test parsing of gandi logs (which needs to be forced, as it's not autodetected)"""

file_ = 'logs/gandi.log'

# have to override previous globals override for this test
import_logs.config.options.custom_w3c_fields = {}
Recorder.recorders = []
import_logs.parser = import_logs.Parser()
import_logs.config.format = import_logs.FORMATS['gandi']
import_logs.config.options.log_hostname = None
import_logs.config.options.enable_http_redirects = True
import_logs.config.options.enable_http_errors = True
import_logs.config.options.replay_tracking = False
import_logs.config.options.w3c_time_taken_in_millisecs = False
import_logs.parser.parse(file_)

hits = [hit.__dict__ for hit in Recorder.recorders]

assert hits[0]['status'] == u'301'
assert hits[0]['userid'] == 'theuser'
assert hits[0]['is_error'] == False
assert hits[0]['extension'] == '/'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == ''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}, 'uid': 'theuser'}
assert hits[0]['generation_time_milli'] == 0
assert hits[0]['host'] == 'www.example.com'
assert hits[0]['filename'] == 'logs/gandi.log'
assert hits[0]['is_redirect'] == True
assert hits[0]['date'] == datetime.datetime(2012, 2, 10, 21, 42, 0o7)
assert hits[0]['lineno'] == 0
assert hits[0]['ip'] == '1.2.3.4'
assert hits[0]['query_string'] == ''
assert hits[0]['path'] == '/'
assert hits[0]['is_robot'] == False
assert hits[0]['full_path'] == '/'
assert hits[0]['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'

assert hits[1]['status'] == u'200'
assert hits[1]['userid'] == None
assert hits[1]['is_error'] == False
assert hits[1]['extension'] == '/'
assert hits[1]['is_download'] == False
assert hits[1]['referrer'] == u'https://www.example.com/'
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}, 'uid': 'theuser'}
assert hits[1]['length'] == 1124
assert hits[1]['generation_time_milli'] == 0
assert hits[1]['host'] == 'www.example.com'
assert hits[1]['filename'] == 'logs/gandi.log'
assert hits[1]['is_redirect'] == False
assert hits[1]['date'] == datetime.datetime(2012, 2, 10, 21, 42, 0o7)
assert hits[1]['lineno'] == 1
assert hits[1]['ip'] == u'125.125.125.125'
assert hits[1]['query_string'] == u''
assert hits[1]['path'] == '/'
assert hits[1]['is_robot'] == False
assert hits[1]['full_path'] == u'/'
assert hits[1]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:90.0) Gecko/20100101 Firefox/90.0'

assert len(hits) == 2

import_logs.config.options.log_hostname = 'foo'

def test_incapsulaw3c_parsing():
"""test parsing of incapsula w3c logs (which needs to be forced, as it's not autodetected)"""

Expand Down