From 8ba3e87d6a9c4d00efab14414757300599801da5 Mon Sep 17 00:00:00 2001 From: Kuba Suder Date: Wed, 28 Nov 2018 16:05:00 +0200 Subject: [PATCH 1/2] option to read a list of page titles from a file --- import_logs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/import_logs.py b/import_logs.py index 267c182..2b56b32 100755 --- a/import_logs.py +++ b/import_logs.py @@ -768,6 +768,11 @@ def _create_parser(self): "changed General.action_title_category_delimiter in your Matomo configuration, you need to set this " "option to the same value in order to get a pretty page titles report." ) + option_parser.add_option( + '--page-titles-from', dest='page_titles_from', default=None, + help="Loads a mapping of URLs to page titles from a given file so that titles can be displayed " + "in the page titles report." + ) option_parser.add_option( '--dump-log-regex', dest='dump_log_regex', action='store_true', default=False, help="Prints out the regex string used to parse log lines and exists. Can be useful for using formats " @@ -927,6 +932,15 @@ def _parse_args(self, option_parser): else: logging.debug('Accepted hostnames: all') + self.page_titles_map = {} + + if self.options.page_titles_from: + for line in open(self.options.page_titles_from).readlines(): + separator = line.index('=') + url = line[0:separator].strip() + title = line[separator+1:-1].strip() + self.page_titles_map[url] = title + if self.options.log_format_regex: self.format = RegexFormat('custom', self.options.log_format_regex, self.options.log_date_format) elif self.options.log_format_name: @@ -1912,6 +1926,10 @@ def _get_hit_args(self, hit): urllib.quote(args['urlref'], '') ) if args['urlref'] != '' else '') ) + else: + page_title = config.page_titles_map.get(args['url']) + if page_title: + args['action_name'] = page_title if hit.generation_time_milli > 0: args['gt_ms'] = int(hit.generation_time_milli) From 8c55c0dcc75f500df22feee853005b7f29821fa9 Mon Sep 17 00:00:00 2001 From: Kuba Suder Date: Sat, 26 Jan 2019 15:20:15 +0200 Subject: [PATCH 2/2] look up the url without query part in page titles list --- import_logs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/import_logs.py b/import_logs.py index 2b56b32..23e8d39 100755 --- a/import_logs.py +++ b/import_logs.py @@ -1927,7 +1927,8 @@ def _get_hit_args(self, hit): ) if args['urlref'] != '' else '') ) else: - page_title = config.page_titles_map.get(args['url']) + url_without_query = re.sub(r'\?.*', '', args['url']) + page_title = config.page_titles_map.get(args['url']) or config.page_titles_map.get(url_without_query) if page_title: args['action_name'] = page_title