From 43d479cd698923589ff3dd4f44e45dd900f4bac6 Mon Sep 17 00:00:00 2001 From: Stephan Richter Date: Fri, 25 Jan 2019 18:04:21 -0500 Subject: [PATCH 1/2] Use efficient database queries to combine data. This algorithm is 25-30x faster than the original one. Given that context-included coverage has about 50x the data of the old format (on our code base), this means that combining is now only 2-3x slower than before. --- coverage/sqldata.py | 174 ++++++++++++++++++++++++++++++++------------ 1 file changed, 129 insertions(+), 45 deletions(-) diff --git a/coverage/sqldata.py b/coverage/sqldata.py index 91ad3cd5e..957be245b 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -326,6 +326,12 @@ def touch_file(self, filename, plugin_name=""): self.add_file_tracers({filename: plugin_name}) def update(self, other_data, aliases=None): + """Update this data with data from several other `CoverageData` + instances. + + If `aliases` is provided, it's a `PathAliases` object that is used to + re-map paths to match the local machine's. + """ if self._has_lines and other_data._has_arcs: raise CoverageException("Can't combine arc data with line data") if self._has_arcs and other_data._has_lines: @@ -333,57 +339,135 @@ def update(self, other_data, aliases=None): aliases = aliases or PathAliases() - # See what we had already measured, for accurate conflict reporting. - this_measured = self.measured_files() - - other_files = set() - # Force the database we're writing to to exist before we start nesting # contexts. self._start_using() - # Start a single transaction in each file. - with self._connect(), other_data._connect(): - # lines - if other_data._has_lines: - for context in other_data.measured_contexts(): - self.set_context(context) - for filename in other_data.measured_files(): - lines = set(other_data.lines(filename, context=context)) - if lines: - other_files.add(filename) - filename = aliases.map(filename) - lines.update(self.lines(filename, context=context) or ()) - self.add_lines({filename: lines}) - - # arcs - if other_data._has_arcs: - for context in other_data.measured_contexts(): - self.set_context(context) - for filename in other_data.measured_files(): - arcs = set(other_data.arcs(filename, context=context)) - if arcs: - other_files.add(filename) - filename = aliases.map(filename) - arcs.update(self.arcs(filename, context=context) or ()) - self.add_arcs({filename: arcs}) - - # file_tracers - for filename in other_files: - other_plugin = other_data.file_tracer(filename) - filename = aliases.map(filename) - if filename in this_measured: - this_plugin = self.file_tracer(filename) - else: - this_plugin = None - if this_plugin is None: - self.add_file_tracers({filename: other_plugin}) - elif this_plugin != other_plugin: + # Collector for all arcs, lines and tracers + files = [] + contexts = [] + arcs = [] + lines = [] + tracers = {} + other_data.read() + with other_data._connect() as conn: + # Get files data. + cur = conn.execute('select path from file') + files.extend(aliases.map(path) for (path, ) in cur) + cur.close() + + # Get contexts data. + cur = conn.execute('select context from context') + contexts.extend(context for (context, ) in cur) + cur.close() + + # Get arc data. + cur = conn.execute( + 'select file.path, context.context, arc.fromno, arc.tono ' + 'from arc ' + 'inner join file on file.id = arc.file_id ' + 'inner join context on context.id = arc.context_id ') + arcs.extend( + (aliases.map(path), context, fromno, tono) + for (path, context, fromno, tono) in cur) + cur.close() + + # Get line data. + cur = conn.execute( + 'select file.path, context.context, line.lineno ' + 'from line ' + 'inner join file on file.id = line.file_id ' + 'inner join context on context.id = line.context_id ') + lines.extend( + (aliases.map(path), context, lineno) + for (path, context, lineno) in cur) + cur.close() + + # Get tracer data. + cur = conn.execute( + 'select file.path, tracer ' + 'from tracer ' + 'inner join file on file.id = tracer.file_id') + tracers.update(dict( + (aliases.map(path), tracer) + for (path, tracer) in cur)) + cur.close() + + with self._connect() as conn: + conn.isolation_level = 'IMMEDIATE' + + # Get all tracers in the DB. Files not in the tracers are assumed + # to have an empty string tracer. Since Sqlite does not support + # full outer joins, we have to make two queries to fill the + # dictionary. + this_tracers = { + path: '' + for path, in conn.execute('select path from file')} + this_tracers.update({ + aliases.map(path): tracer + for path, tracer in conn.execute( + 'select file.path, tracer from tracer ' + 'inner join file on file.id = tracer.file_id')}) + + # Create all file and context rows in the DB. + conn.executemany( + 'insert or ignore into file (path) values (?)', + ((file,) for file in files)) + file_ids = { + path: id + for id, path in conn.execute('select id, path from file')} + conn.executemany( + 'insert or ignore into context (context) values (?)', + ((context,) for context in contexts)) + context_ids = { + context: id + for id, context in conn.execute( + 'select id, context from context')} + + # Prepare tracers and fail, if a conflict is found. + # tracer_paths is used to ensure consistency over the tracer data + # and tracer_map tracks the tracers to be inserted. + tracer_map = {} + for path in files: + this_tracer = this_tracers.get(path) + other_tracer = tracers.get(path, '') + # If there is no tracer, there is always the None tracer. + if this_tracer is not None and this_tracer != other_tracer: raise CoverageException( "Conflicting file tracer name for '%s': %r vs %r" % ( - filename, this_plugin, other_plugin, - ) - ) + path, this_tracer, other_tracer)) + tracer_map[path] = other_tracer + + # Prepare arc and line rows to be inserted by converting the file + # and context strings with integer ids. Then use the efficient + # `executemany()` to insert all rows at once. + arc_rows = list( + (file_ids[file], context_ids[context], fromno, tono) + for file, context, fromno, tono in arcs) + line_rows = list( + (file_ids[file], context_ids[context], lineno) + for file, context, lineno in lines) + + self._choose_lines_or_arcs(arcs=bool(arcs), lines=bool(lines)) + + conn.executemany( + 'insert or ignore into arc ' + '(file_id, context_id, fromno, tono) values (?, ?, ?, ?)', + ((file_ids[file], context_ids[context], fromno, tono) + for file, context, fromno, tono in arcs)) + conn.executemany( + 'insert or ignore into line ' + '(file_id, context_id, lineno) values (?, ?, ?)', + ((file_ids[file], context_ids[context], lineno) + for file, context, lineno in lines)) + conn.executemany( + 'insert or ignore into tracer (file_id, tracer) values (?, ?)', + ((file_ids[filename], tracer) + for filename, tracer in tracer_map.items())) + + # Update all internal cache data. + self._reset() + self.read() def erase(self, parallel=False): """Erase the data in this object. From d9ad81f02a23072d94b1b105cc4aadf75e676d9b Mon Sep 17 00:00:00 2001 From: Stephan Richter Date: Sun, 27 Jan 2019 23:11:49 -0500 Subject: [PATCH 2/2] Some cleanup (left from first versions) and localized file path lookup which takes off another 20%. --- coverage/sqldata.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/coverage/sqldata.py b/coverage/sqldata.py index 957be245b..0219a8a22 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -344,21 +344,16 @@ def update(self, other_data, aliases=None): self._start_using() # Collector for all arcs, lines and tracers - files = [] - contexts = [] - arcs = [] - lines = [] - tracers = {} other_data.read() with other_data._connect() as conn: # Get files data. cur = conn.execute('select path from file') - files.extend(aliases.map(path) for (path, ) in cur) + files = {path: aliases.map(path) for (path,) in cur} cur.close() # Get contexts data. cur = conn.execute('select context from context') - contexts.extend(context for (context, ) in cur) + contexts = [context for (context,) in cur] cur.close() # Get arc data. @@ -367,9 +362,9 @@ def update(self, other_data, aliases=None): 'from arc ' 'inner join file on file.id = arc.file_id ' 'inner join context on context.id = arc.context_id ') - arcs.extend( - (aliases.map(path), context, fromno, tono) - for (path, context, fromno, tono) in cur) + arcs = [ + (files[path], context, fromno, tono) + for (path, context, fromno, tono) in cur] cur.close() # Get line data. @@ -378,9 +373,9 @@ def update(self, other_data, aliases=None): 'from line ' 'inner join file on file.id = line.file_id ' 'inner join context on context.id = line.context_id ') - lines.extend( - (aliases.map(path), context, lineno) - for (path, context, lineno) in cur) + lines = [ + (files[path], context, lineno) + for (path, context, lineno) in cur] cur.close() # Get tracer data. @@ -388,9 +383,7 @@ def update(self, other_data, aliases=None): 'select file.path, tracer ' 'from tracer ' 'inner join file on file.id = tracer.file_id') - tracers.update(dict( - (aliases.map(path), tracer) - for (path, tracer) in cur)) + tracers = {files[path]: tracer for (path, tracer) in cur} cur.close() with self._connect() as conn: @@ -412,7 +405,7 @@ def update(self, other_data, aliases=None): # Create all file and context rows in the DB. conn.executemany( 'insert or ignore into file (path) values (?)', - ((file,) for file in files)) + ((file,) for file in files.values())) file_ids = { path: id for id, path in conn.execute('select id, path from file')} @@ -428,7 +421,7 @@ def update(self, other_data, aliases=None): # tracer_paths is used to ensure consistency over the tracer data # and tracer_map tracks the tracers to be inserted. tracer_map = {} - for path in files: + for path in files.values(): this_tracer = this_tracers.get(path) other_tracer = tracers.get(path, '') # If there is no tracer, there is always the None tracer.