Skip to content

Commit

Permalink
Do not use defaultdict for Query unknowns
Browse files Browse the repository at this point in the history
Do not use defaultdict for Query.unknowns_by_pos and
Query.stopwords_by_pos. Otherwise there are pernicious side effects to
add new entries in these dctionaries when querying them after their
creation.

Reported-by: Mike Rombout @mrombout
Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
  • Loading branch information
pombredanne committed Nov 25, 2021
1 parent cc30ed2 commit 97a1a57
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 22 deletions.
19 changes: 12 additions & 7 deletions src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def qcontains_stopwords(self):
stopwords_pos = qspan & query.stopwords_span
stopwords_pos = (pos for pos in stopwords_pos if pos != qspe)
qry_stopxpos = query.stopwords_by_pos
return any(qry_stopxpos[pos] for pos in stopwords_pos)
return any(qry_stopxpos.get(pos, 0) for pos in stopwords_pos)

def qrange(self):
"""
Expand Down Expand Up @@ -1216,8 +1216,12 @@ def filter_low_score(matches, min_score=100):
return kept, discarded


def filter_spurious_single_token(matches, query=None, unknown_count=5,
trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN):
def filter_spurious_single_token(
matches,
query=None,
unknown_count=5,
trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN,
):
"""
Return a filtered list of kept LicenseMatch matches and a list of
discardable matches given a `matches` list of LicenseMatch by removing
Expand Down Expand Up @@ -1249,9 +1253,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
qend = match.qend

# compute the number of unknown tokens before and after this single
# matched position note: unknowns_by_pos is a defaultdict(int),
# shorts_and_digits is a set of integers
before = unknowns_by_pos[qstart - 1]
# matched position note:
# - unknowns_by_pos is a dict,
# - shorts_and_digits is a set of ints
before = unknowns_by_pos.get(qstart - 1, 0)
for p in range(qstart - 1 - unknown_count, qstart):
if p in shorts_and_digits:
before += 1
Expand All @@ -1262,7 +1267,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
kept.append(match)
continue

after = unknowns_by_pos[qstart]
after = unknowns_by_pos.get(qstart, 0)
for p in range(qend, qend + 1 + unknown_count):
if p in shorts_and_digits:
after += 1
Expand Down
35 changes: 20 additions & 15 deletions src/licensedcode/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,15 +228,15 @@ def __init__(
# index of "known positions" (yes really!) to a number of unknown tokens
# after that known position. For unknowns at the start, the position is
# using the magic -1 key
self.unknowns_by_pos = defaultdict(int)
self.unknowns_by_pos = {}

# Span of "known positions" (yes really!) followed by unknown token(s)
self.unknowns_span = None

# index of "known positions" (yes really!) to a number of stopword
# tokens after that known position. For stopwords at the start, the
# position is using the magic -1 key
self.stopwords_by_pos = defaultdict(int)
self.stopwords_by_pos = {}

# Span of "known positions" (yes really!) followed by stopwords
self.stopwords_span = None
Expand Down Expand Up @@ -355,12 +355,12 @@ def tokens_with_unknowns(self):
"""
unknowns = self.unknowns_by_pos
# yield anything at the start
for _ in range(unknowns[-1]):
for _ in range(unknowns.get(-1, 0)):
yield None

for pos, token in enumerate(self.tokens):
yield token
for _ in range(unknowns[pos]):
for _ in range(unknowns.get(pos, 0)):
yield None

def tokens_by_line(
Expand All @@ -386,11 +386,13 @@ def tokens_by_line(
# bind frequently called functions to local scope
line_by_pos_append = self.line_by_pos.append

self_unknowns_by_pos = self.unknowns_by_pos
# we use a defaultdict as a convenience at construction time
unknowns_by_pos = defaultdict(int)
unknowns_pos = set()
unknowns_pos_add = unknowns_pos.add

self_stopwords_by_pos = self.stopwords_by_pos
# we use a defaultdict as a convenience at construction time
stopwords_by_pos = defaultdict(int)
stopwords_pos = set()
stopwords_pos_add = stopwords_pos.add

Expand Down Expand Up @@ -443,11 +445,11 @@ def tokens_by_line(
# If we have not yet started globally, then all tokens
# seen so far are stopwords and we keep a count of them
# in the magic "-1" position.
self_stopwords_by_pos[-1] += 1
stopwords_by_pos[-1] += 1
else:
# here we have a new unknwon token positioned right after
# the current known_pos
self_stopwords_by_pos[known_pos] += 1
stopwords_by_pos[known_pos] += 1
stopwords_pos_add(known_pos)
# we do not track stopwords, only their position
continue
Expand All @@ -456,11 +458,11 @@ def tokens_by_line(
# If we have not yet started globally, then all tokens
# seen so far are unknowns and we keep a count of them
# in the magic "-1" position.
self_unknowns_by_pos[-1] += 1
unknowns_by_pos[-1] += 1
else:
# here we have a new unknwon token positioned right after
# the current known_pos
self_unknowns_by_pos[known_pos] += 1
unknowns_by_pos[known_pos] += 1
unknowns_pos_add(known_pos)

line_tokens_append(tid)
Expand Down Expand Up @@ -492,11 +494,14 @@ def tokens_by_line(

yield line_tokens

# finally create a Span of positions followed by unkwnons and another
# for positions followed by stopwords used for intersection with the
# query span to do the scoring matches correctly
# finally update the attributes and create a Span of positions followed
# by unkwnons and another for positions followed by stopwords used for
# intersection with the query span to do the scoring matches correctly
self.unknowns_span = Span(unknowns_pos)
self.stopwords_span = Span(stopwords_pos)
# also convert the defaultdicts back to plain discts
self.unknowns_by_pos = dict(unknowns_by_pos)
self.stopwords_by_pos = dict(stopwords_by_pos)

def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
"""
Expand Down Expand Up @@ -760,14 +765,14 @@ def tokens_with_unknowns(self):
unknowns = self.query.unknowns_by_pos
# yield anything at the start only if this is the first query run
if self.start == 0:
for _ in range(unknowns[-1]):
for _ in range(unknowns.get(-1, 0)):
yield None

for pos, token in self.tokens_with_pos():
yield token
if pos == self.end:
break
for _ in range(unknowns[pos]):
for _ in range(unknowns.get(pos, 0)):
yield None

def tokens_with_pos(self):
Expand Down
11 changes: 11 additions & 0 deletions tests/licensedcode/data/query/unknown_positions/lz4.license.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
This repository uses 2 different licenses :
- all files in the `lib` directory use a BSD 2-Clause license
- all other files use a GPLv2 license, unless explicitly stated otherwise

Relevant license is reminded at the top of each source file,
and with presence of COPYING or LICENSE file in associated directories.

This model is selected to emphasize that
files in the `lib` directory are designed to be included into 3rd party applications,
while all other files, in `programs`, `tests` or `examples`,
receive more limited attention and support for such scenario.
75 changes: 75 additions & 0 deletions tests/licensedcode/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import json
import os
from collections import defaultdict

from commoncode.testcase import FileBasedTesting
from licensedcode import cache
Expand Down Expand Up @@ -395,6 +396,31 @@ def test_query_run_unknowns(self):
expected = {-1: 2, 0: 4, 1: 3}
assert dict(q.unknowns_by_pos) == expected

def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(self):
idx = index.LicenseIndex(
[Rule(stored_text='a is the binary')],
_legalese=set(['binary']),
_spdx_tokens=set()
)
q = Query(query_string='binary that was a binary', idx=idx)
list(q.tokens_by_line())
assert q.unknowns_by_pos == {0: 2}
assert q.stopwords_by_pos == {0: 1}

assert not isinstance(q.unknowns_by_pos, defaultdict)
assert not isinstance(q.stopwords_by_pos, defaultdict)

try:
q.unknowns_by_pos[1]
assert q.unknowns_by_pos == {0: 2}
except KeyError:
pass
try:
q.stopwords_by_pos[1]
assert q.stopwords_by_pos == {0: 1}
except KeyError:
pass


class TestQueryWithMultipleRuns(IndexTesting):

Expand Down Expand Up @@ -768,3 +794,52 @@ def test_query_run_for_text_with_long_lines(self):
idx = cache.get_index()
assert len(Query(location1, idx=idx).query_runs) == 17
assert len(Query(location2, idx=idx).query_runs) == 15

def test_match_does_not_change_query_unknown_positions(self):
from licensedcode.match import LicenseMatch
from licensedcode.spans import Span

location = self.get_test_loc('query/unknown_positions/lz4.license.txt')
idx = cache.get_index()
# build a query first
qry1 = Query(location, idx=idx)
# this has the side effect to populate the unknown
txt = u' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens))
assert txt == (
'0-this 1-repository 2-uses 3-2 4-different 5-licenses '
'6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license '
'17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise '
'27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file '
'38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories '
'49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that '
'56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications '
'70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples '
'79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario'
)
list(qry1.tokens_by_line())
assert qry1.unknowns_by_pos == {}

# run matching
matches = idx.match(location=location)
match = matches[0]

rule = [
r for r in idx.rules_by_rid
if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE'
][0]

expected = LicenseMatch(
matcher='2-aho',
rule=rule,
qspan=Span(0, 48),
ispan=Span(0, 48),
)

assert match == expected

# check that query unknown by pos is the same and empty
qry2 = match.query

# this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0}
# after querying done during matching
assert qry2.unknowns_by_pos == {}

0 comments on commit 97a1a57

Please sign in to comment.