Skip to content

Commit

Permalink
Handle non-ascii characters properly in scancode-fingerprint #1690
Browse files Browse the repository at this point in the history
Signed-off-by: Steven Esser <sesser@nexb.com>
  • Loading branch information
steven-esser committed Nov 9, 2019
1 parent 79af902 commit a546574
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
import binascii
from bitarray import bitarray
from bitarray import bitdiff
from licensedcode.tokenize import ngrams
import hashlib

from commoncode.text import toascii
from licensedcode.tokenize import ngrams

HASH_LENGTH = 128
SHINGLE_LENGTH = 3

Expand Down Expand Up @@ -103,6 +105,9 @@ def process_shingles(self, shingle, weighted_list):
"""
Modify weighted list wrt to shingle
"""
# convert other encodings to ascii. See #1690.
shingle = toascii(shingle)

hash = hashlib.md5(shingle.encode()).digest()
result = self.bitarray_from_bytes(hash)

Expand Down
12 changes: 11 additions & 1 deletion plugins/scancode-fingerprint/tests/test_fingerprint.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
Expand Down Expand Up @@ -131,10 +132,19 @@ def test_hex_digest3(self):
result = simhash.hex_digest()
assert result == '7f43e1b18f9c0e705fcf28007bc41754'

def test_hex_digest3(self):
def test_hex_digest4(self):
simhash = Simhash()
assert simhash.hex_digest() == None

# Ensure non-ascii characters are handled properly. See #1690.
def test_hex_digest_non_ascii(self):
simhash = Simhash()

simhash.update('Copyright (c) Mário Morgado')
result = simhash.hex_digest()

assert result == '01010040c1300a05ce41804024000001'

def test_update(self):
simhash = Simhash()
assert simhash.tokens == []
Expand Down

0 comments on commit a546574

Please sign in to comment.