diff --git a/plugins/scancode-fingerprint/src/plugin_fingerprint/fingerprint.py b/plugins/scancode-fingerprint/src/plugin_fingerprint/fingerprint.py index acfcf3cf730..5344f755efe 100644 --- a/plugins/scancode-fingerprint/src/plugin_fingerprint/fingerprint.py +++ b/plugins/scancode-fingerprint/src/plugin_fingerprint/fingerprint.py @@ -25,9 +25,11 @@ import binascii from bitarray import bitarray from bitarray import bitdiff -from licensedcode.tokenize import ngrams import hashlib +from commoncode.text import toascii +from licensedcode.tokenize import ngrams + HASH_LENGTH = 128 SHINGLE_LENGTH = 3 @@ -103,6 +105,9 @@ def process_shingles(self, shingle, weighted_list): """ Modify weighted list wrt to shingle """ + # convert other encodings to ascii. See #1690. + shingle = toascii(shingle) + hash = hashlib.md5(shingle.encode()).digest() result = self.bitarray_from_bytes(hash) diff --git a/plugins/scancode-fingerprint/tests/test_fingerprint.py b/plugins/scancode-fingerprint/tests/test_fingerprint.py index f5fffa0f78e..54b66c641c5 100644 --- a/plugins/scancode-fingerprint/tests/test_fingerprint.py +++ b/plugins/scancode-fingerprint/tests/test_fingerprint.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # http://nexb.com and https://github.com/nexB/scancode-toolkit/ @@ -131,10 +132,19 @@ def test_hex_digest3(self): result = simhash.hex_digest() assert result == '7f43e1b18f9c0e705fcf28007bc41754' - def test_hex_digest3(self): + def test_hex_digest4(self): simhash = Simhash() assert simhash.hex_digest() == None + # Ensure non-ascii characters are handled properly. See #1690. + def test_hex_digest_non_ascii(self): + simhash = Simhash() + + simhash.update('Copyright (c) Mário Morgado') + result = simhash.hex_digest() + + assert result == '01010040c1300a05ce41804024000001' + def test_update(self): simhash = Simhash() assert simhash.tokens == []