diff --git a/frontends/concrete-python/examples/levenshtein_distance/IBAN_name_check.ipynb b/frontends/concrete-python/examples/levenshtein_distance/IBAN_name_check.ipynb new file mode 100644 index 0000000000..de368acfa6 --- /dev/null +++ b/frontends/concrete-python/examples/levenshtein_distance/IBAN_name_check.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf2a3e8d-bf57-4841-a863-1eab96c91373", + "metadata": {}, + "source": [ + "# Comparing encrypted IBAN names\n", + "\n", + "In the bank industries, before completing any transfer from Bank A to Bank B, one needs to check that the given name for the target account (given by the user of Bank A) and the actual name in the target account (in the Bank B) correspond. When this is done between continents, such a check can be long and complicated. Thus, some third parties are used, to pre-check the IBAN names. For privacy reason however, one must do this pre-check over encrypted names.\n", + "\n", + "In this small tutorial, we show how to use our TFHE Levenshtein distance computations to perform such a privacy-preserving check, very simply and directly in Python. This tutorial can be easily configured, to change for example the way strings are normalized before encryption and comparison. " + ] + }, + { + "cell_type": "markdown", + "id": "dc96a80f-0b14-4e64-a33f-31a60351453d", + "metadata": {}, + "source": [ + "## Importing our FHE Levenshtein computations\n", + "\n", + "One can have a look to this file to see how the FHE computations are handled." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "56ba9e20-ca46-4aa6-a0f7-86ca13480a52", + "metadata": {}, + "outputs": [], + "source": [ + "from levenshtein_distance import *" + ] + }, + { + "cell_type": "markdown", + "id": "494cd58c-ea28-4547-92c5-80ed4ba83964", + "metadata": {}, + "source": [ + "## Define the comparison functions\n", + "\n", + "FHE computation will happen in `calculate_and_return`, if `fhe_or_simulate` is set to `fhe`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2410b6b3-0c21-4178-b8dc-f734ec0afd40", + "metadata": {}, + "outputs": [], + "source": [ + "class Arguments:\n", + " \"\"\"Fake an argument parser, to be able to reuse the functions from levenshtein_distance.py.\"\"\"\n", + " def __init__(self, max_string_length = 5, show_mlir = False, show_optimizer = False):\n", + " self.max_string_length = max_string_length\n", + " self.show_mlir = show_mlir\n", + " self.show_optimizer = show_optimizer\n", + "\n", + "def normalized_string(st):\n", + " \"\"\"Normalize a string, to later make that the distance between non-normalized\n", + " string 'John Doe' and 'doe john' is small. This function can be configured depending\n", + " on the needs.\n", + " \"\"\"\n", + "\n", + " # Force lower case\n", + " st = st.lower()\n", + "\n", + " # Sort the words and join\n", + " words = st.split()\n", + " st = \"\".join(sorted(words))\n", + "\n", + " return st\n", + "\n", + "def compare_IBAN_names(string0: str, string1: str, fhe_or_simulate: str):\n", + " \"\"\"Compare two IBAN names: first, normalize the strings, then compute in FHE (look in \n", + " calculate_and_return for FHE details).\"\"\"\n", + " # Normalize strings\n", + " string0 = normalized_string(string0)\n", + " string1 = normalized_string(string1)\n", + " args = Arguments(max_string_length = max(len(string0), len(string1)))\n", + "\n", + " alphabet = Alphabet.init_by_name(\"name\")\n", + " levenshtein_distance = LevenshteinDistance(alphabet, args)\n", + " distance = levenshtein_distance.calculate_and_return(string0, string1, mode=fhe_or_simulate) \n", + "\n", + " max_len = max(len(string0), len(string1))\n", + " similarity = (max_len - distance) / max_len\n", + "\n", + " print(f\"Similarity between the two strings is {similarity:.4f}\")\n", + " return similarity" + ] + }, + { + "cell_type": "markdown", + "id": "f9006416-8240-4d8b-be8d-9011547f4719", + "metadata": {}, + "source": [ + "This is the option to set to \"fhe\" to run computations in FHE. If you set it to \"simulate\", only simulation will be done, which is sufficient to debug what happens, but should not be used in production settings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3b79df6b-8aff-4bfe-b119-e4aec78e04af", + "metadata": {}, + "outputs": [], + "source": [ + "fhe_or_simulate = \"simulate\"" + ] + }, + { + "cell_type": "markdown", + "id": "4062d22f-ae05-4493-a1ab-6a6a16bbc1f3", + "metadata": {}, + "source": [ + "## Make a few comparisons in a private setting\n", + "\n", + "First, with equal strings, the match is perfect." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "60ccaded-7579-4bd4-a972-7eea98d5d585", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 1.0000\n" + ] + } + ], + "source": [ + "string0 = \"John Doe\"\n", + "string1 = \"John Doe\"\n", + "\n", + "assert compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate) == 1.0" + ] + }, + { + "cell_type": "markdown", + "id": "f92bc91b-cd26-4ced-af4e-49811bea2353", + "metadata": {}, + "source": [ + "With reversed names, the match is also perfect, thanks to our definition of `normalized_string`. If it is a non-desired property, we can change it." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9658e10-94dd-4e6a-8352-639493ac36f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 1.0000\n" + ] + } + ], + "source": [ + "string0 = \"John Doe\"\n", + "string1 = \"Doe John\"\n", + "\n", + "assert compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate) == 1.0" + ] + }, + { + "cell_type": "markdown", + "id": "c871b320-f93c-4fdb-9a70-5d423811961e", + "metadata": {}, + "source": [ + "With a typo, similarity is smaller, but still quite high." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a822a188-a7ae-466f-8caa-15d91131fc5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 0.8571\n" + ] + } + ], + "source": [ + "string0 = \"John Doe\"\n", + "string1 = \"John Do\"\n", + "\n", + "assert round(compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate), 2) == 0.86" + ] + }, + { + "cell_type": "markdown", + "id": "a7c26654-08da-4755-8eba-25aef6d49e2a", + "metadata": {}, + "source": [ + "With an added letter, it is also high." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fba38c06-d26a-4dc8-9442-d1f128068d1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 0.8750\n" + ] + } + ], + "source": [ + "string0 = \"John Doe\"\n", + "string1 = \"John W Doe\"\n", + "\n", + "assert round(compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate), 2) == 0.88" + ] + }, + { + "cell_type": "markdown", + "id": "fab3e31c-5533-4983-a854-bfb9bb360611", + "metadata": {}, + "source": [ + "Finally, with totally different names, we can see a very low similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "330de097-fc30-4d46-b2bb-459ab8e00a27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 0.1429\n" + ] + } + ], + "source": [ + "string0 = \"John Doe\"\n", + "string1 = \"Gill Cot\"\n", + "\n", + "assert round(compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate), 2) == 0.14" + ] + }, + { + "cell_type": "markdown", + "id": "001c7c1e-37db-4488-925f-2c46a902d962", + "metadata": {}, + "source": [ + "Remark that, as we sort words in `normalized_string`, typos in the first letter can have bad impacts. It's not obvious to find a function which accepts word reordering but at the same time is not too impacted by mistakes on the first word letters. Choices can be done depending by the banks to fit their preference." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c5600fde-3c42-4f52-ad0c-fa0ebda9b0cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between the two strings is 0.1429\n", + "Similarity between the two strings is 0.8571\n" + ] + } + ], + "source": [ + "# One typo in the first letter\n", + "string0 = \"John Doe\"\n", + "string1 = \"John Poe\"\n", + "\n", + "assert round(compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate), 2) == 0.14\n", + "\n", + "# One typo in the last letter\n", + "string0 = \"John Doe\"\n", + "string1 = \"John Doy\"\n", + "\n", + "assert round(compare_IBAN_names(string0, string1, fhe_or_simulate = fhe_or_simulate), 2) == 0.86" + ] + }, + { + "cell_type": "markdown", + "id": "8a61bc76-7c38-4251-bafd-51d7843dd3c7", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have shown how to use Levenshtein distances in FHE, to perform IBAN checks in a private way. And since the code is open-source and in Python, it's pretty easy for developers to modify it, to fine-tune it to their specific needs, eg in terms of string normalization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bddc5621-5ab8-48cb-9dc8-3fc84b790b2c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py index 6174647319..cfd164559e 100644 --- a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py +++ b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py @@ -34,6 +34,11 @@ def anycase(): """Set any-case alphabet.""" return Alphabet.lowercase() + Alphabet.uppercase() + @staticmethod + def name(): + """Set alphabet for family names""" + return Alphabet.lowercase() + Alphabet.uppercase() + Alphabet(". ") + @staticmethod def dna(): """Set DNA alphabet.""" @@ -52,7 +57,7 @@ def __add__(self, other: "Alphabet") -> "Alphabet": @staticmethod def return_available_alphabets() -> list: """Return available alphabets.""" - return ["string", "STRING", "StRiNg", "ACTG"] + return ["string", "STRING", "StRiNg", "ACTG", "name"] @staticmethod def init_by_name(alphabet_name: str) -> "Alphabet": @@ -67,6 +72,8 @@ def init_by_name(alphabet_name: str) -> "Alphabet": return Alphabet.uppercase() if alphabet_name == "StRiNg": return Alphabet.anycase() + if alphabet_name == "name": + return Alphabet.name() assert alphabet_name == "ACTG", f"Unknown alphabet {alphabet_name}" return Alphabet.dna() @@ -128,11 +135,19 @@ def __init__(self, alphabet: Alphabet, args): def calculate(self, a: str, b: str, mode: str, show_distance: bool = False): """Compute a distance between two strings, either in fhe or in simulate.""" if mode == "simulate": - self._compute_in_simulation([(a, b)]) + self._compute_in_simulation([(a, b)], show_distance=show_distance) else: assert mode == "fhe", "Only 'simulate' and 'fhe' mode are available" self._compute_in_fhe([(a, b)], show_distance=show_distance) + def calculate_and_return(self, a: str, b: str, mode: str) -> int: + """Return distance between two strings, either in fhe or in simulate.""" + if mode == "simulate": + return self._compute_and_return_in_simulation(a, b) + + assert mode == "fhe", "Only 'simulate' and 'fhe' mode are available" + return self._compute_and_return_in_fhe(a, b) + def calculate_list(self, pairs_to_compute_on: list, mode: str): """Compute a distance between strings of a list, either in fhe or in simulate.""" for a, b in pairs_to_compute_on: @@ -185,19 +200,41 @@ def _compile_module(self, args): min_max_strategy_preference=fhe.MinMaxStrategy.ONE_TLU_PROMOTED, ) - def _compute_in_simulation(self, list_patterns: list): + def _compute_and_return_in_simulation(self, a: str, b: str) -> int: + """Check equality between distance in simulation and clear distance, and return.""" + a_as_int = self.alphabet.encode(a) + b_as_int = self.alphabet.encode(b) + + l1_simulate = levenshtein_simulate(self.module, a_as_int, b_as_int) + l1_clear = levenshtein_clear(a_as_int, b_as_int) + + assert l1_simulate == l1_clear, f" {l1_simulate=} and {l1_clear=} are different" + + return int(l1_simulate) + + def _compute_in_simulation(self, list_patterns: list, show_distance: bool = False): """Check equality between distance in simulation and clear distance.""" for a, b in list_patterns: print(f" Computing Levenshtein between strings '{a}' and '{b}'", end="") - a_as_int = self.alphabet.encode(a) - b_as_int = self.alphabet.encode(b) + l1_simulate = self._compute_and_return_in_simulation(a, b) - l1_simulate = levenshtein_simulate(self.module, a_as_int, b_as_int) - l1_clear = levenshtein_clear(a_as_int, b_as_int) + if not show_distance: + print(" - OK") + else: + print(f" - distance is {l1_simulate}") - assert l1_simulate == l1_clear, f" {l1_simulate=} and {l1_clear=} are different" - print(" - OK") + def _compute_and_return_in_fhe(self, a: str, b: str): + """Check equality between distance in FHE and clear distance.""" + a_enc, b_enc = self._encode_and_encrypt_strings(a, b) + + l1_fhe_enc = levenshtein_fhe(self.module, a_enc, b_enc) + l1_fhe = self.module.mix.decrypt(l1_fhe_enc) # type: ignore + l1_clear = levenshtein_clear(a, b) + + assert l1_fhe == l1_clear, f" {l1_fhe=} and {l1_clear=} are different" + + return l1_fhe def _compute_in_fhe(self, list_patterns: list, show_distance: bool = False): """Check equality between distance in FHE and clear distance.""" @@ -207,18 +244,10 @@ def _compute_in_fhe(self, list_patterns: list, show_distance: bool = False): for a, b in list_patterns: print(f" Computing Levenshtein between strings '{a}' and '{b}'", end="") - a_enc, b_enc = self._encode_and_encrypt_strings(a, b) - time_begin = time.time() - l1_fhe_enc = levenshtein_fhe(self.module, a_enc, b_enc) + l1_fhe = self._compute_and_return_in_fhe(a, b) time_end = time.time() - l1_fhe = self.module.mix.decrypt(l1_fhe_enc) # type: ignore - - l1_clear = levenshtein_clear(a, b) - - assert l1_fhe == l1_clear, f" {l1_fhe=} and {l1_clear=} are different" - if not show_distance: print(f" - OK in {time_end - time_begin:.2f} seconds") else: @@ -277,6 +306,21 @@ def mix(is_equal, if_equal, case_1, case_2, case_3): ) +def normalized_string(st): + """Normalize a string, to later make that the distance between non-normalized + string 'John Doe' and 'doe john' is small. + """ + + # Force lower case + st = st.lower() + + # Sort the words and join + words = st.split() + st = "".join(sorted(words)) + + return st + + @lru_cache def levenshtein_clear(x: str, y: str): """Compute the distance in clear, for reference and comparison.""" @@ -382,6 +426,11 @@ def manage_args(): default=4, help="Setting the maximal size of strings", ) + parser.add_argument( + "--normalize_strings_before_distance", + action="store_true", + help="Normalize strings before computing their distance", + ) args = parser.parse_args() # At least one option @@ -432,21 +481,32 @@ def main(): if args.distance is not None: print( f"Running distance between strings '{args.distance[0]}' and '{args.distance[1]}' " - f"for alphabet {args.alphabet}:\n" + f"for alphabet {args.alphabet}:" ) - if max(len(args.distance[0]), len(args.distance[1])) > args.max_string_length: - args.max_string_length = max(len(args.distance[0]), len(args.distance[1])) + string0 = args.distance[0] + string1 = args.distance[1] + + if args.normalize_strings_before_distance: + string0 = normalized_string(string0) + string1 = normalized_string(string1) + print( + f"Normalized strings are '{string0}' and '{string1}' " + "(lower case, no space, sorted words)" + ) + + if max(len(string0), len(string1)) > args.max_string_length: + args.max_string_length = max(len(string0), len(string1)) print( - "Warning, --max_string_length was smaller than lengths of the input strings, " - "fixing it" + "Warning, --max_string_length was smaller than lengths of " + "the input strings, fixing it" ) + print() + alphabet = Alphabet.init_by_name(args.alphabet) levenshtein_distance = LevenshteinDistance(alphabet, args) - levenshtein_distance.calculate( - args.distance[0], args.distance[1], mode="fhe", show_distance=True - ) + levenshtein_distance.calculate(string0, string1, mode="fhe", show_distance=True) print("") print("Successful end\n")