docs(frontend): adding the URL-filtering example

zama-ai · Sep 13, 2024 · 0190e6b · 0190e6b
1 parent df1fd14
commit 0190e6b
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 39 deletions.
diff --git a/frontends/concrete-python/examples/pir/PIR.ipynb b/frontends/concrete-python/examples/pir/PIR.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Private information retrieval\n",
     "\n",
-    "This notebook explains how to do PIR with Concrete, in a simple way, with an application to blocking phone numbers. The principle of PIR is that there is a large database on the server side, which we can't move to the client side for some reasons, like it's too big or we don't want to for privacy reasons or it's updated too often. With PIR, we let the user query the database, and the query (input and output) is not seen in the clear by the server."
+    "This notebook explains how to do PIR with Concrete, in a simple way, with applications to blocking spam phone numbers or bad URLs. The principle of PIR is that there is a non-encrypted large database on the server side, which we can't move to the client side for some reasons, like it's too big or we don't want to for privacy reasons or it's updated too often. With PIR, we let the user query the database, and the query (input and output) is not seen in the clear by the server."
    ]
   },
   {
@@ -193,7 +193,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "FHE computation done in 2.2 milliseconds -- database is 8 (2**3) elements of 5 bits\n"
+      "FHE computation done in 1.9 milliseconds -- database is 8 (2**3) elements of 5 bits\n"
      ]
     }
    ],
@@ -258,18 +258,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "For a database of 2** 4 elements of  8 bits, average execution time is 1.0 milliseconds\n",
-      "For a database of 2** 4 elements of 16 bits, average execution time is 1.1 milliseconds\n",
-      "For a database of 2** 8 elements of  8 bits, average execution time is 3.2 milliseconds\n",
-      "For a database of 2** 8 elements of 16 bits, average execution time is 4.3 milliseconds\n",
-      "For a database of 2** 9 elements of  8 bits, average execution time is 5.8 milliseconds\n",
-      "For a database of 2** 9 elements of 16 bits, average execution time is 8.3 milliseconds\n",
-      "For a database of 2**10 elements of  4 bits, average execution time is 8.1 milliseconds\n",
-      "For a database of 2**10 elements of  8 bits, average execution time is 9.0 milliseconds\n",
-      "For a database of 2**12 elements of  4 bits, average execution time is 30.8 milliseconds\n",
-      "For a database of 2**12 elements of  8 bits, average execution time is 42.6 milliseconds\n",
-      "For a database of 2**14 elements of  4 bits, average execution time is 139.2 milliseconds\n",
-      "For a database of 2**14 elements of  8 bits, average execution time is 199.6 milliseconds\n"
+      "For a database of 2** 4 elements of  8 bits, average execution time is 1.2 milliseconds\n",
+      "For a database of 2** 4 elements of 16 bits, average execution time is 1.3 milliseconds\n",
+      "For a database of 2** 8 elements of  8 bits, average execution time is 3.7 milliseconds\n",
+      "For a database of 2** 8 elements of 16 bits, average execution time is 5.6 milliseconds\n",
+      "For a database of 2** 9 elements of  8 bits, average execution time is 7.5 milliseconds\n",
+      "For a database of 2** 9 elements of 16 bits, average execution time is 10.3 milliseconds\n",
+      "For a database of 2**10 elements of  4 bits, average execution time is 8.7 milliseconds\n",
+      "For a database of 2**10 elements of  8 bits, average execution time is 9.7 milliseconds\n",
+      "For a database of 2**12 elements of  4 bits, average execution time is 35.3 milliseconds\n",
+      "For a database of 2**12 elements of  8 bits, average execution time is 49.3 milliseconds\n",
+      "For a database of 2**14 elements of  4 bits, average execution time is 147.5 milliseconds\n",
+      "For a database of 2**14 elements of  8 bits, average execution time is 202.6 milliseconds\n"
      ]
     }
    ],
@@ -281,11 +281,11 @@
     "\n",
     "for database_input_bits, database_output_bits in sample_list:\n",
     "\n",
-    "    # BCM: remove once I know how to reset\n",
+    "    # FIXME: remove once I know how to reset\n",
     "    @fhe.compiler({\"one_hot_vector\": \"encrypted\", \"database\": \"clear\"})\n",
     "    def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:\n",
     "        return np.dot(one_hot_vector, database)\n",
-    "\n",
+    "        \n",
     "    # Take a random database of expected size and output_bits\n",
     "    database_length = 2**database_input_bits\n",
     "    database = np.array(\n",
@@ -323,7 +323,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "For a database of 2**14 elements of 32 bits with 4 sub-databases, average execution time is 256.2 milliseconds\n"
+      "For a database of 2**14 elements of 32 bits with 4 sub-databases, average execution time is 258.3 milliseconds\n"
      ]
     }
    ],
@@ -447,7 +447,7 @@
    "id": "76855d6d-a65d-442c-b428-f1a077c07570",
    "metadata": {},
    "source": [
-    "## Use-cases\n",
+    "## Use-cases for phone spamming\n",
     "\n",
     "Now, let see where PIR could be used. Let's imagine we want to build a spam database. In France, there are 10 ** 9 ~ 2 ** 30 phone numbers, we could have a database T[i] for i an integer of 30 bits, returning a boolean stating if the phone number is a known spam number. The database would be server side, and often updated. Phones could query the database on an number, and if the result is positive, filter the call as a spam. All of this would be done without the server knowing the calling numbers.\n",
     "\n",
@@ -482,30 +482,30 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Estimated time would be   8764.0 seconds for  8388608 DBs of (4, 8)\n",
-      "Estimated time would be   4608.0 seconds for  4194304 DBs of (4, 16)\n",
-      "Estimated time would be   1673.0 seconds for   524288 DBs of (8, 8)\n",
-      "Estimated time would be   1124.0 seconds for   262144 DBs of (8, 16)\n",
-      "Estimated time would be   1523.0 seconds for   262144 DBs of (9, 8)\n",
-      "Estimated time would be   1086.0 seconds for   131072 DBs of (9, 16)\n",
-      "Estimated time would be   2127.0 seconds for   262144 DBs of (10, 4)\n",
-      "Estimated time would be   1182.0 seconds for   131072 DBs of (10, 8)\n",
-      "Estimated time would be   2022.0 seconds for    65536 DBs of (12, 4)\n",
-      "Estimated time would be   1397.0 seconds for    32768 DBs of (12, 8)\n",
-      "Estimated time would be   2280.0 seconds for    16384 DBs of (14, 4)\n",
-      "Estimated time would be   1636.0 seconds for     8192 DBs of (14, 8)\n",
+      "Estimated time would be   9664.0 seconds for  8388608 DBs of (4, 8)\n",
+      "Estimated time would be   5495.0 seconds for  4194304 DBs of (4, 16)\n",
+      "Estimated time would be   1952.0 seconds for   524288 DBs of (8, 8)\n",
+      "Estimated time would be   1462.0 seconds for   262144 DBs of (8, 16)\n",
+      "Estimated time would be   1977.0 seconds for   262144 DBs of (9, 8)\n",
+      "Estimated time would be   1350.0 seconds for   131072 DBs of (9, 16)\n",
+      "Estimated time would be   2284.0 seconds for   262144 DBs of (10, 4)\n",
+      "Estimated time would be   1275.0 seconds for   131072 DBs of (10, 8)\n",
+      "Estimated time would be   2312.0 seconds for    65536 DBs of (12, 4)\n",
+      "Estimated time would be   1614.0 seconds for    32768 DBs of (12, 8)\n",
+      "Estimated time would be   2417.0 seconds for    16384 DBs of (14, 4)\n",
+      "Estimated time would be   1660.0 seconds for     8192 DBs of (14, 8)\n",
       "\n",
-      "Best combination: 1086.0 seconds for a DB of 30 bits\n",
+      "Best combination: 1275.0 seconds for a DB of 30 bits\n",
       "\n",
-      "Estimated time would be      9.0 seconds for     8192 DBs of (4, 8)\n",
-      "Estimated time would be      5.0 seconds for     4096 DBs of (4, 16)\n",
+      "Estimated time would be     10.0 seconds for     8192 DBs of (4, 8)\n",
+      "Estimated time would be      6.0 seconds for     4096 DBs of (4, 16)\n",
       "Estimated time would be      2.0 seconds for      512 DBs of (8, 8)\n",
       "Estimated time would be      2.0 seconds for      256 DBs of (8, 16)\n",
       "Estimated time would be      2.0 seconds for      256 DBs of (9, 8)\n",
       "Estimated time would be      2.0 seconds for      128 DBs of (9, 16)\n",
       "Estimated time would be      3.0 seconds for      256 DBs of (10, 4)\n",
       "Estimated time would be      2.0 seconds for      128 DBs of (10, 8)\n",
-      "Estimated time would be      2.0 seconds for       64 DBs of (12, 4)\n",
+      "Estimated time would be      3.0 seconds for       64 DBs of (12, 4)\n",
       "Estimated time would be      2.0 seconds for       32 DBs of (12, 8)\n",
       "Estimated time would be      3.0 seconds for       16 DBs of (14, 4)\n",
       "Estimated time would be      2.0 seconds for        8 DBs of (14, 8)\n",
@@ -536,6 +536,20 @@
     "find_best_combination(30)\n",
     "find_best_combination(20)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff578b6f-98b9-4073-8427-ec32aff5437a",
+   "metadata": {},
+   "source": [
+    "## Another use-case for URL checking\n",
+    "\n",
+    "It might also be tempting to keep (and refresh very often) a list of bad URL on the server side, and to use them to protect user to click on bad links. Of course, there will be too many URLs to keep with the previous system: fortunately we have an hash-based solution for this. \n",
+    "\n",
+    "The principle will be to use a small non-cryptographic hash function, which maps strings to small integers, let say 20-bit integers. Then, any time the server would see a bad URL `u`, it would hash it to `h` and would store `T[h] = 1` to set that this hash is potentially dangerous. Then, with our system, the user can use privacy-preserving PIR to know if a given URL `u'` is dangerous, by having it to `h'` and checking if `T[h'] == 1`. \n",
+    "\n",
+    "As we know with such-a-small hash function, there will be collisions, which means that sometimes, the user will receive false positives: having `T[h] == 1` doesn't mean that this given URL is dangerous, but that there exists an URL with same hash which is dangerous. These collisions is not a problem per se, the user may just see a \"Warning, this URL is potentially dangerous\" but still access if he is confident. Or, we could use several different hash functions and different tables `T_i`, and we would check if all `T_i` return 1 to define if an URL is a spam, to highly reduce the probability of collisions. "
+   ]
   }
  ],
  "metadata": {

diff --git a/frontends/concrete-python/examples/pir/pir_utils.py b/frontends/concrete-python/examples/pir/pir_utils.py
@@ -2,9 +2,12 @@
 
 from concrete import fhe
 
+
+@fhe.compiler({"one_hot_vector": "encrypted", "database": "clear"})
 def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:
     return np.dot(one_hot_vector, database)
 
+
 def make_one_hot_vector(index: int, size: int) -> np.ndarray:
 
     answer = np.zeros(shape=(size,), dtype=np.int8)
@@ -14,9 +17,7 @@ def make_one_hot_vector(index: int, size: int) -> np.ndarray:
 
 def compile_function(database, show_mlir=False, show_graph=False):
 
-    @fhe.compiler({"one_hot_vector": "encrypted", "database": "clear"})
-    def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:
-        return np.dot(one_hot_vector, database)
+    get_ith_element_of_database.reset()
 
     database_length = database.shape[0]
     inputset_length = 100
@@ -30,11 +31,10 @@ def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray
     return circuit
 
 
-def test_encrypted_queries(database, circuit, how_many_tests=1, verbose=True):
+def test_encrypted_queries(database, circuit, how_many_tests=1):
 
     for _ in range(how_many_tests):
         database_length = database.shape[0]
-        log_database_length = np.ceil(np.log2(database_length)).astype(np.int32)
 
         # Random index in the database
         random_index = np.random.randint(database_length)

diff --git a/frontends/concrete-python/tests/execution/test_examples.py b/frontends/concrete-python/tests/execution/test_examples.py
@@ -276,6 +276,8 @@ def test_levenshtein_distance_randomly(alphabet_name, max_length, helpers):
 
 def test_pir_basics():
 
+    pir_utils.get_ith_element_of_database.reset()
+
     x = pir_utils.make_one_hot_vector(0, size=5)
     assert np.array_equal(x, np.array([1, 0, 0, 0, 0]))