Skip to content

Commit

Permalink
docs(frontend): adding the URL-filtering example
Browse files Browse the repository at this point in the history
  • Loading branch information
bcm-at-zama committed Sep 13, 2024
1 parent df1fd14 commit 0190e6b
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 39 deletions.
82 changes: 48 additions & 34 deletions frontends/concrete-python/examples/pir/PIR.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"source": [
"# Private information retrieval\n",
"\n",
"This notebook explains how to do PIR with Concrete, in a simple way, with an application to blocking phone numbers. The principle of PIR is that there is a large database on the server side, which we can't move to the client side for some reasons, like it's too big or we don't want to for privacy reasons or it's updated too often. With PIR, we let the user query the database, and the query (input and output) is not seen in the clear by the server."
"This notebook explains how to do PIR with Concrete, in a simple way, with applications to blocking spam phone numbers or bad URLs. The principle of PIR is that there is a non-encrypted large database on the server side, which we can't move to the client side for some reasons, like it's too big or we don't want to for privacy reasons or it's updated too often. With PIR, we let the user query the database, and the query (input and output) is not seen in the clear by the server."
]
},
{
Expand Down Expand Up @@ -193,7 +193,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"FHE computation done in 2.2 milliseconds -- database is 8 (2**3) elements of 5 bits\n"
"FHE computation done in 1.9 milliseconds -- database is 8 (2**3) elements of 5 bits\n"
]
}
],
Expand Down Expand Up @@ -258,18 +258,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"For a database of 2** 4 elements of 8 bits, average execution time is 1.0 milliseconds\n",
"For a database of 2** 4 elements of 16 bits, average execution time is 1.1 milliseconds\n",
"For a database of 2** 8 elements of 8 bits, average execution time is 3.2 milliseconds\n",
"For a database of 2** 8 elements of 16 bits, average execution time is 4.3 milliseconds\n",
"For a database of 2** 9 elements of 8 bits, average execution time is 5.8 milliseconds\n",
"For a database of 2** 9 elements of 16 bits, average execution time is 8.3 milliseconds\n",
"For a database of 2**10 elements of 4 bits, average execution time is 8.1 milliseconds\n",
"For a database of 2**10 elements of 8 bits, average execution time is 9.0 milliseconds\n",
"For a database of 2**12 elements of 4 bits, average execution time is 30.8 milliseconds\n",
"For a database of 2**12 elements of 8 bits, average execution time is 42.6 milliseconds\n",
"For a database of 2**14 elements of 4 bits, average execution time is 139.2 milliseconds\n",
"For a database of 2**14 elements of 8 bits, average execution time is 199.6 milliseconds\n"
"For a database of 2** 4 elements of 8 bits, average execution time is 1.2 milliseconds\n",
"For a database of 2** 4 elements of 16 bits, average execution time is 1.3 milliseconds\n",
"For a database of 2** 8 elements of 8 bits, average execution time is 3.7 milliseconds\n",
"For a database of 2** 8 elements of 16 bits, average execution time is 5.6 milliseconds\n",
"For a database of 2** 9 elements of 8 bits, average execution time is 7.5 milliseconds\n",
"For a database of 2** 9 elements of 16 bits, average execution time is 10.3 milliseconds\n",
"For a database of 2**10 elements of 4 bits, average execution time is 8.7 milliseconds\n",
"For a database of 2**10 elements of 8 bits, average execution time is 9.7 milliseconds\n",
"For a database of 2**12 elements of 4 bits, average execution time is 35.3 milliseconds\n",
"For a database of 2**12 elements of 8 bits, average execution time is 49.3 milliseconds\n",
"For a database of 2**14 elements of 4 bits, average execution time is 147.5 milliseconds\n",
"For a database of 2**14 elements of 8 bits, average execution time is 202.6 milliseconds\n"
]
}
],
Expand All @@ -281,11 +281,11 @@
"\n",
"for database_input_bits, database_output_bits in sample_list:\n",
"\n",
" # BCM: remove once I know how to reset\n",
" # FIXME: remove once I know how to reset\n",
" @fhe.compiler({\"one_hot_vector\": \"encrypted\", \"database\": \"clear\"})\n",
" def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:\n",
" return np.dot(one_hot_vector, database)\n",
"\n",
" \n",
" # Take a random database of expected size and output_bits\n",
" database_length = 2**database_input_bits\n",
" database = np.array(\n",
Expand Down Expand Up @@ -323,7 +323,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"For a database of 2**14 elements of 32 bits with 4 sub-databases, average execution time is 256.2 milliseconds\n"
"For a database of 2**14 elements of 32 bits with 4 sub-databases, average execution time is 258.3 milliseconds\n"
]
}
],
Expand Down Expand Up @@ -447,7 +447,7 @@
"id": "76855d6d-a65d-442c-b428-f1a077c07570",
"metadata": {},
"source": [
"## Use-cases\n",
"## Use-cases for phone spamming\n",
"\n",
"Now, let see where PIR could be used. Let's imagine we want to build a spam database. In France, there are 10 ** 9 ~ 2 ** 30 phone numbers, we could have a database T[i] for i an integer of 30 bits, returning a boolean stating if the phone number is a known spam number. The database would be server side, and often updated. Phones could query the database on an number, and if the result is positive, filter the call as a spam. All of this would be done without the server knowing the calling numbers.\n",
"\n",
Expand Down Expand Up @@ -482,30 +482,30 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Estimated time would be 8764.0 seconds for 8388608 DBs of (4, 8)\n",
"Estimated time would be 4608.0 seconds for 4194304 DBs of (4, 16)\n",
"Estimated time would be 1673.0 seconds for 524288 DBs of (8, 8)\n",
"Estimated time would be 1124.0 seconds for 262144 DBs of (8, 16)\n",
"Estimated time would be 1523.0 seconds for 262144 DBs of (9, 8)\n",
"Estimated time would be 1086.0 seconds for 131072 DBs of (9, 16)\n",
"Estimated time would be 2127.0 seconds for 262144 DBs of (10, 4)\n",
"Estimated time would be 1182.0 seconds for 131072 DBs of (10, 8)\n",
"Estimated time would be 2022.0 seconds for 65536 DBs of (12, 4)\n",
"Estimated time would be 1397.0 seconds for 32768 DBs of (12, 8)\n",
"Estimated time would be 2280.0 seconds for 16384 DBs of (14, 4)\n",
"Estimated time would be 1636.0 seconds for 8192 DBs of (14, 8)\n",
"Estimated time would be 9664.0 seconds for 8388608 DBs of (4, 8)\n",
"Estimated time would be 5495.0 seconds for 4194304 DBs of (4, 16)\n",
"Estimated time would be 1952.0 seconds for 524288 DBs of (8, 8)\n",
"Estimated time would be 1462.0 seconds for 262144 DBs of (8, 16)\n",
"Estimated time would be 1977.0 seconds for 262144 DBs of (9, 8)\n",
"Estimated time would be 1350.0 seconds for 131072 DBs of (9, 16)\n",
"Estimated time would be 2284.0 seconds for 262144 DBs of (10, 4)\n",
"Estimated time would be 1275.0 seconds for 131072 DBs of (10, 8)\n",
"Estimated time would be 2312.0 seconds for 65536 DBs of (12, 4)\n",
"Estimated time would be 1614.0 seconds for 32768 DBs of (12, 8)\n",
"Estimated time would be 2417.0 seconds for 16384 DBs of (14, 4)\n",
"Estimated time would be 1660.0 seconds for 8192 DBs of (14, 8)\n",
"\n",
"Best combination: 1086.0 seconds for a DB of 30 bits\n",
"Best combination: 1275.0 seconds for a DB of 30 bits\n",
"\n",
"Estimated time would be 9.0 seconds for 8192 DBs of (4, 8)\n",
"Estimated time would be 5.0 seconds for 4096 DBs of (4, 16)\n",
"Estimated time would be 10.0 seconds for 8192 DBs of (4, 8)\n",
"Estimated time would be 6.0 seconds for 4096 DBs of (4, 16)\n",
"Estimated time would be 2.0 seconds for 512 DBs of (8, 8)\n",
"Estimated time would be 2.0 seconds for 256 DBs of (8, 16)\n",
"Estimated time would be 2.0 seconds for 256 DBs of (9, 8)\n",
"Estimated time would be 2.0 seconds for 128 DBs of (9, 16)\n",
"Estimated time would be 3.0 seconds for 256 DBs of (10, 4)\n",
"Estimated time would be 2.0 seconds for 128 DBs of (10, 8)\n",
"Estimated time would be 2.0 seconds for 64 DBs of (12, 4)\n",
"Estimated time would be 3.0 seconds for 64 DBs of (12, 4)\n",
"Estimated time would be 2.0 seconds for 32 DBs of (12, 8)\n",
"Estimated time would be 3.0 seconds for 16 DBs of (14, 4)\n",
"Estimated time would be 2.0 seconds for 8 DBs of (14, 8)\n",
Expand Down Expand Up @@ -536,6 +536,20 @@
"find_best_combination(30)\n",
"find_best_combination(20)"
]
},
{
"cell_type": "markdown",
"id": "ff578b6f-98b9-4073-8427-ec32aff5437a",
"metadata": {},
"source": [
"## Another use-case for URL checking\n",
"\n",
"It might also be tempting to keep (and refresh very often) a list of bad URL on the server side, and to use them to protect user to click on bad links. Of course, there will be too many URLs to keep with the previous system: fortunately we have an hash-based solution for this. \n",
"\n",
"The principle will be to use a small non-cryptographic hash function, which maps strings to small integers, let say 20-bit integers. Then, any time the server would see a bad URL `u`, it would hash it to `h` and would store `T[h] = 1` to set that this hash is potentially dangerous. Then, with our system, the user can use privacy-preserving PIR to know if a given URL `u'` is dangerous, by having it to `h'` and checking if `T[h'] == 1`. \n",
"\n",
"As we know with such-a-small hash function, there will be collisions, which means that sometimes, the user will receive false positives: having `T[h] == 1` doesn't mean that this given URL is dangerous, but that there exists an URL with same hash which is dangerous. These collisions is not a problem per se, the user may just see a \"Warning, this URL is potentially dangerous\" but still access if he is confident. Or, we could use several different hash functions and different tables `T_i`, and we would check if all `T_i` return 1 to define if an URL is a spam, to highly reduce the probability of collisions. "
]
}
],
"metadata": {
Expand Down
10 changes: 5 additions & 5 deletions frontends/concrete-python/examples/pir/pir_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@

from concrete import fhe


@fhe.compiler({"one_hot_vector": "encrypted", "database": "clear"})
def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:
return np.dot(one_hot_vector, database)


def make_one_hot_vector(index: int, size: int) -> np.ndarray:

answer = np.zeros(shape=(size,), dtype=np.int8)
Expand All @@ -14,9 +17,7 @@ def make_one_hot_vector(index: int, size: int) -> np.ndarray:

def compile_function(database, show_mlir=False, show_graph=False):

@fhe.compiler({"one_hot_vector": "encrypted", "database": "clear"})
def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray) -> int:
return np.dot(one_hot_vector, database)
get_ith_element_of_database.reset()

database_length = database.shape[0]
inputset_length = 100
Expand All @@ -30,11 +31,10 @@ def get_ith_element_of_database(one_hot_vector: np.ndarray, database: np.ndarray
return circuit


def test_encrypted_queries(database, circuit, how_many_tests=1, verbose=True):
def test_encrypted_queries(database, circuit, how_many_tests=1):

for _ in range(how_many_tests):
database_length = database.shape[0]
log_database_length = np.ceil(np.log2(database_length)).astype(np.int32)

# Random index in the database
random_index = np.random.randint(database_length)
Expand Down
2 changes: 2 additions & 0 deletions frontends/concrete-python/tests/execution/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ def test_levenshtein_distance_randomly(alphabet_name, max_length, helpers):

def test_pir_basics():

pir_utils.get_ith_element_of_database.reset()

x = pir_utils.make_one_hot_vector(0, size=5)
assert np.array_equal(x, np.array([1, 0, 0, 0, 0]))

Expand Down

0 comments on commit 0190e6b

Please sign in to comment.