From bb6247bcea51ed06949c0db68acdd71bc6f7ce3d Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 13:41:06 +0100
Subject: [PATCH 01/18] Fix input() and assert

---
 src/bbsearch/entrypoints/database_entrypoint.py | 7 ++++++-
 src/bbsearch/mining/attribute.py                | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/bbsearch/entrypoints/database_entrypoint.py b/src/bbsearch/entrypoints/database_entrypoint.py
index eb542c5d5..5630084aa 100644
--- a/src/bbsearch/entrypoints/database_entrypoint.py
+++ b/src/bbsearch/entrypoints/database_entrypoint.py
@@ -5,6 +5,11 @@
 
 from ._helper import configure_logging
 
+try:
+    rinput = raw_input
+except NameError:
+    rinput = input
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--log_dir",
@@ -54,7 +59,7 @@ def main():
         engine = sqlalchemy.create_engine(f"sqlite:///{database_path}")
     elif args.db_type == "mysql":
         # We assume the database `cord19_v35` already exists
-        mysql_uri = input("MySQL URI:")
+        mysql_uri = rinput("MySQL URI:")
         password = getpass.getpass("Password:")
         engine = sqlalchemy.create_engine(
             f"mysql+pymysql://root:{password}" f"@{mysql_uri}/cord19_v47"
diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py
index c6075b639..f02a5742e 100644
--- a/src/bbsearch/mining/attribute.py
+++ b/src/bbsearch/mining/attribute.py
@@ -580,7 +580,7 @@ def get_core_nlp_analysis(self, text):
             response = requests.post(
                 self.core_nlp_url + request_params, data=request_data
             )
-            assert response.status_code == 200
+            response.raise_for_status()
             response_json = json.loads(response.text)
         except requests.exceptions.RequestException:
             warnings.warn("There was a problem contacting the CoreNLP server.")
@@ -801,7 +801,8 @@ def __init__(self, texts, attribute_extractor, ee_model):
         """
         super().__init__()
 
-        assert len(texts) > 0
+        if not texts:
+            raise ValueError(f"The list of texts to be annotated shoud be nonempty, but got texts = {texts}")
         self.texts = texts
 
         self.idx_slider = widgets.IntSlider(

From fff2d8f0a596ebb6fd3ac12226d8c64e209e326e Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 14:10:32 +0100
Subject: [PATCH 02/18] Enable bandit

---
 src/bbsearch/mining/entity.py | 2 ++
 tox.ini                       | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py
index 4e5679057..f500d83d2 100644
--- a/src/bbsearch/mining/entity.py
+++ b/src/bbsearch/mining/entity.py
@@ -198,6 +198,8 @@ def to_jsonl(self, path, sort_by=None):
 
         Parameters
         ----------
+        path : pathlib.Path
+            File where to save it.
         sort_by : None or list
             If None, then no sorting taking place. If ``list``, then the
             names of columns along which to sort.
diff --git a/tox.ini b/tox.ini
index 480472a7f..1e70e43a5 100644
--- a/tox.ini
+++ b/tox.ini
@@ -21,7 +21,7 @@ commands =
 [testenv:lint]
 skip_install = true
 deps =
-;    bandit
+    bandit
     black==20.8b1
     flake8==3.8.4
     isort==5.6.4
@@ -31,7 +31,7 @@ commands =
     isort --profile black --check setup.py {[tox]source} tests
     pydocstyle {[tox]source}
     black -q --check setup.py {[tox]source} tests
-;    bandit -q -r {[tox]source}
+    bandit -q -r {[tox]source}
 
 [testenv:format]
 skip_install = true

From 681d6e0ca9ef701e892bd7aadca1298671503c19 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 15:18:59 +0100
Subject: [PATCH 03/18] Fix eval()

---
 src/bbsearch/mining/attribute.py |  4 +++-
 src/bbsearch/mining/entity.py    | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py
index f02a5742e..8ce81c788 100644
--- a/src/bbsearch/mining/attribute.py
+++ b/src/bbsearch/mining/attribute.py
@@ -802,7 +802,9 @@ def __init__(self, texts, attribute_extractor, ee_model):
         super().__init__()
 
         if not texts:
-            raise ValueError(f"The list of texts to be annotated shoud be nonempty, but got texts = {texts}")
+            raise ValueError(
+                f"Texts to be annotated shoud be nonempty, got texts = {texts}"
+            )
         self.texts = texts
 
         self.idx_slider = widgets.IntSlider(
diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py
index f500d83d2..dc3626d80 100644
--- a/src/bbsearch/mining/entity.py
+++ b/src/bbsearch/mining/entity.py
@@ -1,5 +1,6 @@
 """Classes and functions for entity extraction (aka named entity recognition)."""
 
+import ast
 import copy
 
 import numpy as np
@@ -328,11 +329,16 @@ def row2raw(row):
                 ):
                     raise KeyError()
 
-                value = (
-                    eval(f"{value_type}({value_str})")
-                    if value_type != "str"
-                    else value_str
-                )
+                if value_type != "str":
+                    try:
+                        value = ast.literal_eval(value_str)
+                    except ValueError as ve:
+                        if str(ve).startswith("malformed node or string"):
+                            raise NameError(str(ve))
+                        else:
+                            raise
+                else:
+                    value = value_str
 
                 token_pattern = {attribute: value}
                 if op:

From da0c0ce045e5bbef1381a9445c4696393b0246f3 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Mon, 30 Nov 2020 15:26:04 +0100
Subject: [PATCH 04/18] Fix assert

---
 src/bbsearch/mining/eval.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py
index b87f215c1..c319cbc92 100644
--- a/src/bbsearch/mining/eval.py
+++ b/src/bbsearch/mining/eval.py
@@ -441,7 +441,9 @@ def ner_errors(
               ...
             }
     """
-    assert len(iob_true) == len(iob_pred) == len(tokens)
+    if not len(iob_true) == len(iob_pred) == len(tokens):
+        raise ValueError("iob_true, iob_pred and tokens "
+                         "should have the same length")
     etypes = unique_etypes(iob_true)
 
     etypes_map = etypes_map if etypes_map is not None else dict()

From 50bc9aad2c9aa5fe32a3192ea7ba72c94c1a575e Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 15:42:23 +0100
Subject: [PATCH 05/18] Fix assert() and add test

---
 src/bbsearch/mining/eval.py    | 8 +++++---
 tests/test_mining/test_eval.py | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py
index c319cbc92..57c81d798 100644
--- a/src/bbsearch/mining/eval.py
+++ b/src/bbsearch/mining/eval.py
@@ -441,9 +441,11 @@ def ner_errors(
               ...
             }
     """
-    if not len(iob_true) == len(iob_pred) == len(tokens):
-        raise ValueError("iob_true, iob_pred and tokens "
-                         "should have the same length")
+    if not (len(iob_true) == len(iob_pred) == len(tokens)):
+        raise ValueError(
+            f"Inputs iob_true (len={len(iob_true)}), iob_pred (len={len(iob_pred)}), "
+            f"tokens (len={len(tokens)}) should have equal lenght."
+        )
     etypes = unique_etypes(iob_true)
 
     etypes_map = etypes_map if etypes_map is not None else dict()
diff --git a/tests/test_mining/test_eval.py b/tests/test_mining/test_eval.py
index 696312dbe..ac91169b7 100644
--- a/tests/test_mining/test_eval.py
+++ b/tests/test_mining/test_eval.py
@@ -560,6 +560,8 @@ def test_ner_errors(ner_annotations, dataset, mode, errors_expected):
     )
     errors_expected = OrderedDict(errors_expected)
     assert errors_out == errors_expected
+    with pytest.raises(ValueError):
+        ner_errors(iob_true, iob_pred[:-1], tokens)
 
 
 def test_remove_punctuation(punctuation_annotations):

From fe66866b6d91eaa6530d8dab448e1e194dd4c117 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 15:58:48 +0100
Subject: [PATCH 06/18] Fix hardcoded password

---
 src/bbsearch/mining/relation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bbsearch/mining/relation.py b/src/bbsearch/mining/relation.py
index f46a173fc..9a6707583 100644
--- a/src/bbsearch/mining/relation.py
+++ b/src/bbsearch/mining/relation.py
@@ -137,11 +137,11 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols):
     tokens = []
     i = sent.start
     while i < sent.end:
-        new_token = " "  # hack to keep the punctuation nice
+        new_tkn = " "  # hack to keep the punctuation nice
 
         if ent_1.start == i:
             start, end = ent_1.start, ent_1.end
-            new_token += (
+            new_tkn += (
                 etype_symbols[etype_1][0]
                 + doc[start:end].text
                 + etype_symbols[etype_1][1]
@@ -149,7 +149,7 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols):
 
         elif ent_2.start == i:
             start, end = ent_2.start, ent_2.end
-            new_token += (
+            new_tkn += (
                 etype_symbols[etype_2][0]
                 + doc[start:end].text
                 + etype_symbols[etype_2][1]
@@ -157,9 +157,9 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols):
 
         else:
             start, end = i, i + 1
-            new_token = doc[i].text if doc[i].is_punct else new_token + doc[i].text
+            new_tkn = doc[i].text if doc[i].is_punct else new_tkn + doc[i].text
 
-        tokens.append(new_token)
+        tokens.append(new_tkn)
         i += end - start
 
     return "".join(tokens).strip()

From 14a45046d2af9f53c8a193834bdc5cc4cf776e61 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Mon, 30 Nov 2020 16:07:31 +0100
Subject: [PATCH 07/18] Fix 2 SQL issues

---
 src/bbsearch/mining/eval.py |  2 +-
 src/bbsearch/sql.py         | 28 ++++++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py
index 57c81d798..906d2d1f9 100644
--- a/src/bbsearch/mining/eval.py
+++ b/src/bbsearch/mining/eval.py
@@ -444,7 +444,7 @@ def ner_errors(
     if not (len(iob_true) == len(iob_pred) == len(tokens)):
         raise ValueError(
             f"Inputs iob_true (len={len(iob_true)}), iob_pred (len={len(iob_pred)}), "
-            f"tokens (len={len(tokens)}) should have equal lenght."
+            f"tokens (len={len(tokens)}) should have equal length."
         )
     etypes = unique_etypes(iob_true)
 
diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index c4fe147ac..b59a618b2 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import sqlalchemy.sql as sql
 
 
 def get_titles(article_ids, engine):
@@ -151,13 +152,22 @@ def retrieve_paragraph(article_id, paragraph_pos_in_article, engine):
         pd.DataFrame with the paragraph and its metadata:
         article_id, text, section_name, paragraph_pos_in_article.
     """
-    sql_query = f"""SELECT section_name, text
+    sql_query = sql.text(
+        """SELECT section_name, text
                     FROM sentences
-                    WHERE article_id = {article_id}
-                    AND paragraph_pos_in_article = {paragraph_pos_in_article}
+                    WHERE article_id = :article_id
+                    AND paragraph_pos_in_article = :paragraph_pos_in_article
                     ORDER BY sentence_pos_in_paragraph ASC"""
-
-    sentences = pd.read_sql(sql_query, engine)
+    )
+
+    sentences = pd.read_sql(
+        sql_query,
+        engine,
+        params={
+            "article_id": article_id,
+            "paragraph_pos_in_article": paragraph_pos_in_article,
+        },
+    )
     if sentences.empty:
         paragraph = pd.DataFrame(
             columns=["article_id", "text", "section_name", "paragraph_pos_in_article"]
@@ -199,10 +209,12 @@ def retrieve_article_metadata_from_article_id(article_id, engine):
         'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id',
         'pdf_json_files', 'pmc_json_files', 'url', 's2_id'.
     """
-    sql_query = f"""SELECT *
+    sql_query = sql.text(
+        """SELECT *
                     FROM articles
-                    WHERE article_id = {article_id}"""
-    article = pd.read_sql(sql_query, engine)
+                    WHERE article_id = :article_id"""
+    )
+    article = pd.read_sql(sql_query, engine, params={"article_id": article_id})
     return article
 
 

From 0355f125527c824450a7d892bed5525917e2ff43 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Mon, 30 Nov 2020 16:42:52 +0100
Subject: [PATCH 08/18] Fix 1 SQL issue

---
 src/bbsearch/sql.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index b59a618b2..914f55420 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -113,19 +113,23 @@ def retrieve_paragraph_from_sentence_id(sentence_id, engine):
         sentence_id. If None then the `sentence_id` was not found in the
         sentences table.
     """
-    sql_query = f"""SELECT text
+    sql_query = sql.text(
+        """SELECT text
                     FROM sentences
                     WHERE article_id =
                         (SELECT article_id
                         FROM sentences
-                        WHERE sentence_id = {sentence_id})
+                        WHERE sentence_id = :sentence_id )
                     AND paragraph_pos_in_article =
                         (SELECT paragraph_pos_in_article
                         FROM sentences
-                        WHERE sentence_id = {sentence_id})
+                        WHERE sentence_id = :sentence_id )
                     ORDER BY sentence_pos_in_paragraph ASC"""
+    )
 
-    all_sentences = pd.read_sql(sql_query, engine)["text"].to_list()
+    all_sentences = pd.read_sql(sql_query, engine, params={"sentence_id": sentence_id})[
+        "text"
+    ].to_list()
     if not all_sentences:
         paragraph = None
     else:

From 1ee7b15198407a10f802c130b0f13d3625e27f66 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 16:47:12 +0100
Subject: [PATCH 09/18] Fix sql injection at line 30

---
 src/bbsearch/sql.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index 914f55420..46865af35 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -24,13 +24,15 @@ def get_titles(article_ids, engine):
     if len(article_ids) == 0:
         return {}
 
-    query = f"""\
+    query = sql.text(
+        f"""\
     SELECT article_id, title
     FROM articles
-    WHERE article_id IN ({",".join(map(str, article_ids))})
+    WHERE article_id IN :article_ids
     """
+    )
     with engine.begin() as connection:
-        response = connection.execute(query).fetchall()
+        response = connection.execute(query, {"article_ids": article_ids}).fetchall()
         titles = {article_id: title for article_id, title in response}
 
     return titles

From e4eb7ef524033a6dce6f37d72fe4263f43c71de0 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 16:58:48 +0100
Subject: [PATCH 10/18] Fix sql injection at line 80

---
 src/bbsearch/sql.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index 46865af35..476685c70 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -77,14 +77,19 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False)
     """
     sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids)
     sentence_ids_s = sentence_ids_s or "NULL"
-    sql_query = f"""
+    sql_query = sql.text(
+        f"""
     SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
     FROM sentences
-    WHERE sentence_id IN ({sentence_ids_s})
+    WHERE sentence_id IN :sentence_ids_s
     """
+    )
+    sql_query = sql_query.bindparams(sql.bindparam("sentence_ids_s", expanding=True))
 
     with engine.begin() as connection:
-        df_sentences = pd.read_sql(sql_query, connection)
+        df_sentences = pd.read_sql(
+            sql_query, params={"sentence_ids_s": sentence_ids_s}, con=connection
+        )
 
     if keep_order:
         # Remove sentence IDs that were not found, otherwise df.loc will fail.

From 22ecf044217f2975113abeb69b00dfe58a69c7ed Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 30 Nov 2020 18:11:35 +0100
Subject: [PATCH 11/18] Fix some pytest errors

---
 src/bbsearch/sql.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index 476685c70..f84280b94 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -31,6 +31,9 @@ def get_titles(article_ids, engine):
     WHERE article_id IN :article_ids
     """
     )
+    query = query.bindparams(sql.bindparam("article_ids", expanding=True))
+
+
     with engine.begin() as connection:
         response = connection.execute(query, {"article_ids": article_ids}).fetchall()
         titles = {article_id: title for article_id, title in response}
@@ -175,8 +178,8 @@ def retrieve_paragraph(article_id, paragraph_pos_in_article, engine):
         sql_query,
         engine,
         params={
-            "article_id": article_id,
-            "paragraph_pos_in_article": paragraph_pos_in_article,
+            "article_id": int(article_id),
+            "paragraph_pos_in_article": int(paragraph_pos_in_article),
         },
     )
     if sentences.empty:
@@ -225,7 +228,7 @@ def retrieve_article_metadata_from_article_id(article_id, engine):
                     FROM articles
                     WHERE article_id = :article_id"""
     )
-    article = pd.read_sql(sql_query, engine, params={"article_id": article_id})
+    article = pd.read_sql(sql_query, engine, params={"article_id": int(article_id)})
     return article
 
 

From 1b0a57caec2a72a86f3633dcd4129257796031c8 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Tue, 1 Dec 2020 09:55:18 +0100
Subject: [PATCH 12/18] Fix pytest tests

---
 src/bbsearch/sql.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index f84280b94..8e9e76339 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -25,15 +25,13 @@ def get_titles(article_ids, engine):
         return {}
 
     query = sql.text(
-        f"""\
-    SELECT article_id, title
-    FROM articles
-    WHERE article_id IN :article_ids
-    """
+        """SELECT article_id, title
+        FROM articles
+        WHERE article_id IN :article_ids
+        """
     )
     query = query.bindparams(sql.bindparam("article_ids", expanding=True))
 
-
     with engine.begin() as connection:
         response = connection.execute(query, {"article_ids": article_ids}).fetchall()
         titles = {article_id: title for article_id, title in response}
@@ -78,20 +76,22 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False)
         Pandas DataFrame containing all sentences and their corresponding metadata:
         article_id, sentence_id, section_name, text, paragraph_pos_in_article.
     """
-    sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids)
-    sentence_ids_s = sentence_ids_s or "NULL"
+    # sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids)
+    # sentence_ids_s = sentence_ids_s or "NULL"
     sql_query = sql.text(
-        f"""
-    SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
-    FROM sentences
-    WHERE sentence_id IN :sentence_ids_s
-    """
+        """
+        SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
+        FROM sentences
+        WHERE sentence_id IN :sentence_ids
+        """
     )
-    sql_query = sql_query.bindparams(sql.bindparam("sentence_ids_s", expanding=True))
+    sql_query = sql_query.bindparams(sql.bindparam("sentence_ids", expanding=True))
 
     with engine.begin() as connection:
         df_sentences = pd.read_sql(
-            sql_query, params={"sentence_ids_s": sentence_ids_s}, con=connection
+            sql_query,
+            params={"sentence_ids": [int(id_) for id_ in sentence_ids]},
+            con=connection,
         )
 
     if keep_order:

From db30b0ad4da73fe266850a4eff79acbc40874d19 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Tue, 1 Dec 2020 10:06:42 +0100
Subject: [PATCH 13/18] Fix 1 SQL issue

---
 src/bbsearch/sql.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index 8e9e76339..c9956ae10 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -76,8 +76,6 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False)
         Pandas DataFrame containing all sentences and their corresponding metadata:
         article_id, sentence_id, section_name, text, paragraph_pos_in_article.
     """
-    # sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids)
-    # sentence_ids_s = sentence_ids_s or "NULL"
     sql_query = sql.text(
         """
         SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
@@ -248,14 +246,17 @@ def retrieve_articles(article_ids, engine):
         DataFrame containing the articles divided into paragraphs. The columns are
         'article_id', 'paragraph_pos_in_article', 'text', 'section_name'.
     """
-    articles_str = ", ".join(str(id_) for id_ in article_ids)
-    sql_query = f"""SELECT *
+    article_ids = [int(id_) for id_ in article_ids]
+    sql_query = sql.text(
+        """SELECT *
                     FROM sentences
-                    WHERE article_id IN ({articles_str})
+                    WHERE article_id IN :articles_ids
                     ORDER BY article_id ASC,
                     paragraph_pos_in_article ASC,
                     sentence_pos_in_paragraph ASC"""
-    all_sentences = pd.read_sql(sql_query, engine)
+    )
+    sql_query = sql_query.bindparams(sql.bindparam("articles_ids", expanding=True))
+    all_sentences = pd.read_sql(sql_query, engine, params={"articles_ids": article_ids})
 
     groupby_var = all_sentences.groupby(by=["article_id", "paragraph_pos_in_article"])
     paragraphs = groupby_var["text"].apply(lambda x: " ".join(x))

From e701b852548afbe6b14649dd3bbc9d73cd470091 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Tue, 1 Dec 2020 10:36:36 +0100
Subject: [PATCH 14/18] cast sentence_id to int

---
 src/bbsearch/sql.py | 6 +++---
 tests/test_sql.py   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index c9956ae10..65e74b59b 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -135,9 +135,9 @@ def retrieve_paragraph_from_sentence_id(sentence_id, engine):
                     ORDER BY sentence_pos_in_paragraph ASC"""
     )
 
-    all_sentences = pd.read_sql(sql_query, engine, params={"sentence_id": sentence_id})[
-        "text"
-    ].to_list()
+    all_sentences = pd.read_sql(
+        sql_query, engine, params={"sentence_id": int(sentence_id)}
+    )["text"].to_list()
     if not all_sentences:
         paragraph = None
     else:
diff --git a/tests/test_sql.py b/tests/test_sql.py
index 9db19be3f..ec177d347 100644
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@@ -74,7 +74,7 @@ def test_retrieve_sentence_from_sentence_ids(
             ]
         )
 
-    @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100])
+    @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100, -1, np.int64(2)])
     def test_retrieve_paragraph_from_sentence_id(
         self, sentence_id, fake_sqlalchemy_engine
     ):
@@ -85,7 +85,7 @@ def test_retrieve_paragraph_from_sentence_id(
         sentence_text = retrieve_sentences_from_sentence_ids(
             sentence_ids=(sentence_id,), engine=fake_sqlalchemy_engine
         )
-        if sentence_id == 0 or sentence_id == -100:  # invalid sentence_id
+        if sentence_id in [0, -100, -1]:  # invalid sentence_id
             assert paragraph is None
         else:
             assert isinstance(paragraph, str)

From 7bc325d5731a2a7c1fce08b334e425a7a945e60b Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Tue, 1 Dec 2020 10:59:16 +0100
Subject: [PATCH 15/18] Comment some bandit issues

---
 src/bbsearch/database/cord_19.py      |  2 +-
 src/bbsearch/database/mining_cache.py |  2 +-
 src/bbsearch/sql.py                   | 32 +++++++++++++++++----------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/bbsearch/database/cord_19.py b/src/bbsearch/database/cord_19.py
index 5b198596d..92f8fac5d 100644
--- a/src/bbsearch/database/cord_19.py
+++ b/src/bbsearch/database/cord_19.py
@@ -37,7 +37,7 @@ def mark_bad_sentences(engine, sentences_table_name):
 
     logger.info("Getting all sentences")
     with engine.begin() as connection:
-        query = f"SELECT sentence_id, text FROM {sentences_table_name}"
+        query = f"SELECT sentence_id, text FROM {sentences_table_name}"  # nosec
         df_sentences = pd.read_sql(query, connection)
 
     logger.info("Computing text lengths")
diff --git a/src/bbsearch/database/mining_cache.py b/src/bbsearch/database/mining_cache.py
index f48f44969..ed6751b3f 100644
--- a/src/bbsearch/database/mining_cache.py
+++ b/src/bbsearch/database/mining_cache.py
@@ -285,7 +285,7 @@ def _delete_rows(self):
             DELETE
             FROM {self.target_table}
             WHERE mining_model = :mining_model
-            """
+            """  # nosec
             self.engine.execute(
                 sqlalchemy.sql.text(query),
                 mining_model=model_schema["model_path"],
diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
index 65e74b59b..0a6dcbfa6 100644
--- a/src/bbsearch/sql.py
+++ b/src/bbsearch/sql.py
@@ -287,21 +287,27 @@ def retrieve_mining_cache(identifiers, model_names, engine):
     result : pd.DataFrame
         Selected rows of the `mining_cache` table.
     """
-    model_names = tuple(set(model_names))
-    if len(model_names) == 1:
-        model_names = f"('{model_names[0]}')"
+    model_names = list(set(model_names))
+    identifiers_arts = [int(a) for a, p in identifiers if p == -1]
 
-    identifiers_arts = tuple(a for a, p in identifiers if p == -1)
-    if len(identifiers_arts) == 1:
-        identifiers_arts = f"({identifiers_arts[0]})"
     if identifiers_arts:
-        query_arts = f"""
+        query_arts = sql.text(
+            """
         SELECT *
         FROM mining_cache
-        WHERE article_id IN {identifiers_arts} AND mining_model IN {model_names}
+        WHERE article_id IN :identifiers_arts AND mining_model IN :model_names
         ORDER BY article_id, paragraph_pos_in_article, start_char
         """
-        df_arts = pd.read_sql(query_arts, con=engine)
+        )
+        query_arts = query_arts.bindparams(
+            sql.bindparam("identifiers_arts", expanding=True),
+            sql.bindparam("model_names", expanding=True),
+        )
+        df_arts = pd.read_sql(
+            query_arts,
+            con=engine,
+            params={"identifiers_arts": identifiers_arts, "model_names": model_names},
+        )
     else:
         df_arts = pd.DataFrame()
 
@@ -314,6 +320,8 @@ def retrieve_mining_cache(identifiers, model_names, engine):
         # 3. If `len(identifiers_pars)` is too large, we may have a too long
         #    SQL statement which overflows the max length. So we break it down.
 
+        if len(model_names) == 1:
+            model_names = f"('{model_names[0]}')"
         batch_size = 1000
         dfs_pars = []
         d, r = divmod(len(identifiers_pars), batch_size)
@@ -323,14 +331,14 @@ def retrieve_mining_cache(identifiers, model_names, engine):
                 SELECT *
                 FROM mining_cache
                 WHERE (article_id = {a} AND paragraph_pos_in_article = {p})
-                """
+                """  # nosec
                 for a, p in identifiers_pars[i * batch_size : (i + 1) * batch_size]
             )
             query_pars = f"""
             SELECT *
             FROM ({query_pars}) tt
             WHERE tt.mining_model IN {model_names}
-            """
+            """  # nosec
             dfs_pars.append(pd.read_sql(query_pars, engine))
         df_pars = pd.concat(dfs_pars)
         df_pars = df_pars.sort_values(
@@ -580,7 +588,7 @@ def _build_query(self):
                 FROM articles
                 WHERE {" AND ".join(article_conditions)}
             )
-            """.strip()
+            """.strip()  # nosec
             sentence_conditions.append(article_condition_query)
 
         # Restricted sentence IDs

From 88c9437fc0c0d1758b2e81e401307c7775e46251 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Tue, 1 Dec 2020 16:44:00 +0100
Subject: [PATCH 16/18] Fix exception message

---
 src/bbsearch/mining/attribute.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py
index 8ce81c788..dc3cb4454 100644
--- a/src/bbsearch/mining/attribute.py
+++ b/src/bbsearch/mining/attribute.py
@@ -802,9 +802,7 @@ def __init__(self, texts, attribute_extractor, ee_model):
         super().__init__()
 
         if not texts:
-            raise ValueError(
-                f"Texts to be annotated shoud be nonempty, got texts = {texts}"
-            )
+            raise TypeError("texts must be a non-empty list.")
         self.texts = texts
 
         self.idx_slider = widgets.IntSlider(

From d44fb72a29e83585de00a3fe41a22c2cbb70795d Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Wed, 2 Dec 2020 14:58:05 +0100
Subject: [PATCH 17/18] Ignore python2 bandit error

---
 .bandit                                         | 1 +
 src/bbsearch/entrypoints/database_entrypoint.py | 7 +------
 tox.ini                                         | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)
 create mode 100644 .bandit

diff --git a/.bandit b/.bandit
new file mode 100644
index 000000000..da0f88d5b
--- /dev/null
+++ b/.bandit
@@ -0,0 +1 @@
+skips: [B322]
diff --git a/src/bbsearch/entrypoints/database_entrypoint.py b/src/bbsearch/entrypoints/database_entrypoint.py
index 5630084aa..eb542c5d5 100644
--- a/src/bbsearch/entrypoints/database_entrypoint.py
+++ b/src/bbsearch/entrypoints/database_entrypoint.py
@@ -5,11 +5,6 @@
 
 from ._helper import configure_logging
 
-try:
-    rinput = raw_input
-except NameError:
-    rinput = input
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--log_dir",
@@ -59,7 +54,7 @@ def main():
         engine = sqlalchemy.create_engine(f"sqlite:///{database_path}")
     elif args.db_type == "mysql":
         # We assume the database `cord19_v35` already exists
-        mysql_uri = rinput("MySQL URI:")
+        mysql_uri = input("MySQL URI:")
         password = getpass.getpass("Password:")
         engine = sqlalchemy.create_engine(
             f"mysql+pymysql://root:{password}" f"@{mysql_uri}/cord19_v47"
diff --git a/tox.ini b/tox.ini
index 71bd5d14a..80b688084 100644
--- a/tox.ini
+++ b/tox.ini
@@ -32,7 +32,7 @@ commands =
     isort --profile black --check setup.py {[tox]source} tests
     pydocstyle {[tox]source}
     black -q --check setup.py {[tox]source} tests
-    bandit -q -r {[tox]source}
+    bandit -c .bandit -q -r {[tox]source}
 
 [testenv:format]
 skip_install = true

From 563825d623cb4bd0c0bd509d3e0504f44d72bb38 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Wed, 2 Dec 2020 15:22:59 +0100
Subject: [PATCH 18/18] Chain excpetion

---
 src/bbsearch/mining/entity.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py
index dc3626d80..3f41d380b 100644
--- a/src/bbsearch/mining/entity.py
+++ b/src/bbsearch/mining/entity.py
@@ -334,7 +334,7 @@ def row2raw(row):
                         value = ast.literal_eval(value_str)
                     except ValueError as ve:
                         if str(ve).startswith("malformed node or string"):
-                            raise NameError(str(ve))
+                            raise NameError(str(ve)) from ve
                         else:
                             raise
                 else: