From bb6247bcea51ed06949c0db68acdd71bc6f7ce3d Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 13:41:06 +0100 Subject: [PATCH 01/18] Fix input() and assert --- src/bbsearch/entrypoints/database_entrypoint.py | 7 ++++++- src/bbsearch/mining/attribute.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/bbsearch/entrypoints/database_entrypoint.py b/src/bbsearch/entrypoints/database_entrypoint.py index eb542c5d5..5630084aa 100644 --- a/src/bbsearch/entrypoints/database_entrypoint.py +++ b/src/bbsearch/entrypoints/database_entrypoint.py @@ -5,6 +5,11 @@ from ._helper import configure_logging +try: + rinput = raw_input +except NameError: + rinput = input + parser = argparse.ArgumentParser() parser.add_argument( "--log_dir", @@ -54,7 +59,7 @@ def main(): engine = sqlalchemy.create_engine(f"sqlite:///{database_path}") elif args.db_type == "mysql": # We assume the database `cord19_v35` already exists - mysql_uri = input("MySQL URI:") + mysql_uri = rinput("MySQL URI:") password = getpass.getpass("Password:") engine = sqlalchemy.create_engine( f"mysql+pymysql://root:{password}" f"@{mysql_uri}/cord19_v47" diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py index c6075b639..f02a5742e 100644 --- a/src/bbsearch/mining/attribute.py +++ b/src/bbsearch/mining/attribute.py @@ -580,7 +580,7 @@ def get_core_nlp_analysis(self, text): response = requests.post( self.core_nlp_url + request_params, data=request_data ) - assert response.status_code == 200 + response.raise_for_status() response_json = json.loads(response.text) except requests.exceptions.RequestException: warnings.warn("There was a problem contacting the CoreNLP server.") @@ -801,7 +801,8 @@ def __init__(self, texts, attribute_extractor, ee_model): """ super().__init__() - assert len(texts) > 0 + if not texts: + raise ValueError(f"The list of texts to be annotated shoud be nonempty, but got texts = {texts}") self.texts = texts self.idx_slider = widgets.IntSlider( From fff2d8f0a596ebb6fd3ac12226d8c64e209e326e Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 14:10:32 +0100 Subject: [PATCH 02/18] Enable bandit --- src/bbsearch/mining/entity.py | 2 ++ tox.ini | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py index 4e5679057..f500d83d2 100644 --- a/src/bbsearch/mining/entity.py +++ b/src/bbsearch/mining/entity.py @@ -198,6 +198,8 @@ def to_jsonl(self, path, sort_by=None): Parameters ---------- + path : pathlib.Path + File where to save it. sort_by : None or list If None, then no sorting taking place. If ``list``, then the names of columns along which to sort. diff --git a/tox.ini b/tox.ini index 480472a7f..1e70e43a5 100644 --- a/tox.ini +++ b/tox.ini @@ -21,7 +21,7 @@ commands = [testenv:lint] skip_install = true deps = -; bandit + bandit black==20.8b1 flake8==3.8.4 isort==5.6.4 @@ -31,7 +31,7 @@ commands = isort --profile black --check setup.py {[tox]source} tests pydocstyle {[tox]source} black -q --check setup.py {[tox]source} tests -; bandit -q -r {[tox]source} + bandit -q -r {[tox]source} [testenv:format] skip_install = true From 681d6e0ca9ef701e892bd7aadca1298671503c19 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 15:18:59 +0100 Subject: [PATCH 03/18] Fix eval() --- src/bbsearch/mining/attribute.py | 4 +++- src/bbsearch/mining/entity.py | 16 +++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py index f02a5742e..8ce81c788 100644 --- a/src/bbsearch/mining/attribute.py +++ b/src/bbsearch/mining/attribute.py @@ -802,7 +802,9 @@ def __init__(self, texts, attribute_extractor, ee_model): super().__init__() if not texts: - raise ValueError(f"The list of texts to be annotated shoud be nonempty, but got texts = {texts}") + raise ValueError( + f"Texts to be annotated shoud be nonempty, got texts = {texts}" + ) self.texts = texts self.idx_slider = widgets.IntSlider( diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py index f500d83d2..dc3626d80 100644 --- a/src/bbsearch/mining/entity.py +++ b/src/bbsearch/mining/entity.py @@ -1,5 +1,6 @@ """Classes and functions for entity extraction (aka named entity recognition).""" +import ast import copy import numpy as np @@ -328,11 +329,16 @@ def row2raw(row): ): raise KeyError() - value = ( - eval(f"{value_type}({value_str})") - if value_type != "str" - else value_str - ) + if value_type != "str": + try: + value = ast.literal_eval(value_str) + except ValueError as ve: + if str(ve).startswith("malformed node or string"): + raise NameError(str(ve)) + else: + raise + else: + value = value_str token_pattern = {attribute: value} if op: From da0c0ce045e5bbef1381a9445c4696393b0246f3 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Mon, 30 Nov 2020 15:26:04 +0100 Subject: [PATCH 04/18] Fix assert --- src/bbsearch/mining/eval.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py index b87f215c1..c319cbc92 100644 --- a/src/bbsearch/mining/eval.py +++ b/src/bbsearch/mining/eval.py @@ -441,7 +441,9 @@ def ner_errors( ... } """ - assert len(iob_true) == len(iob_pred) == len(tokens) + if not len(iob_true) == len(iob_pred) == len(tokens): + raise ValueError("iob_true, iob_pred and tokens " + "should have the same length") etypes = unique_etypes(iob_true) etypes_map = etypes_map if etypes_map is not None else dict() From 50bc9aad2c9aa5fe32a3192ea7ba72c94c1a575e Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 15:42:23 +0100 Subject: [PATCH 05/18] Fix assert() and add test --- src/bbsearch/mining/eval.py | 8 +++++--- tests/test_mining/test_eval.py | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py index c319cbc92..57c81d798 100644 --- a/src/bbsearch/mining/eval.py +++ b/src/bbsearch/mining/eval.py @@ -441,9 +441,11 @@ def ner_errors( ... } """ - if not len(iob_true) == len(iob_pred) == len(tokens): - raise ValueError("iob_true, iob_pred and tokens " - "should have the same length") + if not (len(iob_true) == len(iob_pred) == len(tokens)): + raise ValueError( + f"Inputs iob_true (len={len(iob_true)}), iob_pred (len={len(iob_pred)}), " + f"tokens (len={len(tokens)}) should have equal lenght." + ) etypes = unique_etypes(iob_true) etypes_map = etypes_map if etypes_map is not None else dict() diff --git a/tests/test_mining/test_eval.py b/tests/test_mining/test_eval.py index 696312dbe..ac91169b7 100644 --- a/tests/test_mining/test_eval.py +++ b/tests/test_mining/test_eval.py @@ -560,6 +560,8 @@ def test_ner_errors(ner_annotations, dataset, mode, errors_expected): ) errors_expected = OrderedDict(errors_expected) assert errors_out == errors_expected + with pytest.raises(ValueError): + ner_errors(iob_true, iob_pred[:-1], tokens) def test_remove_punctuation(punctuation_annotations): From fe66866b6d91eaa6530d8dab448e1e194dd4c117 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 15:58:48 +0100 Subject: [PATCH 06/18] Fix hardcoded password --- src/bbsearch/mining/relation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bbsearch/mining/relation.py b/src/bbsearch/mining/relation.py index f46a173fc..9a6707583 100644 --- a/src/bbsearch/mining/relation.py +++ b/src/bbsearch/mining/relation.py @@ -137,11 +137,11 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols): tokens = [] i = sent.start while i < sent.end: - new_token = " " # hack to keep the punctuation nice + new_tkn = " " # hack to keep the punctuation nice if ent_1.start == i: start, end = ent_1.start, ent_1.end - new_token += ( + new_tkn += ( etype_symbols[etype_1][0] + doc[start:end].text + etype_symbols[etype_1][1] @@ -149,7 +149,7 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols): elif ent_2.start == i: start, end = ent_2.start, ent_2.end - new_token += ( + new_tkn += ( etype_symbols[etype_2][0] + doc[start:end].text + etype_symbols[etype_2][1] @@ -157,9 +157,9 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols): else: start, end = i, i + 1 - new_token = doc[i].text if doc[i].is_punct else new_token + doc[i].text + new_tkn = doc[i].text if doc[i].is_punct else new_tkn + doc[i].text - tokens.append(new_token) + tokens.append(new_tkn) i += end - start return "".join(tokens).strip() From 14a45046d2af9f53c8a193834bdc5cc4cf776e61 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Mon, 30 Nov 2020 16:07:31 +0100 Subject: [PATCH 07/18] Fix 2 SQL issues --- src/bbsearch/mining/eval.py | 2 +- src/bbsearch/sql.py | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py index 57c81d798..906d2d1f9 100644 --- a/src/bbsearch/mining/eval.py +++ b/src/bbsearch/mining/eval.py @@ -444,7 +444,7 @@ def ner_errors( if not (len(iob_true) == len(iob_pred) == len(tokens)): raise ValueError( f"Inputs iob_true (len={len(iob_true)}), iob_pred (len={len(iob_pred)}), " - f"tokens (len={len(tokens)}) should have equal lenght." + f"tokens (len={len(tokens)}) should have equal length." ) etypes = unique_etypes(iob_true) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index c4fe147ac..b59a618b2 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import sqlalchemy.sql as sql def get_titles(article_ids, engine): @@ -151,13 +152,22 @@ def retrieve_paragraph(article_id, paragraph_pos_in_article, engine): pd.DataFrame with the paragraph and its metadata: article_id, text, section_name, paragraph_pos_in_article. """ - sql_query = f"""SELECT section_name, text + sql_query = sql.text( + """SELECT section_name, text FROM sentences - WHERE article_id = {article_id} - AND paragraph_pos_in_article = {paragraph_pos_in_article} + WHERE article_id = :article_id + AND paragraph_pos_in_article = :paragraph_pos_in_article ORDER BY sentence_pos_in_paragraph ASC""" - - sentences = pd.read_sql(sql_query, engine) + ) + + sentences = pd.read_sql( + sql_query, + engine, + params={ + "article_id": article_id, + "paragraph_pos_in_article": paragraph_pos_in_article, + }, + ) if sentences.empty: paragraph = pd.DataFrame( columns=["article_id", "text", "section_name", "paragraph_pos_in_article"] @@ -199,10 +209,12 @@ def retrieve_article_metadata_from_article_id(article_id, engine): 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id'. """ - sql_query = f"""SELECT * + sql_query = sql.text( + """SELECT * FROM articles - WHERE article_id = {article_id}""" - article = pd.read_sql(sql_query, engine) + WHERE article_id = :article_id""" + ) + article = pd.read_sql(sql_query, engine, params={"article_id": article_id}) return article From 0355f125527c824450a7d892bed5525917e2ff43 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Mon, 30 Nov 2020 16:42:52 +0100 Subject: [PATCH 08/18] Fix 1 SQL issue --- src/bbsearch/sql.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index b59a618b2..914f55420 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -113,19 +113,23 @@ def retrieve_paragraph_from_sentence_id(sentence_id, engine): sentence_id. If None then the `sentence_id` was not found in the sentences table. """ - sql_query = f"""SELECT text + sql_query = sql.text( + """SELECT text FROM sentences WHERE article_id = (SELECT article_id FROM sentences - WHERE sentence_id = {sentence_id}) + WHERE sentence_id = :sentence_id ) AND paragraph_pos_in_article = (SELECT paragraph_pos_in_article FROM sentences - WHERE sentence_id = {sentence_id}) + WHERE sentence_id = :sentence_id ) ORDER BY sentence_pos_in_paragraph ASC""" + ) - all_sentences = pd.read_sql(sql_query, engine)["text"].to_list() + all_sentences = pd.read_sql(sql_query, engine, params={"sentence_id": sentence_id})[ + "text" + ].to_list() if not all_sentences: paragraph = None else: From 1ee7b15198407a10f802c130b0f13d3625e27f66 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 16:47:12 +0100 Subject: [PATCH 09/18] Fix sql injection at line 30 --- src/bbsearch/sql.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index 914f55420..46865af35 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -24,13 +24,15 @@ def get_titles(article_ids, engine): if len(article_ids) == 0: return {} - query = f"""\ + query = sql.text( + f"""\ SELECT article_id, title FROM articles - WHERE article_id IN ({",".join(map(str, article_ids))}) + WHERE article_id IN :article_ids """ + ) with engine.begin() as connection: - response = connection.execute(query).fetchall() + response = connection.execute(query, {"article_ids": article_ids}).fetchall() titles = {article_id: title for article_id, title in response} return titles From e4eb7ef524033a6dce6f37d72fe4263f43c71de0 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 16:58:48 +0100 Subject: [PATCH 10/18] Fix sql injection at line 80 --- src/bbsearch/sql.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index 46865af35..476685c70 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -77,14 +77,19 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False) """ sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids) sentence_ids_s = sentence_ids_s or "NULL" - sql_query = f""" + sql_query = sql.text( + f""" SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article FROM sentences - WHERE sentence_id IN ({sentence_ids_s}) + WHERE sentence_id IN :sentence_ids_s """ + ) + sql_query = sql_query.bindparams(sql.bindparam("sentence_ids_s", expanding=True)) with engine.begin() as connection: - df_sentences = pd.read_sql(sql_query, connection) + df_sentences = pd.read_sql( + sql_query, params={"sentence_ids_s": sentence_ids_s}, con=connection + ) if keep_order: # Remove sentence IDs that were not found, otherwise df.loc will fail. From 22ecf044217f2975113abeb69b00dfe58a69c7ed Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 30 Nov 2020 18:11:35 +0100 Subject: [PATCH 11/18] Fix some pytest errors --- src/bbsearch/sql.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index 476685c70..f84280b94 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -31,6 +31,9 @@ def get_titles(article_ids, engine): WHERE article_id IN :article_ids """ ) + query = query.bindparams(sql.bindparam("article_ids", expanding=True)) + + with engine.begin() as connection: response = connection.execute(query, {"article_ids": article_ids}).fetchall() titles = {article_id: title for article_id, title in response} @@ -175,8 +178,8 @@ def retrieve_paragraph(article_id, paragraph_pos_in_article, engine): sql_query, engine, params={ - "article_id": article_id, - "paragraph_pos_in_article": paragraph_pos_in_article, + "article_id": int(article_id), + "paragraph_pos_in_article": int(paragraph_pos_in_article), }, ) if sentences.empty: @@ -225,7 +228,7 @@ def retrieve_article_metadata_from_article_id(article_id, engine): FROM articles WHERE article_id = :article_id""" ) - article = pd.read_sql(sql_query, engine, params={"article_id": article_id}) + article = pd.read_sql(sql_query, engine, params={"article_id": int(article_id)}) return article From 1b0a57caec2a72a86f3633dcd4129257796031c8 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Tue, 1 Dec 2020 09:55:18 +0100 Subject: [PATCH 12/18] Fix pytest tests --- src/bbsearch/sql.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index f84280b94..8e9e76339 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -25,15 +25,13 @@ def get_titles(article_ids, engine): return {} query = sql.text( - f"""\ - SELECT article_id, title - FROM articles - WHERE article_id IN :article_ids - """ + """SELECT article_id, title + FROM articles + WHERE article_id IN :article_ids + """ ) query = query.bindparams(sql.bindparam("article_ids", expanding=True)) - with engine.begin() as connection: response = connection.execute(query, {"article_ids": article_ids}).fetchall() titles = {article_id: title for article_id, title in response} @@ -78,20 +76,22 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False) Pandas DataFrame containing all sentences and their corresponding metadata: article_id, sentence_id, section_name, text, paragraph_pos_in_article. """ - sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids) - sentence_ids_s = sentence_ids_s or "NULL" + # sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids) + # sentence_ids_s = sentence_ids_s or "NULL" sql_query = sql.text( - f""" - SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article - FROM sentences - WHERE sentence_id IN :sentence_ids_s - """ + """ + SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article + FROM sentences + WHERE sentence_id IN :sentence_ids + """ ) - sql_query = sql_query.bindparams(sql.bindparam("sentence_ids_s", expanding=True)) + sql_query = sql_query.bindparams(sql.bindparam("sentence_ids", expanding=True)) with engine.begin() as connection: df_sentences = pd.read_sql( - sql_query, params={"sentence_ids_s": sentence_ids_s}, con=connection + sql_query, + params={"sentence_ids": [int(id_) for id_ in sentence_ids]}, + con=connection, ) if keep_order: From db30b0ad4da73fe266850a4eff79acbc40874d19 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Tue, 1 Dec 2020 10:06:42 +0100 Subject: [PATCH 13/18] Fix 1 SQL issue --- src/bbsearch/sql.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index 8e9e76339..c9956ae10 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -76,8 +76,6 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False) Pandas DataFrame containing all sentences and their corresponding metadata: article_id, sentence_id, section_name, text, paragraph_pos_in_article. """ - # sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids) - # sentence_ids_s = sentence_ids_s or "NULL" sql_query = sql.text( """ SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article @@ -248,14 +246,17 @@ def retrieve_articles(article_ids, engine): DataFrame containing the articles divided into paragraphs. The columns are 'article_id', 'paragraph_pos_in_article', 'text', 'section_name'. """ - articles_str = ", ".join(str(id_) for id_ in article_ids) - sql_query = f"""SELECT * + article_ids = [int(id_) for id_ in article_ids] + sql_query = sql.text( + """SELECT * FROM sentences - WHERE article_id IN ({articles_str}) + WHERE article_id IN :articles_ids ORDER BY article_id ASC, paragraph_pos_in_article ASC, sentence_pos_in_paragraph ASC""" - all_sentences = pd.read_sql(sql_query, engine) + ) + sql_query = sql_query.bindparams(sql.bindparam("articles_ids", expanding=True)) + all_sentences = pd.read_sql(sql_query, engine, params={"articles_ids": article_ids}) groupby_var = all_sentences.groupby(by=["article_id", "paragraph_pos_in_article"]) paragraphs = groupby_var["text"].apply(lambda x: " ".join(x)) From e701b852548afbe6b14649dd3bbc9d73cd470091 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Tue, 1 Dec 2020 10:36:36 +0100 Subject: [PATCH 14/18] cast sentence_id to int --- src/bbsearch/sql.py | 6 +++--- tests/test_sql.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index c9956ae10..65e74b59b 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -135,9 +135,9 @@ def retrieve_paragraph_from_sentence_id(sentence_id, engine): ORDER BY sentence_pos_in_paragraph ASC""" ) - all_sentences = pd.read_sql(sql_query, engine, params={"sentence_id": sentence_id})[ - "text" - ].to_list() + all_sentences = pd.read_sql( + sql_query, engine, params={"sentence_id": int(sentence_id)} + )["text"].to_list() if not all_sentences: paragraph = None else: diff --git a/tests/test_sql.py b/tests/test_sql.py index 9db19be3f..ec177d347 100644 --- a/tests/test_sql.py +++ b/tests/test_sql.py @@ -74,7 +74,7 @@ def test_retrieve_sentence_from_sentence_ids( ] ) - @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100]) + @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100, -1, np.int64(2)]) def test_retrieve_paragraph_from_sentence_id( self, sentence_id, fake_sqlalchemy_engine ): @@ -85,7 +85,7 @@ def test_retrieve_paragraph_from_sentence_id( sentence_text = retrieve_sentences_from_sentence_ids( sentence_ids=(sentence_id,), engine=fake_sqlalchemy_engine ) - if sentence_id == 0 or sentence_id == -100: # invalid sentence_id + if sentence_id in [0, -100, -1]: # invalid sentence_id assert paragraph is None else: assert isinstance(paragraph, str) From 7bc325d5731a2a7c1fce08b334e425a7a945e60b Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Tue, 1 Dec 2020 10:59:16 +0100 Subject: [PATCH 15/18] Comment some bandit issues --- src/bbsearch/database/cord_19.py | 2 +- src/bbsearch/database/mining_cache.py | 2 +- src/bbsearch/sql.py | 32 +++++++++++++++++---------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/bbsearch/database/cord_19.py b/src/bbsearch/database/cord_19.py index 5b198596d..92f8fac5d 100644 --- a/src/bbsearch/database/cord_19.py +++ b/src/bbsearch/database/cord_19.py @@ -37,7 +37,7 @@ def mark_bad_sentences(engine, sentences_table_name): logger.info("Getting all sentences") with engine.begin() as connection: - query = f"SELECT sentence_id, text FROM {sentences_table_name}" + query = f"SELECT sentence_id, text FROM {sentences_table_name}" # nosec df_sentences = pd.read_sql(query, connection) logger.info("Computing text lengths") diff --git a/src/bbsearch/database/mining_cache.py b/src/bbsearch/database/mining_cache.py index f48f44969..ed6751b3f 100644 --- a/src/bbsearch/database/mining_cache.py +++ b/src/bbsearch/database/mining_cache.py @@ -285,7 +285,7 @@ def _delete_rows(self): DELETE FROM {self.target_table} WHERE mining_model = :mining_model - """ + """ # nosec self.engine.execute( sqlalchemy.sql.text(query), mining_model=model_schema["model_path"], diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py index 65e74b59b..0a6dcbfa6 100644 --- a/src/bbsearch/sql.py +++ b/src/bbsearch/sql.py @@ -287,21 +287,27 @@ def retrieve_mining_cache(identifiers, model_names, engine): result : pd.DataFrame Selected rows of the `mining_cache` table. """ - model_names = tuple(set(model_names)) - if len(model_names) == 1: - model_names = f"('{model_names[0]}')" + model_names = list(set(model_names)) + identifiers_arts = [int(a) for a, p in identifiers if p == -1] - identifiers_arts = tuple(a for a, p in identifiers if p == -1) - if len(identifiers_arts) == 1: - identifiers_arts = f"({identifiers_arts[0]})" if identifiers_arts: - query_arts = f""" + query_arts = sql.text( + """ SELECT * FROM mining_cache - WHERE article_id IN {identifiers_arts} AND mining_model IN {model_names} + WHERE article_id IN :identifiers_arts AND mining_model IN :model_names ORDER BY article_id, paragraph_pos_in_article, start_char """ - df_arts = pd.read_sql(query_arts, con=engine) + ) + query_arts = query_arts.bindparams( + sql.bindparam("identifiers_arts", expanding=True), + sql.bindparam("model_names", expanding=True), + ) + df_arts = pd.read_sql( + query_arts, + con=engine, + params={"identifiers_arts": identifiers_arts, "model_names": model_names}, + ) else: df_arts = pd.DataFrame() @@ -314,6 +320,8 @@ def retrieve_mining_cache(identifiers, model_names, engine): # 3. If `len(identifiers_pars)` is too large, we may have a too long # SQL statement which overflows the max length. So we break it down. + if len(model_names) == 1: + model_names = f"('{model_names[0]}')" batch_size = 1000 dfs_pars = [] d, r = divmod(len(identifiers_pars), batch_size) @@ -323,14 +331,14 @@ def retrieve_mining_cache(identifiers, model_names, engine): SELECT * FROM mining_cache WHERE (article_id = {a} AND paragraph_pos_in_article = {p}) - """ + """ # nosec for a, p in identifiers_pars[i * batch_size : (i + 1) * batch_size] ) query_pars = f""" SELECT * FROM ({query_pars}) tt WHERE tt.mining_model IN {model_names} - """ + """ # nosec dfs_pars.append(pd.read_sql(query_pars, engine)) df_pars = pd.concat(dfs_pars) df_pars = df_pars.sort_values( @@ -580,7 +588,7 @@ def _build_query(self): FROM articles WHERE {" AND ".join(article_conditions)} ) - """.strip() + """.strip() # nosec sentence_conditions.append(article_condition_query) # Restricted sentence IDs From 88c9437fc0c0d1758b2e81e401307c7775e46251 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Tue, 1 Dec 2020 16:44:00 +0100 Subject: [PATCH 16/18] Fix exception message --- src/bbsearch/mining/attribute.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py index 8ce81c788..dc3cb4454 100644 --- a/src/bbsearch/mining/attribute.py +++ b/src/bbsearch/mining/attribute.py @@ -802,9 +802,7 @@ def __init__(self, texts, attribute_extractor, ee_model): super().__init__() if not texts: - raise ValueError( - f"Texts to be annotated shoud be nonempty, got texts = {texts}" - ) + raise TypeError("texts must be a non-empty list.") self.texts = texts self.idx_slider = widgets.IntSlider( From d44fb72a29e83585de00a3fe41a22c2cbb70795d Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Wed, 2 Dec 2020 14:58:05 +0100 Subject: [PATCH 17/18] Ignore python2 bandit error --- .bandit | 1 + src/bbsearch/entrypoints/database_entrypoint.py | 7 +------ tox.ini | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) create mode 100644 .bandit diff --git a/.bandit b/.bandit new file mode 100644 index 000000000..da0f88d5b --- /dev/null +++ b/.bandit @@ -0,0 +1 @@ +skips: [B322] diff --git a/src/bbsearch/entrypoints/database_entrypoint.py b/src/bbsearch/entrypoints/database_entrypoint.py index 5630084aa..eb542c5d5 100644 --- a/src/bbsearch/entrypoints/database_entrypoint.py +++ b/src/bbsearch/entrypoints/database_entrypoint.py @@ -5,11 +5,6 @@ from ._helper import configure_logging -try: - rinput = raw_input -except NameError: - rinput = input - parser = argparse.ArgumentParser() parser.add_argument( "--log_dir", @@ -59,7 +54,7 @@ def main(): engine = sqlalchemy.create_engine(f"sqlite:///{database_path}") elif args.db_type == "mysql": # We assume the database `cord19_v35` already exists - mysql_uri = rinput("MySQL URI:") + mysql_uri = input("MySQL URI:") password = getpass.getpass("Password:") engine = sqlalchemy.create_engine( f"mysql+pymysql://root:{password}" f"@{mysql_uri}/cord19_v47" diff --git a/tox.ini b/tox.ini index 71bd5d14a..80b688084 100644 --- a/tox.ini +++ b/tox.ini @@ -32,7 +32,7 @@ commands = isort --profile black --check setup.py {[tox]source} tests pydocstyle {[tox]source} black -q --check setup.py {[tox]source} tests - bandit -q -r {[tox]source} + bandit -c .bandit -q -r {[tox]source} [testenv:format] skip_install = true From 563825d623cb4bd0c0bd509d3e0504f44d72bb38 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Wed, 2 Dec 2020 15:22:59 +0100 Subject: [PATCH 18/18] Chain excpetion --- src/bbsearch/mining/entity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py index dc3626d80..3f41d380b 100644 --- a/src/bbsearch/mining/entity.py +++ b/src/bbsearch/mining/entity.py @@ -334,7 +334,7 @@ def row2raw(row): value = ast.literal_eval(value_str) except ValueError as ve: if str(ve).startswith("malformed node or string"): - raise NameError(str(ve)) + raise NameError(str(ve)) from ve else: raise else: