diff --git a/src/apps/slackapp/slackapp/bolt_app.py b/src/apps/slackapp/slackapp/bolt_app.py index 6d366ab9..436f14bb 100644 --- a/src/apps/slackapp/slackapp/bolt_app.py +++ b/src/apps/slackapp/slackapp/bolt_app.py @@ -337,7 +337,14 @@ def main(): logger.info( "App init: starting HTTP server on port {port}".format(port=cfg.SLACK_PORT) ) - flask_app.run(host="0.0.0.0", port=cfg.SLACK_PORT, debug=cfg.FLASK_DEBUG) + # SECURITY host "0.0.0.0" tells Flask to listen on all available IP addresses. + # This is handy for development, but unsafe in production. + # See https://bandit.readthedocs.io/en/1.7.8/plugins/b104_hardcoded_bind_all_interfaces.html. + # In production you would typically place the Flask server behind a WSGI + # server like Gunicorn and a reverse proxy, and implement other security measures. + flask_app.run( + host="0.0.0.0", port=cfg.SLACK_PORT, debug=cfg.FLASK_DEBUG # nosec B104 + ) # Start the HTTP server diff --git a/src/poetry.lock b/src/poetry.lock index fdf141a7..5b2a14e1 100644 --- a/src/poetry.lock +++ b/src/poetry.lock @@ -210,6 +210,30 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] +[[package]] +name = "bandit" +version = "1.7.8" +description = "Security oriented static analyser for python code." +optional = false +python-versions = ">=3.8" +files = [ + {file = "bandit-1.7.8-py3-none-any.whl", hash = "sha256:509f7af645bc0cd8fd4587abc1a038fc795636671ee8204d502b933aee44f381"}, + {file = "bandit-1.7.8.tar.gz", hash = "sha256:36de50f720856ab24a24dbaa5fee2c66050ed97c1477e0a1159deab1775eab6b"}, +] + +[package.dependencies] +colorama = {version = ">=0.3.9", markers = "platform_system == \"Windows\""} +PyYAML = ">=5.3.1" +rich = "*" +stevedore = ">=1.20.0" + +[package.extras] +baseline = ["GitPython (>=3.1.30)"] +sarif = ["jschema-to-python (>=1.2.3)", "sarif-om (>=1.0.4)"] +test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", "flake8 (>=4.0.0)", "pylint (==1.9.4)", "stestr (>=2.5.0)", "testscenarios (>=0.5.0)", "testtools (>=2.3.0)"] +toml = ["tomli (>=1.1.0)"] +yaml = ["PyYAML"] + [[package]] name = "bcrypt" version = "4.1.2" @@ -1877,6 +1901,30 @@ importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.0)", "mkdocs-nature (>=0.4)"] testing = ["coverage", "pyyaml"] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "markupsafe" version = "2.1.5" @@ -1976,6 +2024,17 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mmh3" version = "4.1.0" @@ -2715,6 +2774,17 @@ files = [ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, ] +[[package]] +name = "pbr" +version = "6.0.0" +description = "Python Build Reasonableness" +optional = false +python-versions = ">=2.6" +files = [ + {file = "pbr-6.0.0-py2.py3-none-any.whl", hash = "sha256:4a7317d5e3b17a3dccb6a8cfe67dab65b20551404c52c8ed41279fa4f0cb4cda"}, + {file = "pbr-6.0.0.tar.gz", hash = "sha256:d1377122a5a00e2f940ee482999518efe16d745d423a670c27773dfbc3c9a7d9"}, +] + [[package]] name = "pinecone-client" version = "2.2.4" @@ -3063,6 +3133,21 @@ files = [ {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, ] +[[package]] +name = "pygments" +version = "2.17.2" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pygments-2.17.2-py3-none-any.whl", hash = "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c"}, + {file = "pygments-2.17.2.tar.gz", hash = "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367"}, +] + +[package.extras] +plugins = ["importlib-metadata"] +windows-terminal = ["colorama (>=0.4.6)"] + [[package]] name = "pypdf" version = "3.17.4" @@ -3514,6 +3599,24 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "rich" +version = "13.7.1" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "rsa" version = "4.9" @@ -3998,6 +4101,20 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\"" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] +[[package]] +name = "stevedore" +version = "5.2.0" +description = "Manage dynamic plugins for Python applications" +optional = false +python-versions = ">=3.8" +files = [ + {file = "stevedore-5.2.0-py3-none-any.whl", hash = "sha256:1c15d95766ca0569cad14cb6272d4d31dae66b011a929d7c18219c176ea1b5c9"}, + {file = "stevedore-5.2.0.tar.gz", hash = "sha256:46b93ca40e1114cea93d738a6c1e365396981bb6bb78c27045b7587c9473544d"}, +] + +[package.dependencies] +pbr = ">=2.0.0,<2.1.0 || >2.1.0" + [[package]] name = "sympy" version = "1.12" @@ -5055,4 +5172,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "80db08c02f1b519a174cde761690e23fbc46009530088f33668fa839c0978d67" +content-hash = "8c9eacd638a67be2433db2c8a7a4753dc57dfd500e2b82b0997e4cc4f72e3379" diff --git a/src/pyproject.toml b/src/pyproject.toml index 1cc16175..48c77852 100644 --- a/src/pyproject.toml +++ b/src/pyproject.toml @@ -32,6 +32,7 @@ pytest-cov = "^4.1.0" [tool.poetry.group.lint.dependencies] +bandit = "^1.7.8" black = "^23.7.0" flake8 = "^6.1.0" isort = "^5.12.0" diff --git a/src/sherpa_ai/database/user_usage_tracker.py b/src/sherpa_ai/database/user_usage_tracker.py index 738d6884..e51cb9e0 100644 --- a/src/sherpa_ai/database/user_usage_tracker.py +++ b/src/sherpa_ai/database/user_usage_tracker.py @@ -2,7 +2,6 @@ import boto3 import sqlalchemy.orm -import sqlalchemy.orm from anyio import Path from sqlalchemy import TIMESTAMP, Boolean, Column, Integer, String, create_engine from sqlalchemy.orm import declarative_base, sessionmaker diff --git a/src/sherpa_ai/scrape/extract_github_readme.py b/src/sherpa_ai/scrape/extract_github_readme.py index 6ff294ad..a402bada 100644 --- a/src/sherpa_ai/scrape/extract_github_readme.py +++ b/src/sherpa_ai/scrape/extract_github_readme.py @@ -11,6 +11,9 @@ from sherpa_ai.connectors.vectorstores import ConversationStore +GITHUB_REQUEST_TIMEOUT = 2.5 + + def get_owner_and_repo(url): """ Extracts the owner and repository name from a GitHub repository URL. @@ -49,7 +52,9 @@ def extract_github_readme(repo_url): "X-GitHub-Api-Version": "2022-11-28", } - response = requests.get(github_api_url, headers=headers) + response = requests.get( + github_api_url, headers=headers, timeout=GITHUB_REQUEST_TIMEOUT + ) files = response.json() if type(files) is dict and files["message"].lower() == "bad credentials": @@ -72,7 +77,9 @@ def extract_github_readme(repo_url): "X-GitHub-Api-Version": "2022-11-28", } - response = requests.get(github_api_url, headers=headers) + response = requests.get( + github_api_url, headers=headers, timeout=GITHUB_REQUEST_TIMEOUT + ) data = response.json() if "content" in data: content = base64.b64decode(data["content"]).decode("utf-8") diff --git a/src/sherpa_ai/scrape/file_scraper.py b/src/sherpa_ai/scrape/file_scraper.py index e850e438..21e9ce6d 100644 --- a/src/sherpa_ai/scrape/file_scraper.py +++ b/src/sherpa_ai/scrape/file_scraper.py @@ -11,6 +11,9 @@ ) +DOWNLOAD_TIMEOUT = 2.5 + + class QuestionWithFileHandler: def __init__(self, question, files, token, user_id, team_id): """ @@ -66,7 +69,9 @@ def download_file(self, file): "Authorization": f"Bearer {self.token}", "Accept": file["mimetype"], } - response = requests.get(file["url_private_download"], headers=headers) + response = requests.get( + file["url_private_download"], headers=headers, timeout=DOWNLOAD_TIMEOUT + ) destination = file["id"] + file["filetype"] # Check if the request was successful (HTTP status code 200) diff --git a/src/sherpa_ai/tools.py b/src/sherpa_ai/tools.py index 23c86ba0..21aa35c2 100644 --- a/src/sherpa_ai/tools.py +++ b/src/sherpa_ai/tools.py @@ -1,10 +1,6 @@ -import os import re -import urllib import urllib.parse -import urllib.request from typing import Any, List, Tuple, Union -from urllib.parse import urlparse import requests from bs4 import BeautifulSoup @@ -21,6 +17,9 @@ from sherpa_ai.output_parser import TaskAction +HTTP_GET_TIMEOUT = 2.5 + + def get_tools(memory, config): tools = [] @@ -56,8 +55,8 @@ def _run(self, query: str) -> str: + "&start=0&max_results=" + str(top_k) ) - data = urllib.request.urlopen(url) - xml_content = data.read().decode("utf-8") + data = requests.get(url, timeout=HTTP_GET_TIMEOUT) + xml_content = data.text summary_pattern = r"(.*?)" summaries = re.findall(summary_pattern, xml_content, re.DOTALL) diff --git a/src/sherpa_ai/utils.py b/src/sherpa_ai/utils.py index a648c8e9..5a4bae04 100644 --- a/src/sherpa_ai/utils.py +++ b/src/sherpa_ai/utils.py @@ -1,9 +1,7 @@ import json import re from typing import List, Optional, Union -from urllib.error import HTTPError, URLError from urllib.parse import urlparse -from urllib.request import urlopen import requests import spacy @@ -23,6 +21,9 @@ from sherpa_ai.models.sherpa_base_model import SherpaOpenAI +HTTP_GET_TIMEOUT = 2.5 + + def load_files(files: List[str]) -> List[Document]: documents = [] loader = None @@ -83,7 +84,7 @@ def get_link_from_slack_client_conversation(data): def scrape_with_url(url: str): - response = requests.get(url) + response = requests.get(url, timeout=HTTP_GET_TIMEOUT) soup = BeautifulSoup(response.content, "html.parser") data = soup.get_text(strip=True) status = response.status_code @@ -288,24 +289,20 @@ def extract_urls(text): def check_url(url): """ - Opens `url` to test its validity. + Performs an HTTP GET request on `url` to test its validity. - Returns True if `url` can be opened, False otherwise. + Returns True if GET succeeds, False otherwise. """ - try: - _ = urlopen(url) - - except HTTPError as e: - logger.info("HTTP error", e) - return False - - except URLError as e: - logger.info("Oops ! Page not found!", e) - return False - + if urlparse(url).scheme in ["http", "https"]: + try: + _ = requests.get(url, timeout=HTTP_GET_TIMEOUT) + return True + except Exception as e: + logger.info(f"{e} - {url}") + return False else: - return True + raise ValueError(f"URL must conform to HTTP(S) scheme: {url}") def extract_numbers_from_text(text): @@ -497,7 +494,7 @@ def json_from_text(text: str): Returns: dict: Parsed JSON data. """ - if type(text) == str: + if isinstance(text, str): text = text.replace("\n", "") json_pattern = r"\{.*\}" json_match = re.search(json_pattern, text) @@ -507,7 +504,7 @@ def json_from_text(text: str): try: parsed_json = json.loads(json_data) return parsed_json - except json.JSONDecodeError as e: + except json.JSONDecodeError: return {} else: return {} @@ -554,7 +551,7 @@ def text_similarity_by_llm( prompt = ( instruction + """ - only return {"entity_exist": true , "messages":"" } if all entities are mentioned inside the answer in + only return {"entity_exist": true , "messages":"" } if all entities are mentioned inside the answer in only return {"entity_exist": false , "messages": " Entity x hasn't been mentioned inside the answer"} if the entity is not mentioned properly . """ ) diff --git a/src/tests/integration_tests/test_entity_citation_validator.py b/src/tests/integration_tests/test_entity_citation_validator.py index 1734b6ca..42e77eea 100644 --- a/src/tests/integration_tests/test_entity_citation_validator.py +++ b/src/tests/integration_tests/test_entity_citation_validator.py @@ -50,7 +50,7 @@ ], ) def test_entity_citation_succeeds_in_qa( - get_llm, test_id, objective, input_data, expected_entities + get_llm, test_id, objective, input_data, expected_entities # noqa: F811 ): llm = get_llm( __file__, test_entity_citation_succeeds_in_qa.__name__ + f"_{str(test_id)}" diff --git a/src/tests/integration_tests/test_qa_agent_actions.py b/src/tests/integration_tests/test_qa_agent_actions.py index fae1690e..acad6f6b 100644 --- a/src/tests/integration_tests/test_qa_agent_actions.py +++ b/src/tests/integration_tests/test_qa_agent_actions.py @@ -43,7 +43,7 @@ def test_qa_agent_succeeds(get_llm): # noqa: F811 assert len(results) == 1 -def test_qa_agent_citation_validation_no_action(get_llm): +def test_qa_agent_citation_validation_no_action(get_llm): # noqa: F811 llm = get_llm(__file__, test_qa_agent_citation_validation_no_action.__name__) shared_memory = SharedMemory( @@ -69,7 +69,7 @@ def test_qa_agent_citation_validation_no_action(get_llm): assert len(results) == 1 -def test_qa_agent_citation_validation_multiple_action(get_llm): +def test_qa_agent_citation_validation_multiple_action(get_llm): # noqa: F811 # Make sure the citation validation works even when the the action providing citation is not selected llm = get_llm(__file__, test_qa_agent_citation_validation_multiple_action.__name__) diff --git a/src/tests/unit_tests/test_util.py b/src/tests/unit_tests/test_utils.py similarity index 93% rename from src/tests/unit_tests/test_util.py rename to src/tests/unit_tests/test_utils.py index fc832667..67f5398d 100644 --- a/src/tests/unit_tests/test_util.py +++ b/src/tests/unit_tests/test_utils.py @@ -4,6 +4,7 @@ from sherpa_ai.utils import ( check_if_number_exist, + check_url, extract_entities, extract_numbers_from_text, get_base_url, @@ -240,7 +241,7 @@ def test_verify_numbers_against_source_succeeds(text_to_test, source_text): (None, []), ], ) -def test_extract_numbers_from_text(text_to_test, expected_data): +def test_extract_numbers_from_text_2(text_to_test, expected_data): extracted_number = extract_numbers_from_text(text_to_test) # source data has these numbers in it numbers_in_source_data = expected_data @@ -295,7 +296,7 @@ def test_verify_numbers_against_source_fails(text_to_test, source_text): ("123something12minim jammed together $45 above 7 elit123", "45 7 12", False), ], ) -def test_extract_numbers_from_text(text_to_test, source_text, expected_result): +def test_extract_numbers_from_text_3(text_to_test, source_text, expected_result): # test against a text which don't have the same numbers as the source check_result = check_if_number_exist(text_to_test, source_text) @@ -378,7 +379,7 @@ def test_text_similarity_entities_present(): check_entity = ["apple", "banana", "orange"] source_entity = ["apple", "orange"] entity_exist, message = text_similarity(check_entity, source_entity) - assert entity_exist == True + assert entity_exist is True assert message == "" @@ -386,7 +387,7 @@ def test_text_similarity_entities_not_present(): check_entity = ["apple", "banana", "orange"] source_entity = ["grape", "kiwi", "pear"] entity_exist, message = text_similarity(check_entity, source_entity) - assert entity_exist == False + assert entity_exist is False expected_message = ( "remember to address these entities grape, kiwi, pear, in final the answer." ) @@ -401,14 +402,6 @@ def test_text_similarity_with_entities_exist(): assert message == "" -def test_text_similarity_with_entities_exist(): - check_entity = ["apple", "banana", "orange"] - source_entity = ["apples", "oranges"] - entity_exist, message = text_similarity_by_metrics(check_entity, source_entity) - assert entity_exist is True - assert message == "" - - def test_text_similarity_with_entities_not_exist(): check_entity = ["apple", "orange", "banana"] source_entity = ["pear", "grape", "kiwi"] @@ -419,3 +412,34 @@ def test_text_similarity_with_entities_not_exist(): "remember to address these entities pear, grape, kiwi, in the final answer." ) assert message.lower() == expected_message.lower() + + +@pytest.mark.parametrize( + "bad_uri", + [ + "file://something", + "s3://some-file", + "javascript:some-code", + "garbage", + "FILE://something", + ], +) +def test_check_url_raises_exception_for_unsupported_uri_scheme(bad_uri): + with pytest.raises(ValueError): + check_url(bad_uri) + + +@pytest.mark.parametrize( + "good_uri", + ["http://something.com", "https://something.com"], +) +def test_check_url_returns_true_for_valid_http_url(good_uri): + with patch("requests.get", return_value=True): + result = check_url(good_uri) + assert result is True + + +def test_check_url_returns_false_on_request_error(): + with patch("requests.get", side_effect=Exception("problem")): + result = check_url("https://anything") + assert result is False