Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix security warnings #365

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/apps/slackapp/slackapp/bolt_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,14 @@ def main():
logger.info(
"App init: starting HTTP server on port {port}".format(port=cfg.SLACK_PORT)
)
flask_app.run(host="0.0.0.0", port=cfg.SLACK_PORT, debug=cfg.FLASK_DEBUG)
# SECURITY host "0.0.0.0" tells Flask to listen on all available IP addresses.
# This is handy for development, but unsafe in production.
# See https://bandit.readthedocs.io/en/1.7.8/plugins/b104_hardcoded_bind_all_interfaces.html.
# In production you would typically place the Flask server behind a WSGI
# server like Gunicorn and a reverse proxy, and implement other security measures.
flask_app.run(
host="0.0.0.0", port=cfg.SLACK_PORT, debug=cfg.FLASK_DEBUG # nosec B104
)


# Start the HTTP server
Expand Down
119 changes: 118 additions & 1 deletion src/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pytest-cov = "^4.1.0"


[tool.poetry.group.lint.dependencies]
bandit = "^1.7.8"
black = "^23.7.0"
flake8 = "^6.1.0"
isort = "^5.12.0"
Expand Down
1 change: 0 additions & 1 deletion src/sherpa_ai/database/user_usage_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import boto3
import sqlalchemy.orm
import sqlalchemy.orm
from anyio import Path
from sqlalchemy import TIMESTAMP, Boolean, Column, Integer, String, create_engine
from sqlalchemy.orm import declarative_base, sessionmaker
Expand Down
11 changes: 9 additions & 2 deletions src/sherpa_ai/scrape/extract_github_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from sherpa_ai.connectors.vectorstores import ConversationStore


GITHUB_REQUEST_TIMEOUT = 2.5


def get_owner_and_repo(url):
"""
Extracts the owner and repository name from a GitHub repository URL.
Expand Down Expand Up @@ -49,7 +52,9 @@ def extract_github_readme(repo_url):
"X-GitHub-Api-Version": "2022-11-28",
}

response = requests.get(github_api_url, headers=headers)
response = requests.get(
github_api_url, headers=headers, timeout=GITHUB_REQUEST_TIMEOUT
)

files = response.json()
if type(files) is dict and files["message"].lower() == "bad credentials":
Expand All @@ -72,7 +77,9 @@ def extract_github_readme(repo_url):
"X-GitHub-Api-Version": "2022-11-28",
}

response = requests.get(github_api_url, headers=headers)
response = requests.get(
github_api_url, headers=headers, timeout=GITHUB_REQUEST_TIMEOUT
)
data = response.json()
if "content" in data:
content = base64.b64decode(data["content"]).decode("utf-8")
Expand Down
7 changes: 6 additions & 1 deletion src/sherpa_ai/scrape/file_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
)


DOWNLOAD_TIMEOUT = 2.5


class QuestionWithFileHandler:
def __init__(self, question, files, token, user_id, team_id):
"""
Expand Down Expand Up @@ -66,7 +69,9 @@ def download_file(self, file):
"Authorization": f"Bearer {self.token}",
"Accept": file["mimetype"],
}
response = requests.get(file["url_private_download"], headers=headers)
response = requests.get(
file["url_private_download"], headers=headers, timeout=DOWNLOAD_TIMEOUT
)
destination = file["id"] + file["filetype"]

# Check if the request was successful (HTTP status code 200)
Expand Down
11 changes: 5 additions & 6 deletions src/sherpa_ai/tools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import os
import re
import urllib
import urllib.parse
import urllib.request
from typing import Any, List, Tuple, Union
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
Expand All @@ -21,6 +17,9 @@
from sherpa_ai.output_parser import TaskAction


HTTP_GET_TIMEOUT = 2.5


def get_tools(memory, config):
tools = []

Expand Down Expand Up @@ -56,8 +55,8 @@ def _run(self, query: str) -> str:
+ "&start=0&max_results="
+ str(top_k)
)
data = urllib.request.urlopen(url)
xml_content = data.read().decode("utf-8")
data = requests.get(url, timeout=HTTP_GET_TIMEOUT)
xml_content = data.text

summary_pattern = r"<summary>(.*?)</summary>"
summaries = re.findall(summary_pattern, xml_content, re.DOTALL)
Expand Down
37 changes: 17 additions & 20 deletions src/sherpa_ai/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import json
import re
from typing import List, Optional, Union
from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.request import urlopen

import requests
import spacy
Expand All @@ -23,6 +21,9 @@
from sherpa_ai.models.sherpa_base_model import SherpaOpenAI


HTTP_GET_TIMEOUT = 2.5


def load_files(files: List[str]) -> List[Document]:
documents = []
loader = None
Expand Down Expand Up @@ -83,7 +84,7 @@ def get_link_from_slack_client_conversation(data):


def scrape_with_url(url: str):
response = requests.get(url)
response = requests.get(url, timeout=HTTP_GET_TIMEOUT)
soup = BeautifulSoup(response.content, "html.parser")
data = soup.get_text(strip=True)
status = response.status_code
Expand Down Expand Up @@ -288,24 +289,20 @@ def extract_urls(text):

def check_url(url):
"""
Opens `url` to test its validity.
Performs an HTTP GET request on `url` to test its validity.

Returns True if `url` can be opened, False otherwise.
Returns True if GET succeeds, False otherwise.
"""

try:
_ = urlopen(url)

except HTTPError as e:
logger.info("HTTP error", e)
return False

except URLError as e:
logger.info("Oops ! Page not found!", e)
return False

if urlparse(url).scheme in ["http", "https"]:
try:
_ = requests.get(url, timeout=HTTP_GET_TIMEOUT)
return True
except Exception as e:
logger.info(f"{e} - {url}")
return False
else:
return True
raise ValueError(f"URL must conform to HTTP(S) scheme: {url}")


def extract_numbers_from_text(text):
Expand Down Expand Up @@ -497,7 +494,7 @@ def json_from_text(text: str):
Returns:
dict: Parsed JSON data.
"""
if type(text) == str:
if isinstance(text, str):
text = text.replace("\n", "")
json_pattern = r"\{.*\}"
json_match = re.search(json_pattern, text)
Expand All @@ -507,7 +504,7 @@ def json_from_text(text: str):
try:
parsed_json = json.loads(json_data)
return parsed_json
except json.JSONDecodeError as e:
except json.JSONDecodeError:
return {}
else:
return {}
Expand Down Expand Up @@ -554,7 +551,7 @@ def text_similarity_by_llm(
prompt = (
instruction
+ """
only return {"entity_exist": true , "messages":"" } if all entities are mentioned inside the answer in
only return {"entity_exist": true , "messages":"" } if all entities are mentioned inside the answer in
only return {"entity_exist": false , "messages": " Entity x hasn't been mentioned inside the answer"} if the entity is not mentioned properly .
"""
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
],
)
def test_entity_citation_succeeds_in_qa(
get_llm, test_id, objective, input_data, expected_entities
get_llm, test_id, objective, input_data, expected_entities # noqa: F811
):
llm = get_llm(
__file__, test_entity_citation_succeeds_in_qa.__name__ + f"_{str(test_id)}"
Expand Down
Loading