diff --git a/tools/prediction_request_sme.py b/tools/prediction_request_sme.py index 066e9ce0..039559bc 100644 --- a/tools/prediction_request_sme.py +++ b/tools/prediction_request_sme.py @@ -28,7 +28,6 @@ from bs4 import BeautifulSoup from googleapiclient.discovery import build - NUM_URLS_EXTRACT = 5 DEFAULT_OPENAI_SETTINGS = { "max_tokens": 500, @@ -111,7 +110,6 @@ * Output only the JSON object. Do not include any other contents in your response. """ - SME_GENERATION_MARKET_PROMPT = """ task question: "{question}" """ @@ -150,7 +148,6 @@ """ - def search_google(query: str, api_key: str, engine: str, num: int = 3) -> List[str]: service = build("customsearch", "v1", developerKey=api_key) search = ( @@ -170,10 +167,10 @@ def get_urls_from_queries(queries: List[str], api_key: str, engine: str) -> List results = [] for query in queries: for url in search_google( - query=query, - api_key=api_key, - engine=engine, - num=3, # Number of returned results + query=query, + api_key=api_key, + engine=engine, + num=3, # Number of returned results ): results.append(url) unique_results = list(set(results)) @@ -181,8 +178,8 @@ def get_urls_from_queries(queries: List[str], api_key: str, engine: str) -> List def extract_text( - html: str, - num_words: int = 300, # TODO: summerise using GPT instead of limit + html: str, + num_words: int = 300, # TODO: summerise using GPT instead of limit ) -> str: """Extract text from a single HTML document""" soup = BeautifulSoup(html, "html.parser") @@ -196,15 +193,16 @@ def extract_text( def process_in_batches( - urls: List[str], window: int = 5, timeout: int = 10 + urls: List[str], window: int = 5, timeout: int = 10 ) -> Generator[None, None, List[Tuple[Future, str]]]: """Iter URLs in batches.""" with ThreadPoolExecutor() as executor: for i in range(0, len(urls), window): - batch = urls[i : i + window] + batch = urls[i: i + window] futures = [(executor.submit(requests.get, url, timeout=timeout), url) for url in batch] yield futures + def extract_texts(urls: List[str], num_words: int = 300) -> List[str]: """Extract texts from URLs""" max_allowed = 5 @@ -225,19 +223,19 @@ def extract_texts(urls: List[str], num_words: int = 300) -> List[str]: except requests.exceptions.ReadTimeout: print(f"Request timed out: {url}.") except Exception as e: - print(f"An error occurred: {e}") + print(f"An error occurred: {e}") if stop: break return extracted_texts def fetch_additional_information( - prompt: str, - engine: str, - temperature: float, - max_tokens: int, - google_api_key: str, - google_engine: str, + prompt: str, + engine: str, + temperature: float, + max_tokens: int, + google_api_key: str, + google_engine: str, ) -> str: """Fetch additional information.""" url_query_prompt = URL_QUERY_PROMPT.format(user_prompt=prompt) @@ -288,8 +286,7 @@ def get_sme_role(engine, temperature, max_tokens, prompt) -> Tuple[str, str]: stop=None, ) generated_sme_roles = response.choices[0].message.content - # check whether the generated_sme_roles is valid json - sme = json.loads(generated_sme_roles) + sme = json.loads(generated_sme_roles)[0] return sme["sme"], sme["sme_introduction"] @@ -353,3 +350,14 @@ def run(**kwargs) -> Tuple[str, Optional[Dict[str, Any]]]: return response.choices[0].message.content, None +prompt = { + 'prompt': '"Will the Disney Royal collections be available for purchase after the World Princess Week by 29 August 2023?" and the `yes` option represented by `Yes` and the `no` option represented by `No`, what are the respective probabilities of `p_yes` and `p_no` occurring?', + 'tool': 'prediction-online-sme', 'nonce': '3a32f8ad-efd1-43b9-adc7-967065321a10', + "api_keys": { + "openai": "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", + "google_api_key": "AIzaSyDgX4gLJh5BUzBGIzUDpP_moqMLP6lAoEc", + "google_engine_id": "23252845a8b224090", + }, +} + +print(run(**prompt))