Merge branch 'master' of github.com:assafelovic/gpt-researcher

assafelovic · Oct 31, 2024 · e714999 · e714999
2 parents cf031ab + e1f0fc5
commit e714999
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 50 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -29,13 +29,6 @@ RUN pip install --no-cache-dir -r requirements.txt && \
 # Stage 3: Final stage with non-root user and app
 FROM gpt-researcher-install AS gpt-researcher
 
-# Use environment variables for API keys (defaults can be overridden at runtime)
-ARG OPENAI_API_KEY
-ARG TAVILY_API_KEY
-
-ENV OPENAI_API_KEY=${OPENAI_API_KEY}
-ENV TAVILY_API_KEY=${TAVILY_API_KEY}
-
 # Create a non-root user for security
 RUN useradd -ms /bin/bash gpt-researcher && \
     chown -R gpt-researcher:gpt-researcher /usr/src/app

diff --git a/gpt_researcher/llm_provider/generic/base.py b/gpt_researcher/llm_provider/generic/base.py
@@ -68,7 +68,7 @@ def from_provider(cls, provider: str, **kwargs: Any):
             llm = ChatFireworks(**kwargs)
         elif provider == "ollama":
             _check_pkg("langchain_community")
-            from langchain_community.chat_models import ChatOllama
+            from langchain_ollama import ChatOllama
 
             llm = ChatOllama(base_url=os.environ["OLLAMA_BASE_URL"], **kwargs)
         elif provider == "together":

diff --git a/gpt_researcher/memory/embeddings.py b/gpt_researcher/memory/embeddings.py
@@ -17,7 +17,7 @@ def __init__(self, embedding_provider: str, model: str, **embdding_kwargs: Any):
         _embeddings = None
         match embedding_provider:
             case "ollama":
-                from langchain_community.embeddings import OllamaEmbeddings
+                from langchain_ollama import OllamaEmbeddings
 
                 _embeddings = OllamaEmbeddings(
                     model=model,

diff --git a/gpt_researcher/retrievers/searx/searx.py b/gpt_researcher/retrievers/searx/searx.py
@@ -1,45 +1,77 @@
-# Tavily API Retriever
-
-# libraries
 import os
-from langchain_community.utilities import SearxSearchWrapper
+import json
+import requests
+from typing import List, Dict
+from urllib.parse import urljoin
 
 
 class SearxSearch():
     """
-    Tavily API Retriever
+    SearxNG API Retriever
     """
-    def __init__(self, query):
+    def __init__(self, query: str):
         """
-        Initializes the TavilySearch object
+        Initializes the SearxSearch object
         Args:
-            query:
+            query: Search query string
         """
         self.query = query
-        self.api_key = self.get_api_key()
+        self.base_url = self.get_searxng_url()
 
-    def get_api_key(self):
+    def get_searxng_url(self) -> str:
         """
-        Gets the Tavily API key
+        Gets the SearxNG instance URL from environment variables
         Returns:
-
+            str: Base URL of SearxNG instance
         """
-        # Get the API key
         try:
-            api_key = os.environ["SEARX_URL"]
-        except:
-            raise Exception("Searx URL key not found. Please set the SEARX_URL environment variable. "
-                            "You can get your key from https://searx.space/")
-        return api_key
+            base_url = os.environ["SEARX_URL"]
+            if not base_url.endswith('/'):
+                base_url += '/'
+            return base_url
+        except KeyError:
+            raise Exception(
+                "SearxNG URL not found. Please set the SEARX_URL environment variable. "
+                "You can find public instances at https://searx.space/"
+            )
 
-    def search(self, max_results=7):
+    def search(self, max_results: int = 10) -> List[Dict[str, str]]:
         """
-        Searches the query
+        Searches the query using SearxNG API
+        Args:
+            max_results: Maximum number of results to return
         Returns:
-
+            List of dictionaries containing search results
         """
-        searx = SearxSearchWrapper(searx_host=os.environ["SEARX_URL"])
-        results = searx.results(self.query, max_results)
-        # Normalizing results to match the format of the other search APIs
-        search_response = [{"href": obj["link"], "body": obj["snippet"]} for obj in results]
-        return search_response
+        search_url = urljoin(self.base_url, "search")
+
+        params = {
+            # The search query. 
+            'q': self.query, 
+            # Output format of results. Format needs to be activated in searxng config.
+            'format': 'json'
+        }
+
+        try:
+            response = requests.get(
+                search_url,
+                params=params,
+                headers={'Accept': 'application/json'}
+            )
+            response.raise_for_status()
+            results = response.json()
+
+            # Normalize results to match the expected format
+            search_response = []
+            for result in results.get('results', [])[:max_results]:
+                search_response.append({
+                    "href": result.get('url', ''),
+                    "body": result.get('content', '')
+                })
+
+            return search_response
+
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Error querying SearxNG: {str(e)}")
+        except json.JSONDecodeError:
+            raise Exception("Error parsing SearxNG response")
diff --git a/gpt_researcher/scraper/arxiv/arxiv.py b/gpt_researcher/scraper/arxiv/arxiv.py
@@ -18,5 +18,5 @@ def scrape(self):
         """
         query = self.link.split("/")[-1]
         retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None)
-        docs = retriever.get_relevant_documents(query=query)
+        docs = retriever.invoke(query=query)
         return docs[0].page_content
diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ markdown
 langchain
 langchain_community
 langchain-openai
+langchain-ollama
 langgraph
 tiktoken
 gpt-researcher

diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py
@@ -1,27 +1,35 @@
-from typing import Dict, Any
+import logging
+from typing import List, Dict, Any
 import asyncio
 from gpt_researcher import GPTResearcher
 
 class CustomLogsHandler:
     """A custom Logs handler class to handle JSON data."""
     def __init__(self):
-        self.logs = []  # Initialize logs to store data
+        self.logs: List[Dict[str, Any]] = []  # Initialize logs to store data
+        logging.basicConfig(level=logging.INFO)  # Set up logging configuration
 
     async def send_json(self, data: Dict[str, Any]) -> None:
-        """Send JSON data and log it."""
-        self.logs.append(data)  # Append data to logs
-        print(f"My custom Log: {data}")  # For demonstration, print the log
-
-async def run():
-    # Define the necessary parameters with sample values
-
+        """Send JSON data and log it, with error handling."""
+        try:
+            self.logs.append(data)  # Append data to logs
+            logging.info(f"My custom Log: {data}")  # Use logging instead of print
+        except Exception as e:
+            logging.error(f"Error logging data: {e}")  # Log any errors
+
+    def clear_logs(self) -> None:
+        """Clear the logs."""
+        self.logs.clear()  # Clear the logs list
+        logging.info("Logs cleared.")  # Log the clearing action
+
+async def run() -> None:
+    """Run the research process and generate a report."""
     query = "What happened in the latest burning man floods?"
-    report_type = "research_report"  # Type of report to generate
-    report_source = "online"  # Could specify source like 'online', 'books', etc.
-    tone = "informative"  # Tone of the report ('informative', 'casual', etc.)
-    config_path = None  # Path to a config file, if needed
+    report_type = "research_report"
+    report_source = "online"
+    tone = "informative"
+    config_path = None
 
-    # Initialize researcher with a custom WebSocket
     custom_logs_handler = CustomLogsHandler()
 
     researcher = GPTResearcher(
@@ -35,6 +43,7 @@ async def run():
 
     await researcher.conduct_research()  # Conduct the research
     report = await researcher.write_report()  # Write the research report
+    logging.info("Report generated successfully.")  # Log report generation
 
     return report