assafelovic · assafelovic · Oct 23, 2024 · Oct 23, 2024
diff --git a/README.md b/README.md
@@ -67,6 +67,7 @@ More specifically:
 
 ## Features
 - 📝 Generate research, outlines, resources and lessons reports with local documents and web sources
+- 🖼️ Supports smart article image scraping and filtering
 - 📜 Can generate long and detailed research reports (over 2K words)
 - 🌐 Aggregates over 20 web sources per research to form objective and factual conclusions
 - 🖥️ Includes both lightweight (HTML/CSS/JS) and production ready (NextJS + Tailwind) UX/UI

diff --git a/frontend/styles.css b/frontend/styles.css
@@ -189,65 +189,74 @@ a:hover {
     text-decoration: underline;
 }
 
-/* Add these styles at the end of the file */
+/* Add or modify these styles at the end of the file */
 #selectedImagesContainer {
     background-color: rgba(255, 255, 255, 0.1);
     border-radius: 12px;
     padding: 15px;
     margin-bottom: 20px;
-    color: #fff;  /* Ensure text is visible */
+    color: #fff;
+    display: flex;
+    flex-wrap: wrap;
+    gap: 10px;
+    justify-content: center;
 }
 
 #selectedImagesContainer h3 {
+    width: 100%;
     margin-top: 0;
     margin-bottom: 10px;
-    color: #fff;  /* Ensure header is visible */
+    color: #fff;
 }
 
-.image-dialog {
-  position: fixed;
-  top: 0;
-  left: 0;
-  width: 100%;
-  height: 100%;
-  background-color: rgba(0, 0, 0, 0.8);
-  display: flex;
-  flex-direction: column;
-  justify-content: center;
-  align-items: center;
-  z-index: 1000;
+#selectedImagesContainer img {
+    width: 150px;
+    height: 150px;
+    object-fit: cover;
+    cursor: pointer;
+    transition: transform 0.3s ease, box-shadow 0.3s ease;
+    border-radius: 8px;
 }
 
-.image-dialog img {
-  max-width: 90%;
-  max-height: 80%;
-  object-fit: contain;
-  border-radius: 8px;
-  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+#selectedImagesContainer img:hover {
+    transform: scale(1.05);
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
 }
 
-.image-dialog button {
-  margin-top: 20px;
-  padding: 10px 20px;
-  background-color: #007bff;
-  color: white;
-  border: none;
-  border-radius: 5px;
-  cursor: pointer;
-  font-size: 16px;
-  transition: background-color 0.3s ease;
+.image-dialog {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    background-color: rgba(0, 0, 0, 0.8);
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    z-index: 1000;
 }
 
-.image-dialog button:hover {
-  background-color: #0056b3;
+.image-dialog img {
+    max-width: 90%;
+    max-height: 80%;
+    object-fit: contain;
+    border-radius: 8px;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
 }
 
-#selectedImagesContainer img {
-  cursor: pointer;
-  transition: transform 0.3s ease, box-shadow 0.3s ease;
+.image-dialog button {
+    margin-top: 20px;
+    padding: 10px 20px;
+    background-color: #007bff;
+    color: white;
+    border: none;
+    border-radius: 5px;
+    cursor: pointer;
+    font-size: 16px;
+    transition: background-color 0.3s ease;
 }
 
-#selectedImagesContainer img:hover {
-  transform: scale(1.05);
-  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+.image-dialog button:hover {
+    background-color: #0056b3;
 }
diff --git a/gpt_researcher/actions/web_scraping.py b/gpt_researcher/actions/web_scraping.py
@@ -30,7 +30,7 @@ def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, An
         scraped_data = scraper.run()
         for item in scraped_data:
             if 'image_urls' in item:
-                images.extend([{'url': img_url} for img_url in item['image_urls']])
+                images.extend([img for img in item['image_urls']])
     except Exception as e:
         print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
 

diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py
@@ -130,8 +130,8 @@ async def get_similar_written_contents_by_draft_section_titles(
         )
 
     # Utility methods
-    def get_research_images(self) -> List[Dict[str, Any]]:
-        return self.research_images
+    def get_research_images(self, top_k=10) -> List[Dict[str, Any]]:
+        return self.research_images[:top_k]
 
     def add_research_images(self, images: List[Dict[str, Any]]) -> None:
         self.research_images.extend(images)

diff --git a/gpt_researcher/scraper/scraper.py b/gpt_researcher/scraper/scraper.py
@@ -49,6 +49,7 @@ def extract_data_from_url(self, link, session):
 
             if len(content) < 100:
                 return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
+
             return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
         except Exception as e:
             return {"url": link, "raw_content": None, "image_urls": [], "title": ""}

diff --git a/gpt_researcher/scraper/utils.py b/gpt_researcher/scraper/utils.py
@@ -1,7 +1,7 @@
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse, parse_qs
 import logging
-import re
+import hashlib
 
 def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
     """Extract relevant images from the page"""
@@ -14,24 +14,35 @@ def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
         for img in all_images:
             img_src = urljoin(url, img['src'])
             if img_src.startswith(('http://', 'https://')):
+                score = 0
                 # Check for relevant classes
                 if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']):
-                    image_urls.append((img_src, 3))  # Higher priority
+                    score = 3  # Higher score
                 # Check for size attributes
                 elif img.get('width') and img.get('height'):
                     width = parse_dimension(img['width'])
                     height = parse_dimension(img['height'])
                     if width and height:
-                        if width >= 1200 and height >= 600:
-                            image_urls.append((img_src, 2))  # Medium priority
+                        if width >= 2000 and height >= 1000:
+                            score = 2  # Medium score (very large images)
+                        elif width >= 1600 or height >= 800:
+                            score = 1  # Lower score
                         elif width >= 800 or height >= 400:
-                            image_urls.append((img_src, 1))  # Lower priority
-                        elif width >= 600 or height >= 300:
-                            image_urls.append((img_src, 0))  # Lower priority
+                            score = 0  # Lowest score
+                        else:
+                            continue  # Skip small images
+
+                image_urls.append({'url': img_src, 'score': score})
 
-        # Sort images by priority (highest first) and then limit to top 10
-        sorted_images = sorted(image_urls, key=lambda x: x[1], reverse=True)
-        return [img[0] for img in sorted_images[:10]]
+        # Sort images by score (highest first)
+        sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True)
+
+        # Select all images with score 3 and 2, then add score 1 images up to a total of 10
+        high_score_images = [img for img in sorted_images if img['score'] in [3, 2]]
+        low_score_images = [img for img in sorted_images if img['score'] == 1]
+
+        result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))]
+        return result[:10]  # Ensure we don't return more than 10 images in total
 
     except Exception as e:
         logging.error(f"Error in get_relevant_images: {e}")
@@ -49,3 +60,24 @@ def parse_dimension(value: str) -> int:
 def extract_title(soup: BeautifulSoup) -> str:
     """Extract the title from the BeautifulSoup object"""
     return soup.title.string if soup.title else ""
+
+def get_image_hash(image_url: str) -> str:
+    """Calculate a simple hash based on the image filename and essential query parameters"""
+    try:
+        parsed_url = urlparse(image_url)
+
+        # Extract the filename
+        filename = parsed_url.path.split('/')[-1]
+
+        # Extract essential query parameters (e.g., 'url' for CDN-served images)
+        query_params = parse_qs(parsed_url.query)
+        essential_params = query_params.get('url', [])
+
+        # Combine filename and essential parameters
+        image_identifier = filename + ''.join(essential_params)
+
+        # Calculate hash
+        return hashlib.md5(image_identifier.encode()).hexdigest()
+    except Exception as e:
+        logging.error(f"Error calculating image hash for {image_url}: {e}")
+        return None
diff --git a/gpt_researcher/skills/browser.py b/gpt_researcher/skills/browser.py
@@ -1,9 +1,8 @@
-from typing import List, Dict, Optional, Set
-import hashlib
-import re
+from typing import List, Dict
 
 from ..actions.utils import stream_output
 from ..actions.web_scraping import scrape_urls
+from ..scraper.utils import get_image_hash  # Add this import
 
 
 class BrowserManager:
@@ -61,25 +60,26 @@ async def browse_urls(self, urls: List[str]) -> List[Dict]:
 
     def select_top_images(self, images: List[Dict], k: int = 2) -> List[str]:
         """
-        Select top k images and remove duplicates.
+        Select most relevant images and remove duplicates based on image content.
 
         Args:
-            images (List[Dict]): List of image dictionaries with 'url' keys.
-            k (int): Number of top images to select.
+            images (List[Dict]): List of image dictionaries with 'url' and 'score' keys.
+            k (int): Number of top images to select if no high-score images are found.
 
         Returns:
-            List[str]: List of selected top image URLs.
+            List[str]: List of selected image URLs.
         """
-        # Remove duplicates based on image URL
         unique_images = []
-        image_hashes = set()
+        seen_hashes = set()
         current_research_images = self.researcher.get_research_images()
 
-        for img in images:
-            img_hash = hashlib.md5(img['url'].encode()).hexdigest()
-            if img_hash not in image_hashes and img_hash not in {hashlib.md5(existing_img.encode()).hexdigest()
-                                                                 for existing_img in current_research_images}:
-                image_hashes.add(img_hash)
+        # First, select all score 2 and 3 images
+        high_score_images = [img for img in images if img['score'] >= 2]
+
+        for img in high_score_images + images:  # Process high-score images first, then all images
+            img_hash = get_image_hash(img['url'])
+            if img_hash and img_hash not in seen_hashes and img['url'] not in current_research_images:
+                seen_hashes.add(img_hash)
                 unique_images.append(img['url'])
 
                 if len(unique_images) == k:

diff --git a/gpt_researcher/skills/writer.py b/gpt_researcher/skills/writer.py
@@ -40,11 +40,12 @@ async def write_report(self, existing_headers: list = [], relevant_written_conte
             str: The generated report.
         """
         # send the selected images prior to writing report
-        if self.researcher.research_images:
+        research_images = self.researcher.get_research_images()
+        if research_images:
             await stream_output(
                 "images",
                 "selected_images",
-                json.dumps(self.researcher.research_images),
+                json.dumps(research_images),
                 self.researcher.websocket,
             )