Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added all changes #942

Merged
merged 1 commit into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ More specifically:

## Features
- 📝 Generate research, outlines, resources and lessons reports with local documents and web sources
- 🖼️ Supports smart article image scraping and filtering
- 📜 Can generate long and detailed research reports (over 2K words)
- 🌐 Aggregates over 20 web sources per research to form objective and factual conclusions
- 🖥️ Includes both lightweight (HTML/CSS/JS) and production ready (NextJS + Tailwind) UX/UI
Expand Down
87 changes: 48 additions & 39 deletions frontend/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -189,65 +189,74 @@ a:hover {
text-decoration: underline;
}

/* Add these styles at the end of the file */
/* Add or modify these styles at the end of the file */
#selectedImagesContainer {
background-color: rgba(255, 255, 255, 0.1);
border-radius: 12px;
padding: 15px;
margin-bottom: 20px;
color: #fff; /* Ensure text is visible */
color: #fff;
display: flex;
flex-wrap: wrap;
gap: 10px;
justify-content: center;
}

#selectedImagesContainer h3 {
width: 100%;
margin-top: 0;
margin-bottom: 10px;
color: #fff; /* Ensure header is visible */
color: #fff;
}

.image-dialog {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.8);
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
z-index: 1000;
#selectedImagesContainer img {
width: 150px;
height: 150px;
object-fit: cover;
cursor: pointer;
transition: transform 0.3s ease, box-shadow 0.3s ease;
border-radius: 8px;
}

.image-dialog img {
max-width: 90%;
max-height: 80%;
object-fit: contain;
border-radius: 8px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
#selectedImagesContainer img:hover {
transform: scale(1.05);
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
}

.image-dialog button {
margin-top: 20px;
padding: 10px 20px;
background-color: #007bff;
color: white;
border: none;
border-radius: 5px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s ease;
.image-dialog {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.8);
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
z-index: 1000;
}

.image-dialog button:hover {
background-color: #0056b3;
.image-dialog img {
max-width: 90%;
max-height: 80%;
object-fit: contain;
border-radius: 8px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
}

#selectedImagesContainer img {
cursor: pointer;
transition: transform 0.3s ease, box-shadow 0.3s ease;
.image-dialog button {
margin-top: 20px;
padding: 10px 20px;
background-color: #007bff;
color: white;
border: none;
border-radius: 5px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s ease;
}

#selectedImagesContainer img:hover {
transform: scale(1.05);
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
.image-dialog button:hover {
background-color: #0056b3;
}
2 changes: 1 addition & 1 deletion gpt_researcher/actions/web_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, An
scraped_data = scraper.run()
for item in scraped_data:
if 'image_urls' in item:
images.extend([{'url': img_url} for img_url in item['image_urls']])
images.extend([img for img in item['image_urls']])
except Exception as e:
print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")

Expand Down
4 changes: 2 additions & 2 deletions gpt_researcher/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ async def get_similar_written_contents_by_draft_section_titles(
)

# Utility methods
def get_research_images(self) -> List[Dict[str, Any]]:
return self.research_images
def get_research_images(self, top_k=10) -> List[Dict[str, Any]]:
return self.research_images[:top_k]

def add_research_images(self, images: List[Dict[str, Any]]) -> None:
self.research_images.extend(images)
Expand Down
1 change: 1 addition & 0 deletions gpt_researcher/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def extract_data_from_url(self, link, session):

if len(content) < 100:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}

return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
except Exception as e:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
Expand Down
54 changes: 43 additions & 11 deletions gpt_researcher/scraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse, parse_qs
import logging
import re
import hashlib

def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
"""Extract relevant images from the page"""
Expand All @@ -14,24 +14,35 @@ def get_relevant_images(soup: BeautifulSoup, url: str) -> list:
for img in all_images:
img_src = urljoin(url, img['src'])
if img_src.startswith(('http://', 'https://')):
score = 0
# Check for relevant classes
if any(cls in img.get('class', []) for cls in ['header', 'featured', 'hero', 'thumbnail', 'main', 'content']):
image_urls.append((img_src, 3)) # Higher priority
score = 3 # Higher score
# Check for size attributes
elif img.get('width') and img.get('height'):
width = parse_dimension(img['width'])
height = parse_dimension(img['height'])
if width and height:
if width >= 1200 and height >= 600:
image_urls.append((img_src, 2)) # Medium priority
if width >= 2000 and height >= 1000:
score = 2 # Medium score (very large images)
elif width >= 1600 or height >= 800:
score = 1 # Lower score
elif width >= 800 or height >= 400:
image_urls.append((img_src, 1)) # Lower priority
elif width >= 600 or height >= 300:
image_urls.append((img_src, 0)) # Lower priority
score = 0 # Lowest score
else:
continue # Skip small images

image_urls.append({'url': img_src, 'score': score})

# Sort images by priority (highest first) and then limit to top 10
sorted_images = sorted(image_urls, key=lambda x: x[1], reverse=True)
return [img[0] for img in sorted_images[:10]]
# Sort images by score (highest first)
sorted_images = sorted(image_urls, key=lambda x: x['score'], reverse=True)

# Select all images with score 3 and 2, then add score 1 images up to a total of 10
high_score_images = [img for img in sorted_images if img['score'] in [3, 2]]
low_score_images = [img for img in sorted_images if img['score'] == 1]

result = high_score_images + low_score_images[:max(0, 10 - len(high_score_images))]
return result[:10] # Ensure we don't return more than 10 images in total

except Exception as e:
logging.error(f"Error in get_relevant_images: {e}")
Expand All @@ -49,3 +60,24 @@ def parse_dimension(value: str) -> int:
def extract_title(soup: BeautifulSoup) -> str:
"""Extract the title from the BeautifulSoup object"""
return soup.title.string if soup.title else ""

def get_image_hash(image_url: str) -> str:
"""Calculate a simple hash based on the image filename and essential query parameters"""
try:
parsed_url = urlparse(image_url)

# Extract the filename
filename = parsed_url.path.split('/')[-1]

# Extract essential query parameters (e.g., 'url' for CDN-served images)
query_params = parse_qs(parsed_url.query)
essential_params = query_params.get('url', [])

# Combine filename and essential parameters
image_identifier = filename + ''.join(essential_params)

# Calculate hash
return hashlib.md5(image_identifier.encode()).hexdigest()
except Exception as e:
logging.error(f"Error calculating image hash for {image_url}: {e}")
return None
28 changes: 14 additions & 14 deletions gpt_researcher/skills/browser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import List, Dict, Optional, Set
import hashlib
import re
from typing import List, Dict

from ..actions.utils import stream_output
from ..actions.web_scraping import scrape_urls
from ..scraper.utils import get_image_hash # Add this import


class BrowserManager:
Expand Down Expand Up @@ -61,25 +60,26 @@ async def browse_urls(self, urls: List[str]) -> List[Dict]:

def select_top_images(self, images: List[Dict], k: int = 2) -> List[str]:
"""
Select top k images and remove duplicates.
Select most relevant images and remove duplicates based on image content.

Args:
images (List[Dict]): List of image dictionaries with 'url' keys.
k (int): Number of top images to select.
images (List[Dict]): List of image dictionaries with 'url' and 'score' keys.
k (int): Number of top images to select if no high-score images are found.

Returns:
List[str]: List of selected top image URLs.
List[str]: List of selected image URLs.
"""
# Remove duplicates based on image URL
unique_images = []
image_hashes = set()
seen_hashes = set()
current_research_images = self.researcher.get_research_images()

for img in images:
img_hash = hashlib.md5(img['url'].encode()).hexdigest()
if img_hash not in image_hashes and img_hash not in {hashlib.md5(existing_img.encode()).hexdigest()
for existing_img in current_research_images}:
image_hashes.add(img_hash)
# First, select all score 2 and 3 images
high_score_images = [img for img in images if img['score'] >= 2]

for img in high_score_images + images: # Process high-score images first, then all images
img_hash = get_image_hash(img['url'])
if img_hash and img_hash not in seen_hashes and img['url'] not in current_research_images:
seen_hashes.add(img_hash)
unique_images.append(img['url'])

if len(unique_images) == k:
Expand Down
5 changes: 3 additions & 2 deletions gpt_researcher/skills/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ async def write_report(self, existing_headers: list = [], relevant_written_conte
str: The generated report.
"""
# send the selected images prior to writing report
if self.researcher.research_images:
research_images = self.researcher.get_research_images()
if research_images:
await stream_output(
"images",
"selected_images",
json.dumps(self.researcher.research_images),
json.dumps(research_images),
self.researcher.websocket,
)

Expand Down