diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index e62015fa..2fb613d0 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -168,7 +168,7 @@ def download_pdf(self, url, filename): - filename(str): Name of the PDF file Returns: 1 on failure, file path on success. """ - file_path = os.path.join(os.getcwd(), filename) + file_path = os.path.join(os.getcwd(), filename + ".pdf") response = requests.get(url, headers=self.http_header) if response.status_code == 200: with open(file_path, 'wb') as f: @@ -181,7 +181,7 @@ def download_pdf(self, url, filename): # Override def content_extract(self, filename, url, **kwargs): - if url[-4] == ".pdf": + if ".pdf" in url: pdf_result = self.download_pdf(url, filename) return pdf_result else: diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py index 20ee2074..a41a6bb7 100644 --- a/rag/scraper/Scraper_master/scrape_pdf.py +++ b/rag/scraper/Scraper_master/scrape_pdf.py @@ -18,13 +18,13 @@ def content_extract(self, filename, url, **kwargs): if response.status_code == 200: with open(filename, 'wb') as f: f.write(response.content) - print(f"Download completed successfully and saved as {self.root_filename}") + print(f"Download completed successfully and saved as {filename}") else: print(f"Failed to download the PDF. Status code: {response.status_code}") # Example usage: if __name__ == "__main__": - pdf_url = "http://example.com/path/to/your/pdf/file.pdf" # Replace with the actual PDF URL - pdf_saver = ScrapePdf(pdf_url) # Specify the filename to save as - pdf_saver.content_extract("name", pdf_url) # Start the download process + pdf_url = "pdflink" # Replace with the actual PDF URL + pdf_saver = ScrapePdf(pdf_url) + pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process diff --git a/rag/scraper/Scraper_master/scrape_vid.py b/rag/scraper/Scraper_master/scrape_vid.py index 9ab79f8e..bd9748cc 100644 --- a/rag/scraper/Scraper_master/scrape_vid.py +++ b/rag/scraper/Scraper_master/scrape_vid.py @@ -1,7 +1,8 @@ from rag.scraper.Scraper_master.base_scraper import BaseScraper -from pytube import Playlist, YouTube +from pytubefix import Playlist, YouTube import os from utils import save_to_file + class ScrapeVid(BaseScraper): def __init__(self, url, root_filename): super().__init__(url)