augcog · FranardoHuang · Jul 18, 2024 · Jul 9, 2024 · Jul 12, 2024 · Jul 17, 2024
diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py
@@ -168,7 +168,7 @@ def download_pdf(self, url, filename):
         - filename(str): Name of the PDF file
         Returns: 1 on failure, file path on success.
         """
-        file_path = os.path.join(os.getcwd(), filename)
+        file_path = os.path.join(os.getcwd(), filename + ".pdf")
         response = requests.get(url, headers=self.http_header)
         if response.status_code == 200:
             with open(file_path, 'wb') as f:
@@ -181,7 +181,7 @@ def download_pdf(self, url, filename):
 
     # Override
     def content_extract(self, filename, url, **kwargs):
-        if url[-4] == ".pdf":
+        if ".pdf" in url:
             pdf_result = self.download_pdf(url, filename)
             return pdf_result
         else:

diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py
@@ -18,13 +18,13 @@ def content_extract(self, filename, url, **kwargs):
         if response.status_code == 200:
             with open(filename, 'wb') as f:
                 f.write(response.content)
-            print(f"Download completed successfully and saved as {self.root_filename}")
+            print(f"Download completed successfully and saved as {filename}")
         else:
             print(f"Failed to download the PDF. Status code: {response.status_code}")
 
 
 # Example usage:
 if __name__ == "__main__":
-    pdf_url = "http://example.com/path/to/your/pdf/file.pdf"  # Replace with the actual PDF URL
-    pdf_saver = ScrapePdf(pdf_url)  # Specify the filename to save as
-    pdf_saver.content_extract("name", pdf_url) # Start the download process
+    pdf_url = "pdflink"  # Replace with the actual PDF URL
+    pdf_saver = ScrapePdf(pdf_url)
+    pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process
diff --git a/rag/scraper/Scraper_master/scrape_vid.py b/rag/scraper/Scraper_master/scrape_vid.py
@@ -1,7 +1,8 @@
 from rag.scraper.Scraper_master.base_scraper import BaseScraper
-from pytube import Playlist, YouTube
+from pytubefix import Playlist, YouTube
 import os
 from utils import save_to_file
+
 class ScrapeVid(BaseScraper):
     def __init__(self, url, root_filename):
         super().__init__(url)