-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
83 lines (55 loc) · 2.58 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import urllib.request
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import os
import time
df = pd.read_csv('qsience.csv')
directory_name = "arabic_papers_qs"
for idx, link in enumerate(df['pdf_link-href']):
response = urllib.request.urlopen(link)
filename = os.path.join(directory_name, f'qsciencefile_{idx}.pdf')
with open(filename, 'wb') as file:
file.write(response.read())
# import urllib.request
# import os
# import requests
# from bs4 import BeautifulSoup
# import time
# def download_pdfs(search_query, base_url, search_url, download_folder):
# if not os.path.exists(download_folder):
# os.makedirs(download_folder)
# for start in range(0, 991, 10):
# search_payload = {'start': start, 'q': search_query, 'hl': 'en', 'as_sdt': '0,5'}
# try:
# search_response = requests.get(search_url, params=search_payload)
# if search_response.status_code != 200:
# print(f"Failed to get search results: {search_response.status_code}")
# continue
# soup = BeautifulSoup(search_response.text, 'html.parser')
# pdf_urls = []
# for link in soup.find_all('a'):
# url = link.get('href')
# if url and 'pdf' in url:
# pdf_urls.append(url if url.startswith('http') else base_url + url)
# for link in pdf_urls:
# try:
# original_filename = os.path.basename(link)
# print(f"Attempting to download {original_filename}...")
# response = urllib.request.urlopen(link)
# filename = os.path.join(download_folder, original_filename)
# with open(filename, 'wb') as file:
# file.write(response.read())
# print(f"Successfully downloaded {original_filename}")
# except Exception as e:
# print(f"Failed to download {link}: {e}")
# except Exception as e:
# print(f"Failed to search: {e}")
# time.sleep(10000)
# # Usage
# base_url = 'https://scholar.google.com/' # Replace with the actual base URL
# search_url = 'https://scholar.google.com/scholar' # Modify this to the correct search URL
# download_folder = 'downloaded_pdfs' # Folder where PDFs will be saved
# search_query = 'الادب' # Replace with the actual search term
# download_pdfs(search_query, base_url, search_url, download_folder)