forked from kriti21/webscrapingbs4Py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgate_exam_papersg4g.py
109 lines (109 loc) · 4.3 KB
/
gate_exam_papersg4g.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import *
import re
data = requests.get("http://www.geeksforgeeks.org/")
soup = BeautifulSoup(data.text, 'html.parser')
data1 = soup.find_all('hgroup')
#data2 = data1[1].find_all('a')
#print (data2)
for link in data1[1].find_all('a', href=True, text='GATE CS'):
nextlink = link['href']
url = requests.get(str(nextlink))
soup = BeautifulSoup(url.text, 'html.parser')
#print (soup)
i=1
list1 = soup.find_all('a', href=True, text=re.compile('^GATE-CS'))
list2 = soup.find_all('a', href=True, text=re.compile('^GATE\sCS\s20..$'))
for link in list1:
link1 = link['href']
#print (link1)
filename = 'gateprep-'+str(i);
with open(filename, 'w') as handle:
finalpage = requests.get(link1)
soup = BeautifulSoup(finalpage.text, 'html.parser')
data3 = soup.find('div', id="mtq_question_container-1")
data4 = data3.find('div')
divs = data4.find_all('div',{'class':'mtq_question mtq_scroll_item-1'})
try:
for div in divs:
nd1 = div.find('div',{'class':'mtq_question_label'})
#print (nd1.text)
handle.writelines(nd1.text)
handle.writelines("\n")
nd2 = div.find('div',{'class':'mtq_question_text'})
handle.writelines(nd2.text)
#print (nd2.text)
handle.writelines("\n")
nd3 = div.find_all('div',{'class':'mtq_answer_text'})
count=1
for ans in nd3:
st = "("+str(count)+")"
handle.writelines(st)
handle.writelines(ans.text)
#print (ans.text)
handle.writelines("\n")
count+=1
handle.writelines("\n\n\n")
try:
nd4 = div.find('div',{'class':'mtq_explanation-label'})
handle.writelines(nd4.text)
#print (nd4.text)
handle.writelines("\n")
nd5 = div.find('div',{'class':'mtq_explanation-text'})
handle.writelines(nd5.text)
#print (nd5.text)
handle.writelines("\n")
except Exception as e:
pass
handle.writelines("\n\n\n")
#print ("executed successfully!")
i+=1
except Exception as e:
print (e)
for link in list2:
link2 = link['href']
#print (link2)
filename = 'gateprep-'+str(i);
with open(filename, 'w') as handle:
finalpage = requests.get(link1)
soup = BeautifulSoup(finalpage.text, 'html.parser')
data3 = soup.find('div', id="mtq_question_container-1")
data4 = data3.find('div')
divs = data4.find_all('div',{'class':'mtq_question mtq_scroll_item-1'})
try:
for div in divs:
nd1 = div.find('div',{'class':'mtq_question_label'})
#print (nd1.text)
handle.writelines(nd1.text)
handle.writelines("\n")
nd2 = div.find('div',{'class':'mtq_question_text'})
handle.writelines(nd2.text)
#print (nd2.text)
handle.writelines("\n")
nd3 = div.find_all('div',{'class':'mtq_answer_text'})
count=1
for ans in nd3:
st = "("+str(count)+")"
handle.writelines(st)
handle.writelines(ans.text)
#print (ans.text)
handle.writelines("\n")
count+=1
handle.writelines("\n\n\n")
try:
nd4 = div.find('div',{'class':'mtq_explanation-label'})
handle.writelines(nd4.text)
#print (nd4.text)
handle.writelines("\n")
nd5 = div.find('div',{'class':'mtq_explanation-text'})
handle.writelines(nd5.text)
#print (nd5.text)
handle.writelines("\n")
except Exception as e:
pass
handle.writelines("\n\n\n")
#print ("executed successfully!")
i+=1
except Exception as e:
print (e)
print ("Successfully saved all the files :)")