-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneopets.py
229 lines (186 loc) · 7.86 KB
/
neopets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from datetime import datetime
import random
import sys
wait_times = [10, 12, 13, 14, 15, 16, 17, 18, 19, 20]
neopets_list = [
"nimmo", "scorchio","jubjub","grarrl",
"skeith","korbat","lenny","wocky",
"bruce", "kiko","kau","usul",
"aisha","chia","eyrie","tuskaninny",
"flotsam","jetsam","kacheek","uni","buzz","lupe",
"elephante","gelert","mynci","kyrii",
"peophin","quiggle","shoyru","acara",
"zafara","blumaroo","techo","moehog",
"poogle", "kougra", "grundo", "koi",
"meerca", "chomby", "pteri", "krawk",
"tonu", "draik", "lxi", "yurble",
"ruki", "bori", "hissi", "lutari",
"xweetok", "ogrin", "gnorbu", "vandagyre",
]
jub_jub = ["jubjub"]
test_list = [
"nimmo",
"draik", "ixi", "yurbie",
]
results_list = []
test_results = []
def wait():
global wait_times
time.sleep(random.choice(wait_times))
def pet_loop(pet_list):
for pet in pet_list:
# Wait random amount of seconds before initializing search for pet
wait()
for page in range(1,3):
# Capture results from page 1 -> page 6 of pet and store in global results list
print(f"Capture page for pet[{pet}] at page[{page}]")
capture_page(page_number=page, pet=pet)
def capture_page(page_number=1, pet="nimmo"):
"""
capture_page() returns a list of Dictionaries holding the "name", "color", "species", "stuck_pets_link",
"""
global results_list
try:
# Wait for another random amount of seconds before request
wait()
print(f"Results for pet: {pet}, page: {page_number} ")
# Capture response of Stuck pet page
page_res = requests.get(f"https://www.stuckpets.com/neopets/{pet}/page/{page_number}",timeout=1)
# Determine whether page returns anything
if page_res:
# Use BS4 to find
soup = BeautifulSoup(
page_res.content, 'html.parser'
)
for row in soup.find_all("tr",
attrs={
"class": "pet-tr",
}):
name = row.td.contents[0]["title"].split(" ", 3)[0]
color = row.td.contents[0]["title"].split(" ", 3)[2]
species = row.td.contents[0]["title"].split(" ", 3)[3]
link = row.a["href"]
# Store result as 'res' Dict
res = {
"name": name,
"color": color,
"species":species,
"link": link,
}
print("Link: ",row.a["href"])
print("name: ", row.td.contents[0]["title"].split(" ", 3)[0])
print("color: ", row.td.contents[0]["title"].split(" ", 3)[2])
print("species", row.td.contents[0]["title"].split(" ", 3)[3])
if res:
results_list.append(res)
else:
print("Results are empty")
else:
return (f"Page Result Returned Nothing on pet [{pet}] page number[{page_number}]")
except ConnectionError as e:
print(f"There was a network problem at pet[{pet}] page number[{page_number}]: ", e)
except TimeoutError as e:
print("There was a Timeout: ", e)
def capture_pages(start_page=1, end_page=6, pet="nimmo"):
# Declare use of global results list to hold Dict 'res'
global results_list, wait_times
for i in range(start_page, end_page):
try:
# Wait for another random amount of seconds before request
wait()
print(f"Capturing pet: {pet}, page: {i} ")
# Capture response of Stuck pet page
page_res = requests.get(f"https://www.stuckpets.com/neopets/{pet}/page/{i}",timeout=1)
# If Page Result is valid
if page_res:
soup = BeautifulSoup(
page_res.content, 'html.parser'
)
# Find each table row with class="pet-tr"
for row in soup.find_all("tr",
attrs={
"class": "pet-tr",
}):
# Split the text to capture each attribute
name = row.td.contents[0]["title"].split(" ", 3)[0]
color = row.td.contents[0]["title"].split(" ", 3)[2]
species = row.td.contents[0]["title"].split(" ", 3)[3]
link = row.a["href"]
# Store result as 'res' Dict
res = {
"name": name,
"color": color,
"species":species,
"link": link,
}
print(res)
# If result is valid append to the global result list
if res:
results_list.append(res)
else:
print("Error reading Name/Color/Species")
continue
else:
print("Response returned nothing")
continue
except TypeError as e:
print("Type Error occured: ", e)
continue
except (requests.exceptions.InvalidURL) as e:
print("The URL provided was invalid: ", e);
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
print("A connection error or timeout occurred:", e)
except requests.exceptions.HTTPError as e:
print("HTTP Error occured: ", e)
except requests.exceptions.RequestException as e:
print("Error occured: ", e)
def create_time_stamped_df():
global results_list
# Create String to add to end of neopets filename
date = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")
# Final Step create a csv from the results list
df = pd.DataFrame.from_dict(results_list)
df.to_csv(f"neopets_{date}.csv")
def test_page_loop():
for pet in test_list:
# Wait random amount of seconds before initializing search for pet
wait()
for page in range(1,3):
# Capture results from page 1 -> page 6 of pet and store in global results list
print(f"Capture page for pet[{pet}] at page[{page}]")
capture_page(page_number=page, pet=pet)
def main_loop():
"""
Loop through every pet and every page scraping the:
-Name
-Color
-Species
-Link to be adopted at
"""
print("Using default list loop")
pet_loop(pet_list=neopets_list)
"""
Store Information inside DataFrame w/TimeStamped Filename CSV
neopets_year_month_day_hour_minute_second.csv
"""
create_time_stamped_df()
def command_line_loop(neopets_to_scrape):
print("Using command line loop")
pet_loop(pet_list=neopets_to_scrape)
create_time_stamped_df()
if __name__ == "__main__":
# Using command line arguements if provided otherwise use default main loop
args = sys.argv[1:]
neopets = []
if args:
print("Using command line arguements")
neopets = args[0].split()
print(neopets)
command_line_loop(neopets)
else:
print("Using default main loop")
main_loop()