-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhotel.py
238 lines (224 loc) · 9.94 KB
/
hotel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import random
import concurrent.futures
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import re
from bs4 import BeautifulSoup
from queue import Queue
import threading
lock = threading.Lock()
# 查找所有符合指定格式的网址
infoList = []
urls_y = []
resultslist = []
urls = [
"http://tonkiang.us/hoteliptv.php?page=1&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=2&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=3&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=4&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=5&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=6&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=7&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=8&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=9&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=10&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=11&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=12&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=13&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=14&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=15&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=16&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=17&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=18&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=19&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=20&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=21&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=22&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=23&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=24&s=江苏",
"http://tonkiang.us/hoteliptv.php?page=25&s=江苏"
]
# 初始化计数器为0
counter = -1
# 每次调用该函数时将计数器加1并返回结果
def increment_counter():
global counter
counter += 1
return counter
#判断一个数字是单数还是双数可
def is_odd_or_even(number):
if number % 2 == 0:
return True
else:
return False
for url in urls:
# 创建一个Chrome WebDriver实例
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("blink-settings=imagesEnabled=false")
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(60) # 10秒后超时
# 设置脚本执行超时
driver.set_script_timeout(50) # 5秒后超时
# 使用WebDriver访问网页
driver.get(url) # 将网址替换为你要访问的网页地址
time.sleep(20)
# 获取网页内容
page_content = driver.page_source
# 关闭WebDriver
driver.quit()
print(increment_counter()) #方便看看是否有执行啊
# 查找所有符合指定格式的网址
pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+" # 设置匹配的格式,如http://8.8.8.8:8888
urls_all = re.findall(pattern, page_content)
# urls = list(set(urls_all)) # 去重得到唯一的URL列表
urls_y = set(urls_all) # 去重得到唯一的URL列表
for urlv in urls_y:
resultslist.append(f"{urlv}")
resultslist = set(resultslist) # 去重得到唯一的URL列表
with open("iplist.txt", 'w', encoding='utf-8') as file:
for iplist in resultslist:
file.write(iplist + "\n")
print(iplist)
file.close()
sorted_list = sorted(resultslist)
def worker(thread_url,counter_id):
try:
# 创建一个Chrome WebDriver实例
results = []
chrome_options = Options()
chrome_options.add_argument(f"user-data-dir=selenium{counter_id}")
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("blink-settings=imagesEnabled=false")
driver = webdriver.Chrome(options=chrome_options)
# 设置页面加载超时
driver.set_page_load_timeout(60) # 10秒后超时
# 设置脚本执行超时
driver.set_script_timeout(50) # 5秒后超时
# 使用WebDriver访问网页
if is_odd_or_even(random.randint(1, 200)):
page_url= f"http://tonkiang.us/9dlist2.php?s={thread_url}"
else:
page_url= f"http://foodieguide.com/iptvsearch/alllist.php?s={thread_url}"
print(page_url)
driver.get(page_url) # 将网址替换为你要访问的网页地址
WebDriverWait(driver, 45).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "div.tables")
)
)
time.sleep(1)
soup = BeautifulSoup(driver.page_source, "html.parser")
tables_div = soup.find("div", class_="tables")
results = (
tables_div.find_all("div", class_="result")
if tables_div
else []
)
if not any(
result.find("div", class_="m3u8") for result in results
):
#break
print("Err-------------------------------------------------------------------------------------------------------")
for result in results:
#print(result)
m3u8_div = result.find("div", class_="m3u8")
url_int = m3u8_div.text.strip() if m3u8_div else None
#取频道名称
m3u8_name_div = result.find("div", class_="channel")
url_name = m3u8_name_div.text.strip() if m3u8_div else None
#-----
#print("-------------------------------------------------------------------------------------------------------")
name =f"{url_name}"
if len(name) == 0:
name = "Err画中画"
#print(name)
urlsp =f"{url_int}"
if len(urlsp) == 0:
urlsp = "rtp://127.0.0.1"
print(f"{url_name}\t{url_int}")
#print("-------------------------------------------------------------------------------------------------------")
urlsp = urlsp.replace("http://67.211.73.118:9901", "")
name = name.replace("cctv", "CCTV")
name = name.replace("中央", "CCTV")
name = name.replace("央视", "CCTV")
name = name.replace("HD", "高清")
name = name.replace("标清", "")
name = name.replace("频道", "")
name = name.replace("-", "")
name = name.replace(" ", "")
name = name.replace("PLUS", "+")
name = name.replace("+", "+")
name = name.replace("(", "")
name = name.replace(")", "")
name = re.sub(r"CCTV(\d+)台", r"CCTV\1", name)
name = name.replace("CCTV1综合", "CCTV1")
name = name.replace("CCTV2财经", "CCTV2")
name = name.replace("CCTV3综艺", "CCTV3")
name = name.replace("CCTV4国际", "CCTV4")
name = name.replace("CCTV4中文国际", "CCTV4")
name = name.replace("CCTV4欧洲", "CCTV4")
name = name.replace("CCTV5体育", "CCTV5")
name = name.replace("CCTV6电影", "CCTV6")
name = name.replace("CCTV7军事", "CCTV7")
name = name.replace("CCTV7军农", "CCTV7")
name = name.replace("CCTV7农业", "CCTV7")
name = name.replace("CCTV7国防军事", "CCTV7")
name = name.replace("CCTV8电视剧", "CCTV8")
name = name.replace("CCTV9记录", "CCTV9")
name = name.replace("CCTV9纪录", "CCTV9")
name = name.replace("CCTV10科教", "CCTV10")
name = name.replace("CCTV11戏曲", "CCTV11")
name = name.replace("CCTV12社会与法", "CCTV12")
name = name.replace("CCTV13新闻", "CCTV13")
name = name.replace("CCTV新闻", "CCTV13")
name = name.replace("CCTV14少儿", "CCTV14")
name = name.replace("CCTV15音乐", "CCTV15")
name = name.replace("CCTV16奥林匹克", "CCTV16")
name = name.replace("CCTV17农业农村", "CCTV17")
name = name.replace("CCTV17农业", "CCTV17")
name = name.replace("CCTV5+体育赛视", "CCTV5+")
name = name.replace("CCTV5+体育赛事", "CCTV5+")
name = name.replace("CCTV5+体育", "CCTV5+")
name = name.replace("CMIPTV", "")
name = name.replace("内蒙卫视", "内蒙古卫视")
name = name.replace("CCTVCCTV", "CCTV")
if "http" in urlsp:
# 获取锁
lock.acquire()
infoList.append(f"{name},{urlsp}")
# 释放锁
lock.release()
print(f"=========================>>> Thread {thread_url} save ok")
except Exception as e:
print(f"=========================>>> Thread {thread_url} caught an exception: {e}")
finally:
# 确保线程结束时关闭WebDriver实例
driver.quit()
print(f"=========================>>> Thread {thread_url} quiting")
# 标记任务完成
time.sleep(0)
# 创建一个线程池,限制最大线程数为3
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
# 提交任务到线程池,并传入参数
counter = increment_counter()
for i in sorted_list: # 假设有5个任务需要执行
executor.submit(worker, i ,counter)
infoList = set(infoList) # 去重得到唯一的URL列表
# infoList = sorted(infoList)
with open("myitv.txt", 'w', encoding='utf-8') as file:
for info in infoList:
file.write(info + "\n")
print(info)
file.close()