-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_dd.py
76 lines (59 loc) · 2.39 KB
/
check_dd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import platform
import subprocess
from pathlib import Path
from urllib.parse import urlsplit
from utils import load_json, load_text, write_json, write_text
# Set command based on OS
ddl_cmd = 'dead-domains-linter'
if platform.system() == 'Windows':
ddl_cmd += '.cmd'
def ddl_cli(urls: list[str]):
# Prepare all URLs first
write_text(urls, 'ddl/feeds.txt')
# Double check dead domains as it can depend on the network
try:
subprocess.run([ddl_cmd, '-i', 'ddl/feeds.txt', '--export', 'ddl/dead_domains.txt'])
except KeyboardInterrupt:
exit()
# domain_paths info
domain_paths = set(line for line in load_text('domain_paths.txt', True) if '/' not in line)
# Gather all feeds
feeds: dict[str, dict[str, str]] = {'feeds.json': load_json('feeds.json')}
# Get archive feeds to process separately
archive_domains = set()
archive_list = []
for file_path in Path('archive').iterdir():
feeds[f'archive/{file_path.name}'] = (feed := load_json(file_path))
print(f'Extracting URLs in {file_path.name} ...')
for url in feed.keys():
# We don't want to process same domains
if (domain := urlsplit(url).netloc.split(':')[0].removeprefix('www.')) in archive_domains:
continue
archive_domains.add(domain)
# Search subdomain to domain_paths
domain_split = domain.split('.')
for idx in reversed(range(len(domain_split) - 1)):
subdomain = '.'.join(domain_split[idx:])
if subdomain in domain_paths:
archive_list.append(url)
break
# Check dead domain of archive URLs first
ddl_cli(archive_list)
archive_dead = set(load_text('ddl/dead_domains.txt', True))
for file_name, feed in feeds.items():
print(f'Check dead domains in {file_name} ...')
# Get all URLs
urls = [url for url in feed.keys()]
# Check dead domain on only feeds.json, otherwise just remove item
if file_name == 'feeds.json':
ddl_cli(urls)
dead_domains = set(load_text('ddl/dead_domains.txt', True))
else:
dead_domains = archive_dead
for url in urls:
if urlsplit(url).netloc.split(':')[0] not in dead_domains:
continue
feeds[file_name].pop(url, None)
input('\nPlease check dead domains again...')
for file_name, feed in feeds.items():
write_json(feed, file_name)