-
Notifications
You must be signed in to change notification settings - Fork 4
/
gfwlist2domainlist.awk
executable file
·91 lines (75 loc) · 2.28 KB
/
gfwlist2domainlist.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/awk -f
{
if (/^$/ || /^#/ || /^\!/ || /^\[/ || /^@@/) {
# Empty line, or line starts with #, !, [ or @@
# https://adblockplus.org/en/filters#comments
# https://adblockplus.org/en/filters#whitelist
} else if (/(^\|\|)|(^\|)/) {
# Line starts with || or |
# https://adblockplus.org/en/filters#anchors
sub(/(^\|\|)|(^\|)/, "") # remove leading || or |
extract($0)
} else if (/^\/.*\/$/) {
# Line in two slashs, like /.../, is a regular expression.
# https://adblockplus.org/en/filters#regexps
gsub(/(^\/)|(\/$)/, "") # remove leading and tailing /
extract_regex($0)
} else {
extract($0)
}
}
END {
for (i = 1; i <= domain_count; i++) {
print domains[i] | "sort | uniq"
}
close("sort | uniq")
}
# Extract domain name from non-regex line
function extract(line) {
# Remove everything till ://, such as http:// https://
sub(/.*:\/\//, "", line)
# Remove everything from /
sub(/\/.*/, "", line)
# Remove everything from :
sub(/:.*/, "", line)
# Remove segments(dot separated) contain *
sub(/[^\.]*\*[^\.]*/, "", line)
# Remove leading dot
sub(/^\./, "", line)
if (line ~ /^((([0-9]{1,2})|(1[0-9]{2})|(2[0-4][0-9])|(25[0-5]))\.){3}(([0-9]{1,2})|(1[0-9]{2})|(2[0-4][0-9])|(25[0-5]))$/) {
# IPv4 string
print "Skipping line " NR ". " line | "cat >&2"
close("cat >&2")
} else if (line ~ /^([A-Za-z0-9\-]+\.)+(xn\-\-)?[A-Za-z0-9]+$/) {
domains[++domain_count] = line
} else {
print "Skipping line " NR ". " line | "cat >&2"
close("cat >&2")
}
}
# Extract domain name from regex line
function extract_regex(line) {
# Expand the lines like (aa|bb|cc) into multiple records
pos = match(line, /\([A-Za-z0-9\.\|]+\)/)
if (pos != 0) {
in_bracket = substr(line, RSTART + 1, RLENGTH - 2)
n = split(in_bracket, arr, "|")
for (i = 1; i <= n; i++) {
expanded_line = line
sub(/\([A-Za-z0-9\.\|]+\)/, arr[i], expanded_line)
extract_regex(expanded_line)
}
} else {
# Remove everything till :\/\/ in regex(:// in plain text, such as http:// https://)
sub(/.*:\\\/\\\//, "", line)
# Remove (...)*? such as ([^\/]+\.)
gsub(/\([^\)]+\)\*?/, "", line)
# Remove [...]
gsub(/\[.*\]\+/, "", line)
# Remove everything from \/ in regex(/ in plain text)
sub(/\\\/.*/, "", line)
# Replace \. to .
gsub(/\\\./, ".", line)
extract(line)
}
}