-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PSL Private Section Domains WHOIS Checker #2014
Merged
Merged
Changes from 4 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
venv | ||
__pycache__ | ||
data/*.csv |
206 changes: 206 additions & 0 deletions
206
tools/private_domains_checker/PSLPrivateDomainsProcessor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
import datetime | ||
import json | ||
import os | ||
import pandas as pd | ||
import requests | ||
import whois | ||
|
||
|
||
def check_dns_status(domain): | ||
def make_request(): | ||
try: | ||
url = f"https://dns.google/resolve?name={domain}&type=NS" | ||
response = requests.get(url) | ||
json_response = response.json() | ||
if json_response.get("Status") == 3: | ||
return "NXDOMAIN" | ||
else: | ||
return "ok" | ||
except Exception as e: | ||
return "ERROR" | ||
|
||
dns_status = make_request() | ||
if dns_status == "ERROR": # Give it another try | ||
dns_status = make_request() | ||
return dns_status | ||
|
||
|
||
def get_whois_data(domain): | ||
try: | ||
w = whois.whois(domain) | ||
whois_domain_status = w.status | ||
whois_expiry = w.expiration_date | ||
if isinstance(whois_expiry, list): | ||
whois_expiry = whois_expiry[0] | ||
whois_status = "ok" | ||
except Exception as e: | ||
whois_domain_status = None | ||
whois_expiry = None | ||
whois_status = "ERROR" | ||
return whois_domain_status, whois_expiry, whois_status | ||
|
||
|
||
def check_psl_txt_record(domain): | ||
def make_request(): | ||
try: | ||
url = f"https://dns.google/resolve?name=_psl.{domain}&type=TXT" | ||
response = requests.get(url) | ||
json_response = response.json() | ||
txt_records = json_response.get("Answer", []) | ||
for record in txt_records: | ||
if "github.com/publicsuffix/list/pull/" in record.get("data", ""): | ||
return "valid" | ||
return "invalid" | ||
except Exception as e: | ||
return "ERROR" | ||
|
||
psl_txt_status = make_request() | ||
if psl_txt_status == "ERROR": # Give it another try | ||
psl_txt_status = make_request() | ||
return psl_txt_status | ||
|
||
|
||
class PSLPrivateDomainsProcessor: | ||
def __init__(self): | ||
self.psl_url = "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat" | ||
self.psl_icann_marker = "// ===BEGIN ICANN DOMAINS===" | ||
self.psl_private_marker = "// ===BEGIN PRIVATE DOMAINS===" | ||
self.columns = [ | ||
"psl_entry", | ||
"top_level_domain", | ||
"dns_status", | ||
"whois_status", | ||
"whois_domain_expiry_date", | ||
"whois_domain_status", | ||
"psl_txt_status" | ||
] | ||
self.df = pd.DataFrame(columns=self.columns) | ||
self.icann_domains = set() | ||
|
||
def fetch_psl_data(self): | ||
print("Fetching PSL data from URL...") | ||
response = requests.get(self.psl_url) | ||
psl_data = response.text | ||
print("PSL data fetched.") | ||
return psl_data | ||
|
||
def parse_domain(self, domain): | ||
# Remove any leading '*.' parts | ||
domain = domain.lstrip('*.') | ||
|
||
# Split the domain into parts | ||
parts = domain.split('.') | ||
|
||
# Traverse the domain parts from the top-level domain upwards | ||
for i in range(len(parts)): | ||
candidate = '.'.join(parts[i:]) | ||
if candidate in self.icann_domains: | ||
continue | ||
elif '.'.join(parts[i + 1:]) in self.icann_domains: | ||
# convert punycode to ASCII to support IDN domains | ||
return candidate.encode('idna').decode('ascii') | ||
|
||
# If no valid domain is found, raise an error | ||
raise ValueError(f"No valid top-level domain found in the provided domain: {domain}") | ||
|
||
def parse_psl_data(self, psl_data): | ||
print("Parsing PSL data...") | ||
|
||
lines = psl_data.splitlines() | ||
process_icann = False | ||
process_private = False | ||
private_domains = [] | ||
|
||
for line in lines: | ||
stripped_line = line.strip() | ||
if stripped_line == self.psl_icann_marker: | ||
process_icann = True | ||
process_private = False | ||
continue | ||
elif stripped_line == self.psl_private_marker: | ||
process_icann = False | ||
process_private = True | ||
continue | ||
|
||
if stripped_line.startswith('//') or not stripped_line: | ||
continue | ||
|
||
if process_icann: | ||
self.icann_domains.add(stripped_line) | ||
elif process_private: | ||
private_domains.append(stripped_line) | ||
|
||
print(f"Private domains to be processed: {len(private_domains)}\n" | ||
f"ICANN domains: {len(self.icann_domains)}") | ||
|
||
# Parse each domain | ||
private_domains = [self.parse_domain(domain) for domain in private_domains] | ||
|
||
# Remove duplicates | ||
private_domains = list(set(private_domains)) | ||
print("Private domains in the publicly registrable name space: ", len(private_domains)) | ||
|
||
return private_domains | ||
|
||
def process_domains(self, domains): | ||
data = [] | ||
for domain in domains: | ||
|
||
whois_domain_status, whois_expiry, whois_status = get_whois_data(domain) | ||
dns_status = check_dns_status(domain) | ||
psl_txt_status = check_psl_txt_record(domain) | ||
|
||
print( | ||
f"{domain} - DNS Status: {dns_status}, Expiry: {whois_expiry}, PSL TXT Status: {psl_txt_status}") | ||
|
||
data.append({ | ||
"psl_entry": domain, | ||
"top_level_domain": domain, | ||
"whois_domain_status": json.dumps(whois_domain_status), | ||
"whois_domain_expiry_date": whois_expiry, | ||
"whois_status": whois_status, | ||
"dns_status": dns_status, | ||
"psl_txt_status": psl_txt_status | ||
}) | ||
|
||
self.df = pd.DataFrame(data, columns=self.columns) | ||
|
||
def save_results(self): | ||
sorted_df = self.df.sort_values(by="psl_entry") | ||
sorted_df.to_csv("data/all.csv", index=False) | ||
|
||
def save_invalid_results(self): | ||
# Save nxdomain.csv | ||
nxdomain_df = self.df[self.df["dns_status"] != "ok"].sort_values(by="psl_entry") | ||
nxdomain_df.to_csv("data/nxdomain.csv", index=False) | ||
|
||
# Save expired.csv | ||
today_str = datetime.datetime.utcnow().strftime("%Y-%m-%d") | ||
expired_df = self.df[ | ||
self.df["whois_domain_expiry_date"].notnull() & | ||
(self.df["whois_domain_expiry_date"].astype(str).str[:10] < today_str) | ||
].sort_values(by="psl_entry") | ||
expired_df.to_csv("data/expired.csv", index=False) | ||
|
||
# Save missing_psl_txt.csv | ||
missing_psl_txt_df = self.df[self.df["psl_txt_status"] == "invalid"].sort_values(by="psl_entry") | ||
missing_psl_txt_df.to_csv("data/missing_psl_txt.csv", index=False) | ||
|
||
def save_hold_results(self): | ||
hold_df = self.df[ | ||
self.df["whois_domain_status"].str.contains("hold", case=False, na=False) | ||
].sort_values(by="psl_entry") | ||
hold_df.to_csv("data/hold.csv", index=False) | ||
|
||
def run(self): | ||
psl_data = self.fetch_psl_data() | ||
domains = self.parse_psl_data(psl_data) | ||
self.process_domains(domains) | ||
self.save_results() | ||
self.save_invalid_results() | ||
self.save_hold_results() | ||
|
||
|
||
if __name__ == "__main__": | ||
processor = PSLPrivateDomainsProcessor() | ||
processor.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# PSL Private Section Domains WHOIS Checker | ||
|
||
## Overview | ||
|
||
The `PSLPrivateDomainsProcessor` is a Python script designed to fetch data from the Public Suffix List (PSL) and check the domain status, expiry dates, and `_psl` TXT records of the private section domains. | ||
|
||
It performs WHOIS checks on these domains and saves the results into CSV files for manual review. | ||
|
||
## Requirements | ||
|
||
- Python 3.x | ||
- `requests` | ||
- `pandas` | ||
- `python-whois` | ||
|
||
You can install the required packages using pip: | ||
|
||
```sh | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
|
||
`PSLPrivateDomainsProcessor.py`: The main script containing the `PSLPrivateDomainsProcessor` class and functions for DNS and WHOIS checks. | ||
|
||
Run the script using Python: | ||
|
||
```sh | ||
cd private_domains_checker | ||
mkdir data | ||
python PSLPrivateDomainsProcessor.py | ||
``` | ||
|
||
## Main Components | ||
|
||
### Functions | ||
|
||
- `check_dns_status(domain)`: Checks the DNS status of a domain using Google's DNS API. It attempts to recheck DNS status if the initial check fails. | ||
- `get_whois_data(domain)`: Retrieves WHOIS data for a domain. Note: WHOIS data parsing handles multiple expiration dates by selecting the first one. | ||
- `check_psl_txt_record(domain)`: Checks the `_psl` TXT record for a domain using Google's DNS API. | ||
|
||
### Class | ||
|
||
#### PSLPrivateDomainsProcessor | ||
|
||
- `fetch_psl_data()`: Fetches the PSL data from the specified URL. | ||
- `parse_domain(domain)`: Parses and normalizes a domain. | ||
- `parse_psl_data(psl_data)`: Parses the fetched PSL data and separates ICANN and private domains. | ||
- `process_domains(domains)`: Processes each domain, performing DNS, WHOIS, and `_psl` TXT record checks. | ||
- `save_results()`: Saves all processed domain data to `data/all.csv`. | ||
- `save_invalid_results()`: Saves domains with invalid DNS or expired WHOIS data to `data/nxdomain.csv` and `data/expired.csv`. | ||
- `save_hold_results()`: Saves domains with WHOIS status containing any form of "hold" to `data/hold.csv`. | ||
- `save_missing_psl_txt_results()`: Saves domains with invalid `_psl` TXT records to `data/missing_psl_txt.csv`. | ||
- `run()`: Executes the entire processing pipeline. | ||
|
||
## Output | ||
|
||
The script generates the following CSV files in the `data` directory: | ||
|
||
- `all.csv`: Contains all processed domain data. | ||
- `nxdomain.csv`: Contains domains that could not be resolved (`NXDOMAIN`). | ||
- `expired.csv`: Contains domains with expired WHOIS records. | ||
- `hold.csv`: Contains domains with WHOIS status indicating any kind of "hold". | ||
- `missing_psl_txt.csv`: Contains domains with invalid `_psl` TXT records. | ||
|
||
## Example | ||
|
||
An example CSV entry: | ||
|
||
| psl_entry | top_level_domain | dns_status | whois_status | whois_domain_expiry_date | whois_domain_status | psl_txt_status | | ||
| -------------- | ---------------- | ---------- | ------------ | ----------------------- | ---------------------------- | -------------- | | ||
| example.com | example.com | ok | ok | 2024-12-31 | "clientTransferProhibited" | "valid" | | ||
|
||
## Publicly Registrable Namespace Determination | ||
|
||
The script determines the publicly registrable namespace from private domains by using the ICANN section. | ||
|
||
Here's how it works: | ||
|
||
1. **ICANN Domains Set**: ICANN domains are stored in a set for quick lookup. | ||
2. **Domain Parsing**: For each private domain, the script splits the domain into parts. It then checks if any suffix of these parts exists in the ICANN domains set. | ||
3. **Normalization**: The private domain is normalized to its publicly registrable form using the ICANN domains set. | ||
|
||
Examples: | ||
|
||
- **Input**: PSL private domain entry `"*.example.com"` | ||
- **Process**: | ||
- Remove leading `'*.'`: `"example.com"` | ||
- Check `"com"` against the ICANN domains set: Found | ||
- **Output**: `"example.com"` | ||
|
||
- **Input**: PSL private domain entry `"sub.example.co.uk"` | ||
- **Process**: | ||
- Check `"example.co.uk"` against the ICANN domains set: Not found | ||
- Check `"co.uk"` against the ICANN domains set: Found | ||
- **Output**: `"example.co.uk"` | ||
|
||
The output is then used for checking WHOIS data. | ||
|
||
## License | ||
|
||
This tool is licensed under the MIT License. |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Google is a good source for this, but I do want to mention that this places authority on to Google - they could theoretically intervene in the resolution process on a given record.
Would we perhaps want to use an array of such resolvers that are randomly selected from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated - Good point about relying solely on Google for DNS, as it centralizes authority and potential intervention risks. Sometimes, it gives errors due to random reasons such as network fluctuations, so perhaps randomly selecting from multiple resolvers is still not error-proof. So, I decided to query two resolvers (Google and Cloudflare) and only take the value if they are consistent across the two: the updated implementation uses both Google and Cloudflare DNS resolvers. It queries both resolvers and only considers the result usable and consistent if both return the same status. If the results are inconsistent or if there are errors, it re-retries up to 5 times.