publicsuffix · simon-friedberger · Jul 22, 2024 · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/tools/private_domains_checker/.gitignore b/tools/private_domains_checker/.gitignore
@@ -0,0 +1,3 @@
+venv
+__pycache__
+data/*.csv
diff --git a/tools/private_domains_checker/PSLPrivateDomainsProcessor.py b/tools/private_domains_checker/PSLPrivateDomainsProcessor.py
@@ -0,0 +1,206 @@
+import datetime
+import json
+import os
+import pandas as pd
+import requests
+import whois
+
+
+def check_dns_status(domain):
+    def make_request():
+        try:
+            url = f"https://dns.google/resolve?name={domain}&type=NS"
+            response = requests.get(url)
+            json_response = response.json()
+            if json_response.get("Status") == 3:
+                return "NXDOMAIN"
+            else:
+                return "ok"
+        except Exception as e:
+            return "ERROR"
+
+    dns_status = make_request()
+    if dns_status == "ERROR":  # Give it another try
+        dns_status = make_request()
+    return dns_status
+
+
+def get_whois_data(domain):
+    try:
+        w = whois.whois(domain)
+        whois_domain_status = w.status
+        whois_expiry = w.expiration_date
+        if isinstance(whois_expiry, list):
+            whois_expiry = whois_expiry[0]
+        whois_status = "ok"
+    except Exception as e:
+        whois_domain_status = None
+        whois_expiry = None
+        whois_status = "ERROR"
+    return whois_domain_status, whois_expiry, whois_status
+
+
+def check_psl_txt_record(domain):
+    def make_request():
+        try:
+            url = f"https://dns.google/resolve?name=_psl.{domain}&type=TXT"
+            response = requests.get(url)
+            json_response = response.json()
+            txt_records = json_response.get("Answer", [])
+            for record in txt_records:
+                if "github.com/publicsuffix/list/pull/" in record.get("data", ""):
+                    return "valid"
+            return "invalid"
+        except Exception as e:
+            return "ERROR"
+
+    psl_txt_status = make_request()
+    if psl_txt_status == "ERROR":  # Give it another try
+        psl_txt_status = make_request()
+    return psl_txt_status
+
+
+class PSLPrivateDomainsProcessor:
+    def __init__(self):
+        self.psl_url = "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat"
+        self.psl_icann_marker = "// ===BEGIN ICANN DOMAINS==="
+        self.psl_private_marker = "// ===BEGIN PRIVATE DOMAINS==="
+        self.columns = [
+            "psl_entry",
+            "top_level_domain",
+            "dns_status",
+            "whois_status",
+            "whois_domain_expiry_date",
+            "whois_domain_status",
+            "psl_txt_status"
+        ]
+        self.df = pd.DataFrame(columns=self.columns)
+        self.icann_domains = set()
+
+    def fetch_psl_data(self):
+        print("Fetching PSL data from URL...")
+        response = requests.get(self.psl_url)
+        psl_data = response.text
+        print("PSL data fetched.")
+        return psl_data
+
+    def parse_domain(self, domain):
+        # Remove any leading '*.' parts
+        domain = domain.lstrip('*.')
+
+        # Split the domain into parts
+        parts = domain.split('.')
+
+        # Traverse the domain parts from the top-level domain upwards
+        for i in range(len(parts)):
+            candidate = '.'.join(parts[i:])
+            if candidate in self.icann_domains:
+                continue
+            elif '.'.join(parts[i + 1:]) in self.icann_domains:
+                # convert punycode to ASCII to support IDN domains
+                return candidate.encode('idna').decode('ascii')
+
+        # If no valid domain is found, raise an error
+        raise ValueError(f"No valid top-level domain found in the provided domain: {domain}")
+
+    def parse_psl_data(self, psl_data):
+        print("Parsing PSL data...")
+
+        lines = psl_data.splitlines()
+        process_icann = False
+        process_private = False
+        private_domains = []
+
+        for line in lines:
+            stripped_line = line.strip()
+            if stripped_line == self.psl_icann_marker:
+                process_icann = True
+                process_private = False
+                continue
+            elif stripped_line == self.psl_private_marker:
+                process_icann = False
+                process_private = True
+                continue
+
+            if stripped_line.startswith('//') or not stripped_line:
+                continue
+
+            if process_icann:
+                self.icann_domains.add(stripped_line)
+            elif process_private:
+                private_domains.append(stripped_line)
+
+        print(f"Private domains to be processed: {len(private_domains)}\n"
+              f"ICANN domains: {len(self.icann_domains)}")
+
+        # Parse each domain
+        private_domains = [self.parse_domain(domain) for domain in private_domains]
+
+        # Remove duplicates
+        private_domains = list(set(private_domains))
+        print("Private domains in the publicly registrable name space: ", len(private_domains))
+
+        return private_domains
+
+    def process_domains(self, domains):
+        data = []
+        for domain in domains:
+
+            whois_domain_status, whois_expiry, whois_status = get_whois_data(domain)
+            dns_status = check_dns_status(domain)
+            psl_txt_status = check_psl_txt_record(domain)
+
+            print(
+                f"{domain} - DNS Status: {dns_status}, Expiry: {whois_expiry}, PSL TXT Status: {psl_txt_status}")
+
+            data.append({
+                "psl_entry": domain,
+                "top_level_domain": domain,
+                "whois_domain_status": json.dumps(whois_domain_status),
+                "whois_domain_expiry_date": whois_expiry,
+                "whois_status": whois_status,
+                "dns_status": dns_status,
+                "psl_txt_status": psl_txt_status
+            })
+
+        self.df = pd.DataFrame(data, columns=self.columns)
+
+    def save_results(self):
+        sorted_df = self.df.sort_values(by="psl_entry")
+        sorted_df.to_csv("data/all.csv", index=False)
+
+    def save_invalid_results(self):
+        # Save nxdomain.csv
+        nxdomain_df = self.df[self.df["dns_status"] != "ok"].sort_values(by="psl_entry")
+        nxdomain_df.to_csv("data/nxdomain.csv", index=False)
+
+        # Save expired.csv
+        today_str = datetime.datetime.utcnow().strftime("%Y-%m-%d")
+        expired_df = self.df[
+            self.df["whois_domain_expiry_date"].notnull() &
+            (self.df["whois_domain_expiry_date"].astype(str).str[:10] < today_str)
+        ].sort_values(by="psl_entry")
+        expired_df.to_csv("data/expired.csv", index=False)
+
+        # Save missing_psl_txt.csv
+        missing_psl_txt_df = self.df[self.df["psl_txt_status"] == "invalid"].sort_values(by="psl_entry")
+        missing_psl_txt_df.to_csv("data/missing_psl_txt.csv", index=False)
+
+    def save_hold_results(self):
+        hold_df = self.df[
+            self.df["whois_domain_status"].str.contains("hold", case=False, na=False)
+        ].sort_values(by="psl_entry")
+        hold_df.to_csv("data/hold.csv", index=False)
+
+    def run(self):
+        psl_data = self.fetch_psl_data()
+        domains = self.parse_psl_data(psl_data)
+        self.process_domains(domains)
+        self.save_results()
+        self.save_invalid_results()
+        self.save_hold_results()
+
+
+if __name__ == "__main__":
+    processor = PSLPrivateDomainsProcessor()
+    processor.run()
diff --git a/tools/private_domains_checker/README.md b/tools/private_domains_checker/README.md
@@ -0,0 +1,102 @@
+# PSL Private Section Domains WHOIS Checker
+
+## Overview
+
+The `PSLPrivateDomainsProcessor` is a Python script designed to fetch data from the Public Suffix List (PSL) and check the domain status, expiry dates, and `_psl` TXT records of the private section domains. 
+
+It performs WHOIS checks on these domains and saves the results into CSV files for manual review.
+
+## Requirements
+
+- Python 3.x
+- `requests`
+- `pandas`
+- `python-whois`
+
+You can install the required packages using pip:
+
+```sh
+pip install -r requirements.txt
+```
+
+## Usage
+
+`PSLPrivateDomainsProcessor.py`: The main script containing the `PSLPrivateDomainsProcessor` class and functions for DNS and WHOIS checks.
+
+Run the script using Python:
+
+```sh
+cd private_domains_checker
+mkdir data
+python PSLPrivateDomainsProcessor.py
+```
+
+## Main Components
+
+### Functions
+
+- `check_dns_status(domain)`: Checks the DNS status of a domain using Google's DNS API. It attempts to recheck DNS status if the initial check fails.
+- `get_whois_data(domain)`: Retrieves WHOIS data for a domain. Note: WHOIS data parsing handles multiple expiration dates by selecting the first one.
+- `check_psl_txt_record(domain)`: Checks the `_psl` TXT record for a domain using Google's DNS API.
+
+### Class
+
+#### PSLPrivateDomainsProcessor
+
+- `fetch_psl_data()`: Fetches the PSL data from the specified URL.
+- `parse_domain(domain)`: Parses and normalizes a domain.
+- `parse_psl_data(psl_data)`: Parses the fetched PSL data and separates ICANN and private domains.
+- `process_domains(domains)`: Processes each domain, performing DNS, WHOIS, and `_psl` TXT record checks.
+- `save_results()`: Saves all processed domain data to `data/all.csv`.
+- `save_invalid_results()`: Saves domains with invalid DNS or expired WHOIS data to `data/nxdomain.csv` and `data/expired.csv`.
+- `save_hold_results()`: Saves domains with WHOIS status containing any form of "hold" to `data/hold.csv`.
+- `save_missing_psl_txt_results()`: Saves domains with invalid `_psl` TXT records to `data/missing_psl_txt.csv`.
+- `run()`: Executes the entire processing pipeline.
+
+## Output
+
+The script generates the following CSV files in the `data` directory:
+
+- `all.csv`: Contains all processed domain data.
+- `nxdomain.csv`: Contains domains that could not be resolved (`NXDOMAIN`).
+- `expired.csv`: Contains domains with expired WHOIS records.
+- `hold.csv`: Contains domains with WHOIS status indicating any kind of "hold".
+- `missing_psl_txt.csv`: Contains domains with invalid `_psl` TXT records.
+
+## Example
+
+An example CSV entry:
+
+| psl_entry      | top_level_domain | dns_status | whois_status | whois_domain_expiry_date | whois_domain_status          | psl_txt_status |
+| -------------- | ---------------- | ---------- | ------------ | ----------------------- | ---------------------------- | -------------- |
+| example.com    | example.com      | ok         | ok           | 2024-12-31              | "clientTransferProhibited"   | "valid"        |
+
+## Publicly Registrable Namespace Determination
+
+The script determines the publicly registrable namespace from private domains by using the ICANN section. 
+
+Here's how it works:
+
+1. **ICANN Domains Set**: ICANN domains are stored in a set for quick lookup.
+2. **Domain Parsing**: For each private domain, the script splits the domain into parts. It then checks if any suffix of these parts exists in the ICANN domains set.
+3. **Normalization**: The private domain is normalized to its publicly registrable form using the ICANN domains set.
+
+Examples:
+
+- **Input**: PSL private domain entry `"*.example.com"`
+  - **Process**: 
+    - Remove leading `'*.'`: `"example.com"`
+    - Check `"com"` against the ICANN domains set: Found
+  - **Output**: `"example.com"`
+
+- **Input**: PSL private domain entry `"sub.example.co.uk"`
+  - **Process**:
+    - Check `"example.co.uk"` against the ICANN domains set: Not found
+    - Check `"co.uk"` against the ICANN domains set: Found
+  - **Output**: `"example.co.uk"`
+
+The output is then used for checking WHOIS data.
+
+## License
+
+This tool is licensed under the MIT License.