diff --git a/README.md b/README.md index 1ca0075..9aeff91 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

-Static Badge Static Badge +Static Badge Static Badge Static Badge Static Badge

@@ -78,6 +78,11 @@ DPULSE is a software solution for conducting OSINT research in relation to a cer - SecurityTrails API (deep subdomains and DNS enumeration) - HudsonRock API (for querying a database with exposed computers which were compromised through global info-stealer campaigns) +5. ***Web-pages snapshoting:*** extended functionality which allows to save web-pages copies in different forms: + - Screenshot snapshotting (saves target domain's page in form of screenshot) + - HTML snapshotting (saves target domain'spage in form of HTML file) + - Wayback Machine snapshotting (saves every version of target domain's page within a user-defined time period) + Finally, DPULSE compiles all found data into an easy-to-read HTML or XLSX report by category. It also saves all information about scan in local report storage database, which can be restored later. # How to install and run DPULSE @@ -166,7 +171,7 @@ If you have problems with starting installer.sh, you should try to use `dos2unix # Tasks to complete before new release - [x] Add web pages snapshoting (with screenshots) - [x] Add web pages snapshoting (with web pages copying as HTML objects) -- [ ] Add web pages snapshoting (with Wayback Machine) +- [x] Add web pages snapshoting (with Wayback Machine) # DPULSE mentions in social medias @@ -176,6 +181,8 @@ If you have problems with starting installer.sh, you should try to use `dos2unix ### [The very first mention from cybercrime intelligence company (HudsonRock)](https://www.linkedin.com/feed/update/urn:li:share:7294336938495385600/) +### [The very first mention on cybersecurity educational website (Ethical Hackers Academy)](https://ethicalhacksacademy.com/blogs/cyber-security-tools/dpulse) + ## X.com mentions: ### [by @DarkWebInformer](https://x.com/DarkWebInformer/status/1787583156775759915?t=Ak1W9ddUPpDvLAkVyQG8fQ&s=19) diff --git a/datagather_modules/data_assembler.py b/datagather_modules/data_assembler.py index 1822976..8d97c25 100644 --- a/datagather_modules/data_assembler.py +++ b/datagather_modules/data_assembler.py @@ -16,6 +16,7 @@ from screen_snapshotting import take_screenshot from config_processing import read_config from html_snapshotting import save_page_as_html +from archive_snapshotting import download_snapshot try: import requests @@ -72,7 +73,7 @@ def report_preprocessing(self, short_domain, report_file_type): os.makedirs(report_folder, exist_ok=True) return casename, db_casename, db_creation_date, robots_filepath, sitemap_filepath, sitemap_links_filepath, report_file_type, report_folder, files_ctime, report_ctime - def data_gathering(self, short_domain, url, report_file_type, pagesearch_flag, keywords, keywords_flag, dorking_flag, used_api_flag, snapshotting_flag, username): + def data_gathering(self, short_domain, url, report_file_type, pagesearch_flag, keywords, keywords_flag, dorking_flag, used_api_flag, snapshotting_flag, username, from_date, end_date): casename, db_casename, db_creation_date, robots_filepath, sitemap_filepath, sitemap_links_filepath, report_file_type, report_folder, ctime, report_ctime = self.report_preprocessing(short_domain, report_file_type) logging.info(f'### THIS LOG PART FOR {casename} CASE, TIME: {ctime} STARTS HERE') print(Fore.GREEN + "Started scanning domain" + Style.RESET_ALL) @@ -186,6 +187,8 @@ def data_gathering(self, short_domain, url, report_file_type, pagesearch_flag, k take_screenshot(installed_browser, url, report_folder + '//screensnapshot.png') elif snapshotting_flag.lower() == 'p': save_page_as_html(url, report_folder + '//domain_html_copy.html') + elif snapshotting_flag.lower() == 'w': + download_snapshot(short_domain, from_date, end_date, report_folder) print(Fore.LIGHTMAGENTA_EX + f"\n[EXTENDED SCAN END: PAGE SNAPSHOTTING]\n" + Style.RESET_ALL) else: pass @@ -269,6 +272,8 @@ def data_gathering(self, short_domain, url, report_file_type, pagesearch_flag, k take_screenshot(installed_browser, url, report_folder + '//screensnapshot.png') elif snapshotting_flag.lower() == 'p': save_page_as_html(url, report_folder + '//domain_html_copy.html') + elif snapshotting_flag.lower() == 'w': + download_snapshot(short_domain, from_date, end_date, report_folder) print(Fore.LIGHTMAGENTA_EX + f"\n[EXTENDED SCAN END: PAGE SNAPSHOTTING]\n" + Style.RESET_ALL) else: pass diff --git a/dpulse.py b/dpulse.py index 63a8353..3760100 100644 --- a/dpulse.py +++ b/dpulse.py @@ -53,7 +53,7 @@ cli = cli_init.Menu() cli.welcome_menu() -def process_report(report_filetype, short_domain, url, case_comment, keywords_list, keywords_flag, dorking_flag, used_api_flag, pagesearch_flag, pagesearch_ui_mark, spinner_thread, snapshotting_flag, snapshotting_ui_mark, username): +def process_report(report_filetype, short_domain, url, case_comment, keywords_list, keywords_flag, dorking_flag, used_api_flag, pagesearch_flag, pagesearch_ui_mark, spinner_thread, snapshotting_flag, snapshotting_ui_mark, username, from_date, end_date): import xlsx_report_creation as xlsx_rc import html_report_creation as html_rc from misc import time_processing @@ -61,9 +61,9 @@ def process_report(report_filetype, short_domain, url, case_comment, keywords_li try: start = time() if pagesearch_flag in ['y', 'si']: - data_array, report_info_array = data_processing.data_gathering(short_domain, url, report_filetype.lower(), pagesearch_flag.lower(), keywords_list, keywords_flag, dorking_flag.lower(), used_api_flag, snapshotting_flag, username) + data_array, report_info_array = data_processing.data_gathering(short_domain, url, report_filetype.lower(), pagesearch_flag.lower(), keywords_list, keywords_flag, dorking_flag.lower(), used_api_flag, snapshotting_flag, username, from_date, end_date) else: - data_array, report_info_array = data_processing.data_gathering(short_domain, url, report_filetype.lower(), pagesearch_flag.lower(), '', keywords_flag, dorking_flag.lower(), used_api_flag, snapshotting_flag, username) + data_array, report_info_array = data_processing.data_gathering(short_domain, url, report_filetype.lower(), pagesearch_flag.lower(), '', keywords_flag, dorking_flag.lower(), used_api_flag, snapshotting_flag, username, from_date, end_date) end = time() - start endtime_string = time_processing(end) @@ -164,7 +164,7 @@ def run(): else: print(Fore.RED + "\nInvalid API usage mode" + Style.RESET_ALL) break - snapshotting_flag = input(Fore.YELLOW + "Select Snapshotting mode [S(creenshot)/P(age Copy)/N (for None)] >> ") + snapshotting_flag = input(Fore.YELLOW + "Select Snapshotting mode [S(creenshot)/P(age Copy)/W(ayback Machine)/N (for None)] >> ") if pagesearch_flag.lower() == 'y' or pagesearch_flag.lower() == 'n': if pagesearch_flag.lower() == "n": pagesearch_ui_mark = 'No' @@ -196,11 +196,16 @@ def run(): break else: snapshotting_ui_mark = 'No' + from_date = end_date = 'N' if snapshotting_flag.lower() == 's': + from_date = end_date = 'N' snapshotting_ui_mark = "Yes, domain's main page snapshotting as a screenshot" elif snapshotting_flag.lower() == 'p': + from_date = end_date = 'N' snapshotting_ui_mark = "Yes, domain's main page snapshotting as a .HTML file" elif snapshotting_flag.lower() == 'w': # not supported at the moment + from_date = str(input('Enter start date (YYYYMMDD format): ')) + end_date = str(input('Enter end date (YYYYMMDD format): ')) snapshotting_ui_mark = "Yes, domain's main page snapshotting using Wayback Machine" cli_init.print_prescan_summary(short_domain, report_filetype.upper(), pagesearch_ui_mark, dorking_ui_mark, used_api_ui, case_comment, snapshotting_ui_mark) print(Fore.LIGHTMAGENTA_EX + "[BASIC SCAN START]\n" + Style.RESET_ALL) @@ -209,7 +214,7 @@ def run(): if report_filetype.lower() in ['html', 'xlsx']: process_report(report_filetype, short_domain, url, case_comment, keywords_list, keywords_flag, dorking_flag, used_api_flag, - pagesearch_flag, pagesearch_ui_mark, spinner_thread, snapshotting_flag, snapshotting_ui_mark, username) + pagesearch_flag, pagesearch_ui_mark, spinner_thread, snapshotting_flag, snapshotting_ui_mark, username, from_date, end_date) else: print(Fore.RED + "\nUnsupported PageSearch mode. Please choose between Y or N") diff --git a/service/cli_init.py b/service/cli_init.py index 0431ae3..c0a48ff 100644 --- a/service/cli_init.py +++ b/service/cli_init.py @@ -20,7 +20,7 @@ def welcome_menu(self): fig = Figlet(font=wm_font) print('\n') self.console.print(fig.renderText('DPULSE'), style=preview_style) - print(Fore.MAGENTA + Style.BRIGHT + '[DPULSE-CLI] - [v1.2.2 stable] - [OSINT-TECHNOLOGIES]\n' + Style.RESET_ALL) + print(Fore.MAGENTA + Style.BRIGHT + '[DPULSE-CLI] - [v1.2.3 stable] - [OSINT-TECHNOLOGIES]\n' + Style.RESET_ALL) print(Fore.MAGENTA + Style.BRIGHT + '[Visit our pages]\nGitHub repository: https://github.com/OSINT-TECHNOLOGIES\nPyPi page: https://pypi.org/project/dpulse/\nDocumentation: https://dpulse.readthedocs.io' + Style.RESET_ALL) def print_main_menu(self): diff --git a/service/config_processing.py b/service/config_processing.py index c7a5d11..49e0fa1 100644 --- a/service/config_processing.py +++ b/service/config_processing.py @@ -27,10 +27,11 @@ def create_config(): ] config = configparser.ConfigParser() + config['HTML_REPORTING'] = {'template': 'default'} config['LOGGING'] = {'log_level': 'info'} config['CLI VISUAL'] = {'preview_color': 'red', 'font': 'slant'} config['DORKING'] = {'dorking_delay (secs)': '2', 'delay_step': '5'} - config['SNAPSHOTTING'] = {'installed_browser': 'firefox', 'opera_browser_path': 'None'} + config['SNAPSHOTTING'] = {'installed_browser': 'firefox', 'opera_browser_path': 'None', 'wayback_retries': '3', 'wayback_req_pause': '2'} config['USER-AGENTS'] = {} for i, agent in enumerate(basic_user_agents): config['USER-AGENTS'][f'agent_{i + 1}'] = agent @@ -56,6 +57,9 @@ def read_config(): proxies_file_path = config.get('PROXIES', 'proxies_file_path') installed_browser = config.get('SNAPSHOTTING', 'installed_browser') opera_browser_path = config.get('SNAPSHOTTING', 'opera_browser_path') + wayback_retries_amount = config.get('SNAPSHOTTING', 'wayback_retries') + wayback_requests_pause = config.get('SNAPSHOTTING', 'wayback_req_pause') + html_report_template = config.get('HTML_REPORTING', 'template') config_values = { @@ -67,7 +71,10 @@ def read_config(): 'user_agents': user_agents, 'proxies_file_path': proxies_file_path, 'installed_browser': installed_browser, - 'opera_browser_path': opera_browser_path + 'opera_browser_path': opera_browser_path, + 'wayback_retries_amount': wayback_retries_amount, + 'wayback_requests_pause': wayback_requests_pause, + 'template': html_report_template } return config_values diff --git a/service/pdf_report_templates/compromise_report_template.html b/service/pdf_report_templates/compromise_report_template.html new file mode 100644 index 0000000..0b7b26b --- /dev/null +++ b/service/pdf_report_templates/compromise_report_template.html @@ -0,0 +1,120 @@ + + + + + + + + +

OPEN SOURCE RESEARCH REPORT

+

Organization: {{org}}

+
+ +

TABLE OF CONTENTS

+
+

1. General scan information

+

2. WHOIS information

+

3. Social medias links

+

4. Subdomains information

+

5. DNS & SSL information

+

6. Services & frameworks

+

7. Basic pre-pentest information

+

8. Dorking scan info

+

9. PageSearch results

+

10. API scan results

+
+
+ +

GENERAL SCAN INFO

+
+

Total subdomains: {{a_tsf}}

+

Social media links: {{a_tsm}}

+
Robots.txt: {{robots_txt_result}}
+Sitemap.xml: {{sitemap_xml_result}}
+Dorking: {{dorking_status}}
+
+
+ +

WHOIS INFORMATION

+
+

Domain: {{sh_domain}} URL: {{full_url}}

+
IP: {{ip_address}}
+Registrar: {{registrar}}
+Dates: {{creation_date}} → {{expiration_date}}
+
+
+ +

SOCIAL MEDIAS

+
+

Facebook:

{% for l in fb_links %}⇒ {{ l }}{% endfor %}
+

Twitter/X:

{% for l in tw_links+xcom_links %}⇒ {{ l }}{% endfor %}
+

Instagram:

{% for l in inst_links %}⇒ {{ l }}{% endfor %}
+
+
+ +

SUBDOMAINS

+
+

Found subdomains:

{% for sd in subdomains %}⇒ {{ sd }}{% endfor %}
+

IPs:

{% for sdip in subdomain_ip %}⇒ {{ sdip }}{% endfor %}
+
+
+ +

DNS/SSL

+
+
NS: {{name_servers}}
+MX: {{mx_records}}
+SSL Issuer: {{issuer}}
+NotBefore: {{notBefore}}
+NotAfter: {{notAfter}}
+
+
+ +

SERVICES

+
+

Web servers:

{% for ws in web_servers %}⇒ {{ ws }}{% endfor %}
+

CMS:

{% for cm in cms %}⇒ {{ cm }}{% endfor %}
+

Languages:

{% for pl in programming_languages %}⇒ {{ pl }}{% endfor %}
+
+
+ +

BASIC PRE-PENTEST

+
+

Open ports:

{% for op in ports %}⇒ {{ op }}{% endfor %}
+

Vulnerabilities:

{% for vuln in vulns %}⇒ {{ vuln }}{% endfor %}
+
+
+ +

DORKING SCAN

+
{{ add_dsi | safe }}
+
+ +

PAGESEARCH

+
+
Subdomains: {{ps_s}}
+Emails: {{ps_e}}
+Documents: {{ps_f}}
+…
+
+
+ +

VIRUSTOTAL

+
{{ virustotal_output }}
+

SECURITYTRAILS

+
{{ securitytrails_output }}
+

HUDSONROCK

+
{{ hudsonrock_output }}
+
+ +

Created by DPULSE (OSINT-TECHNOLOGIES)

+

+ GitHub | + PyPI +

+ + + \ No newline at end of file diff --git a/service/pdf_report_templates/monospaced_report_template.html b/service/pdf_report_templates/monospaced_report_template.html new file mode 100644 index 0000000..e3521a4 --- /dev/null +++ b/service/pdf_report_templates/monospaced_report_template.html @@ -0,0 +1,174 @@ + + + + + + + + +

OPEN SOURCE RESEARCH REPORT

+
Organization: {{org}}
+
+ +

TABLE OF CONTENTS

+
+1. General info
+2. WHOIS
+3. Social medias
+4. Subdomains
+5. DNS/SSL
+6. Services
+7. Pre-pentest
+8. Dorking
+9. PageSearch
+10. APIs
+
+
+ +

GENERAL SCAN INFO

+
+Subdomains:  {{a_tsf}}
+Social:      {{a_tsm}}
+Robots.txt:  {{robots_txt_result}}
+Sitemap.xml: {{sitemap_xml_result}}
+Sitemap links: {{sitemap_links}}
+Dorking:     {{dorking_status}}
+PageSearch:  {{pagesearch_ui_mark}}
+Snapshotting: {{snapshotting_ui_mark}}
+Report time: {{ctime}}
+
+
+ +

WHOIS INFORMATION

+
+Domain:     {{sh_domain}}
+URL:        {{full_url}}
+IP:         {{ip_address}}
+Registrar:  {{registrar}}
+Created:    {{creation_date}}
+Expires:    {{expiration_date}}
+Emails:     {{mails}}
+
+
+ +

SOCIAL MEDIAS SEARCH RESULTS

+
+FACEBOOK:
+{% for link in fb_links %}⇒ {{ link }}{% endfor %}
+TWITTER/X:
+{% for link in tw_links+xcom_links %}⇒ {{ link }}{% endfor %}
+INSTAGRAM:
+{% for link in inst_links %}⇒ {{ link }}{% endfor %}
+TELEGRAM:
+{% for link in tg_links %}⇒ {{ link }}{% endfor %}
+TIKTOK:
+{% for link in tt_links %}⇒ {{ link }}{% endfor %}
+LINKEDIN:
+{% for link in li_links %}⇒ {{ link }}{% endfor %}
+VKONTAKTE:
+{% for link in vk_links %}⇒ {{ link }}{% endfor %}
+YOUTUBE:
+{% for link in yt_links %}⇒ {{ link }}{% endfor %}
+ODNOKLASSNIKI:
+{% for link in ok_links %}⇒ {{ link }}{% endfor %}
+WECHAT:
+{% for link in wc_links %}⇒ {{ link }}{% endfor %}
+
+
+ +

SUBDOMAINS ANALYSIS RESULTS

+
+FOUND SUBDOMAINS:
+{% for sd in subdomains %}⇒ {{ sd }}{% endfor %}
+IPs:
+{% for sdip in subdomain_ip %}⇒ {{ sdip }}{% endfor %}
+Emails:
+{% for smails in subdomain_mails %}⇒ {{ smails }}{% endfor %}
+
+
+ +

DNS & SSL INFORMATION

+
+NAME SERVERS: {{name_servers}}
+MX RECORDS:  {{mx_records}}
+SSL ISSUER:  {{issuer}}
+SUBJECT:     {{subject}}
+NOT BEFORE:  {{notBefore}}
+NOT AFTER:   {{notAfter}}
+COMMON NAME: {{commonName}}
+SERIAL:      {{serialNumber}}
+
+
+ +

SERVICES & FRAMEWORKS INFORMATION

+
+WEB SERVERS:
+{% for ws in web_servers %}⇒ {{ ws }}{% endfor %}
+CMS:
+{% for cm in cms %}⇒ {{ cm }}{% endfor %}
+PROGRAMMING LANGUAGES:
+{% for pl in programming_languages %}⇒ {{ pl }}{% endfor %}
+WEB FRAMEWORKS:
+{% for wf in web_frameworks %}⇒ {{ wf }}{% endfor %}
+ANALYTICS:
+{% for analytic in analytics %}⇒ {{ analytic }}{% endfor %}
+JS FRAMEWORKS:
+{% for jsf in javascript_frameworks %}⇒ {{ jsf }}{% endfor %}
+TAGS:
+{% for tag in tags %}⇒ {{ tag }}{% endfor %}
+CPE:
+{% for cpe in cpes %}⇒ {{ cpe }}{% endfor %}
+
+
+ +

BASIC PRE-PENTEST INFORMATION

+
+OPEN PORTS:
+{% for op in ports %}⇒ {{ op }}{% endfor %}
+HOSTNAMES:
+{% for hn in hostnames %}⇒ {{ hn }}{% endfor %}
+POTENTIAL VULNERABILITIES:
+{% for vuln in vulns %}⇒ {{ vuln }}{% endfor %}
+
+
+ +

DORKING SCAN INFO

+
{{ add_dsi | safe }}
+
+ +

PAGESEARCH RESULTS

+
+SUBDOMAINS FOUND: {{ps_s}}
+EMAILS FOUND:    {{ps_e}}
+DOCUMENTS:       {{ps_f}}
+COOKIES:         {{ps_c}}
+API KEYS:        {{ps_a}}
+WEB ELEMENTS:    {{ps_w}}
+PASSWORDS:       {{ps_p}}
+
+
+ +

VIRUSTOTAL API SCAN RESULTS

+
{{ virustotal_output }}
+
+ +

SECURITYTRAILS API SCAN RESULTS

+
{{ securitytrails_output }}
+
+ +

HUDSONROCK API SCAN RESULTS

+
{{ hudsonrock_output }}
+
+ +
+Created by DPULSE (OSINT-TECHNOLOGIES)
+GitHub:  https://github.com/OSINT-TECHNOLOGIES
+PyPI:    https://pypi.org/project/dpulse/
+
+ + + \ No newline at end of file diff --git a/service/pdf_report_templates/paragraph_report_template.html b/service/pdf_report_templates/paragraph_report_template.html new file mode 100644 index 0000000..cf966ab --- /dev/null +++ b/service/pdf_report_templates/paragraph_report_template.html @@ -0,0 +1,154 @@ + + + + + + + + +

Open Source Research Report

+

{{org}}

+
+ +

Table of Contents

+
+

1. General scan information

+

2. WHOIS information

+

3. Social medias links

+

4. Subdomains information

+

5. DNS & SSL information

+

6. Services & frameworks

+

7. Basic pre-pentest information

+

8. Dorking scan info

+

9. PageSearch results

+

10. API scan results

+
+
+ +

GENERAL SCAN INFO

+
+

Total subdomains: {{a_tsf}}

+

Total social media links: {{a_tsm}}

+

Status of robots.txt: {{robots_txt_result}}

+

Status of sitemap.xml: {{sitemap_xml_result}}

+

Status of sitemap links: {{sitemap_links}}

+

Google Dorking: {{dorking_status}}

+

PageSearch: {{pagesearch_ui_mark}}

+

Snapshotting: {{snapshotting_ui_mark}}

+

Report time: {{ctime}}

+
+
+ +

WHOIS INFORMATION

+
+

Domain: {{sh_domain}}

+

Full URL: {{full_url}}

+

IP address: {{ip_address}}

+

Registrar: {{registrar}}

+

Creation date: {{creation_date}}

+

Expiration date: {{expiration_date}}

+

Organization name: {{org}}

+

Contact e-mails: {{mails}}

+
+
+ +

SOCIAL MEDIAS SEARCH RESULTS

+
+

FACEBOOK:

+

TWITTER (+ X.com):

+

INSTAGRAM:

+

TELEGRAM:

+

TIKTOK:

+

LINKEDIN:

+

VKONTAKTE:

+

YOUTUBE:

+

ODNOKLASSNIKI:

+

WECHAT:

+
+
+ +

SUBDOMAINS ANALYSIS RESULTS

+
+

Found subdomains:

+

Subdomains IP addresses:

+

Subdomains e-mails:

+
+
+ +

DNS & SSL INFORMATION

+
+

(DNS) Name servers: {{name_servers}}

+

(DNS) MX addresses: {{mx_records}}

+

(SSL) Issuer: {{issuer}}

+

(SSL) Subject: {{subject}}

+

(SSL) Not before: {{notBefore}}

+

(SSL) Not after: {{notAfter}}

+

(SSL) Certificate name: {{commonName}}

+

(SSL) Certificate serial number: {{serialNumber}}

+
+
+ +

SERVICES & FRAMEWORKS INFORMATION

+
+

Web servers:

+

CMS:

+

Used programming languages:

+

Used web frameworks:

+

Analytics service:

+

Used JavaScript frameworks:

+

Tags:

+

Common Platform Enumeration:

+
+
+ +

BASIC PRE-PENTEST INFORMATION

+
+

Open ports:

+

Hostnames:

+

Potential vulnerabilities:

+
+
+ +

DORKING SCAN INFO

+
{{ add_dsi | safe }}
+
+ +

PAGESEARCH RESULTS

+
+

Amount of accessible subdomains: {{ps_s}}

+

Amount of email addresses: {{ps_e}}

+

Amount of found documents: {{ps_f}}

+

Amount of found cookies: {{ps_c}}

+

Amount of found API key: {{ps_a}}

+

Amount of WEB elements found: {{ps_w}}

+

Amount of exposed passwords found: {{ps_p}}

+
+
+ +

VIRUSTOTAL API SCAN RESULTS

+
{{ virustotal_output }}
+
+ +

SECURITYTRAILS API SCAN RESULTS

+
{{ securitytrails_output }}
+
+ +

HUDSONROCK API SCAN RESULTS

+
{{ hudsonrock_output }}
+
+ +

Created by DPULSE (OSINT-TECHNOLOGIES)

+ + + + \ No newline at end of file diff --git a/snapshotting/archive_snapshotting.py b/snapshotting/archive_snapshotting.py new file mode 100644 index 0000000..03ee561 --- /dev/null +++ b/snapshotting/archive_snapshotting.py @@ -0,0 +1,70 @@ +import requests +import os +import time +from colorama import Fore, Style +import sys +from config_processing import read_config + +sys.path.append('service') +CDX_API = "http://web.archive.org/cdx/search/cdx" + +def get_values_from_config(): + config_values = read_config() + retries = int(config_values['wayback_retries_amount']) + pause_between_requests = int(config_values['wayback_requests_pause']) + return retries, pause_between_requests + +def get_snapshots(url, from_date, to_date): + params = { + "url": url, + "from": from_date, + "to": to_date, + "output": "json", + "fl": "timestamp,original,mime", + "filter": "statuscode:200", + "collapse": "digest" + } + print(Fore.GREEN + f"Sending request to Wayback CDX API for {url}, period: {from_date} - {to_date}..." + Style.RESET_ALL) + response = requests.get(CDX_API, params=params) + response.raise_for_status() + data = response.json() + return data[1:] + +def snapshot_enum(snapshot_storage_folder, timestamp, original_url, index): + retries, _ = get_values_from_config() + archive_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + for attempt in range(1, retries + 1): + try: + response = requests.get(archive_url, timeout=15) + response.raise_for_status() + filename = f"{index}_{timestamp}.html" + filepath = os.path.join(snapshot_storage_folder, filename) + with open(filepath, "w", encoding="utf-8") as f: + f.write(response.text) + print(Fore.GREEN + f"[{index}] Downloaded: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{archive_url}" + Style.RESET_ALL) + return True + except Exception as e: + print(Fore.RED + f"[{index}] Attempt {attempt}/{retries} failed for {archive_url}. Retrying..." + Style.RESET_ALL) + time.sleep(2) + print(Fore.RED + f"[{index}] Failed to download after {retries} attempts: {archive_url}" + Style.RESET_ALL) + return False + +def download_snapshot(short_domain, from_date, end_date, report_folder): + _, pause_between_requests = get_values_from_config() + snapshot_storage_folder = report_folder + '//wayback_snapshots' + os.makedirs(snapshot_storage_folder, exist_ok=True) + snapshots = get_snapshots(short_domain, from_date, end_date) + print(Fore.GREEN + "Total snapshots found:" + Style.RESET_ALL + Fore.LIGHTCYAN_EX + Style.BRIGHT + f" {len(snapshots)}" + Style.RESET_ALL) + html_snapshots = [ + s for s in snapshots + if len(s) >= 2 and ( + s[1].endswith(".html") or s[1].endswith("/") or s[1] == short_domain) + ] + print(Fore.GREEN + "HTML snapshots to download:" + Style.RESET_ALL + Fore.LIGHTCYAN_EX + Style.BRIGHT + f" {len(html_snapshots)}\n" + Style.RESET_ALL) + if not html_snapshots: + print(Fore.RED + "No HTML snapshots available for download." + Style.RESET_ALL) + return + for i, (timestamp, original_url, *_) in enumerate(html_snapshots): + snapshot_enum(snapshot_storage_folder, timestamp, original_url, i) + time.sleep(pause_between_requests) + print(Fore.GREEN + "\nFinished downloading HTML snapshots" + Style.RESET_ALL)