From d178d47265572c5dedb32d107f5c1f7a9622ff88 Mon Sep 17 00:00:00 2001 From: Yogesh Ojha Date: Sat, 13 Jul 2024 12:33:10 +0530 Subject: [PATCH 1/2] add validator function to validate url and ignore urls with junk and whitespaces --- web/reNgine/utilities.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/web/reNgine/utilities.py b/web/reNgine/utilities.py index d985d2e00..c63fef975 100644 --- a/web/reNgine/utilities.py +++ b/web/reNgine/utilities.py @@ -1,4 +1,5 @@ import os +import validators from celery._state import get_current_task from celery.utils.log import ColorFormatter @@ -86,3 +87,30 @@ def replace_nulls(obj): return {key: replace_nulls(value) for key, value in obj.items()} else: return obj + + +def is_valid_url(url, validate_only_http_scheme=True): + """ + Validate a URL/endpoint + + Args: + url (str): The URL to validate. + validate_only_http_scheme (bool): If True, only validate HTTP/HTTPS URLs. + + Returns: + bool: True if the URL is valid, False otherwise. + """ + # no urls returns false + if not url: + return False + + # urls with space are not valid urls + if ' ' in url: + return False + + if validators.url(url): + # check for scheme, for example ftp:// can be a valid url but may not be required to crawl etc + if validate_only_http_scheme: + return url.startswith('http://') or url.startswith('https://') + return True + return False \ No newline at end of file From 9f4b741a563a1a6f3734148b4da22ebe0b9f5bd2 Mon Sep 17 00:00:00 2001 From: Yogesh Ojha Date: Sat, 13 Jul 2024 12:33:33 +0530 Subject: [PATCH 2/2] validate urls in fetching http_urls for screenshot --- web/reNgine/common_func.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web/reNgine/common_func.py b/web/reNgine/common_func.py index b07c2cce9..205a5b437 100644 --- a/web/reNgine/common_func.py +++ b/web/reNgine/common_func.py @@ -25,6 +25,7 @@ from dashboard.models import * from startScan.models import * from targetApp.models import * +from reNgine.utilities import is_valid_url logger = get_task_logger(__name__) @@ -334,7 +335,7 @@ def get_http_urls( endpoints = [e for e in endpoints if e.is_alive] # Grab only http_url from endpoint objects - endpoints = [e.http_url for e in endpoints] + endpoints = [e.http_url for e in endpoints if is_valid_url(e.http_url)] if ignore_files: # ignore all files extensions_path = f'{RENGINE_HOME}/fixtures/extensions.txt' with open(extensions_path, 'r') as f: