Skip to content

Commit

Permalink
skip extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
belane committed Aug 20, 2022
1 parent 65a7b00 commit 43685d7
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion cloudhunter.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ class HiddenGems(object):

HTTP_TIMEOUT = 5
UA = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36" }
skip_extensions = ['exe','bin','pdf','zip','jpg','png','svg','avi','mp3','mp4','gz','tar','rar','7z','ttf','otf','woff','woff2']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
known_urls = []

Expand Down Expand Up @@ -264,7 +265,7 @@ def crawl(self, deep=0, active_crawl=False):

if(deep > 0):
childs = self.filter_scope(list(set(self.urls['links'] + self.urls['forms'])))
childs = [x for x in childs if self.normalize_url(x)[-3::].lower() not in ['pdf','zip','jpg','png','avi','mp3','mp4','.gz','tar','rar','.7z']]
childs = [x for x in childs if self.url_extension(x) not in self.skip_extensions]
for child in childs:
if child not in self.known_urls:
self.childs.append(HiddenGems(child, deep=deep - 1, active_crawl=active_crawl))
Expand All @@ -275,6 +276,8 @@ def crawl_raw_urls(self, urls):
if url in self.known_urls:
continue
self.known_urls.append(url)
if self.url_extension(url) in self.skip_extensions:
continue
try:
r = requests.get(url, timeout=self.HTTP_TIMEOUT, verify=False, headers=self.UA)
result += self.extract_raw_links(r.text)
Expand Down Expand Up @@ -303,6 +306,13 @@ def normalize_url(self, src, full_query=False, base_url=None):
query = ''
return '{}://{}{}{}'.format(url.scheme, url.netloc, url.path, query)

def url_extension(self, url):
file = self.normalize_url(url).split('/')[-1]
parts = file.split('.')
if len(parts) == 1:
return None
return parts[-1].lower()

def list_urls(self, scope=False, full_query=False):
result = []
for url_list in self.urls.values():
Expand Down

0 comments on commit 43685d7

Please sign in to comment.