Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added website parsing feature #6

Merged
merged 10 commits into from
Oct 2, 2020
32 changes: 24 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,35 @@ The syntax for the script is the following:
**release0_1 \[command] [arg]**

# Commands
url: pass a url as an argument to test for. This only tests the argument link, and is not recursive.

example: **release0_1 url https://www.youtube.com/watch?v=zILpjFqlOak**
## url

**url**: pass a url as an argument to test for. This only tests the argument link, and is not recursive.

example: **release0_1 url https://www.youtube.com/watch?v=zILpjFqlOak**

file: pass a file as an argument to test links inside. If the file has multiple links, it will test those as well.
**--l**: an option for the **url** command that allows you to search through a website for dead links given the url

example: **release0_1 url --l https://www.youtube.com/watch?v=zILpjFqlOak**

example: **release0_1 file test.html**
**--s**: an option for the **url** command that allows you to search through a website for http links and sees if they work as https

example: **release0_1 url --s https://www.youtube.com/watch?v=zILpjFqlOak**

version: returns you the version of this code
## file

example: **release0_1 version**
**file**: pass a file as an argument to test links inside. If the file has multiple links, it will test those as well.

--help: gives you the lists of commands for help
example: **release0_1 file test.html**

example: **release0_1 --help**
**--s**: an option for the **file** command that allows you to search through a file for http links and sees if they work as https

example: **release0_1 url --s C:\Users\user1\Documents\file.html**

**version**: returns you the version of this code

example: **release0_1 version**

**--help**: gives you the lists of commands for help

example: **release0_1 --help**
60 changes: 46 additions & 14 deletions release0_1.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,53 @@
import click
import urllib3
import re
# check this with Click documentation
from colorama import Fore
import sys


def basic_file_read(file, *args):
def get_urls(data, *args):
s = args[0]
pattern = re.findall(r'https?:[a-zA-Z0-9_.+-/#~]+', data)
for l in pattern:
q = l.strip()
test_request(q)
if (s):
isHttp = re.match('(http)', q)
if (isHttp):
q = re.sub('(http)', 'https', q)
test_request(q)


def website_read(q, s):
try:
file_data = open(file,'r',encoding="utf-8")
pattern = re.findall(r'https?:[a-zA-Z0-9_.+-/#~]+', file_data.read())
for l in pattern:
q = l.strip()
test_request(q)
if(s):
isHttp = re.match('(http)', q)
if(isHttp):
q = re.sub('(http)','https', q)
test_request(q)
h = urllib3.PoolManager()
response = h.request('GET', q, timeout=5.0)
except:
print("This is an invalid link, please try again")
else:
try:
get_urls(response.data.decode('ISO-8859-1'), s)
except:
print("An error has occurred when retrieving the website")


def basic_file_read(file, *args):
try:
file_data = open(file, 'r', encoding="utf-8")
get_urls(file_data.read(), *args)
except OSError:
print("The file cannot be opened! Make sure this file can be read and is legit.")


def test_request(q):
# accepts one url
# try adding more support for other html codes
try:
h = urllib3.PoolManager()
req = h.request('HEAD', q)
if(req.status == 200):
if (req.status == 200):
# this is not working with my machine. Perhaps check Click documentation and see what is going on with this
print(Fore.GREEN + f"{q} passes with {req.status}!")
elif (req.status == 403):
print(Fore.WHITE + f"{q} looks sus, it returned {req.status}")
Expand All @@ -34,22 +56,32 @@ def test_request(q):
except:
print("Unknown Error: " + str(sys.exc_info()[0]))


@click.group()
def cli():
pass


@cli.command('file')
@click.argument('file')
@click.option('--s', is_flag=True, default=False, help='Optional flag to check if https can be used instead of http')
def file_reader(file, s):
"""this reads URL links from a file!"""
basic_file_read(file, s)


@cli.command('url')
@click.argument('url')
def url_reader(url):
@click.option('--l', is_flag=True, default=False, help='look through the given website recursively and check the webpage for '
'dead links')
@click.option('--s', is_flag=True, default=False, help='Optional flag to check if https can be used instead of http')
def url_reader(url, l, s):
"""this reads a URL that you pass as an argument!"""
test_request(str(url))
if l:
website_read(url, s)
else:
test_request(str(url))


@cli.command('version')
def version_check():
Expand Down
15 changes: 8 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from setuptools import setup

setup(
name = 'release0_1',
version = '0.1',
install_requires = ['Click','urllib3','colorama'],
py_modules = ['release0_1'],
name='release0_1',
version='0.1',
install_requires=['Click', 'urllib3', 'Colorama'],
py_modules=['release0_1'],

entry_points={
'console_scripts':
['release0_1=release0_1:cli']}
entry_points={
'console_scripts':
['release0_1=release0_1:cli']}
)