-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/davidjeiel/lista-links
- Loading branch information
Showing
5 changed files
with
333 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: lista-links | ||
channels: | ||
- defaults | ||
- conda-forge | ||
dependencies: | ||
- python=3.9 | ||
- flask | ||
- requests | ||
- beautifulsoup4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
name: Python Package using Conda | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build-linux: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
max-parallel: 5 | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python 3.10 | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: '3.10' | ||
- name: Add conda to system path | ||
run: | | ||
# $CONDA is an environment variable pointing to the root of the miniconda directory | ||
echo $CONDA/bin >> $GITHUB_PATH | ||
- name: Install dependencies | ||
run: | | ||
conda env update --file .github/workflows/environment.yml --name base | ||
- name: Lint with flake8 | ||
run: | | ||
conda install flake8 | ||
# stop the build if there are Python syntax errors or undefined names | ||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | ||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | ||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics | ||
- name: Test with pytest | ||
run: | | ||
conda install pytest | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"colab_type": "text", | ||
"id": "view-in-github" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/davidjeiel/lista-links/blob/master/ListaLinks.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "mnHTZHIZJt3u", | ||
"outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processando links: 50%|█████ | 59/117 [03:45<04:01, 4.16s/link]" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#@title Captura de links do site escolhido {display-mode: \"form\"}\n", | ||
"# This code will be hidden when the notebook is loaded.\n", | ||
"import requests\n", | ||
"import pandas as pd\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"from urllib.parse import urlparse, urljoin, urldefrag\n", | ||
"from tqdm import tqdm\n", | ||
"\n", | ||
"def check_links(url):\n", | ||
" # Faz a requisição HTTP para obter o conteúdo da página\n", | ||
" response = requests.get(url)\n", | ||
" if response.status_code != 200:\n", | ||
" print(f\"Erro ao acessar {url}. Status de requisição: {response.status_code}\")\n", | ||
" return []\n", | ||
"\n", | ||
" # Analisa o HTML da página usando o BeautifulSoup\n", | ||
" soup = BeautifulSoup(response.content, 'html.parser')\n", | ||
"\n", | ||
" # Obtém o domínio da URL base\n", | ||
" parsed_url = urlparse(url)\n", | ||
" base_domain = parsed_url.netloc\n", | ||
"\n", | ||
" # Lista para armazenar os resultados\n", | ||
" links = []\n", | ||
"\n", | ||
" # Encontra todos os elementos <a> no HTML\n", | ||
" for link in tqdm(soup.find_all('a'), desc=\"Processando links\", unit=\"link\"):\n", | ||
" href = link.get('href')\n", | ||
"\n", | ||
" if href is None:\n", | ||
" continue\n", | ||
"\n", | ||
" # Verifica o tipo de link\n", | ||
" link_type = None\n", | ||
" if href.startswith('tel:'):\n", | ||
" link_type = 'tel'\n", | ||
" elif href.startswith('mailto:'):\n", | ||
" link_type = 'mailto'\n", | ||
" elif href.startswith('#'):\n", | ||
" link_type = 'anchor'\n", | ||
" elif href.startswith('/'):\n", | ||
" link_type = 'internal'\n", | ||
" else:\n", | ||
" link_type = 'external'\n", | ||
"\n", | ||
" # Resolve links relativos para links absolutos\n", | ||
" if href.startswith('/') or href.startswith('#'):\n", | ||
" href = urljoin(url, href)\n", | ||
"\n", | ||
" # Remove fragmento da URL (parte após #)\n", | ||
" href = urldefrag(href)[0]\n", | ||
"\n", | ||
" # Verifica se o link está no mesmo domínio ou é um link externo\n", | ||
" parsed_href = urlparse(href)\n", | ||
" if parsed_href.netloc == base_domain or parsed_href.netloc == '':\n", | ||
" # Ignora links do tipo \"tel\"\n", | ||
" if link_type == 'tel':\n", | ||
" continue\n", | ||
"\n", | ||
" # Faz a requisição HTTP para verificar o status do link\n", | ||
" link_response = requests.head(href, allow_redirects=True)\n", | ||
" link_status = link_response.status_code\n", | ||
" links.append({\n", | ||
" 'page': url,\n", | ||
" 'link': href,\n", | ||
" 'status': link_status,\n", | ||
" 'type': link_type\n", | ||
" })\n", | ||
"\n", | ||
" return links\n", | ||
"\n", | ||
"\n", | ||
"# Exemplo de uso\n", | ||
"url = 'https://fgts.gov.br' #@param\n", | ||
"pages = check_links(url)\n", | ||
"\n", | ||
"def check_links_on_pages(pages):\n", | ||
" all_links = []\n", | ||
"\n", | ||
" for page in tqdm(pages, desc=\"Verificando links na página: \", unit=\"página\"):\n", | ||
" page_links = check_links(page['link'])\n", | ||
" all_links.extend(page_links)\n", | ||
"\n", | ||
" return all_links\n", | ||
"\n", | ||
"result = check_links_on_pages(pages)\n", | ||
"\n", | ||
"print('Escolha o tipo de resposta')\n", | ||
"print('1 - Exibição em tela')\n", | ||
"print('2 - Arquivo de texto')\n", | ||
"print('3 - Arquivo csv')\n", | ||
"print('4 - Arquivo xlsx')\n", | ||
"\n", | ||
"tipo_resposta = \"Arquivo XLSX\" #@param ['Arquivo de texto', 'Arquivo CSV', 'Arquivo XLSX', 'Respostas em tela']\n", | ||
"#tipo_resposta = int(input())\n", | ||
"\n", | ||
"def switch(tipo_resposta):\n", | ||
" if tipo_resposta == 'Respostas em tela':\n", | ||
" # Exibição dos resultados em tela\n", | ||
" for link in result:\n", | ||
" print(link)\n", | ||
" elif tipo_resposta == 'Arquivo de texto':\n", | ||
" # Geração de arquivo TXT\n", | ||
" with open('resultados.txt', 'w') as file:\n", | ||
" for link in result:\n", | ||
" file.write(str(link) + '\\n')\n", | ||
" elif tipo_resposta == 'Arquivo CSV':\n", | ||
" # Geração de arquivo CSV\n", | ||
" df = pd.DataFrame(result)\n", | ||
" df.to_csv('resultados.csv', index=False)\n", | ||
" elif tipo_resposta == 'Arquivo XLSX':\n", | ||
" # Geração de arquivo Excel (xlsx)\n", | ||
" df = pd.DataFrame(result)\n", | ||
" df.to_excel('resultados.xlsx', index=False)\n", | ||
"\n", | ||
"\n", | ||
"switch(tipo_resposta)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP", | ||
"include_colab_link": true, | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#@title Captura de links do site escolhido {display-mode: "form"} | ||
# This code will be hidden when the notebook is loaded. | ||
import requests | ||
import pandas as pd | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urlparse, urljoin, urldefrag | ||
from tqdm import tqdm | ||
|
||
def check_links(url): | ||
# Faz a requisição HTTP para obter o conteúdo da página | ||
response = requests.get(url) | ||
if response.status_code != 200: | ||
print(f"Erro ao acessar {url}. Status de requisição: {response.status_code}") | ||
return [] | ||
|
||
# Analisa o HTML da página usando o BeautifulSoup | ||
soup = BeautifulSoup(response.content, 'html.parser') | ||
|
||
# Obtém o domínio da URL base | ||
parsed_url = urlparse(url) | ||
base_domain = parsed_url.netloc | ||
|
||
# Lista para armazenar os resultados | ||
links = [] | ||
|
||
# Encontra todos os elementos <a> no HTML | ||
for link in tqdm(soup.find_all('a'), desc="Processando links", unit="link"): | ||
href = link.get('href') | ||
|
||
if href is None: | ||
continue | ||
|
||
# Verifica o tipo de link | ||
link_type = None | ||
if href.startswith('tel:'): | ||
link_type = 'tel' | ||
elif href.startswith('mailto:'): | ||
link_type = 'mailto' | ||
elif href.startswith('#'): | ||
link_type = 'anchor' | ||
elif href.startswith('/'): | ||
link_type = 'internal' | ||
else: | ||
link_type = 'external' | ||
|
||
# Resolve links relativos para links absolutos | ||
if href.startswith('/') or href.startswith('#'): | ||
href = urljoin(url, href) | ||
|
||
# Remove fragmento da URL (parte após #) | ||
href = urldefrag(href)[0] | ||
|
||
# Verifica se o link está no mesmo domínio ou é um link externo | ||
parsed_href = urlparse(href) | ||
if parsed_href.netloc == base_domain or parsed_href.netloc == '': | ||
# Ignora links do tipo "tel" | ||
if link_type == 'tel': | ||
continue | ||
|
||
# Faz a requisição HTTP para verificar o status do link | ||
link_response = requests.head(href, allow_redirects=True) | ||
link_status = link_response.status_code | ||
links.append({ | ||
'page': url, | ||
'link': href, | ||
'status': link_status, | ||
'type': link_type | ||
}) | ||
|
||
return links | ||
|
||
|
||
def check_links_on_pages(pages): | ||
all_links = [] | ||
|
||
for page in tqdm(pages, desc="Verificando links na página: ", unit="página"): | ||
page_links = check_links(page['link']) | ||
all_links.extend(page_links) | ||
|
||
return all_links | ||
|
||
# Exemplo de uso | ||
print('Por favor informe a URL iniciando com "http://" ou "https://"') | ||
url = input() | ||
pages = check_links(url) | ||
result = check_links_on_pages(pages) | ||
|
||
print('Escolha o tipo de resposta') | ||
print('1 - Exibição em tela') | ||
print('2 - Arquivo de texto') | ||
print('3 - Arquivo csv') | ||
print('4 - Arquivo xlsx') | ||
|
||
tipo_resposta = int(input()) | ||
|
||
def switch(tipo_resposta): | ||
if tipo_resposta == 1: | ||
# Exibição dos resultados em tela | ||
for link in result: | ||
print(link) | ||
elif tipo_resposta == 2: | ||
# Geração de arquivo TXT | ||
with open('resultados.txt', 'w') as file: | ||
for link in result: | ||
file.write(str(link) + '\n') | ||
elif tipo_resposta == 3: | ||
# Geração de arquivo CSV | ||
df = pd.DataFrame(result) | ||
df.to_csv('resultados.csv', index=False) | ||
elif tipo_resposta == 4: | ||
# Geração de arquivo Excel (xlsx) | ||
df = pd.DataFrame(result) | ||
df.to_excel('resultados.xlsx', index=False) | ||
|
||
|
||
switch(tipo_resposta) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
requests==2.25.1 | ||
beautifulsoup4==4.9.3 | ||
pandas==1.2.2 | ||
openpyxl==3.0.6 | ||
tqdm==4.56.0 | ||
requests==2.31.0 | ||
beautifulsoup4==4.12.2 | ||
pandas==2.0.3 | ||
openpyxl==3.1.2 | ||
tqdm==4.65.0 |