Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
davidjeiel committed Jul 9, 2023
2 parents 5caae0a + 206f20c commit 1d19768
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 5 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: lista-links
channels:
- defaults
- conda-forge
dependencies:
- python=3.9
- flask
- requests
- beautifulsoup4
34 changes: 34 additions & 0 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Python Package using Conda

on: [push]

jobs:
build-linux:
runs-on: ubuntu-latest
strategy:
max-parallel: 5

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: '3.10'
- name: Add conda to system path
run: |
# $CONDA is an environment variable pointing to the root of the miniconda directory
echo $CONDA/bin >> $GITHUB_PATH
- name: Install dependencies
run: |
conda env update --file .github/workflows/environment.yml --name base
- name: Lint with flake8
run: |
conda install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
conda install pytest
pytest
169 changes: 169 additions & 0 deletions ListaLinks.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/github/davidjeiel/lista-links/blob/master/ListaLinks.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mnHTZHIZJt3u",
"outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processando links: 50%|█████ | 59/117 [03:45<04:01, 4.16s/link]"
]
}
],
"source": [
"#@title Captura de links do site escolhido {display-mode: \"form\"}\n",
"# This code will be hidden when the notebook is loaded.\n",
"import requests\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urlparse, urljoin, urldefrag\n",
"from tqdm import tqdm\n",
"\n",
"def check_links(url):\n",
" # Faz a requisição HTTP para obter o conteúdo da página\n",
" response = requests.get(url)\n",
" if response.status_code != 200:\n",
" print(f\"Erro ao acessar {url}. Status de requisição: {response.status_code}\")\n",
" return []\n",
"\n",
" # Analisa o HTML da página usando o BeautifulSoup\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
" # Obtém o domínio da URL base\n",
" parsed_url = urlparse(url)\n",
" base_domain = parsed_url.netloc\n",
"\n",
" # Lista para armazenar os resultados\n",
" links = []\n",
"\n",
" # Encontra todos os elementos <a> no HTML\n",
" for link in tqdm(soup.find_all('a'), desc=\"Processando links\", unit=\"link\"):\n",
" href = link.get('href')\n",
"\n",
" if href is None:\n",
" continue\n",
"\n",
" # Verifica o tipo de link\n",
" link_type = None\n",
" if href.startswith('tel:'):\n",
" link_type = 'tel'\n",
" elif href.startswith('mailto:'):\n",
" link_type = 'mailto'\n",
" elif href.startswith('#'):\n",
" link_type = 'anchor'\n",
" elif href.startswith('/'):\n",
" link_type = 'internal'\n",
" else:\n",
" link_type = 'external'\n",
"\n",
" # Resolve links relativos para links absolutos\n",
" if href.startswith('/') or href.startswith('#'):\n",
" href = urljoin(url, href)\n",
"\n",
" # Remove fragmento da URL (parte após #)\n",
" href = urldefrag(href)[0]\n",
"\n",
" # Verifica se o link está no mesmo domínio ou é um link externo\n",
" parsed_href = urlparse(href)\n",
" if parsed_href.netloc == base_domain or parsed_href.netloc == '':\n",
" # Ignora links do tipo \"tel\"\n",
" if link_type == 'tel':\n",
" continue\n",
"\n",
" # Faz a requisição HTTP para verificar o status do link\n",
" link_response = requests.head(href, allow_redirects=True)\n",
" link_status = link_response.status_code\n",
" links.append({\n",
" 'page': url,\n",
" 'link': href,\n",
" 'status': link_status,\n",
" 'type': link_type\n",
" })\n",
"\n",
" return links\n",
"\n",
"\n",
"# Exemplo de uso\n",
"url = 'https://fgts.gov.br' #@param\n",
"pages = check_links(url)\n",
"\n",
"def check_links_on_pages(pages):\n",
" all_links = []\n",
"\n",
" for page in tqdm(pages, desc=\"Verificando links na página: \", unit=\"página\"):\n",
" page_links = check_links(page['link'])\n",
" all_links.extend(page_links)\n",
"\n",
" return all_links\n",
"\n",
"result = check_links_on_pages(pages)\n",
"\n",
"print('Escolha o tipo de resposta')\n",
"print('1 - Exibição em tela')\n",
"print('2 - Arquivo de texto')\n",
"print('3 - Arquivo csv')\n",
"print('4 - Arquivo xlsx')\n",
"\n",
"tipo_resposta = \"Arquivo XLSX\" #@param ['Arquivo de texto', 'Arquivo CSV', 'Arquivo XLSX', 'Respostas em tela']\n",
"#tipo_resposta = int(input())\n",
"\n",
"def switch(tipo_resposta):\n",
" if tipo_resposta == 'Respostas em tela':\n",
" # Exibição dos resultados em tela\n",
" for link in result:\n",
" print(link)\n",
" elif tipo_resposta == 'Arquivo de texto':\n",
" # Geração de arquivo TXT\n",
" with open('resultados.txt', 'w') as file:\n",
" for link in result:\n",
" file.write(str(link) + '\\n')\n",
" elif tipo_resposta == 'Arquivo CSV':\n",
" # Geração de arquivo CSV\n",
" df = pd.DataFrame(result)\n",
" df.to_csv('resultados.csv', index=False)\n",
" elif tipo_resposta == 'Arquivo XLSX':\n",
" # Geração de arquivo Excel (xlsx)\n",
" df = pd.DataFrame(result)\n",
" df.to_excel('resultados.xlsx', index=False)\n",
"\n",
"\n",
"switch(tipo_resposta)"
]
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP",
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
116 changes: 116 additions & 0 deletions VerificaLinksQuebradosRotina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#@title Captura de links do site escolhido {display-mode: "form"}
# This code will be hidden when the notebook is loaded.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urldefrag
from tqdm import tqdm

def check_links(url):
# Faz a requisição HTTP para obter o conteúdo da página
response = requests.get(url)
if response.status_code != 200:
print(f"Erro ao acessar {url}. Status de requisição: {response.status_code}")
return []

# Analisa o HTML da página usando o BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Obtém o domínio da URL base
parsed_url = urlparse(url)
base_domain = parsed_url.netloc

# Lista para armazenar os resultados
links = []

# Encontra todos os elementos <a> no HTML
for link in tqdm(soup.find_all('a'), desc="Processando links", unit="link"):
href = link.get('href')

if href is None:
continue

# Verifica o tipo de link
link_type = None
if href.startswith('tel:'):
link_type = 'tel'
elif href.startswith('mailto:'):
link_type = 'mailto'
elif href.startswith('#'):
link_type = 'anchor'
elif href.startswith('/'):
link_type = 'internal'
else:
link_type = 'external'

# Resolve links relativos para links absolutos
if href.startswith('/') or href.startswith('#'):
href = urljoin(url, href)

# Remove fragmento da URL (parte após #)
href = urldefrag(href)[0]

# Verifica se o link está no mesmo domínio ou é um link externo
parsed_href = urlparse(href)
if parsed_href.netloc == base_domain or parsed_href.netloc == '':
# Ignora links do tipo "tel"
if link_type == 'tel':
continue

# Faz a requisição HTTP para verificar o status do link
link_response = requests.head(href, allow_redirects=True)
link_status = link_response.status_code
links.append({
'page': url,
'link': href,
'status': link_status,
'type': link_type
})

return links


def check_links_on_pages(pages):
all_links = []

for page in tqdm(pages, desc="Verificando links na página: ", unit="página"):
page_links = check_links(page['link'])
all_links.extend(page_links)

return all_links

# Exemplo de uso
print('Por favor informe a URL iniciando com "http://" ou "https://"')
url = input()
pages = check_links(url)
result = check_links_on_pages(pages)

print('Escolha o tipo de resposta')
print('1 - Exibição em tela')
print('2 - Arquivo de texto')
print('3 - Arquivo csv')
print('4 - Arquivo xlsx')

tipo_resposta = int(input())

def switch(tipo_resposta):
if tipo_resposta == 1:
# Exibição dos resultados em tela
for link in result:
print(link)
elif tipo_resposta == 2:
# Geração de arquivo TXT
with open('resultados.txt', 'w') as file:
for link in result:
file.write(str(link) + '\n')
elif tipo_resposta == 3:
# Geração de arquivo CSV
df = pd.DataFrame(result)
df.to_csv('resultados.csv', index=False)
elif tipo_resposta == 4:
# Geração de arquivo Excel (xlsx)
df = pd.DataFrame(result)
df.to_excel('resultados.xlsx', index=False)


switch(tipo_resposta)
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
requests==2.25.1
beautifulsoup4==4.9.3
pandas==1.2.2
openpyxl==3.0.6
tqdm==4.56.0
requests==2.31.0
beautifulsoup4==4.12.2
pandas==2.0.3
openpyxl==3.1.2
tqdm==4.65.0

0 comments on commit 1d19768

Please sign in to comment.