From 8e54b75ffe4d1190f7eb1e9fbad6a1f6fc525267 Mon Sep 17 00:00:00 2001 From: c117813 Date: Fri, 7 Jul 2023 15:06:38 -0300 Subject: [PATCH 1/8] =?UTF-8?q?Altera=C3=A7=C3=A3o=20para=20a=20captura=20?= =?UTF-8?q?de=20links=20internos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- VerificaLinksQuebradosRotina.py | 116 ++++++++++++++++++++++++++++++++ requirements.txt | 10 +-- 2 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 VerificaLinksQuebradosRotina.py diff --git a/VerificaLinksQuebradosRotina.py b/VerificaLinksQuebradosRotina.py new file mode 100644 index 0000000..619eb29 --- /dev/null +++ b/VerificaLinksQuebradosRotina.py @@ -0,0 +1,116 @@ +#@title Captura de links do site escolhido {display-mode: "form"} +# This code will be hidden when the notebook is loaded. +import requests +import pandas as pd +from bs4 import BeautifulSoup +from urllib.parse import urlparse, urljoin, urldefrag +from tqdm import tqdm + +def check_links(url): + # Faz a requisição HTTP para obter o conteúdo da página + response = requests.get(url) + if response.status_code != 200: + print(f"Erro ao acessar {url}. Status de requisição: {response.status_code}") + return [] + + # Analisa o HTML da página usando o BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # Obtém o domínio da URL base + parsed_url = urlparse(url) + base_domain = parsed_url.netloc + + # Lista para armazenar os resultados + links = [] + + # Encontra todos os elementos no HTML + for link in tqdm(soup.find_all('a'), desc="Processando links", unit="link"): + href = link.get('href') + + if href is None: + continue + + # Verifica o tipo de link + link_type = None + if href.startswith('tel:'): + link_type = 'tel' + elif href.startswith('mailto:'): + link_type = 'mailto' + elif href.startswith('#'): + link_type = 'anchor' + elif href.startswith('/'): + link_type = 'internal' + else: + link_type = 'external' + + # Resolve links relativos para links absolutos + if href.startswith('/') or href.startswith('#'): + href = urljoin(url, href) + + # Remove fragmento da URL (parte após #) + href = urldefrag(href)[0] + + # Verifica se o link está no mesmo domínio ou é um link externo + parsed_href = urlparse(href) + if parsed_href.netloc == base_domain or parsed_href.netloc == '': + # Ignora links do tipo "tel" + if link_type == 'tel': + continue + + # Faz a requisição HTTP para verificar o status do link + link_response = requests.head(href, allow_redirects=True) + link_status = link_response.status_code + links.append({ + 'page': url, + 'link': href, + 'status': link_status, + 'type': link_type + }) + + return links + + +def check_links_on_pages(pages): + all_links = [] + + for page in tqdm(pages, desc="Verificando links na página: "+ page['page'], unit="página"): + page_links = check_links(page['link']) + all_links.extend(page_links) + + return all_links + +# Exemplo de uso +print('Por favor informe a URL iniciando com "http://" ou "https://"') +url = input() +pages = check_links(url) +result = check_links_on_pages(pages) + +print('Escolha o tipo de resposta') +print('1 - Exibição em tela') +print('2 - Arquivo de texto') +print('3 - Arquivo csv') +print('4 - Arquivo xlsx') + +tipo_resposta = int(input()) + +def switch(tipo_resposta): + if tipo_resposta == 1: + # Exibição dos resultados em tela + for link in result: + print(link) + elif tipo_resposta == 2: + # Geração de arquivo TXT + with open('resultados.txt', 'w') as file: + for link in result: + file.write(str(link) + '\n') + elif tipo_resposta == 3: + # Geração de arquivo CSV + df = pd.DataFrame(result) + df.to_csv('resultados.csv', index=False) + elif tipo_resposta == 4: + # Geração de arquivo Excel (xlsx) + df = pd.DataFrame(result) + df.to_excel('resultados.xlsx', index=False) + + +switch(tipo_resposta) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2b0e075..a3e8e47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -requests==2.25.1 -beautifulsoup4==4.9.3 -pandas==1.2.2 -openpyxl==3.0.6 -tqdm==4.56.0 \ No newline at end of file +requests==2.31.0 +beautifulsoup4==4.12.2 +pandas==2.0.3 +openpyxl==3.1.2 +tqdm==4.65.0 \ No newline at end of file From ef19640c13f6be01285c33722fd9cea33183db8e Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 17:50:48 -0300 Subject: [PATCH 2/8] Criado usando o Colaboratory --- ListaLinks.ipynb | 169 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 ListaLinks.ipynb diff --git a/ListaLinks.ipynb b/ListaLinks.ipynb new file mode 100644 index 0000000..1bde6c7 --- /dev/null +++ b/ListaLinks.ipynb @@ -0,0 +1,169 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Captura de links do site escolhido {display-mode: \"form\"}\n", + "# This code will be hidden when the notebook is loaded.\n", + "import requests\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup\n", + "from urllib.parse import urlparse, urljoin, urldefrag\n", + "from tqdm import tqdm\n", + "\n", + "def check_links(url):\n", + " # Faz a requisição HTTP para obter o conteúdo da página\n", + " response = requests.get(url)\n", + " if response.status_code != 200:\n", + " print(f\"Erro ao acessar {url}. Status de requisição: {response.status_code}\")\n", + " return []\n", + "\n", + " # Analisa o HTML da página usando o BeautifulSoup\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + "\n", + " # Obtém o domínio da URL base\n", + " parsed_url = urlparse(url)\n", + " base_domain = parsed_url.netloc\n", + "\n", + " # Lista para armazenar os resultados\n", + " links = []\n", + "\n", + " # Encontra todos os elementos no HTML\n", + " for link in tqdm(soup.find_all('a'), desc=\"Processando links\", unit=\"link\"):\n", + " href = link.get('href')\n", + "\n", + " if href is None:\n", + " continue\n", + "\n", + " # Verifica o tipo de link\n", + " link_type = None\n", + " if href.startswith('tel:'):\n", + " link_type = 'tel'\n", + " elif href.startswith('mailto:'):\n", + " link_type = 'mailto'\n", + " elif href.startswith('#'):\n", + " link_type = 'anchor'\n", + " elif href.startswith('/'):\n", + " link_type = 'internal'\n", + " else:\n", + " link_type = 'external'\n", + "\n", + " # Resolve links relativos para links absolutos\n", + " if href.startswith('/') or href.startswith('#'):\n", + " href = urljoin(url, href)\n", + "\n", + " # Remove fragmento da URL (parte após #)\n", + " href = urldefrag(href)[0]\n", + "\n", + " # Verifica se o link está no mesmo domínio ou é um link externo\n", + " parsed_href = urlparse(href)\n", + " if parsed_href.netloc == base_domain or parsed_href.netloc == '':\n", + " # Ignora links do tipo \"tel\"\n", + " if link_type == 'tel':\n", + " continue\n", + "\n", + " # Faz a requisição HTTP para verificar o status do link\n", + " link_response = requests.head(href, allow_redirects=True)\n", + " link_status = link_response.status_code\n", + " links.append({\n", + " 'page': url,\n", + " 'link': href,\n", + " 'status': link_status,\n", + " 'type': link_type\n", + " })\n", + "\n", + " return links\n", + "\n", + "\n", + "# Exemplo de uso\n", + "url = 'https://fgts.gov.br' #@param\n", + "pages = check_links(url)\n", + "\n", + "def check_links_on_pages(pages):\n", + " all_links = []\n", + "\n", + " for page in tqdm(pages, desc=\"Verificando links na página: \"+ pages['page'], unit=\"página\"):\n", + " page_links = check_links(page['link'])\n", + " all_links.extend(page_links)\n", + "\n", + " return all_links\n", + "\n", + "result = check_links_on_pages(pages)\n", + "\n", + "print('Escolha o tipo de resposta')\n", + "print('1 - Exibição em tela')\n", + "print('2 - Arquivo de texto')\n", + "print('3 - Arquivo csv')\n", + "print('4 - Arquivo xlsx')\n", + "\n", + "tipo_resposta = \"Arquivo XLSX\" #@param ['Arquivo de texto', 'Arquivo CSV', 'Arquivo XLSX', 'Respostas em tela']\n", + "#tipo_resposta = int(input())\n", + "\n", + "def switch(tipo_resposta):\n", + " if tipo_resposta == 'Respostas em tela':\n", + " # Exibição dos resultados em tela\n", + " for link in result:\n", + " print(link)\n", + " elif tipo_resposta == 'Arquivo de texto':\n", + " # Geração de arquivo TXT\n", + " with open('resultados.txt', 'w') as file:\n", + " for link in result:\n", + " file.write(str(link) + '\\n')\n", + " elif tipo_resposta == 'Arquivo CSV':\n", + " # Geração de arquivo CSV\n", + " df = pd.DataFrame(result)\n", + " df.to_csv('resultados.csv', index=False)\n", + " elif tipo_resposta == 'Arquivo XLSX':\n", + " # Geração de arquivo Excel (xlsx)\n", + " df = pd.DataFrame(result)\n", + " df.to_excel('resultados.xlsx', index=False)\n", + "\n", + "\n", + "switch(tipo_resposta)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mnHTZHIZJt3u", + "outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Processando links: 50%|█████ | 59/117 [03:45<04:01, 4.16s/link]" + ] + } + ] + } + ] +} \ No newline at end of file From d006565f4932f06b933c4ae38f3537bfb9684354 Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 17:52:33 -0300 Subject: [PATCH 3/8] Create python-package-conda.yml --- .github/workflows/python-package-conda.yml | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/python-package-conda.yml diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml new file mode 100644 index 0000000..384f9b7 --- /dev/null +++ b/.github/workflows/python-package-conda.yml @@ -0,0 +1,34 @@ +name: Python Package using Conda + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda env update --file environment.yml --name base + - name: Lint with flake8 + run: | + conda install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + conda install pytest + pytest From 58df90daed4d64a6de897309d828e5e12f5ba5c8 Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 18:13:41 -0300 Subject: [PATCH 4/8] Create environment.yml --- .github/workflows/environment.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/workflows/environment.yml diff --git a/.github/workflows/environment.yml b/.github/workflows/environment.yml new file mode 100644 index 0000000..92ef12b --- /dev/null +++ b/.github/workflows/environment.yml @@ -0,0 +1,9 @@ +name: lista-links +channels: + - defaults + - conda-forge +dependencies: + - python=3.9 + - flask + - requests + - beautifulsoup4 From 45db42c1b38a9b671ea0cab9981d014e6a2d3c8b Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 18:16:52 -0300 Subject: [PATCH 5/8] =?UTF-8?q?ajuste=20de=20endere=C3=A7amento=20do=20env?= =?UTF-8?q?ironment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/python-package-conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 384f9b7..d4c0a99 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -20,7 +20,7 @@ jobs: echo $CONDA/bin >> $GITHUB_PATH - name: Install dependencies run: | - conda env update --file environment.yml --name base + conda env update --file .github/workflows/environment.yml --name base - name: Lint with flake8 run: | conda install flake8 From 08a6f8e7e2c84bc74e94640a28bbd78a6ac04cbc Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 18:29:56 -0300 Subject: [PATCH 6/8] =?UTF-8?q?Corre=C3=A7=C3=A3o=20do=20erro=20ao=20exibi?= =?UTF-8?q?r=20nome=20de=20paginas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ListaLinks.ipynb | 76 ++++++++++++++++----------------- VerificaLinksQuebradosRotina.py | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/ListaLinks.ipynb b/ListaLinks.ipynb index 1bde6c7..e7eeb19 100644 --- a/ListaLinks.ipynb +++ b/ListaLinks.ipynb @@ -1,26 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ "\"Open" @@ -28,6 +12,23 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mnHTZHIZJt3u", + "outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processando links: 50%|█████ | 59/117 [03:45<04:01, 4.16s/link]" + ] + } + ], "source": [ "#@title Captura de links do site escolhido {display-mode: \"form\"}\n", "# This code will be hidden when the notebook is loaded.\n", @@ -108,7 +109,7 @@ "def check_links_on_pages(pages):\n", " all_links = []\n", "\n", - " for page in tqdm(pages, desc=\"Verificando links na página: \"+ pages['page'], unit=\"página\"):\n", + " for page in tqdm(pages, desc=\"Verificando links na página: \", unit=\"página\"):\n", " page_links = check_links(page['link'])\n", " all_links.extend(page_links)\n", "\n", @@ -146,24 +147,23 @@ "\n", "\n", "switch(tipo_resposta)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mnHTZHIZJt3u", - "outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Processando links: 50%|█████ | 59/117 [03:45<04:01, 4.16s/link]" - ] - } ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/VerificaLinksQuebradosRotina.py b/VerificaLinksQuebradosRotina.py index 619eb29..59e7b18 100644 --- a/VerificaLinksQuebradosRotina.py +++ b/VerificaLinksQuebradosRotina.py @@ -73,7 +73,7 @@ def check_links(url): def check_links_on_pages(pages): all_links = [] - for page in tqdm(pages, desc="Verificando links na página: "+ page['page'], unit="página"): + for page in tqdm(pages, desc="Verificando links na página: ", unit="página"): page_links = check_links(page['link']) all_links.extend(page_links) From 61e00deae3e3bad848dff8abc0d408d8c33ae451 Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 18:42:19 -0300 Subject: [PATCH 7/8] teste miniconda --- .github/workflows/python-package-conda.yml | 48 +++++++--------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index d4c0a99..ec66375 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -1,34 +1,14 @@ -name: Python Package using Conda - -on: [push] - -jobs: - build-linux: - runs-on: ubuntu-latest - strategy: - max-parallel: 5 - - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: '3.10' - - name: Add conda to system path - run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH - - name: Install dependencies - run: | - conda env update --file .github/workflows/environment.yml --name base - - name: Lint with flake8 - run: | - conda install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - conda install pytest - pytest +- name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.9 + activate-environment: lista-links + environment-file: environment.yml + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! +- name: Install dependencies + shell: bash -l {0} + run: | + conda info -v # mostra informações sobre o conda e o ambiente + conda list -v # mostra a lista de pacotes instalados + conda install -v -y --file requirements.txt # instala as dependências com mais detalhes From 206f20cdd489de65e1e03ac1042d892f893f01c4 Mon Sep 17 00:00:00 2001 From: David Jeiel Date: Fri, 7 Jul 2023 18:46:04 -0300 Subject: [PATCH 8/8] workflow 2 --- .github/workflows/python-package-conda.yml | 48 +++++++++++++++------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index ec66375..1315286 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -1,14 +1,34 @@ -- name: Setup Miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.9 - activate-environment: lista-links - environment-file: environment.yml - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! -- name: Install dependencies - shell: bash -l {0} - run: | - conda info -v # mostra informações sobre o conda e o ambiente - conda list -v # mostra a lista de pacotes instalados - conda install -v -y --file requirements.txt # instala as dependências com mais detalhes +name: Python Package using Conda + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda env update --file .github/workflows/environment.yml --name base + - name: Lint with flake8 + run: | + conda install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + conda install pytest + pytest \ No newline at end of file