Merge branch 'master' of https://github.com/davidjeiel/lista-links

davidjeiel · Jul 9, 2023 · 1d19768 · 1d19768
2 parents 5caae0a + 206f20c
commit 1d19768
Show file tree

Hide file tree

Showing 5 changed files with 333 additions and 5 deletions.
diff --git a/.github/workflows/environment.yml b/.github/workflows/environment.yml
@@ -0,0 +1,9 @@
+name: lista-links
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  - python=3.9
+  - flask
+  - requests
+  - beautifulsoup4
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -0,0 +1,34 @@
+name: Python Package using Conda
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        conda env update --file .github/workflows/environment.yml --name base
+    - name: Lint with flake8
+      run: |
+        conda install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        conda install pytest
+        pytest
diff --git a/ListaLinks.ipynb b/ListaLinks.ipynb
@@ -0,0 +1,169 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "view-in-github"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/davidjeiel/lista-links/blob/master/ListaLinks.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mnHTZHIZJt3u",
+        "outputId": "c42d3971-40e4-4176-ab1b-3c7622aaf7b5"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Processando links:  50%|█████     | 59/117 [03:45<04:01,  4.16s/link]"
+          ]
+        }
+      ],
+      "source": [
+        "#@title Captura de links do site escolhido {display-mode: \"form\"}\n",
+        "# This code will be hidden when the notebook is loaded.\n",
+        "import requests\n",
+        "import pandas as pd\n",
+        "from bs4 import BeautifulSoup\n",
+        "from urllib.parse import urlparse, urljoin, urldefrag\n",
+        "from tqdm import tqdm\n",
+        "\n",
+        "def check_links(url):\n",
+        "    # Faz a requisição HTTP para obter o conteúdo da página\n",
+        "    response = requests.get(url)\n",
+        "    if response.status_code != 200:\n",
+        "        print(f\"Erro ao acessar {url}. Status de requisição: {response.status_code}\")\n",
+        "        return []\n",
+        "\n",
+        "    # Analisa o HTML da página usando o BeautifulSoup\n",
+        "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+        "\n",
+        "    # Obtém o domínio da URL base\n",
+        "    parsed_url = urlparse(url)\n",
+        "    base_domain = parsed_url.netloc\n",
+        "\n",
+        "    # Lista para armazenar os resultados\n",
+        "    links = []\n",
+        "\n",
+        "    # Encontra todos os elementos <a> no HTML\n",
+        "    for link in tqdm(soup.find_all('a'), desc=\"Processando links\", unit=\"link\"):\n",
+        "        href = link.get('href')\n",
+        "\n",
+        "        if href is None:\n",
+        "            continue\n",
+        "\n",
+        "        # Verifica o tipo de link\n",
+        "        link_type = None\n",
+        "        if href.startswith('tel:'):\n",
+        "            link_type = 'tel'\n",
+        "        elif href.startswith('mailto:'):\n",
+        "            link_type = 'mailto'\n",
+        "        elif href.startswith('#'):\n",
+        "            link_type = 'anchor'\n",
+        "        elif href.startswith('/'):\n",
+        "            link_type = 'internal'\n",
+        "        else:\n",
+        "            link_type = 'external'\n",
+        "\n",
+        "        # Resolve links relativos para links absolutos\n",
+        "        if href.startswith('/') or href.startswith('#'):\n",
+        "            href = urljoin(url, href)\n",
+        "\n",
+        "        # Remove fragmento da URL (parte após #)\n",
+        "        href = urldefrag(href)[0]\n",
+        "\n",
+        "        # Verifica se o link está no mesmo domínio ou é um link externo\n",
+        "        parsed_href = urlparse(href)\n",
+        "        if parsed_href.netloc == base_domain or parsed_href.netloc == '':\n",
+        "            # Ignora links do tipo \"tel\"\n",
+        "            if link_type == 'tel':\n",
+        "                continue\n",
+        "\n",
+        "            # Faz a requisição HTTP para verificar o status do link\n",
+        "            link_response = requests.head(href, allow_redirects=True)\n",
+        "            link_status = link_response.status_code\n",
+        "            links.append({\n",
+        "                'page': url,\n",
+        "                'link': href,\n",
+        "                'status': link_status,\n",
+        "                'type': link_type\n",
+        "            })\n",
+        "\n",
+        "    return links\n",
+        "\n",
+        "\n",
+        "# Exemplo de uso\n",
+        "url = 'https://fgts.gov.br'  #@param\n",
+        "pages = check_links(url)\n",
+        "\n",
+        "def check_links_on_pages(pages):\n",
+        "    all_links = []\n",
+        "\n",
+        "    for page in tqdm(pages, desc=\"Verificando links na página: \", unit=\"página\"):\n",
+        "        page_links = check_links(page['link'])\n",
+        "        all_links.extend(page_links)\n",
+        "\n",
+        "    return all_links\n",
+        "\n",
+        "result = check_links_on_pages(pages)\n",
+        "\n",
+        "print('Escolha o tipo de resposta')\n",
+        "print('1 - Exibição em tela')\n",
+        "print('2 - Arquivo de texto')\n",
+        "print('3 - Arquivo csv')\n",
+        "print('4 - Arquivo xlsx')\n",
+        "\n",
+        "tipo_resposta = \"Arquivo XLSX\"  #@param ['Arquivo de texto', 'Arquivo CSV', 'Arquivo XLSX', 'Respostas em tela']\n",
+        "#tipo_resposta = int(input())\n",
+        "\n",
+        "def switch(tipo_resposta):\n",
+        "  if tipo_resposta == 'Respostas em tela':\n",
+        "    # Exibição dos resultados em tela\n",
+        "    for link in result:\n",
+        "        print(link)\n",
+        "  elif tipo_resposta == 'Arquivo de texto':\n",
+        "    # Geração de arquivo TXT\n",
+        "    with open('resultados.txt', 'w') as file:\n",
+        "        for link in result:\n",
+        "            file.write(str(link) + '\\n')\n",
+        "  elif tipo_resposta == 'Arquivo CSV':\n",
+        "    # Geração de arquivo CSV\n",
+        "    df = pd.DataFrame(result)\n",
+        "    df.to_csv('resultados.csv', index=False)\n",
+        "  elif tipo_resposta == 'Arquivo XLSX':\n",
+        "    # Geração de arquivo Excel (xlsx)\n",
+        "    df = pd.DataFrame(result)\n",
+        "    df.to_excel('resultados.xlsx', index=False)\n",
+        "\n",
+        "\n",
+        "switch(tipo_resposta)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "authorship_tag": "ABX9TyP3lmfuQTtk/BgrsJwMDPdP",
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/VerificaLinksQuebradosRotina.py b/VerificaLinksQuebradosRotina.py
@@ -0,0 +1,116 @@
+#@title Captura de links do site escolhido {display-mode: "form"}
+# This code will be hidden when the notebook is loaded.
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin, urldefrag
+from tqdm import tqdm
+
+def check_links(url):
+    # Faz a requisição HTTP para obter o conteúdo da página
+    response = requests.get(url)
+    if response.status_code != 200:
+        print(f"Erro ao acessar {url}. Status de requisição: {response.status_code}")
+        return []
+
+    # Analisa o HTML da página usando o BeautifulSoup
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    # Obtém o domínio da URL base
+    parsed_url = urlparse(url)
+    base_domain = parsed_url.netloc
+
+    # Lista para armazenar os resultados
+    links = []
+
+    # Encontra todos os elementos <a> no HTML
+    for link in tqdm(soup.find_all('a'), desc="Processando links", unit="link"):
+        href = link.get('href')
+
+        if href is None:
+            continue
+
+        # Verifica o tipo de link
+        link_type = None
+        if href.startswith('tel:'):
+            link_type = 'tel'
+        elif href.startswith('mailto:'):
+            link_type = 'mailto'
+        elif href.startswith('#'):
+            link_type = 'anchor'
+        elif href.startswith('/'):
+            link_type = 'internal'
+        else:
+            link_type = 'external'
+
+        # Resolve links relativos para links absolutos
+        if href.startswith('/') or href.startswith('#'):
+            href = urljoin(url, href)
+
+        # Remove fragmento da URL (parte após #)
+        href = urldefrag(href)[0]
+
+        # Verifica se o link está no mesmo domínio ou é um link externo
+        parsed_href = urlparse(href)
+        if parsed_href.netloc == base_domain or parsed_href.netloc == '':
+            # Ignora links do tipo "tel"
+            if link_type == 'tel':
+                continue
+
+            # Faz a requisição HTTP para verificar o status do link
+            link_response = requests.head(href, allow_redirects=True)
+            link_status = link_response.status_code
+            links.append({
+                'page': url,
+                'link': href,
+                'status': link_status,
+                'type': link_type
+            })
+
+    return links
+
+
+def check_links_on_pages(pages):
+    all_links = []
+
+    for page in tqdm(pages, desc="Verificando links na página: ", unit="página"):
+        page_links = check_links(page['link'])
+        all_links.extend(page_links)
+
+    return all_links
+
+# Exemplo de uso
+print('Por favor informe a URL iniciando com "http://" ou "https://"')
+url = input()
+pages = check_links(url)
+result = check_links_on_pages(pages)
+
+print('Escolha o tipo de resposta')
+print('1 - Exibição em tela')
+print('2 - Arquivo de texto')
+print('3 - Arquivo csv')
+print('4 - Arquivo xlsx')
+
+tipo_resposta = int(input())
+
+def switch(tipo_resposta):
+  if tipo_resposta == 1:
+    # Exibição dos resultados em tela
+    for link in result:
+        print(link)
+  elif tipo_resposta == 2:
+    # Geração de arquivo TXT
+    with open('resultados.txt', 'w') as file:
+        for link in result:
+            file.write(str(link) + '\n')
+  elif tipo_resposta == 3:
+    # Geração de arquivo CSV
+    df = pd.DataFrame(result)
+    df.to_csv('resultados.csv', index=False)
+  elif tipo_resposta == 4:
+    # Geração de arquivo Excel (xlsx)
+    df = pd.DataFrame(result)
+    df.to_excel('resultados.xlsx', index=False)
+
+
+switch(tipo_resposta)
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-requests==2.25.1
-beautifulsoup4==4.9.3
-pandas==1.2.2
-openpyxl==3.0.6
-tqdm==4.56.0
+requests==2.31.0
+beautifulsoup4==4.12.2
+pandas==2.0.3
+openpyxl==3.1.2
+tqdm==4.65.0