Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance Scraper #64

Merged
merged 9 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/bitcoinops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ jobs:
if: github.event_name == 'scrape'
run: if [ "${{ github.event.client_payload.secret }}" != "${{ secrets.COMMON_SECRET }}" ]; then echo "Wrong password"; exit 1; fi
- uses: actions/checkout@v2
- uses: actions/setup-node@v2
- uses: actions/setup-python@v2
with:
node-version: 18
python-version: 3.9
- name: Install dependencies
run: |
cd bitcoinops && yarn
cd ../common && yarn
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Fetch data
run: |
mkdir /tmp/data
cd bitcoinops && node main.js
python bitcoinops/main.py
env:
ES_ENGINE: ${{ secrets.ES_ENGINE }}
ES_URL: ${{ secrets.ES_URL }}
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/bitcointalk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-node@v2
- uses: actions/setup-python@v2
with:
node-version: 18
python-version: 18
- name: Install dependencies
run: |
cd bitcointalk && yarn
cd ../common && yarn
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Fetch data
run: |
mkdir /tmp/data
cd bitcointalk && node index.js
python bitcointalk/main.py
env:
ES_ENGINE: ${{ secrets.ES_ENGINE }}
ES_URL: ${{ secrets.ES_URL }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/bitcointranscripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-node@v2
- uses: actions/setup-python@v2
with:
node-version: 18
python-version: 3.9
- name: Install dependencies
run: |
cd bitcointranscripts && yarn
cd ../common && yarn
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Fetch data
run: |
mkdir /tmp/data
cd bitcointranscripts && node main.js
python bitcointranscripts/main.py
env:
ES_ENGINE: ${{ secrets.ES_ENGINE }}
ES_URL: ${{ secrets.ES_URL }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/mailing-list-bitcoin.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Bitcoin Mailing list
on:
schedule:
- cron: '0 0 * * *' # every day at midnight
# schedule:
# - cron: '0 0 * * *' # every day at midnight
workflow_dispatch:
jobs:
fetch:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/mailing-list-lightning.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Lightning Mailing list
on:
schedule:
- cron: '0 0 * * *' # every day at midnight
# schedule:
# - cron: '0 0 * * *' # every day at midnight
workflow_dispatch:
jobs:
fetch:
Expand Down
166 changes: 0 additions & 166 deletions bitcoinops/main.js

This file was deleted.

135 changes: 135 additions & 0 deletions bitcoinops/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import asyncio
import os
import re
import zipfile
from datetime import datetime

import requests
import yaml
from dotenv import load_dotenv
from loguru import logger

from common.elasticsearch_utils import upsert_document

load_dotenv()

FOLDER_NAME = "raw_data"
REPO_URL = "https://github.com/bitcoinops/bitcoinops.github.io/archive/refs/heads/master.zip"
POST_DIR = "bitcoinops.github.io-master/_posts/en"
TOPIC_DIR = "bitcoinops.github.io-master/_topics/en"

INDEX_NAME = os.getenv('INDEX')
DATA_DIR = os.getenv('DATA_DIR')

# Paths
DIR_PATH = os.path.join(DATA_DIR, "bitcoinops_dir")
GLOBAL_URL_VARIABLE = os.path.join(DIR_PATH, FOLDER_NAME)


async def download_repo():
os.makedirs(DIR_PATH, exist_ok=True)

if os.path.exists(GLOBAL_URL_VARIABLE):
urvishp80 marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f"Repo already downloaded at path: {DIR_PATH}")
return

logger.info(f"Downloading repo at path: {DIR_PATH}")
file_path = os.path.join(DIR_PATH, "raw_data.zip")

try:
response = requests.get(REPO_URL)
response.raise_for_status()

with open(file_path, 'wb') as file:
file.write(response.content)
logger.info(f"Downloaded {REPO_URL} to {file_path}")

with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(DIR_PATH)
logger.info(f"Unzipped {file_path} to {DIR_PATH}")

except requests.RequestException as e:
logger.error(f"Failed to download the repo: {e}")

except zipfile.BadZipFile as e:
logger.error(f"Failed to unzip the file: {e}")


def parse_markdowns(content: str):
sections = re.split(r'---\n', content)
if len(sections) < 3:
raise ValueError("Input text does not contain proper front matter delimiters '---'")
front_matter = sections[1].strip()
body = sections[2].strip()
return front_matter, body


def parse_post(post_file: str, typeof: str):
try:
with open(post_file, 'r', encoding='utf-8') as file:
content = file.read()
content = re.sub(r'{%.*%}', '', content, flags=re.MULTILINE)
front_matter, body = parse_markdowns(content)
metadata = yaml.safe_load(front_matter)
custom_id = os.path.basename(post_file).replace('.md', '') if typeof == 'topic' else metadata['slug']
document = {
"id": f"bitcoinops-{custom_id}",
"title": metadata['title'],
"body_formatted": body,
"body": body,
"body_type": "markdown",
"created_at": metadata.get('date').strftime('%Y-%m-%dT%H:%M:%S.000Z') if metadata.get('date') else None,
"domain": "https://bitcoinops.org/en/",
"url": f"https://bitcoinops.org/en/topics/{custom_id}" if typeof == "topic" else f"https://bitcoinops.org{metadata['permalink']}",
"type": "topic" if typeof == "topic" else metadata['type'],
"language": metadata.get('lang', 'en'),
"authors": ["bitcoinops"],
"indexed_at": datetime.now().isoformat()
}
return document
except IOError as e:
logger.warning(f"Issue while parsing the file, {post_file}: {e}")
return None


def dir_walk(extracted_dir: str, typeof: str):
if os.path.exists(extracted_dir):
documents = []
for root, dirs, files in os.walk(extracted_dir):
for dir in dirs:
documents.extend(dir_walk(os.path.join(root, dir), typeof))
for post_file in files:
logger.info(f"Parsing {os.path.join(root, post_file)}")
document = parse_post(os.path.join(root, post_file), typeof)
if document:
documents.append(document)
return documents
else:
logger.critical("Data Directory not available.")
return []


async def main():
await download_repo()
all_posts = dir_walk(os.path.join(DIR_PATH, POST_DIR), "posts")
all_topics = dir_walk(os.path.join(DIR_PATH, TOPIC_DIR), "topic")
count_new = 0
count_updated = 0
all_posts.extend(all_topics)
for post in all_posts:
try:
res = upsert_document(index_name=INDEX_NAME, doc_id=post['id'], doc_body=post)
logger.info(f"Version-{res['_version']}, Result-{res['result']}, ID-{res['_id']}")
if res['result'] == 'created':
count_new += 1
if res['result'] == 'updated':
count_updated += 1
except Exception as e:
logger.error(f"Error: {e}")
logger.warning(post)
logger.info(f"Inserted {count_new} new documents")
logger.info(f"Updated {count_updated} documents")


if __name__ == '__main__':
asyncio.run(main())
Loading