Skip to content

Commit

Permalink
Refactor repository.
Browse files Browse the repository at this point in the history
  • Loading branch information
toddbirchard committed Jan 6, 2024
1 parent f310f2a commit 3702ae5
Show file tree
Hide file tree
Showing 18 changed files with 832 additions and 344 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
TARGET_URL=https://hackersandslackers.com
4 changes: 4 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[flake8]
select = E9,F63,F7,F82
exclude = .git,.github,__pycache__,.pytest_cache,.venv,logs,creds,.reports
max-line-length = 120
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,10 @@ venv.bak/
credentials.json
gcloud.json

# Etc.
.idea
.pytest_cache
# OS
.DS_Store

# IDEs
.idea
.vs_code

2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2020 Hackers and Slackers
Copyright (c) 2024 Hackers and Slackers

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
81 changes: 81 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
PROJECT_NAME := $(shell basename $CURDIR)
VIRTUAL_ENV := $(CURDIR)/.venv
LOCAL_PYTHON := $(VIRTUAL_ENV)/bin/python3

define HELP
Manage $(PROJECT_NAME). Usage:

make run - Run $(PROJECT_NAME) locally.
make install - Create local virtualenv & install dependencies.
make deploy - Set up project & run locally.
make update - Update dependencies via Poetry and output resulting `requirements.txt`.
make format - Run Python code formatter & sort dependencies.
make lint - Check code formatting with flake8.
make clean - Remove extraneous compiled files, caches, logs, etc.

endef
export HELP


.PHONY: run install deploy update format lint clean help

all help:
@echo "$$HELP"

env: $(VIRTUAL_ENV)

$(VIRTUAL_ENV):
if [ ! -d $(VIRTUAL_ENV) ]; then \
echo "Creating Python virtual env in \`${VIRTUAL_ENV}\`"; \
python3 -m venv $(VIRTUAL_ENV); \
fi
poetry config virtualenvs.path $(VIRTUAL_ENV)

.PHONY: run
run: env
$(LOCAL_PYTHON) -m main

.PHONY: install
install: env
$(shell . $(VIRTUAL_ENV)/bin/activate)
$(LOCAL_PYTHON) -m pip install --upgrade pip setuptools wheel && \
poetry install --with dev --sync
echo Installed dependencies in \`${VIRTUAL_ENV}\`;

.PHONY: deploy
deploy:
make install && \
make run

.PHONY: update
update: env
$(LOCAL_PYTHON) -m pip install --upgrade pip setuptools wheel && \
poetry update --with dev && \
poetry export -f requirements.txt --output requirements.txt --without-hashes && \
echo Installed dependencies in \`${VIRTUAL_ENV}\`;

.PHONY: format
format: env
$(LOCAL_PYTHON) -m isort --multi-line=3 . && \
$(LOCAL_PYTHON) -m black .

.PHONY: lint
lint: env
$(LOCAL_PYTHON) -m flake8 . --count \
--select=E9,F63,F7,F82 \
--exclude .git,.github,__pycache__,.pytest_cache,.venv,logs,creds,.venv,docs,logs,.reports \
--show-source \
--statistics

.PHONY: clean
clean:
find . -name 'poetry.lock' -delete && \
find . -name '.coverage' -delete && \
find . -name '.Pipfile.lock' -delete && \
find . -wholename '**/*.pyc' -delete && \
find . -type d -wholename '__pycache__' -exec rm -rf {} + && \
find . -type d -wholename './.venv' -exec rm -rf {} + && \
find . -type d -wholename '.pytest_cache' -exec rm -rf {} + && \
find . -type d -wholename '**/.pytest_cache' -exec rm -rf {} + && \
find . -type d -wholename './logs/*.log' -exec rm -rf {} + && \
find . -type d -wholename './.reports/*' -exec rm -rf {} +
14 changes: 0 additions & 14 deletions Pipfile

This file was deleted.

135 changes: 0 additions & 135 deletions Pipfile.lock

This file was deleted.

50 changes: 17 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,54 +1,38 @@
# BeautifulSoup Web Scraping Tutorial

![Python](https://img.shields.io/badge/Python-v^3.8-blue.svg?logo=python&longCache=true&logoColor=white&colorB=5e81ac&style=flat-square&colorA=4c566a)
![BeautifulSoup](https://img.shields.io/badge/BeautifulSoup4-v4.9.1-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
![Requests](https://img.shields.io/badge/Requests-v2.23.0-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
![Python](https://img.shields.io/badge/Python-v^3.10-blue.svg?logo=python&longCache=true&logoColor=white&colorB=5e81ac&style=flat-square&colorA=4c566a)
![BeautifulSoup](https://img.shields.io/badge/BeautifulSoup4-v4.12.2-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
![Requests](https://img.shields.io/badge/Requests-v2.31.0-blue.svg?longCache=true&logo=python&longCache=true&style=flat-square&logoColor=white&colorB=5e81ac&colorA=4c566a)
![GitHub Last Commit](https://img.shields.io/github/last-commit/google/skia.svg?style=flat-square&colorA=4c566a&colorB=a3be8c)
[![GitHub Issues](https://img.shields.io/github/issues/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorA=4c566a&colorB=ebcb8b&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/issues)
[![GitHub Stars](https://img.shields.io/github/stars/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorB=ebcb8b&colorA=4c566a&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/hackersandslackers/beautifulsoup-tutorial.svg?style=flat-square&colorA=4c566a&colorB=ebcb8b&logo=Github)](https://github.com/hackersandslackers/beautifulsoup-tutorial/network)

![Beautifulsoup Tutorial](https://github.com/hackersandslackers/beautifulsoup-tutorial/blob/master/.github/beautifulsoup@2x.jpg?raw=true)

A beginner's tutorial to scraping websites using Python's [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library. This repository is the source code for the tutorial found here:
https://hackersandslackers.com/scraping-urls-with-beautifulsoup/
A beginner's tutorial to scraping websites using Python's [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library.

## Installation
This repository is the source code for the tutorial found [here](https://hackersandslackers.com/beautifulsoup-tutorial/).

**Installation via `requirements.txt`**:
## Getting Started

```shell
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
$ cd beautifulsoup-tutorial
$ python3 -m venv myenv
$ source myenv/bin/activate
$ pip3 install -r requirements.txt
$ python3 main.py
```
Get set up locally in two steps:

**Installation via [Pipenv](https://pipenv-fork.readthedocs.io/en/latest/)**:
### Environment Variables

```shell
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
$ cd beautifulsoup-tutorial
$ pipenv shell
$ pipenv update
$ python3 main.py
```
Replace the value in **.env.example** with your value, and rename this file to **.env**:

**Installation via [Poetry](https://python-poetry.org/)**:
* `TARGET_URL`: An HTTP URL to scrape and display metadata from.

```shell
$ git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
$ cd beautifulsoup-tutorial
$ poetry shell
$ poetry update
$ poetry run
```
### Installation

## How to Use
Get up and running with `make deploy`:

This script will output metadata scraped from whichever URL is specified in **config.py**. Simply change the value of this variable to test the script against any URL of your choice.
```shell
git clone https://github.com/hackersandslackers/beautifulsoup-tutorial.git
cd beautifulsoup-tutorial
make deploy
```

------------------

Expand Down
20 changes: 20 additions & 0 deletions beautifulsoup_tutorial/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Scrape metadata from target URL."""
import pprint

from beautifulsoup_tutorial.fetch import fetch_html_from_url
from beautifulsoup_tutorial.scrape import scrape_page_metadata

from config import TARGET_URL


def init_script() -> dict:
"""
Fetch a given HTML page to extract & display metadata for.
returns: dict
"""
resp = fetch_html_from_url(TARGET_URL)
metadata = scrape_page_metadata(resp, TARGET_URL)
pp = pprint.PrettyPrinter(indent=4, width=120, sort_dicts=False)
pp.pprint(metadata)
return metadata
28 changes: 28 additions & 0 deletions beautifulsoup_tutorial/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Fetch raw HTML from a URL."""
from typing import Optional

import requests
from requests.exceptions import HTTPError


def fetch_html_from_url(url: str) -> Optional[str]:
"""
Fetch raw HTML from a URL.
:param str url: URL to `GET` contents from.
:return: Optional[str]
"""
try:
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
return requests.get(url, headers=headers)
except HTTPError as e:
print(f"HTTP error occurred: {e}")
except Exception as e:
print(f"Unexpected error occurred: {e}")
Loading

0 comments on commit 3702ae5

Please sign in to comment.