From 413dfccd3470c54d6b3d7a06be22166bd80fda22 Mon Sep 17 00:00:00 2001 From: Rohan Godha Date: Wed, 26 Oct 2022 10:18:01 +0530 Subject: [PATCH] Proposing a python package for parsing HTML and XML documents that can create a parse tree for parsed pages that can be used to extract data from HTML, which would be useful for web scraping. --- beautifulSoap.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 beautifulSoap.md diff --git a/beautifulSoap.md b/beautifulSoap.md new file mode 100644 index 0000000..0389f4a --- /dev/null +++ b/beautifulSoap.md @@ -0,0 +1,93 @@ +#IMPORT RESOURCES +import requests +from bs4 import BeautifulSoup + +#MAKE A SOAP OBJECT OUT OF A WEBSITE +// 1. The HTTP request +webpage = request.get('URL', 'html.parser'); +// 2. Turn the website into a soup object +soup = BeautifulSoup(webpage.content); + +#OBJECT TYPES +//1. Tags correspond to HTML tags +Example Code: +soup = BeautifulSoup('
An example div

An example p tag

'); + +print(soup.div); +-->
An example div
+--> gets the first tag of that type on the page + +print(soup.div.name) +print(soup.div.attrs) +--> div +--> {'id': 'example'} + +//2. Navigable Strings: Piece of text inside of HTML Tags +print(soup.div.string) +--> An example div + +#NAVIGATING BY TAGS +Example Code: +

World's Best Chocolate Chip Cookies

+ + + +//1. Get the children of a tag: +for child in soup.ul.children: + print(child) +-->
  • 1 cup flour
  • +-->
  • 1/2 cup sugar
  • +... + +//2. Get the parent of a tag: +for parent in soup.li.parents: + print(parent) + +#FIND ALL +//1. find_all() +print(soup.find_all("h1")) +--> Outputs all

    ...

    on the website + +//1.1. find_all() with regex +import re +soup.find_all(re.compile("[ou]l")) +--> Outputs all and
      ...
    +soup.find_all(re.compile("h[1-9]")) +--> Outputs all headings + + +//1.2. find_all() with lists +soup.find_all(['h1', 'a', 'p']) + + +//1.3 find_all() with attributes +soup.find_all(attrs={'class':'banner', 'id':'jumbotron'}); + + +//1.4 find_all() with functions +def has_banner_class_and_hello_world(tag): + return tag.attr('class') == "banner" and tag.string == "Hello world" + +soup.find_all(has_banner_class_and_hello_world) + +#CSS SELECTORS +//1. grab CSS classes with .select("class_name") +soup.select(".recipeLink") + +//*2. grab CSS IDs with .select("#id_name") +soup.select("#selected") + +//3. using a loop +for link in soup.select(".recipeLink > a"): + webpage = requests.get(link) + new_soup = BeautifulSoup(webpage)