Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Canonical url intergration batch 3 #905

Merged
merged 36 commits into from
Oct 28, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
892fbe5
Adds canonical_url for M - R
jknndy Oct 11, 2023
1bd117c
New test cases
jknndy Oct 11, 2023
443cab9
Test case updates for M*
jknndy Oct 12, 2023
055c18c
marthastewart corrections
jknndy Oct 12, 2023
4d2a8a6
Updates to maangchi
jknndy Oct 12, 2023
106955f
Finalizing M scrapers
jknndy Oct 12, 2023
5da27a8
Test case updates for N*
jknndy Oct 14, 2023
e2c36a8
Onehundredonecookbooks rewrite
jknndy Oct 15, 2023
f9bc1d2
Test case updates for P*
jknndy Oct 15, 2023
c7d7593
P*, primalledgehealth & normalize string
jknndy Oct 15, 2023
baf697c
Update paninihappy.py
jknndy Oct 15, 2023
564b82b
Test case updates for R*
jknndy Oct 15, 2023
5063272
Merge branch 'canonical_url_intergration_batch_3' of https://github.c…
jknndy Oct 15, 2023
2367f8b
rezeptwelt updates
jknndy Oct 15, 2023
57cdad7
Finalizing updates
jknndy Oct 15, 2023
d836997
Tox run
jknndy Oct 15, 2023
e6f17a0
moved \u00C2 to scrapers
jknndy Oct 18, 2023
af8de41
Batch of code review changes
jknndy Oct 19, 2023
7059264
Conflict resolutions
jknndy Oct 25, 2023
3bb0dfc
Merge branch 'main' into canonical_url_intergration_batch_3
jknndy Oct 26, 2023
a0e76ec
Merge branch 'main' into canonical_url_intergration_batch_3
jknndy Oct 26, 2023
e775b3d
Conflict resolutions
jknndy Oct 26, 2023
cb0d6bb
Merge branch 'main' into canonical_url_intergration_batch_3
jknndy Oct 26, 2023
263e385
Update maangchi.py
jknndy Oct 28, 2023
6502a2c
updated canonical_url description
jknndy Oct 28, 2023
c3ae0d9
dependencies: add LatinFixer string-encoding-translation helper library
jayaddison Oct 28, 2023
506ae1e
panelinha: improve ingredient output text representation
jayaddison Oct 28, 2023
b8d74eb
panelinha: add ugly workaround for malencoded strings that begin with…
jayaddison Oct 28, 2023
b5b7450
panelinha: improve title output text representation
jayaddison Oct 28, 2023
6e104aa
panelinha: linting: module import order fixup for isort
jayaddison Oct 28, 2023
56782fe
panelinha: update testhtml
jayaddison Oct 28, 2023
af9f9df
Revert "panelinha: improve title output text representation"
jayaddison Oct 28, 2023
dd0138d
Revert "panelinha: add ugly workaround for malencoded strings that be…
jayaddison Oct 28, 2023
648c72d
Revert "panelinha: improve ingredient output text representation"
jayaddison Oct 28, 2023
74fc5e9
Revert "dependencies: add LatinFixer string-encoding-translation help…
jayaddison Oct 28, 2023
ad31f80
panelinha: fixup: remove unused (and in fact, unavailable now) import
jayaddison Oct 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions recipe_scrapers/maangchi.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ def image(self):
return self.schema.image()

def ingredients(self):
before = self.soup.find("h4", string="Directions").find_all_previous("li")
after = self.soup.find("h4", string="Ingredients:").find_all_next("li")
before = self.soup.find("h2", string="Ingredients").find_all_next("li")
after = self.soup.find("h2", string="Directions").find_all_previous("li")
list_before = [normalize_string(b.get_text()) for b in before]
list_after = [normalize_string(a.get_text()) for a in after]
list_before.reverse()
return [x for x in list_before if x in list_after]

def instructions(self):
Expand All @@ -43,7 +42,12 @@ def instructions(self):
)

def ratings(self):
return self.schema.ratings()
jknndy marked this conversation as resolved.
Show resolved Hide resolved
rating_element = self.soup.find("p", {"class": "rmp-rating-widget__results"})
if rating_element:
rating_text = rating_element.find(
"span", {"class": "rmp-rating-widget__results__rating"}
).text.strip()
return int(rating_text)

def cuisine(self):
return self.schema.cuisine()
Expand Down
19 changes: 8 additions & 11 deletions recipe_scrapers/marthastewart.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,16 @@ def title(self):
return self.schema.title()

def total_time(self):
s = (
self.soup.findAll("div", {"class": "two-subcol-content-wrapper"})[0]
.find("div", {"class": "recipe-meta-item-body"})
.text.strip()
)
return get_minutes(s)
time_label = self.soup.find("div", string="Total Time:")
if time_label:
servings_value = time_label.find_next(
"div", {"class": "mntl-recipe-details__value"}
)
if servings_value:
return get_minutes(servings_value.text.strip())

def yields(self):
return (
self.soup.findAll("div", {"class": "two-subcol-content-wrapper"})[1]
.find("div", {"class": "recipe-meta-item-body"})
.text.strip()
)
return self.schema.yields()

def ingredients(self):
return self.schema.ingredients()
Expand Down
10 changes: 0 additions & 10 deletions recipe_scrapers/momswithcrockpots.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,3 @@ def instructions(self):
return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)

def ratings(self):
return round(
float(
self.soup.find(
"span", {"class": "wprm-recipe-rating-average"}
).get_text()
),
2,
)
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
54 changes: 41 additions & 13 deletions recipe_scrapers/onehundredonecookbooks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# mypy: disallow_untyped_defs=False
import re

from ._abstract import AbstractScraper
from ._utils import get_yields


class OneHundredOneCookBooks(AbstractScraper):
Expand All @@ -17,27 +16,56 @@ def author(self):
return self.schema.author()

def title(self):
return self.soup.find("h1").get_text()
return self.soup.find("h2").get_text()

def total_time(self):
return self.schema.total_time()

def yields(self):
data = self.soup.find_all("p", limit=3, recursive=False)[-1].get_text()
extraction = re.search("([0-9]+) servings", data)
return extraction.group(1) if extraction else None
header = self.soup.find("div", class_="cb101-recipe-time-header")
if header and "Serves" in header.text:
data = self.soup.find("div", class_="cb101-recipe-time").text.strip()
total_yields = data.split()[0]
return get_yields(total_yields)

def image(self):
return self.schema.image()

def ingredients(self):
ingredients = self.soup.find("blockquote").p.stripped_strings
return list(ingredients)
ingredients = []

def instructions(self):
return self.soup.find_all("p", limit=2, recursive=False)[1].get_text(
"\n", strip=True
ingredient_items = self.soup.select(
".cb101-recipe-ingredients li.cb101-recipe-ingredient"
)

def ratings(self):
return None
for item in ingredient_items:
amount_element = item.select_one(".cb101-recipe-ingredient-amount")
unit_element = item.select_one(".cb101-recipe-ingredient-unit")
name_element = item.select_one(".cb101-recipe-ingredient-name")

amount = amount_element.get_text(strip=True) if amount_element else ""
unit = unit_element.get_text(strip=True) if unit_element else ""
name = name_element.get_text(strip=True)

if amount and unit:
ingredient = f"{amount} {unit} {name}".strip()
else:
ingredient = f"{amount}{unit} {name}".strip()
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

ingredients.append(ingredient)
jknndy marked this conversation as resolved.
Show resolved Hide resolved

return ingredients

def instructions(self):
instructions_div = self.soup.find(
"div", class_="cb101-recipe-header", string="Instructions"
)
instructions = []
if instructions_div:
instruction_group = instructions_div.find_next(
"div", class_="cb101-recipe-instruction-group"
)
if instruction_group:
for instruction in instruction_group.find_all("p"):
instructions.append(instruction.get_text(strip=True))
return "\n".join(instructions)
25 changes: 14 additions & 11 deletions recipe_scrapers/panelinha.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

from ._abstract import AbstractScraper
from ._utils import get_minutes, normalize_string
from ._utils import get_yields, normalize_string

INSTRUCTIONS_NUMBERING_REGEX = re.compile(r"^\d{1,2}\.\s*") # noqa

Expand All @@ -13,23 +13,22 @@ def host(cls):
return "panelinha.com.br"

def title(self):
return normalize_string(self.soup.find("h1").get_text())
jknndy marked this conversation as resolved.
Show resolved Hide resolved

def total_time(self):
return get_minutes(
self.soup.find("span", string="Tempo de preparo").nextSibling
)
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
parent_div = self.soup.find("div", {"id": "recipe_header"})
return normalize_string(parent_div.find("h1").get_text())

def ingredients(self):
ingredients = self.soup.find("h4", string="Ingredientes").nextSibling.findAll(
ingredients = self.soup.find("h5", string="Ingredientes").nextSibling.findAll(
"li"
)

return [normalize_string(ingredient.get_text()) for ingredient in ingredients]
return [
normalize_string(ingredient.get_text().replace("\u00C2", ""))
for ingredient in ingredients
]

def instructions(self):
instructions = self.soup.find(
"h4", string="Modo de preparo"
"h5", string="Modo de preparo"
).nextSibling.findAll("li")

instructions = [
Expand All @@ -53,4 +52,8 @@ def instructions(self):
return "\n".join(instructions)

def yields(self):
return self.schema.yields()
main_element = self.soup.find("main")
yield_text = main_element.get("data-item-p-yield")
yield_number = re.search(r"\d+", yield_text)
if yield_number:
return get_yields(yield_number.group())
7 changes: 5 additions & 2 deletions recipe_scrapers/paninihappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ def yields(self):
return get_yields(self.soup.find("span", {"class": "yield"}))

def image(self):
image = self.soup.find("img", {"class": "post_image", "src": True})
return image["src"] if image else None
div_hrecipe = self.soup.find("div", {"class": "hrecipe"})
if div_hrecipe:
img_tag = div_hrecipe.find("img", {"loading": "lazy"})
if img_tag and "src" in img_tag.attrs:
return img_tag["src"]

def ingredients(self):
ingredients = self.soup.findAll("li", {"class": "ingredient"})
Expand Down
18 changes: 17 additions & 1 deletion recipe_scrapers/primaledgehealth.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# mypy: disallow_untyped_defs=False
from ._abstract import AbstractScraper
from ._utils import normalize_string


class PrimalEdgeHealth(AbstractScraper):
Expand All @@ -20,7 +21,22 @@ def image(self):
return self.schema.image()

def ingredients(self):
return self.schema.ingredients()
ingredients_li = self.soup.select(".wprm-recipe-ingredient")
ingredients_list = [
normalize_string(
li.find("span", {"class": "wprm-recipe-ingredient-amount"}).get_text()
).replace("\u00C2", "")
+ " "
+ normalize_string(
li.find("span", {"class": "wprm-recipe-ingredient-unit"}).get_text()
).replace("\u00C2", "")
+ " "
+ normalize_string(
li.find("span", {"class": "wprm-recipe-ingredient-name"}).get_text()
).replace("\u00C2", "")
for li in ingredients_li
]
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
return ingredients_list
jknndy marked this conversation as resolved.
Show resolved Hide resolved

def instructions(self):
return self.schema.instructions()
Expand Down
15 changes: 6 additions & 9 deletions recipe_scrapers/rezeptwelt.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,12 @@ def ingredients(self):
return self.schema.ingredients()

def instructions(self):
content = self.soup.find("ol", {"itemprop": "recipeInstructions"}).findAll(
"div", {"itemprop": "itemListElement"}
)
res = ""
for i in content:
steps = i.findAll("span", {"itemprop": "text"})
for step in steps:
res += normalize_string(step.text) + "\n"
return res
preparation_div = self.soup.find("div", id="preparationSteps")
instructions = preparation_div.find("span", itemprop="text").find_all("p")
instruction_texts = [instruction.get_text() for instruction in instructions]
normalized_texts = [normalize_string(text) for text in instruction_texts]
joined_instructions = "\n".join(filter(None, normalized_texts))
return joined_instructions
jknndy marked this conversation as resolved.
Show resolved Hide resolved

def ratings(self):
return self.schema.ratings()
Expand Down
Loading