-
Notifications
You must be signed in to change notification settings - Fork 525
/
nihhealthyeating.py
209 lines (165 loc) · 7.12 KB
/
nihhealthyeating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from ._abstract import AbstractScraper
from ._exceptions import ElementNotFoundInHtml, StaticValueException
from ._grouping_utils import IngredientGroup
from ._utils import get_minutes, get_yields, normalize_string
class NIHHealthyEating(AbstractScraper):
@classmethod
def host(cls):
return "healthyeating.nhlbi.nih.gov"
def title(self):
# This content must be present for all recipes on this website.
return normalize_string(self.soup.h1.get_text())
def site_name(self):
raise StaticValueException(
return_value="National Heart, Lung and Blood Institute"
)
def total_time(self):
# This content must be present for all recipes on this website.
time_table = self.soup.find("table", {"class": "recipe_time_table"})
if time_table is None:
raise ElementNotFoundInHtml("Table with times was not found.")
return sum(
get_minutes(td) for td in time_table.find_all("td") if get_minutes(td)
)
def yields(self):
# This content must be present for all recipes on this website.
time_table = self.soup.find("table", {"class": "recipe_time_table"})
if time_table is None:
raise ElementNotFoundInHtml(
"Table with the number of servings that the recipe yields was not found."
)
i = 0
for t in time_table.findAll("th"):
if "Yields" in t:
break
i += 1
if i >= len(time_table.findAll("td")):
raise ElementNotFoundInHtml(
"Table cells with servings that the recipe yields were not found."
)
return get_yields(time_table.find_all("td")[i])
def image(self):
# Optional content recipes on this website.
img = self.soup.find("img", {"class": "recipe_image", "src": True})
if img is None:
raise ElementNotFoundInHtml("Image not found.")
image_relative_url = img.get("src")
if image_relative_url is None:
raise ElementNotFoundInHtml("Image not found.")
image_relative_url = f"https://{self.host()}{image_relative_url}"
return image_relative_url
def ingredient_groups(self) -> list[IngredientGroup]:
# This content must be present for recipes on this website.
ingredients_div = self.soup.find("div", {"id": "ingredients"})
section = []
if ingredients_div is None:
raise ElementNotFoundInHtml("Ingredients not found.")
# Find more than one lists of ingredients
ingredients_h4_sections = ingredients_div.find_all("h4")
# Ingredients are broken down into sections
# https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=11&cId=1&rId=5
if len(ingredients_h4_sections) >= 2:
ingredients_sections = ingredients_div.find_all("tr")
for ingredients_section in ingredients_sections:
items = ingredients_section.find("p").get_text().strip().split("\n")
# create ingredient group for each section
res = IngredientGroup(
ingredients=items,
purpose=normalize_string(ingredients_section.find("h4").get_text()),
)
section.append(res)
return section
# Default case
ingredients_p = ingredients_div.findAll("p")
ingredients = [normalize_string(para.get_text()) for para in ingredients_p]
ingredients_list = [
ing for ing in ingredients if not ing.lower().startswith("recipe cards")
]
# Edge case: ingredents are a mix for single main ingredients and a single sub section
# https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=0&cId=10&rId=163
if len(ingredients_h4_sections) == 1:
items = (
ingredients_div.find("h4")
.find_next_sibling("p")
.get_text()
.strip()
.split("\n")
)
group = IngredientGroup(
purpose=normalize_string(ingredients_h4_sections[0].get_text()),
ingredients=items,
)
section.append(group)
section.append(IngredientGroup(ingredients=ingredients_list[:-1]))
return section
return [IngredientGroup(ingredients_list)]
def ingredients(self) -> list[str]:
results = []
for ingredient_group in self.ingredient_groups():
results.extend(ingredient_group.ingredients)
return results
def instructions(self):
# This content must be present for recipes on this website.
directions_div = self.soup.find("div", {"id": "recipe_directions"})
if directions_div is None:
raise ElementNotFoundInHtml("Instructions not found.")
instructions = directions_div.findAll("div", {"class": "steptext"})
return "\n".join(
[normalize_string(instruction.get_text()) for instruction in instructions]
)
def nutrients(self):
elements = []
nutrition = {}
for s in (
self.soup.find("div", {"id": "nutrition_info"}).find("table").find_all("tr")
):
for element in s.find_all("td"):
if element.get_text().strip() != "":
elements.append(normalize_string(element.get_text()))
for i in range(0, len(elements), 2):
if len(elements) > i + 1:
k, v = elements[i], elements[i + 1]
nutrition[k] = v
return nutrition
def description(self):
return normalize_string(
self.soup.find("p", {"class": "recipe_detail_subtext"}).get_text()
)
def prep_time(self):
return get_minutes(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[0]
.get_text()
)
def cook_time(self):
return get_minutes(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[1]
.get_text()
)
def serving_size(self):
return normalize_string(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[3]
.get_text()
)
def recipe_source(self):
return normalize_string(
self.soup.find("div", {"id": "Recipe_Source"}).get_text().split(": ")[1]
)
def recipe_cards(self):
recipe_cards_maker = self.soup.find("strong", string="Recipe Cards:")
if recipe_cards_maker is None:
return None
recipe_cards = []
recipe_cards_maker_siblings = recipe_cards_maker.next_siblings
for recipe_cards_maker_sibling in recipe_cards_maker_siblings:
link = recipe_cards_maker_sibling.find("a")
if recipe_cards_maker_sibling.name == "li":
recipe_cards.append(
{
"size": normalize_string(recipe_cards_maker_sibling.get_text()),
"url": link.get("href"),
}
)
return recipe_cards