Skip to content

Commit

Permalink
feat: Use a LLM for scraped ingredient parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
TomBursch committed Aug 6, 2024
1 parent 3f74be5 commit 62380f8
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 13 deletions.
25 changes: 12 additions & 13 deletions backend/app/controller/recipe/recipe_controller.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import re

from app.errors import NotFoundRequest, InvalidUsage
from app.models.recipe import RecipeItems, RecipeTags
from app.models import Household, RecipeItems, RecipeTags
from flask import jsonify, Blueprint
from flask_jwt_extended import jwt_required
from app.helpers import validate_args, authorize_household
from app.models import Recipe, Item, Tag
from recipe_scrapers import scrape_me
from recipe_scrapers._exceptions import SchemaOrgException, NoSchemaFoundInWildMode
from ingredient_parser import parse_ingredient

from app.service.file_has_access_or_download import file_has_access_or_download
from app.service.ingredient_parsing import parseIngredients
from .schemas import (
SearchByNameRequest,
AddRecipe,
Expand Down Expand Up @@ -194,6 +193,10 @@ def getAllFiltered(args, household_id):
@authorize_household()
@validate_args(ScrapeRecipe)
def scrapeRecipe(args, household_id):
household = Household.find_by_id(household_id)
if not household:
raise NotFoundRequest()

try:
scraper = scrape_me(args["url"], wild_mode=True)
except NoSchemaFoundInWildMode:
Expand Down Expand Up @@ -231,20 +234,16 @@ def scrapeRecipe(args, household_id):
recipe.photo = scraper.image()
recipe.source = args["url"]
items = {}
for ingredient in scraper.ingredients():
parsed = parse_ingredient(ingredient)
name = parsed.name.text if parsed.name else ingredient
item = Item.find_by_name(household_id, name)
for ingredient in parseIngredients(scraper.ingredients(), household.language):
name = ingredient.name if ingredient.name else ingredient.originalText
item = Item.find_name_starts_with(household_id, name)
if item:
description = f"{parsed.amount[0].quantity if len(parsed.amount) > 0 else ''} {parsed.amount[0].unit if len(parsed.amount) > 0 else ''}"
# description = description + (" " if description else "") + (parsed.comment.text if parsed.comment else "") # Usually cooking instructions

items[ingredient] = item.obj_to_dict() | {
"description": description,
items[ingredient.originalText] = item.obj_to_dict() | {
"description": ingredient.description,
"optional": False,
}
else:
items[ingredient] = None
items[ingredient.originalText] = None
return jsonify(
{
"recipe": recipe.obj_to_dict(),
Expand Down
8 changes: 8 additions & 0 deletions backend/app/models/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ def find_by_default_key(cls, household_id: int, default_key: str) -> Self:
def find_by_id(cls, id) -> Self:
return cls.query.filter(cls.id == id).first()

@classmethod
def find_name_starts_with(cls, household_id: int, starts_with: str) -> Self:
starts_with = starts_with.strip()
return cls.query.filter(
cls.household_id == household_id,
func.lower(cls.name).like(func.lower(starts_with) + "%"),
).first()

@classmethod
def search_name(cls, name: str, household_id: int) -> list[Self]:
item_count = 11
Expand Down
97 changes: 97 additions & 0 deletions backend/app/service/ingredient_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from ingredient_parser import parse_ingredient
from litellm import completion
import json
import os

from app.config import SUPPORTED_LANGUAGES

LLM_MODEL = os.getenv("LLM_MODEL")
LLM_API_URL = os.getenv("LLM_API_URL")

class IngredientParsingResult:
originalText: str = None
name: str = None
description: str = None

def __init__(self, original_text, name, description):
self.originalText = original_text
self.name = name
self.description = description

def __str__(self):
return f"{self.originalText} -> {self.name} ({self.description})"


def parseNLP(ingredients: list[str]) -> list[IngredientParsingResult]:
def parseNLPSingle(ingredient):
parsed = parse_ingredient(ingredient)
name = parsed.name.text if parsed.name else None
description = f"{parsed.amount[0].quantity if len(parsed.amount) > 0 else ''} {parsed.amount[0].unit if len(parsed.amount) > 0 else ''}"
# description = description + (" " if description else "") + (parsed.comment.text if parsed.comment else "") # Usually cooking instructions
return IngredientParsingResult(ingredient, name, description)

return [parseNLPSingle(e) for e in ingredients]


def parseLLM(
ingredients: list[str], targetLanguageCode: str = None
) -> list[IngredientParsingResult]:
systemMessage = """
You are a tool that returns only JSON in the form of [{"name": name, "description": description}, ...]. Split every string from the list into these two properties. You receive recipe ingredients and fill the name field with the singular name of the ingredient and everything else is the description. Translate the response into the specified language.
For example in English:
Given: ["300g of Rice", "2 Chocolates"] you return only:
[{"name": "Rice", "description": "300g"}, {"name": "Chocolate", "description": "2"}]
Return only JSON and nothing else.
""" + (
f"Translate the response to {SUPPORTED_LANGUAGES[targetLanguageCode]}. Translate the JSON content to {SUPPORTED_LANGUAGES[targetLanguageCode]}. Your target language is {SUPPORTED_LANGUAGES[targetLanguageCode]}. Respond in {SUPPORTED_LANGUAGES[targetLanguageCode]} from the start."
if targetLanguageCode in SUPPORTED_LANGUAGES
else ""
)

response = completion(
model=LLM_MODEL,
api_base=LLM_API_URL,
# response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": systemMessage,
},
{
"role": "user",
"content": f"Translate the response to {SUPPORTED_LANGUAGES[targetLanguageCode]}. Translate the JSON content to {SUPPORTED_LANGUAGES[targetLanguageCode]}. Your target language is {SUPPORTED_LANGUAGES[targetLanguageCode]}. Respond in {SUPPORTED_LANGUAGES[targetLanguageCode]} from the start.",
},
{
"role": "user",
"content": json.dumps(ingredients),
},
],
)

llmResponse = json.loads(response.choices[0].message.content)
if len(llmResponse) != len(ingredients):
return None
parsedIngredients = []
for i in range(len(llmResponse)):
parsedIngredients.append(
IngredientParsingResult(
ingredients[i], llmResponse[i]["name"], llmResponse[i]["description"]
)
)

return parsedIngredients


def parseIngredients(
ingredients: list[str],
targetLanguageCode=None,
) -> list[IngredientParsingResult]:
if LLM_MODEL:
try:
return parseLLM(ingredients, targetLanguageCode) or parseNLP(ingredients)
except Exception as e:
print("Error parsing ingredients:", e)

return parseNLP(ingredients)
25 changes: 25 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
aiohappyeyeballs==2.3.4
aiohttp==3.10.0
aiosignal==1.3.1
alembic==1.13.2
amqp==5.2.0
annotated-types==0.7.0
anyio==4.4.0
apispec==6.6.1
appdirs==1.4.4
APScheduler==3.10.4
Expand All @@ -26,7 +30,9 @@ cryptography==42.0.8
cycler==0.12.1
dbscan1d==0.2.2
defusedxml==0.7.1
distro==1.9.0
extruct==0.17.0
filelock==3.15.4
flake8==7.1.0
Flask==3.0.3
Flask-APScheduler==1.13.1
Expand All @@ -39,23 +45,32 @@ Flask-SQLAlchemy==3.1.1
flexcache==0.3
flexparser==0.3.1
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.6.1
future==1.0.0
gevent==24.2.1
greenlet==3.0.0rc3
h11==0.14.0
html5lib==1.1
html_text==0.6.2
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.24.5
idna==3.7
importlib_metadata==8.2.0
ingredient_parser_nlp==1.0.0
iniconfig==2.0.0
isodate==0.6.1
itsdangerous==2.2.0
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jstyleson==0.0.2
kiwisolver==1.4.5
kombu==5.3.7
lark==1.1.9
litellm==1.42.7
lxml==5.2.2
lxml_html_clean==0.1.1
Mako==1.3.5
Expand All @@ -65,10 +80,12 @@ matplotlib==3.9.1
mccabe==0.7.0
mf2py==2.0.1
mlxtend==0.23.1
multidict==6.0.5
mypy-extensions==1.0.0
nltk==3.8.1
numpy==2.0.0
oic==1.7.0
openai==1.37.1
packaging==24.1
pandas==2.2.2
pathspec==0.12.1
Expand Down Expand Up @@ -101,21 +118,27 @@ python-engineio==4.9.1
python-socketio==5.11.3
pytz==2024.1
pytz-deprecation-shim==0.1.0.post0
PyYAML==6.0.1
rdflib==7.0.0
rdflib-jsonld==0.6.2
recipe_scrapers==14.58.0
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
rpds-py==0.19.1
scikit-learn==1.5.1
scipy==1.14.0
setuptools==71.0.3
setuptools-scm==8.1.0
simple-websocket==1.0.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.31
sqlite-icu==1.0
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.19.1
toml==0.10.2
tomli==2.0.1
tqdm==4.66.4
Expand All @@ -136,5 +159,7 @@ wcwidth==0.2.13
webencodings==0.5.1
Werkzeug==3.0.3
wsproto==1.2.0
yarl==1.9.4
zipp==3.19.2
zope.event==5.0
zope.interface==6.4.post2

0 comments on commit 62380f8

Please sign in to comment.