Skip to content

Commit

Permalink
Merge branch 'dev' into fix/resolve-conflict
Browse files Browse the repository at this point in the history
tictactoeid committed Dec 2, 2024
2 parents 4b4cdf1 + 137701b commit d96c695
Showing 2 changed files with 28 additions and 10 deletions.
34 changes: 26 additions & 8 deletions crawlers/base_crawler.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,8 @@
from bs4 import BeautifulSoup
from pytz import timezone

from slack import _send_slack_message


def text_normalizer(text, only_letters=False):
non_letters = [
@@ -120,12 +122,23 @@ def normalize(self, meal, **kwargs):


class FindPrice(MealNormalizer):
def normalize(self, meal, **kwargs):
p = re.compile(r"([1-9]\d{0,2}[,.]?\d00)(.*?원)?")
m = p.search(meal.name)
def _match_pattern(self, meal, pattern):
m = list(pattern.finditer(meal.name))
if m:
meal.set_price(m.group(1))
meal.set_name(p.sub("", meal.name))
last_match = m[-1] # 메뉴명 중간에 가격이 들어가는 경우가 있어 마지막에 매칭되는 것을 가격으로 판정
meal.set_price(last_match.group(1))
start, end = last_match.span()
meal.set_name(meal.name[:start] + meal.name[end:])
return meal, bool(m)

def normalize(self, meal, **kwargs):
p = re.compile(r"([1-9]\d{0,2}[,.]?\d00)\s*(.*?원)?")
meal, result = self._match_pattern(meal, p)

if not result: # 가격이 1000원 미만인 경우
p = re.compile(r"([1-9]\d{0,2})\s*(원)")
meal, result = self._match_pattern(meal, p)

return meal


@@ -199,7 +212,7 @@ class RestaurantCrawler(metaclass=ABCMeta):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"}
url = ""
normalizer_classes = []
not_meal = [
not_meal = [ # 메뉴이름에 이를 포함하는 경우 메뉴에서 제외
"휴무",
"휴점",
"폐점",
@@ -240,6 +253,11 @@ class RestaurantCrawler(metaclass=ABCMeta):
"하계방학", # 301동 '*하계방학 일 100식 한정*'
"2중택1", # 301동 '(1), (2) 중 택1', '(1), (2) 중 택 1'
]
not_meal_exact_match = [ # 메뉴이름이 정확히 일치하는 경우만 제외
"메뉴",
"식사", # 301동 <식사>
"천원의아침밥", # 301동 <천원의아침밥>
]

def __init__(self):
self.meals = []
@@ -266,7 +284,7 @@ async def run(self, url=None, **kwargs):
soup = BeautifulSoup(html, "html.parser")
self.crawl(soup, **kwargs)
except Exception as e:
_send_slack_message(f"Error in Run, {type(e).__name}: {str(e)}\nURL: {url}")
_send_slack_message(f"Error in Run, {type(e).__name__}: {str(e)}\nURL: {url}")

def normalize(self, meal, **kwargs):
for normalizer_cls in self.normalizer_classes:
@@ -275,7 +293,7 @@ def normalize(self, meal, **kwargs):

def is_meal_name_when_normalized(self, name):
normalized_name = text_normalizer(name, True)
if not normalized_name or normalized_name == "메뉴":
if not normalized_name or normalized_name in self.not_meal_exact_match:
return False
is_meal_name = all(re.match(".*" + p + ".*", normalized_name) is None for p in self.not_meal)
return is_meal_name
4 changes: 2 additions & 2 deletions slack.py
Original file line number Diff line number Diff line change
@@ -7,15 +7,15 @@ def _send_slack_message(message: str):
slack_token = os.environ.get("SLACK_TOKEN")
slack_channel = os.environ.get("SLACK_CHANNEL")
if not slack_token:
print(f"No Slack token provided. Skip sending message: {message}")
print(f"No Slack token provided. Skip sending message:\n{message}")
return
body = {"channel": slack_channel, "text": message}
headers = {"Authorization": f"Bearer {slack_token}"}
try:
res = requests.post("https://slack.com/api/chat.postMessage", headers=headers, data=body, timeout=100)
res.raise_for_status()
except Exception as e:
print(f"Failed to send Slack message: {str(e)}")
print(f"Failed to send Slack message: \n{str(e)}")
print(f"Response: {e.response.text if e.response else 'No response'}")


0 comments on commit d96c695

Please sign in to comment.