diff --git a/README.md b/README.md index ab0f208e..ddfa523e 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from: - [Sharkgaming.dk](https://www.sharkgaming.dk/) - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/) - [HifiKlubben.dk](https://www.hifiklubben.dk/) +- [Shein.com](https://www.shein.com/) ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
diff --git a/scraper/constants.py b/scraper/constants.py index cd75cf58..44c913bb 100644 --- a/scraper/constants.py +++ b/scraper/constants.py @@ -18,6 +18,7 @@ "sharkgaming": "midnightblue", "newegg": "#f7c20a", "hifiklubben": "#231f20", + "shein": "#ffed24", } URL_SCHEMES = ("http://", "https://") diff --git a/scraper/domains.py b/scraper/domains.py index 2dec2f68..56a4468f 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -310,16 +310,11 @@ def _get_product_name(self) -> str: def _get_product_price(self) -> float: if self.soup_url.split("/")[3] == "itm": - price = float( - self.request_data.find("div", class_="x-price-primary") - .text - .replace("US $", "") - ) + price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", "")) else: price = float( self.request_data.find("div", class_="x-price-primary") - .text - .replace("DKK ", "") + .text.replace("DKK ", "") .replace("$", "") .replace(",", "") ) @@ -514,6 +509,27 @@ def get_short_url(self) -> str: return f"{website}/{id}" +class SheinHandler(BaseWebsiteHandler): + def _get_common_data(self) -> None: + script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text + self.script_json = json.loads(script_data_raw) + + def _get_product_name(self) -> str: + return self.script_json.get("name") + + def _get_product_price(self) -> float: + return float(self.script_json.get("offers").get("price")) + + def _get_product_currency(self) -> str: + return self.script_json.get("offers").get("priceCurrency") + + def _get_product_id(self) -> str: + return self.script_json.get("sku") + + def get_short_url(self) -> str: + return self.url + + def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str: stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") @@ -568,4 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler: "sharkgaming": SharkGamingHandler, "newegg": NeweggHandler, "hifiklubben": HifiKlubbenHandler, + "shein": SheinHandler, } diff --git a/tests/test_add_product.py b/tests/test_add_product.py index 14cd2f3a..1c3aea67 100644 --- a/tests/test_add_product.py +++ b/tests/test_add_product.py @@ -22,6 +22,7 @@ ("https://sharkgaming.dk/", does_not_raise()), ("https://www.newegg.com/", does_not_raise()), ("https://www.hifiklubben.dk/", does_not_raise()), + ("https://us.shein.com/", does_not_raise()), ("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)), ] diff --git a/tests/test_objects.json b/tests/test_objects.json index 5997d58b..0c65e877 100644 --- a/tests/test_objects.json +++ b/tests/test_objects.json @@ -95,6 +95,12 @@ "expected_title": "SENNHEISER MOMENTUM 4 WIRELESS", "expected_id": "senmomentum4bk", "expected_currency": "DKK" + }, + "shein": { + "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html", + "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt", + "expected_id": "sm2311284334246374", + "expected_currency": "EUR" } } } diff --git a/tests/test_website_handlers.py b/tests/test_website_handlers.py index 9914bc34..bcc9d92a 100644 --- a/tests/test_website_handlers.py +++ b/tests/test_website_handlers.py @@ -17,6 +17,7 @@ ProshopHandler, SharkGamingHandler, HifiKlubbenHandler, + SheinHandler, ) from scraper.models import Info @@ -47,6 +48,7 @@ def read_json(filename: str) -> dict: sharkgaming_test = test_website_handlers_json["sharkgaming"] newegg_test = test_website_handlers_json["newegg"] hifiklubben_test = test_website_handlers_json["hifiklubben"] +shein_test = test_website_handlers_json["shein"] class BaseTestWebsiteHandler(ABC): @@ -560,3 +562,33 @@ def test_get_id(self) -> None: id = self.test_handler._get_product_id() assert isinstance(id, str) assert id == hifiklubben_test["expected_id"] + + +class TestSheinHandler(BaseTestWebsiteHandler): + test_handler = SheinHandler(shein_test["link"]) + + def test_get_product_info(self, mocker) -> None: + mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) + actual = self.test_handler.get_product_info() + assert isinstance(actual, Info) + assert actual.valid + + def test_get_name(self) -> None: + actual = self.test_handler._get_product_name().lower() + expected = shein_test["expected_title"].lower() + assert isinstance(actual, str) + assert actual == expected + + def test_get_price(self) -> None: + price = self.test_handler._get_product_price() + assert isinstance(price, float) + + def test_get_currency(self) -> None: + currency = self.test_handler._get_product_currency() + assert isinstance(currency, str) + assert currency == shein_test["expected_currency"] + + def test_get_id(self) -> None: + id = self.test_handler._get_product_id() + assert isinstance(id, str) + assert id == shein_test["expected_id"]