From 68cf08c500d2bbacc325fecb14834381d7541dfe Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:30:13 +0100 Subject: [PATCH 1/8] Add website handler 'Shein' Add website handler 'Shein' Add key 'shein' to dictionary 'SUPPORTED_DOMAINS' Add url 'https://us.shein.com/' to test_add_product --- scraper/domains.py | 22 ++++++++++++++++++++++ tests/test_add_product.py | 1 + 2 files changed, 23 insertions(+) diff --git a/scraper/domains.py b/scraper/domains.py index 68fa14bf..dd988d83 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -515,6 +515,27 @@ def get_short_url(self) -> str: def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str: +class Shein(BaseWebsiteHandler): + def _get_common_data(self) -> None: + script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text + self.script_json = json.loads(script_data_raw) + + def _get_product_name(self) -> str: + return self.script_json.get("name") + + def _get_product_price(self) -> float: + return float(self.script_json.get("offers").get("price")) + + def _get_product_currency(self) -> str: + return self.script_json.get("offers").get("priceCurrency") + + def _get_product_id(self) -> str: + return self.script_json.get("sku") + + def get_short_url(self) -> str: + return self.url + + stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1) domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0] @@ -553,4 +574,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler: "sharkgaming": SharkGamingHandler, "newegg": NeweggHandler, "hifiklubben": HifiKlubbenHandler, + "shein": Shein, } diff --git a/tests/test_add_product.py b/tests/test_add_product.py index 14cd2f3a..1c3aea67 100644 --- a/tests/test_add_product.py +++ b/tests/test_add_product.py @@ -22,6 +22,7 @@ ("https://sharkgaming.dk/", does_not_raise()), ("https://www.newegg.com/", does_not_raise()), ("https://www.hifiklubben.dk/", does_not_raise()), + ("https://us.shein.com/", does_not_raise()), ("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)), ] From 4c3227fafd0060b35054f4086b2d642013090502 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:42:36 +0100 Subject: [PATCH 2/8] Fix domains.py --- scraper/domains.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/domains.py b/scraper/domains.py index dd988d83..cf82e2c0 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -514,7 +514,6 @@ def get_short_url(self) -> str: return f"{website}/{id}" -def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str: class Shein(BaseWebsiteHandler): def _get_common_data(self) -> None: script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text @@ -536,6 +535,7 @@ def get_short_url(self) -> str: return self.url +def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str: stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1) domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0] From f1fcf5d6fdcd667a3fc7fdc5b861ae69e010731f Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Wed, 10 Jan 2024 17:45:39 +0100 Subject: [PATCH 3/8] Add Shein.com to supported websites in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ab0f208e..3605796e 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from: - [Sharkgaming.dk](https://www.sharkgaming.dk/) - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/) - [HifiKlubben.dk](https://www.hifiklubben.dk/) +- [Shein.com](https://www.us.shein.com/) ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
From 940b286ddc1ef528e77fe323f94a0985701af54e Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 23:38:07 +0100 Subject: [PATCH 4/8] Add shein to WEBSITE_COLORS --- scraper/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper/constants.py b/scraper/constants.py index cd75cf58..44c913bb 100644 --- a/scraper/constants.py +++ b/scraper/constants.py @@ -18,6 +18,7 @@ "sharkgaming": "midnightblue", "newegg": "#f7c20a", "hifiklubben": "#231f20", + "shein": "#ffed24", } URL_SCHEMES = ("http://", "https://") From 33a6f562e49c5c2fad8fb8c33c4428b2e4f92750 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 23:56:03 +0100 Subject: [PATCH 5/8] Delete duplicate Shein website handler --- scraper/domains.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/scraper/domains.py b/scraper/domains.py index 1ed76583..e47fbd93 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -530,27 +530,6 @@ def get_short_url(self) -> str: return self.url -class Shein(BaseWebsiteHandler): - def _get_common_data(self) -> None: - script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text - self.script_json = json.loads(script_data_raw) - - def _get_product_name(self) -> str: - return self.script_json.get("name") - - def _get_product_price(self) -> float: - return float(self.script_json.get("offers").get("price")) - - def _get_product_currency(self) -> str: - return self.script_json.get("offers").get("priceCurrency") - - def _get_product_id(self) -> str: - return self.script_json.get("sku") - - def get_short_url(self) -> str: - return self.url - - def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str: stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") From b07f4757295bfb8d9bed13ca6b861dd2f673c7b4 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 23:57:01 +0100 Subject: [PATCH 6/8] Update link to Shein in README Remove subdomain 'us' from Shein link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3605796e..ddfa523e 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ This scraper can (so far) scrape prices on products from: - [Sharkgaming.dk](https://www.sharkgaming.dk/) - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/) - [HifiKlubben.dk](https://www.hifiklubben.dk/) -- [Shein.com](https://www.us.shein.com/) +- [Shein.com](https://www.shein.com/) ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
From 9c1cb035983ba3af1a7b00679794943bfc52c27a Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Sat, 27 Jan 2024 16:32:27 +0100 Subject: [PATCH 7/8] Rename class 'Shein' to 'SheinHandler' --- scraper/domains.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper/domains.py b/scraper/domains.py index e47fbd93..56a4468f 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -509,7 +509,7 @@ def get_short_url(self) -> str: return f"{website}/{id}" -class Shein(BaseWebsiteHandler): +class SheinHandler(BaseWebsiteHandler): def _get_common_data(self) -> None: script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text self.script_json = json.loads(script_data_raw) @@ -584,5 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler: "sharkgaming": SharkGamingHandler, "newegg": NeweggHandler, "hifiklubben": HifiKlubbenHandler, - "shein": Shein, + "shein": SheinHandler, } From 467a74dbc90792960be973ca059a055330c35b2c Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Sat, 27 Jan 2024 16:32:45 +0100 Subject: [PATCH 8/8] Add website handler tests for SheinHandler --- tests/test_objects.json | 6 ++++++ tests/test_website_handlers.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/tests/test_objects.json b/tests/test_objects.json index 5997d58b..0c65e877 100644 --- a/tests/test_objects.json +++ b/tests/test_objects.json @@ -95,6 +95,12 @@ "expected_title": "SENNHEISER MOMENTUM 4 WIRELESS", "expected_id": "senmomentum4bk", "expected_currency": "DKK" + }, + "shein": { + "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html", + "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt", + "expected_id": "sm2311284334246374", + "expected_currency": "EUR" } } } diff --git a/tests/test_website_handlers.py b/tests/test_website_handlers.py index 9914bc34..bcc9d92a 100644 --- a/tests/test_website_handlers.py +++ b/tests/test_website_handlers.py @@ -17,6 +17,7 @@ ProshopHandler, SharkGamingHandler, HifiKlubbenHandler, + SheinHandler, ) from scraper.models import Info @@ -47,6 +48,7 @@ def read_json(filename: str) -> dict: sharkgaming_test = test_website_handlers_json["sharkgaming"] newegg_test = test_website_handlers_json["newegg"] hifiklubben_test = test_website_handlers_json["hifiklubben"] +shein_test = test_website_handlers_json["shein"] class BaseTestWebsiteHandler(ABC): @@ -560,3 +562,33 @@ def test_get_id(self) -> None: id = self.test_handler._get_product_id() assert isinstance(id, str) assert id == hifiklubben_test["expected_id"] + + +class TestSheinHandler(BaseTestWebsiteHandler): + test_handler = SheinHandler(shein_test["link"]) + + def test_get_product_info(self, mocker) -> None: + mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data) + actual = self.test_handler.get_product_info() + assert isinstance(actual, Info) + assert actual.valid + + def test_get_name(self) -> None: + actual = self.test_handler._get_product_name().lower() + expected = shein_test["expected_title"].lower() + assert isinstance(actual, str) + assert actual == expected + + def test_get_price(self) -> None: + price = self.test_handler._get_product_price() + assert isinstance(price, float) + + def test_get_currency(self) -> None: + currency = self.test_handler._get_product_currency() + assert isinstance(currency, str) + assert currency == shein_test["expected_currency"] + + def test_get_id(self) -> None: + id = self.test_handler._get_product_id() + assert isinstance(id, str) + assert id == shein_test["expected_id"]