From 68cf08c500d2bbacc325fecb14834381d7541dfe Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Tue, 9 Jan 2024 23:30:13 +0100
Subject: [PATCH 1/8] Add website handler 'Shein'
Add website handler 'Shein'
Add key 'shein' to dictionary 'SUPPORTED_DOMAINS'
Add url 'https://us.shein.com/' to test_add_product
---
scraper/domains.py | 22 ++++++++++++++++++++++
tests/test_add_product.py | 1 +
2 files changed, 23 insertions(+)
diff --git a/scraper/domains.py b/scraper/domains.py
index 68fa14bf..dd988d83 100644
--- a/scraper/domains.py
+++ b/scraper/domains.py
@@ -515,6 +515,27 @@ def get_short_url(self) -> str:
def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str:
+class Shein(BaseWebsiteHandler):
+ def _get_common_data(self) -> None:
+ script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
+ self.script_json = json.loads(script_data_raw)
+
+ def _get_product_name(self) -> str:
+ return self.script_json.get("name")
+
+ def _get_product_price(self) -> float:
+ return float(self.script_json.get("offers").get("price"))
+
+ def _get_product_currency(self) -> str:
+ return self.script_json.get("offers").get("priceCurrency")
+
+ def _get_product_id(self) -> str:
+ return self.script_json.get("sku")
+
+ def get_short_url(self) -> str:
+ return self.url
+
+
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1)
domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0]
@@ -553,4 +574,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
"sharkgaming": SharkGamingHandler,
"newegg": NeweggHandler,
"hifiklubben": HifiKlubbenHandler,
+ "shein": Shein,
}
diff --git a/tests/test_add_product.py b/tests/test_add_product.py
index 14cd2f3a..1c3aea67 100644
--- a/tests/test_add_product.py
+++ b/tests/test_add_product.py
@@ -22,6 +22,7 @@
("https://sharkgaming.dk/", does_not_raise()),
("https://www.newegg.com/", does_not_raise()),
("https://www.hifiklubben.dk/", does_not_raise()),
+ ("https://us.shein.com/", does_not_raise()),
("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
]
From 4c3227fafd0060b35054f4086b2d642013090502 Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Tue, 9 Jan 2024 23:42:36 +0100
Subject: [PATCH 2/8] Fix domains.py
---
scraper/domains.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scraper/domains.py b/scraper/domains.py
index dd988d83..cf82e2c0 100644
--- a/scraper/domains.py
+++ b/scraper/domains.py
@@ -514,7 +514,6 @@ def get_short_url(self) -> str:
return f"{website}/{id}"
-def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str:
class Shein(BaseWebsiteHandler):
def _get_common_data(self) -> None:
script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
@@ -536,6 +535,7 @@ def get_short_url(self) -> str:
return self.url
+def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str:
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1)
domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0]
From f1fcf5d6fdcd667a3fc7fdc5b861ae69e010731f Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Wed, 10 Jan 2024 17:45:39 +0100
Subject: [PATCH 3/8] Add Shein.com to supported websites in README
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index ab0f208e..3605796e 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from:
- [Sharkgaming.dk](https://www.sharkgaming.dk/)
- [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
- [HifiKlubben.dk](https://www.hifiklubben.dk/)
+- [Shein.com](https://www.us.shein.com/)
****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
From 940b286ddc1ef528e77fe323f94a0985701af54e Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Fri, 12 Jan 2024 23:38:07 +0100
Subject: [PATCH 4/8] Add shein to WEBSITE_COLORS
---
scraper/constants.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/scraper/constants.py b/scraper/constants.py
index cd75cf58..44c913bb 100644
--- a/scraper/constants.py
+++ b/scraper/constants.py
@@ -18,6 +18,7 @@
"sharkgaming": "midnightblue",
"newegg": "#f7c20a",
"hifiklubben": "#231f20",
+ "shein": "#ffed24",
}
URL_SCHEMES = ("http://", "https://")
From 33a6f562e49c5c2fad8fb8c33c4428b2e4f92750 Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Fri, 12 Jan 2024 23:56:03 +0100
Subject: [PATCH 5/8] Delete duplicate Shein website handler
---
scraper/domains.py | 21 ---------------------
1 file changed, 21 deletions(-)
diff --git a/scraper/domains.py b/scraper/domains.py
index 1ed76583..e47fbd93 100644
--- a/scraper/domains.py
+++ b/scraper/domains.py
@@ -530,27 +530,6 @@ def get_short_url(self) -> str:
return self.url
-class Shein(BaseWebsiteHandler):
- def _get_common_data(self) -> None:
- script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
- self.script_json = json.loads(script_data_raw)
-
- def _get_product_name(self) -> str:
- return self.script_json.get("name")
-
- def _get_product_price(self) -> float:
- return float(self.script_json.get("offers").get("price"))
-
- def _get_product_currency(self) -> str:
- return self.script_json.get("offers").get("priceCurrency")
-
- def _get_product_id(self) -> str:
- return self.script_json.get("sku")
-
- def get_short_url(self) -> str:
- return self.url
-
-
def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
From b07f4757295bfb8d9bed13ca6b861dd2f673c7b4 Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Fri, 12 Jan 2024 23:57:01 +0100
Subject: [PATCH 6/8] Update link to Shein in README
Remove subdomain 'us' from Shein link
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 3605796e..ddfa523e 100644
--- a/README.md
+++ b/README.md
@@ -150,7 +150,7 @@ This scraper can (so far) scrape prices on products from:
- [Sharkgaming.dk](https://www.sharkgaming.dk/)
- [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
- [HifiKlubben.dk](https://www.hifiklubben.dk/)
-- [Shein.com](https://www.us.shein.com/)
+- [Shein.com](https://www.shein.com/)
****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
From 9c1cb035983ba3af1a7b00679794943bfc52c27a Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Sat, 27 Jan 2024 16:32:27 +0100
Subject: [PATCH 7/8] Rename class 'Shein' to 'SheinHandler'
---
scraper/domains.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scraper/domains.py b/scraper/domains.py
index e47fbd93..56a4468f 100644
--- a/scraper/domains.py
+++ b/scraper/domains.py
@@ -509,7 +509,7 @@ def get_short_url(self) -> str:
return f"{website}/{id}"
-class Shein(BaseWebsiteHandler):
+class SheinHandler(BaseWebsiteHandler):
def _get_common_data(self) -> None:
script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
self.script_json = json.loads(script_data_raw)
@@ -584,5 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
"sharkgaming": SharkGamingHandler,
"newegg": NeweggHandler,
"hifiklubben": HifiKlubbenHandler,
- "shein": Shein,
+ "shein": SheinHandler,
}
From 467a74dbc90792960be973ca059a055330c35b2c Mon Sep 17 00:00:00 2001
From: Crinibus <57172157+Crinibus@users.noreply.github.com>
Date: Sat, 27 Jan 2024 16:32:45 +0100
Subject: [PATCH 8/8] Add website handler tests for SheinHandler
---
tests/test_objects.json | 6 ++++++
tests/test_website_handlers.py | 32 ++++++++++++++++++++++++++++++++
2 files changed, 38 insertions(+)
diff --git a/tests/test_objects.json b/tests/test_objects.json
index 5997d58b..0c65e877 100644
--- a/tests/test_objects.json
+++ b/tests/test_objects.json
@@ -95,6 +95,12 @@
"expected_title": "SENNHEISER MOMENTUM 4 WIRELESS",
"expected_id": "senmomentum4bk",
"expected_currency": "DKK"
+ },
+ "shein": {
+ "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html",
+ "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt",
+ "expected_id": "sm2311284334246374",
+ "expected_currency": "EUR"
}
}
}
diff --git a/tests/test_website_handlers.py b/tests/test_website_handlers.py
index 9914bc34..bcc9d92a 100644
--- a/tests/test_website_handlers.py
+++ b/tests/test_website_handlers.py
@@ -17,6 +17,7 @@
ProshopHandler,
SharkGamingHandler,
HifiKlubbenHandler,
+ SheinHandler,
)
from scraper.models import Info
@@ -47,6 +48,7 @@ def read_json(filename: str) -> dict:
sharkgaming_test = test_website_handlers_json["sharkgaming"]
newegg_test = test_website_handlers_json["newegg"]
hifiklubben_test = test_website_handlers_json["hifiklubben"]
+shein_test = test_website_handlers_json["shein"]
class BaseTestWebsiteHandler(ABC):
@@ -560,3 +562,33 @@ def test_get_id(self) -> None:
id = self.test_handler._get_product_id()
assert isinstance(id, str)
assert id == hifiklubben_test["expected_id"]
+
+
+class TestSheinHandler(BaseTestWebsiteHandler):
+ test_handler = SheinHandler(shein_test["link"])
+
+ def test_get_product_info(self, mocker) -> None:
+ mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
+ actual = self.test_handler.get_product_info()
+ assert isinstance(actual, Info)
+ assert actual.valid
+
+ def test_get_name(self) -> None:
+ actual = self.test_handler._get_product_name().lower()
+ expected = shein_test["expected_title"].lower()
+ assert isinstance(actual, str)
+ assert actual == expected
+
+ def test_get_price(self) -> None:
+ price = self.test_handler._get_product_price()
+ assert isinstance(price, float)
+
+ def test_get_currency(self) -> None:
+ currency = self.test_handler._get_product_currency()
+ assert isinstance(currency, str)
+ assert currency == shein_test["expected_currency"]
+
+ def test_get_id(self) -> None:
+ id = self.test_handler._get_product_id()
+ assert isinstance(id, str)
+ assert id == shein_test["expected_id"]