diff --git a/README.md b/README.md
index ab0f208e..ddfa523e 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from:
- [Sharkgaming.dk](https://www.sharkgaming.dk/)
- [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
- [HifiKlubben.dk](https://www.hifiklubben.dk/)
+- [Shein.com](https://www.shein.com/)
****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)
The listed Amazon domains is from my quick testing with one or two products from each domain.
diff --git a/scraper/constants.py b/scraper/constants.py
index cd75cf58..44c913bb 100644
--- a/scraper/constants.py
+++ b/scraper/constants.py
@@ -18,6 +18,7 @@
"sharkgaming": "midnightblue",
"newegg": "#f7c20a",
"hifiklubben": "#231f20",
+ "shein": "#ffed24",
}
URL_SCHEMES = ("http://", "https://")
diff --git a/scraper/domains.py b/scraper/domains.py
index 2dec2f68..56a4468f 100644
--- a/scraper/domains.py
+++ b/scraper/domains.py
@@ -310,16 +310,11 @@ def _get_product_name(self) -> str:
def _get_product_price(self) -> float:
if self.soup_url.split("/")[3] == "itm":
- price = float(
- self.request_data.find("div", class_="x-price-primary")
- .text
- .replace("US $", "")
- )
+ price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", ""))
else:
price = float(
self.request_data.find("div", class_="x-price-primary")
- .text
- .replace("DKK ", "")
+ .text.replace("DKK ", "")
.replace("$", "")
.replace(",", "")
)
@@ -514,6 +509,27 @@ def get_short_url(self) -> str:
return f"{website}/{id}"
+class SheinHandler(BaseWebsiteHandler):
+ def _get_common_data(self) -> None:
+ script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
+ self.script_json = json.loads(script_data_raw)
+
+ def _get_product_name(self) -> str:
+ return self.script_json.get("name")
+
+ def _get_product_price(self) -> float:
+ return float(self.script_json.get("offers").get("price"))
+
+ def _get_product_currency(self) -> str:
+ return self.script_json.get("offers").get("priceCurrency")
+
+ def _get_product_id(self) -> str:
+ return self.script_json.get("sku")
+
+ def get_short_url(self) -> str:
+ return self.url
+
+
def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
@@ -568,4 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
"sharkgaming": SharkGamingHandler,
"newegg": NeweggHandler,
"hifiklubben": HifiKlubbenHandler,
+ "shein": SheinHandler,
}
diff --git a/tests/test_add_product.py b/tests/test_add_product.py
index 14cd2f3a..1c3aea67 100644
--- a/tests/test_add_product.py
+++ b/tests/test_add_product.py
@@ -22,6 +22,7 @@
("https://sharkgaming.dk/", does_not_raise()),
("https://www.newegg.com/", does_not_raise()),
("https://www.hifiklubben.dk/", does_not_raise()),
+ ("https://us.shein.com/", does_not_raise()),
("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
]
diff --git a/tests/test_objects.json b/tests/test_objects.json
index 5997d58b..0c65e877 100644
--- a/tests/test_objects.json
+++ b/tests/test_objects.json
@@ -95,6 +95,12 @@
"expected_title": "SENNHEISER MOMENTUM 4 WIRELESS",
"expected_id": "senmomentum4bk",
"expected_currency": "DKK"
+ },
+ "shein": {
+ "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html",
+ "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt",
+ "expected_id": "sm2311284334246374",
+ "expected_currency": "EUR"
}
}
}
diff --git a/tests/test_website_handlers.py b/tests/test_website_handlers.py
index 9914bc34..bcc9d92a 100644
--- a/tests/test_website_handlers.py
+++ b/tests/test_website_handlers.py
@@ -17,6 +17,7 @@
ProshopHandler,
SharkGamingHandler,
HifiKlubbenHandler,
+ SheinHandler,
)
from scraper.models import Info
@@ -47,6 +48,7 @@ def read_json(filename: str) -> dict:
sharkgaming_test = test_website_handlers_json["sharkgaming"]
newegg_test = test_website_handlers_json["newegg"]
hifiklubben_test = test_website_handlers_json["hifiklubben"]
+shein_test = test_website_handlers_json["shein"]
class BaseTestWebsiteHandler(ABC):
@@ -560,3 +562,33 @@ def test_get_id(self) -> None:
id = self.test_handler._get_product_id()
assert isinstance(id, str)
assert id == hifiklubben_test["expected_id"]
+
+
+class TestSheinHandler(BaseTestWebsiteHandler):
+ test_handler = SheinHandler(shein_test["link"])
+
+ def test_get_product_info(self, mocker) -> None:
+ mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
+ actual = self.test_handler.get_product_info()
+ assert isinstance(actual, Info)
+ assert actual.valid
+
+ def test_get_name(self) -> None:
+ actual = self.test_handler._get_product_name().lower()
+ expected = shein_test["expected_title"].lower()
+ assert isinstance(actual, str)
+ assert actual == expected
+
+ def test_get_price(self) -> None:
+ price = self.test_handler._get_product_price()
+ assert isinstance(price, float)
+
+ def test_get_currency(self) -> None:
+ currency = self.test_handler._get_product_currency()
+ assert isinstance(currency, str)
+ assert currency == shein_test["expected_currency"]
+
+ def test_get_id(self) -> None:
+ id = self.test_handler._get_product_id()
+ assert isinstance(id, str)
+ assert id == shein_test["expected_id"]