Merge pull request #241 from Crinibus/add-support-to-shein

Add support for Shein
Crinibus · Jan 27, 2024 · 611fff2 · 611fff2
2 parents 88fe8be + 467a74d
commit 611fff2
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from:
 - [Sharkgaming.dk](https://www.sharkgaming.dk/)
 - [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
 - [HifiKlubben.dk](https://www.hifiklubben.dk/)
+- [Shein.com](https://www.shein.com/)
 
 ****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)<br/>
 The listed Amazon domains is from my quick testing with one or two products from each domain.<br/>

diff --git a/scraper/constants.py b/scraper/constants.py
@@ -18,6 +18,7 @@
     "sharkgaming": "midnightblue",
     "newegg": "#f7c20a",
     "hifiklubben": "#231f20",
+    "shein": "#ffed24",
 }
 
 URL_SCHEMES = ("http://", "https://")
diff --git a/scraper/domains.py b/scraper/domains.py
@@ -310,16 +310,11 @@ def _get_product_name(self) -> str:
 
     def _get_product_price(self) -> float:
         if self.soup_url.split("/")[3] == "itm":
-            price = float(
-                self.request_data.find("div", class_="x-price-primary")
-                .text
-                .replace("US $", "")
-            )
+            price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", ""))
         else:
             price = float(
                 self.request_data.find("div", class_="x-price-primary")
-                .text
-                .replace("DKK ", "")
+                .text.replace("DKK ", "")
                 .replace("$", "")
                 .replace(",", "")
             )
@@ -514,6 +509,27 @@ def get_short_url(self) -> str:
         return f"{website}/{id}"
 
 
+class SheinHandler(BaseWebsiteHandler):
+    def _get_common_data(self) -> None:
+        script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
+        self.script_json = json.loads(script_data_raw)
+
+    def _get_product_name(self) -> str:
+        return self.script_json.get("name")
+
+    def _get_product_price(self) -> float:
+        return float(self.script_json.get("offers").get("price"))
+
+    def _get_product_currency(self) -> str:
+        return self.script_json.get("offers").get("priceCurrency")
+
+    def _get_product_id(self) -> str:
+        return self.script_json.get("sku")
+
+    def get_short_url(self) -> str:
+        return self.url
+
+
 def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
     stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
 
@@ -568,4 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
     "sharkgaming": SharkGamingHandler,
     "newegg": NeweggHandler,
     "hifiklubben": HifiKlubbenHandler,
+    "shein": SheinHandler,
 }
diff --git a/tests/test_add_product.py b/tests/test_add_product.py
@@ -22,6 +22,7 @@
     ("https://sharkgaming.dk/", does_not_raise()),
     ("https://www.newegg.com/", does_not_raise()),
     ("https://www.hifiklubben.dk/", does_not_raise()),
+    ("https://us.shein.com/", does_not_raise()),
     ("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
 ]
 

diff --git a/tests/test_objects.json b/tests/test_objects.json
@@ -95,6 +95,12 @@
       "expected_title": "SENNHEISER MOMENTUM 4 WIRELESS",
       "expected_id": "senmomentum4bk",
       "expected_currency": "DKK"
+    },
+    "shein": {
+      "link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html",
+      "expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt",
+      "expected_id": "sm2311284334246374",
+      "expected_currency": "EUR"
     }
   }
 }
diff --git a/tests/test_website_handlers.py b/tests/test_website_handlers.py
@@ -17,6 +17,7 @@
     ProshopHandler,
     SharkGamingHandler,
     HifiKlubbenHandler,
+    SheinHandler,
 )
 from scraper.models import Info
 
@@ -47,6 +48,7 @@ def read_json(filename: str) -> dict:
 sharkgaming_test = test_website_handlers_json["sharkgaming"]
 newegg_test = test_website_handlers_json["newegg"]
 hifiklubben_test = test_website_handlers_json["hifiklubben"]
+shein_test = test_website_handlers_json["shein"]
 
 
 class BaseTestWebsiteHandler(ABC):
@@ -560,3 +562,33 @@ def test_get_id(self) -> None:
         id = self.test_handler._get_product_id()
         assert isinstance(id, str)
         assert id == hifiklubben_test["expected_id"]
+
+
+class TestSheinHandler(BaseTestWebsiteHandler):
+    test_handler = SheinHandler(shein_test["link"])
+
+    def test_get_product_info(self, mocker) -> None:
+        mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
+        actual = self.test_handler.get_product_info()
+        assert isinstance(actual, Info)
+        assert actual.valid
+
+    def test_get_name(self) -> None:
+        actual = self.test_handler._get_product_name().lower()
+        expected = shein_test["expected_title"].lower()
+        assert isinstance(actual, str)
+        assert actual == expected
+
+    def test_get_price(self) -> None:
+        price = self.test_handler._get_product_price()
+        assert isinstance(price, float)
+
+    def test_get_currency(self) -> None:
+        currency = self.test_handler._get_product_currency()
+        assert isinstance(currency, str)
+        assert currency == shein_test["expected_currency"]
+
+    def test_get_id(self) -> None:
+        id = self.test_handler._get_product_id()
+        assert isinstance(id, str)
+        assert id == shein_test["expected_id"]