Skip to content

Commit

Permalink
Merge pull request #241 from Crinibus/add-support-to-shein
Browse files Browse the repository at this point in the history
Add support for Shein
  • Loading branch information
Crinibus authored Jan 27, 2024
2 parents 88fe8be + 467a74d commit 611fff2
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 7 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ This scraper can (so far) scrape prices on products from:
- [Sharkgaming.dk](https://www.sharkgaming.dk/)
- [Newegg.com](https://www.newegg.com/) & [Newegg.ca](https://www.newegg.ca/)
- [HifiKlubben.dk](https://www.hifiklubben.dk/)
- [Shein.com](https://www.shein.com/)

****OBS these Amazon domains should work: [.com](https://www.amazon.com/), [.ca](https://www.amazon.ca/), [.es](https://www.amazon.es/), [.fr](https://www.amazon.fr/), [.de](https://www.amazon.de/) and [.it](https://www.amazon.it/)<br/>
The listed Amazon domains is from my quick testing with one or two products from each domain.<br/>
Expand Down
1 change: 1 addition & 0 deletions scraper/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"sharkgaming": "midnightblue",
"newegg": "#f7c20a",
"hifiklubben": "#231f20",
"shein": "#ffed24",
}

URL_SCHEMES = ("http://", "https://")
31 changes: 24 additions & 7 deletions scraper/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,11 @@ def _get_product_name(self) -> str:

def _get_product_price(self) -> float:
if self.soup_url.split("/")[3] == "itm":
price = float(
self.request_data.find("div", class_="x-price-primary")
.text
.replace("US $", "")
)
price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", ""))
else:
price = float(
self.request_data.find("div", class_="x-price-primary")
.text
.replace("DKK ", "")
.text.replace("DKK ", "")
.replace("$", "")
.replace(",", "")
)
Expand Down Expand Up @@ -514,6 +509,27 @@ def get_short_url(self) -> str:
return f"{website}/{id}"


class SheinHandler(BaseWebsiteHandler):
def _get_common_data(self) -> None:
script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
self.script_json = json.loads(script_data_raw)

def _get_product_name(self) -> str:
return self.script_json.get("name")

def _get_product_price(self) -> float:
return float(self.script_json.get("offers").get("price"))

def _get_product_currency(self) -> str:
return self.script_json.get("offers").get("priceCurrency")

def _get_product_id(self) -> str:
return self.script_json.get("sku")

def get_short_url(self) -> str:
return self.url


def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")

Expand Down Expand Up @@ -568,4 +584,5 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
"sharkgaming": SharkGamingHandler,
"newegg": NeweggHandler,
"hifiklubben": HifiKlubbenHandler,
"shein": SheinHandler,
}
1 change: 1 addition & 0 deletions tests/test_add_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
("https://sharkgaming.dk/", does_not_raise()),
("https://www.newegg.com/", does_not_raise()),
("https://www.hifiklubben.dk/", does_not_raise()),
("https://us.shein.com/", does_not_raise()),
("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
]

Expand Down
6 changes: 6 additions & 0 deletions tests/test_objects.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@
"expected_title": "SENNHEISER MOMENTUM 4 WIRELESS",
"expected_id": "senmomentum4bk",
"expected_currency": "DKK"
},
"shein": {
"link": "https://euqs.shein.com/Men-s-Letter-Print-Slim-Fit-Short-Sleeve-T-Shirt-p-28492178.html",
"expected_title": "Men's Letter Print Slim Fit Short Sleeve T-Shirt",
"expected_id": "sm2311284334246374",
"expected_currency": "EUR"
}
}
}
32 changes: 32 additions & 0 deletions tests/test_website_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
ProshopHandler,
SharkGamingHandler,
HifiKlubbenHandler,
SheinHandler,
)
from scraper.models import Info

Expand Down Expand Up @@ -47,6 +48,7 @@ def read_json(filename: str) -> dict:
sharkgaming_test = test_website_handlers_json["sharkgaming"]
newegg_test = test_website_handlers_json["newegg"]
hifiklubben_test = test_website_handlers_json["hifiklubben"]
shein_test = test_website_handlers_json["shein"]


class BaseTestWebsiteHandler(ABC):
Expand Down Expand Up @@ -560,3 +562,33 @@ def test_get_id(self) -> None:
id = self.test_handler._get_product_id()
assert isinstance(id, str)
assert id == hifiklubben_test["expected_id"]


class TestSheinHandler(BaseTestWebsiteHandler):
test_handler = SheinHandler(shein_test["link"])

def test_get_product_info(self, mocker) -> None:
mocker.patch("scraper.domains.BaseWebsiteHandler._request_product_data", return_value=self.test_handler.request_data)
actual = self.test_handler.get_product_info()
assert isinstance(actual, Info)
assert actual.valid

def test_get_name(self) -> None:
actual = self.test_handler._get_product_name().lower()
expected = shein_test["expected_title"].lower()
assert isinstance(actual, str)
assert actual == expected

def test_get_price(self) -> None:
price = self.test_handler._get_product_price()
assert isinstance(price, float)

def test_get_currency(self) -> None:
currency = self.test_handler._get_product_currency()
assert isinstance(currency, str)
assert currency == shein_test["expected_currency"]

def test_get_id(self) -> None:
id = self.test_handler._get_product_id()
assert isinstance(id, str)
assert id == shein_test["expected_id"]

0 comments on commit 611fff2

Please sign in to comment.