Skip to content

Commit

Permalink
Merge pull request #186 from Crinibus/update-website-handlers-how-to-…
Browse files Browse the repository at this point in the history
…get-product-ids-and-short-urls

Update website handlers how to get product ids and short urls
  • Loading branch information
Crinibus authored Oct 11, 2022
2 parents 8a1be08 + a87aad5 commit 2e24255
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 68 deletions.
88 changes: 36 additions & 52 deletions scraper/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,11 @@ def _get_product_currency(self) -> str:
return currency

def _get_product_id(self) -> str:
return self.request_data.find("span", itemprop="sku").text
return self.url.split("/")[4]

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.komplett.dk/product/{self.info.id}"
id = self._get_product_id()
return f"https://www.komplett.dk/product/{id}"


class ProshopHandler(BaseWebsiteHandler):
Expand All @@ -97,7 +96,7 @@ def _get_common_data(self):
self.soup_script_tag_json = json.loads(soup_script_tag)

def _get_product_name(self) -> str:
return self.request_data.find("div", class_="col-xs-12 col-sm-7").h1.text
return self.soup_script_tag_json["name"]

def _get_product_price(self) -> float:
try:
Expand Down Expand Up @@ -126,13 +125,11 @@ def _get_product_currency(self) -> str:
return currency

def _get_product_id(self) -> str:
id = self.soup_script_tag_json.get("sku")
return id
return self.url.split("/")[-1]

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.proshop.dk/{self.info.id}"
id = self._get_product_id()
return f"https://www.proshop.dk/{id}"


class ComputerSalgHandler(BaseWebsiteHandler):
Expand All @@ -146,12 +143,11 @@ def _get_product_currency(self) -> str:
return self.request_data.find("span", itemprop="priceCurrency").get("content")

def _get_product_id(self) -> str:
return self.request_data.find("h2", class_="productIdentifierHeadline").span.text
return self.url.split("/")[4]

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.computersalg.dk/i/{self.info.id}"
id = self._get_product_id()
return f"https://www.computersalg.dk/i/{id}"


class ElgigantenHandler(BaseWebsiteHandler):
Expand All @@ -171,16 +167,15 @@ def _get_product_id(self) -> str:
return self.url.split("/")[-1]

def _get_json_api_data(self) -> dict:
id_number = self.url.split("/")[-1]
id_number = self._get_product_id()
# API link to get price and currency
api_link = f"https://www.elgiganten.dk/cxorchestrator/dk/api?appMode=b2c&user=anonymous&operationName=getProductWithDynamicDetails&variables=%7B%22articleNumber%22%3A%22{id_number}%22%2C%22withCustomerSpecificPrices%22%3Afalse%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%229bfbc062032a2a6b924883b81508af5c77bbfc5f66cc41c7ffd7d519885ac5e4%22%7D%7D"
response = request_url(api_link)
return response.json()

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.elgiganten.dk/product/{self.info.id}"
id = self._get_product_id()
return f"https://www.elgiganten.dk/product/{id}"


class AvXpertenHandler(BaseWebsiteHandler):
Expand Down Expand Up @@ -291,26 +286,21 @@ def _get_product_currency(self) -> str:
return currency

def _get_product_id(self) -> str:
if self.soup_url.split("/")[3] == "itm":
id = self.request_data.find("div", id="descItemNumber").text
else:
id = self.request_data.find("div", class_="item-details").a.get("data-itemid")

return id
return self.url.split("/")[4].split("?")[0]

def get_short_url(self) -> str:
if self.url.split("/")[3] != "itm":
return self.url.split("?")[0]
id = self._get_product_id()

if not self.info:
return None
return f"https://www.ebay.com/itm/{self.info.id}"
if self.url.split("/")[3] == "itm":
return f"https://www.ebay.com/itm/{id}"
else:
return f"https://www.ebay.com/p/{id}"


class PowerHandler(BaseWebsiteHandler):
def _get_common_data(self) -> None:
self.id = self.url.split("/")[-2].strip("p-")
self.api_json = request_url(f"https://www.power.dk/api/v2/products?ids={self.id}").json()
id = self._get_product_id()
self.api_json = request_url(f"https://www.power.dk/api/v2/products?ids={id}").json()

def _get_product_name(self) -> str:
return self.api_json[0].get("title")
Expand All @@ -322,19 +312,18 @@ def _get_product_currency(self) -> str:
return "DKK"

def _get_product_id(self) -> str:
return self.id
return self.url.split("/")[-2].strip("p-")

def get_short_url(self) -> str:
if not self.info:
return None
id = self._get_product_id()
url_id = self.url.split("/")[3]
return f"https://www.power.dk/{url_id}/p-{self.info.id}"
return f"https://www.power.dk/{url_id}/p-{id}"


class ExpertHandler(BaseWebsiteHandler):
def _get_common_data(self) -> None:
self.id = self.url.split("/")[-2].strip("p-")
self.api_json = request_url(f"https://www.expert.dk/api/v2/products?ids={self.id}").json()
id = self._get_product_id()
self.api_json = request_url(f"https://www.expert.dk/api/v2/products?ids={id}").json()

def _get_product_name(self) -> str:
return self.api_json[0].get("title")
Expand All @@ -346,13 +335,12 @@ def _get_product_currency(self) -> str:
return "DKK"

def _get_product_id(self) -> str:
return self.id
return self.url.split("/")[-2].strip("p-")

def get_short_url(self) -> str:
if not self.info:
return None
id = self._get_product_id()
url_id = self.url.split("/")[3]
return f"https://www.expert.dk/{url_id}/p-{self.info.id}"
return f"https://www.expert.dk/{url_id}/p-{id}"


class MMVisionHandler(BaseWebsiteHandler):
Expand All @@ -367,12 +355,10 @@ def _get_product_price(self) -> float:
return float(self.request_data.find("h3", class_="product-price text-right").text.strip("fra ").strip().strip(",-"))

def _get_product_currency(self) -> str:
currency = self.soup_script_tag_json.get("offers").get("priceCurrency")
return currency
return self.soup_script_tag_json.get("offers").get("priceCurrency")

def _get_product_id(self) -> str:
id = self.soup_script_tag_json.get("productID")
return id
return self.soup_script_tag_json.get("productID")

def get_short_url(self) -> str:
return self.url
Expand Down Expand Up @@ -428,12 +414,11 @@ def _get_product_currency(self) -> str:
return self.product_data.get("offers").get("priceCurrency")

def _get_product_id(self) -> str:
return self.product_data.get("sku")
return self.url.split("/")[5].split("?")[0]

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.newegg.com/p/{self.info.id}"
id = self._get_product_id()
return f"https://www.newegg.com/p/{id}"


class HifiKlubbenHandler(BaseWebsiteHandler):
Expand All @@ -452,9 +437,8 @@ def _get_product_id(self) -> str:
return self.url.split("/")[4]

def get_short_url(self) -> str:
if not self.info:
return None
return f"https://www.hifiklubben.dk/{self.info.id}"
id = self._get_product_id()
return f"https://www.hifiklubben.dk/{id}"


def get_website_name(url: str) -> str:
Expand Down
35 changes: 20 additions & 15 deletions tests/test_add_product.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
import pytest
from contextlib import nullcontext as does_not_raise

from scraper.filemanager import Filemanager
from scraper.add_product import add_product
from scraper.exceptions import WebsiteNotSupported

test_objects_json = Filemanager.read_json("./tests/test_objects.json")
test_json = test_objects_json["test_website_handlers"]

test_domains = [
("https://www.amazon.com/", does_not_raise()),
("https://www.ebay.com/", does_not_raise()),
("https://www.komplett.dk/", does_not_raise()),
("https://www.proshop.dk/", does_not_raise()),
("https://www.computersalg.dk/", does_not_raise()),
("https://www.elgiganten.dk/", does_not_raise()),
("https://www.avxperten.dk/", does_not_raise()),
("https://www.av-cables.dk/", does_not_raise()),
("https://www.power.dk/", does_not_raise()),
("https://www.expert.dk/", does_not_raise()),
("https://www.mm-vision.dk/", does_not_raise()),
("https://www.coolshop.dk/", does_not_raise()),
("https://www.sharkgaming.dk/", does_not_raise()),
("https://www.newegg.com/", does_not_raise()),
("https://www.hifiklubben.dk/", does_not_raise()),
(test_json["amazon"]["link"], does_not_raise()),
(test_json["ebay_with_itm"]["link"], does_not_raise()),
(test_json["ebay_with_p"]["link"], does_not_raise()),
(test_json["komplett"]["link"], does_not_raise()),
(test_json["proshop"]["link"], does_not_raise()),
(test_json["computersalg"]["link"], does_not_raise()),
(test_json["elgiganten"]["link"], does_not_raise()),
(test_json["avxperten"]["link"], does_not_raise()),
(test_json["av-cables"]["link"], does_not_raise()),
(test_json["power"]["link"], does_not_raise()),
(test_json["expert"]["link"], does_not_raise()),
(test_json["mm-vision"]["link"], does_not_raise()),
(test_json["coolshop"]["link"], does_not_raise()),
(test_json["sharkgaming"]["link"], does_not_raise()),
(test_json["newegg"]["link"], does_not_raise()),
(test_json["hifiklubben"]["link"], does_not_raise()),
("https://www.notsupported.com/", pytest.raises(WebsiteNotSupported)),
]

Expand Down
2 changes: 1 addition & 1 deletion tests/test_objects.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"ebay_with_p": {
"link": "https://www.ebay.com/p/1248083754?iid=181677611772&rt=nc",
"expected_title": "Etude House Collagen Eye Patch Korea Cosmetics 10 Sheets",
"expected_id": "181677611772",
"expected_id": "1248083754",
"expected_currency": "USD"
},
"expert": {
Expand Down

0 comments on commit 2e24255

Please sign in to comment.