Skip to content

Commit

Permalink
Applying suggestion from 1st review
Browse files Browse the repository at this point in the history
  • Loading branch information
Cimon Lucas (LCM) committed Nov 2, 2022
1 parent 7633531 commit 655d3e5
Show file tree
Hide file tree
Showing 41 changed files with 1,285 additions and 1,524 deletions.
77 changes: 77 additions & 0 deletions src/calibre/gui2/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,86 @@
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

from contextlib import closing
from time import perf_counter

from lxml import html

from calibre import browser as create_browser, prints
from calibre.constants import DEBUG
from calibre.scraper.simple import read_url
from calibre.utils.filenames import ascii_filename


def browser_get_url(url, timeout, browser=None, user_agent=None, headers=None, data=None, novisit=False, html_parser=None, save_html_to=None):
"""
Retrieve the content at the given HTTP URL,
and measure the time it takes to do so in DEBUG mode.
Uses mechanize.Browser
:param url: a URL string.
:param timeout: a numerical timeout in seconds for the HTTP request.
:param browser: an optional existing mechanize.Browser instance.
If not provided, a new one will be created.
:param user_agent: optional User-Agent to use if no "browser" parameter is provided.
:param headers: optional list of HTTP headers to set on the request
:param data: optional query parameters
:param novisit: optional boolean indicating to use mechanize "novisit" method
when fetching web pages.
:param save_html_to: an optional file path where to save the web page content.
:param html_parser: an optional function to parse the HTML string.
By default: lxml.html.fromstring
:return: a parsed HTML element/document
"""
start_time = perf_counter()
if browser is None:
browser = create_browser(user_agent=user_agent)
if headers:
browser.addheaders.extend(headers)
browser_open = browser.open_novisit if novisit else browser.open
with closing(browser_open(url, data=data, timeout=timeout)) as web_page:
html_content = web_page.read()
if save_html_to:
with open(save_html_to, 'wb') as html_file:
html_file.write(raw_content)
if not html_parser:
html_parser = html.fromstring
html_parsed = html_parser(html_content)
if DEBUG:
duration = perf_counter() - start_time
prints(f'browser_get_url took {duration:.2f}s for URL {url}')
return html_parsed


def http_get_url(storage, url, timeout):
"""
Retrieve the content at the given HTTP URL,
and measure the time it takes to do so in DEBUG mode.
Uses qt.webengine and hence the chromium network stack.
:param url: a URL string.
:param timeout: a numerical timeout in seconds for the HTTP request.
:return: the HTML content as a string
"""
start_time = perf_counter()
html_content = read_url(storage, url, timeout)
if DEBUG:
duration = perf_counter() - start_time
prints(f"http_get_url took {duration:.2f}s for URL {url}")
return html_content


class StorePlugin: # {{{

'''
Expand Down
3 changes: 1 addition & 2 deletions src/calibre/gui2/store/amazon_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>

from qt.core import QUrl
from threading import Lock
from time import monotonic

Expand Down Expand Up @@ -45,7 +44,7 @@ class AmazonStore:

def open(self, parent=None, detail_item=None, external=False):
store_link = get_method('get_store_link_amazon')(self, detail_item)
open_url(QUrl(store_link))
open_url(store_link)

def search(self, query, max_results=10, timeout=60):
for result in get_method('search_amazon')(self, query, max_results=max_results, timeout=timeout):
Expand Down
6 changes: 3 additions & 3 deletions src/calibre/gui2/store/amazon_live.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from lxml import etree, html
from urllib.parse import urlencode

from calibre.scraper.simple import read_url
from calibre.gui2.store import http_get_url
from calibre.gui2.store.search_result import SearchResult


Expand All @@ -26,7 +26,7 @@ def asbytes(x):
url = self.SEARCH_BASE_URL + '?' + urlencode(uquery)

counter = max_results
raw = read_url(self.scraper_storage, url, timeout=timeout)
raw = http_get_url(self.scraper_storage, url, timeout=timeout)
if write_html_to is not None:
with open(write_html_to, 'w') as f:
f.write(raw)
Expand Down Expand Up @@ -85,7 +85,7 @@ def parse_details_amazon(self, idata, search_result):

def get_details_amazon(self, search_result, timeout):
url = self.DETAILS_URL + search_result.detail_item
raw = read_url(self.scraper_storage, url, timeout=timeout)
raw = http_get_url(self.scraper_storage, url, timeout=timeout)
idata = html.fromstring(raw)
return parse_details_amazon(self, idata, search_result)

Expand Down
4 changes: 1 addition & 3 deletions src/calibre/gui2/store/opensearch_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from contextlib import closing

from qt.core import QUrl

from calibre import (browser, guess_extension)
from calibre.gui2 import open_url
from calibre.utils.xml_parse import safe_xml_fromstring
Expand Down Expand Up @@ -88,7 +86,7 @@ def open(self, parent=None, detail_item=None, external=False):
return

if external or self.config.get('open_external', False):
open_url(QUrl(detail_item if detail_item else self.web_url))
open_url(detail_item if detail_item else self.web_url)
else:
d = WebStoreDialog(self.gui, self.web_url, parent, detail_item, create_browser=self.create_browser)
d.setWindowTitle(self.name)
Expand Down
145 changes: 65 additions & 80 deletions src/calibre/gui2/store/stores/amazon_de_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,13 @@

store_version = 15 # Needed for dynamic plugin loading

from contextlib import closing
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode

from lxml import html

from qt.core import QUrl

from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store import browser_get_url, StorePlugin
from calibre.gui2.store.search_result import SearchResult

SEARCH_BASE_URL = 'https://www.amazon.de/s/'
Expand Down Expand Up @@ -49,102 +43,93 @@ def asbytes(x):
return x
uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
url = base_url + '?' + urlencode(uquery)
br = browser(user_agent=get_user_agent())

doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to)

try:
results = doc.xpath('//div[@id="atfResults" and @class]')[0]
except IndexError:
return

if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
else:
return

counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
raw = f.read()
if write_html_to is not None:
with open(write_html_to, 'wb') as f:
f.write(raw)
doc = html.fromstring(raw)
try:
results = doc.xpath('//div[@id="atfResults" and @class]')[0]
except IndexError:
return

if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
for data in doc.xpath(data_xpath):
if counter <= 0:
break

# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue

# We must have an asin otherwise we can't easily reference the
# book later.
asin = data.xpath(asin_xpath)
if asin:
asin = asin[0]
else:
return

for data in doc.xpath(data_xpath):
if counter <= 0:
break

# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue

# We must have an asin otherwise we can't easily reference the
# book later.
asin = data.xpath(asin_xpath)
if asin:
asin = asin[0]
else:
continue
continue

cover_url = ''.join(data.xpath(cover_xpath))
cover_url = ''.join(data.xpath(cover_xpath))

title = ''.join(data.xpath(title_xpath))
author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass
title = ''.join(data.xpath(title_xpath))
author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass

price = ''.join(data.xpath(price_xpath))
price = ''.join(data.xpath(price_xpath))

counter -= 1
counter -= 1

s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'

yield s
yield s


class AmazonKindleStore(StorePlugin):

def open(self, parent=None, detail_item=None, external=False):
store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK
open_url(QUrl(store_link))
open_url(store_link)

def search(self, query, max_results=10, timeout=60):
for result in search_amazon(query, max_results=max_results, timeout=timeout):
yield result

def get_details(self, search_result, timeout):
url = DETAILS_URL

br = browser(user_agent=get_user_agent())
with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent())
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
DRM_SEARCH_TEXT + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
DRM_FREE_TEXT + '") and contains(b, "' +
DRM_SEARCH_TEXT + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
DRM_FREE_TEXT + '") and contains(b, "' +
DRM_SEARCH_TEXT + '")])'):
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_UNKNOWN
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_LOCKED
search_result.drm = SearchResult.DRM_UNKNOWN
else:
search_result.drm = SearchResult.DRM_LOCKED
return True


Expand Down
Loading

0 comments on commit 655d3e5

Please sign in to comment.