Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extra debug logs displaying requests duration #1781

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions src/calibre/gui2/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,86 @@
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

from contextlib import closing
from time import perf_counter

from lxml import html

from calibre import browser as create_browser, prints
from calibre.constants import DEBUG
from calibre.scraper.simple import read_url
from calibre.utils.filenames import ascii_filename


def browser_get_url(url, timeout, browser=None, user_agent=None, headers=None, data=None, novisit=False, html_parser=None, save_html_to=None):
"""
Retrieve the content at the given HTTP URL,
and measure the time it takes to do so in DEBUG mode.
Uses mechanize.Browser

:param url: a URL string.

:param timeout: a numerical timeout in seconds for the HTTP request.

:param browser: an optional existing mechanize.Browser instance.
If not provided, a new one will be created.

:param user_agent: optional User-Agent to use if no "browser" parameter is provided.

:param headers: optional list of HTTP headers to set on the request

:param data: optional query parameters

:param novisit: optional boolean indicating to use mechanize "novisit" method
when fetching web pages.

:param save_html_to: an optional file path where to save the web page content.

:param html_parser: an optional function to parse the HTML string.
By default: lxml.html.fromstring

:return: a parsed HTML element/document
"""
start_time = perf_counter()
if browser is None:
browser = create_browser(user_agent=user_agent)
if headers:
browser.addheaders.extend(headers)
browser_open = browser.open_novisit if novisit else browser.open
with closing(browser_open(url, data=data, timeout=timeout)) as web_page:
html_content = web_page.read()
if save_html_to:
with open(save_html_to, 'wb') as html_file:
html_file.write(raw_content)
if not html_parser:
html_parser = html.fromstring
html_parsed = html_parser(html_content)
if DEBUG:
duration = perf_counter() - start_time
prints(f'browser_get_url took {duration:.2f}s for URL {url}')
return html_parsed


def http_get_url(storage, url, timeout):
"""
Retrieve the content at the given HTTP URL,
and measure the time it takes to do so in DEBUG mode.
Uses qt.webengine and hence the chromium network stack.

:param url: a URL string.

:param timeout: a numerical timeout in seconds for the HTTP request.

:return: the HTML content as a string
"""
start_time = perf_counter()
html_content = read_url(storage, url, timeout)
if DEBUG:
duration = perf_counter() - start_time
prints(f"http_get_url took {duration:.2f}s for URL {url}")
return html_content


class StorePlugin: # {{{

'''
Expand Down
3 changes: 1 addition & 2 deletions src/calibre/gui2/store/amazon_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>

from qt.core import QUrl
from threading import Lock
from time import monotonic

Expand Down Expand Up @@ -45,7 +44,7 @@ class AmazonStore:

def open(self, parent=None, detail_item=None, external=False):
store_link = get_method('get_store_link_amazon')(self, detail_item)
open_url(QUrl(store_link))
open_url(store_link)

def search(self, query, max_results=10, timeout=60):
for result in get_method('search_amazon')(self, query, max_results=max_results, timeout=timeout):
Expand Down
6 changes: 3 additions & 3 deletions src/calibre/gui2/store/amazon_live.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from lxml import etree, html
from urllib.parse import urlencode

from calibre.scraper.simple import read_url
from calibre.gui2.store import http_get_url
from calibre.gui2.store.search_result import SearchResult


Expand All @@ -26,7 +26,7 @@ def asbytes(x):
url = self.SEARCH_BASE_URL + '?' + urlencode(uquery)

counter = max_results
raw = read_url(self.scraper_storage, url, timeout=timeout)
raw = http_get_url(self.scraper_storage, url, timeout=timeout)
if write_html_to is not None:
with open(write_html_to, 'w') as f:
f.write(raw)
Expand Down Expand Up @@ -85,7 +85,7 @@ def parse_details_amazon(self, idata, search_result):

def get_details_amazon(self, search_result, timeout):
url = self.DETAILS_URL + search_result.detail_item
raw = read_url(self.scraper_storage, url, timeout=timeout)
raw = http_get_url(self.scraper_storage, url, timeout=timeout)
idata = html.fromstring(raw)
return parse_details_amazon(self, idata, search_result)

Expand Down
4 changes: 1 addition & 3 deletions src/calibre/gui2/store/opensearch_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from contextlib import closing

from qt.core import QUrl

from calibre import (browser, guess_extension)
from calibre.gui2 import open_url
from calibre.utils.xml_parse import safe_xml_fromstring
Expand Down Expand Up @@ -88,7 +86,7 @@ def open(self, parent=None, detail_item=None, external=False):
return

if external or self.config.get('open_external', False):
open_url(QUrl(detail_item if detail_item else self.web_url))
open_url(detail_item if detail_item else self.web_url)
else:
d = WebStoreDialog(self.gui, self.web_url, parent, detail_item, create_browser=self.create_browser)
d.setWindowTitle(self.name)
Expand Down
145 changes: 65 additions & 80 deletions src/calibre/gui2/store/stores/amazon_de_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,13 @@

store_version = 15 # Needed for dynamic plugin loading

from contextlib import closing
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode

from lxml import html

from qt.core import QUrl

from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store import browser_get_url, StorePlugin
from calibre.gui2.store.search_result import SearchResult

SEARCH_BASE_URL = 'https://www.amazon.de/s/'
Expand Down Expand Up @@ -49,102 +43,93 @@ def asbytes(x):
return x
uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()}
url = base_url + '?' + urlencode(uquery)
br = browser(user_agent=get_user_agent())

doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to)

try:
results = doc.xpath('//div[@id="atfResults" and @class]')[0]
except IndexError:
return

if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
else:
return

counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
raw = f.read()
if write_html_to is not None:
with open(write_html_to, 'wb') as f:
f.write(raw)
doc = html.fromstring(raw)
try:
results = doc.xpath('//div[@id="atfResults" and @class]')[0]
except IndexError:
return

if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY
price_xpath = ('descendant::div[@class="a-row a-spacing-none" and'
' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()')
for data in doc.xpath(data_xpath):
if counter <= 0:
break

# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue

# We must have an asin otherwise we can't easily reference the
# book later.
asin = data.xpath(asin_xpath)
if asin:
asin = asin[0]
else:
return

for data in doc.xpath(data_xpath):
if counter <= 0:
break

# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue

# We must have an asin otherwise we can't easily reference the
# book later.
asin = data.xpath(asin_xpath)
if asin:
asin = asin[0]
else:
continue
continue

cover_url = ''.join(data.xpath(cover_xpath))
cover_url = ''.join(data.xpath(cover_xpath))

title = ''.join(data.xpath(title_xpath))
author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass
title = ''.join(data.xpath(title_xpath))
author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass

price = ''.join(data.xpath(price_xpath))
price = ''.join(data.xpath(price_xpath))

counter -= 1
counter -= 1

s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'

yield s
yield s


class AmazonKindleStore(StorePlugin):

def open(self, parent=None, detail_item=None, external=False):
store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK
open_url(QUrl(store_link))
open_url(store_link)

def search(self, query, max_results=10, timeout=60):
for result in search_amazon(query, max_results=max_results, timeout=timeout):
yield result

def get_details(self, search_result, timeout):
url = DETAILS_URL

br = browser(user_agent=get_user_agent())
with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent())
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
DRM_SEARCH_TEXT + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
DRM_FREE_TEXT + '") and contains(b, "' +
DRM_SEARCH_TEXT + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
DRM_FREE_TEXT + '") and contains(b, "' +
DRM_SEARCH_TEXT + '")])'):
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_UNKNOWN
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_LOCKED
search_result.drm = SearchResult.DRM_UNKNOWN
else:
search_result.drm = SearchResult.DRM_LOCKED
return True


Expand Down
Loading