Merge pull request #402 from algolia/feat/enhance_pkgdown_integration

Feat/enhance pkgdown integration
algolia · Oct 15, 2018 · 7e890f2 · 7e890f2
2 parents 76913d6 + 8813de6
commit 7e890f2
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 3 deletions.
diff --git a/deployer/src/config_creator.py b/deployer/src/config_creator.py
@@ -3,12 +3,32 @@
 import re
 from . import helpers
 from . import helpdesk_helper
+from urlparse import urlparse
 
 
 def extract_root_from_input(input_string):
  # We cant parse the url since user might have not enter a proper link
+
+ if input_string.endswith('/'): # We assume that the string is already the proper root
+ return input_string
+
  domain = re.match(".+?([^\/]\/(?!\/))", input_string) # extracting substring before the first isolated / (not //)
- return domain.group() if domain else input_string
+ try:
+ url_parsed = urlparse(input_string);
+ # Removing unused parameters
+ url_parsed._replace(params='', query='', fragment='')
+ path_splited = url_parsed.path.split('/')
+
+ # Path is redirecting to a page
+ if ('html' in path_splited[-1]):
+ url_parsed = url_parsed._replace(path='/'.join(path_splited[: -1]))
+ # We are fine
+ else:
+ pass
+
+ return url_parsed.geturl()
+ except ValueError:
+ return domain.group() if domain else input_string
 
 
 def to_docusaurus_config(config, urls=None):
@@ -61,7 +81,8 @@ def to_pkgdown_config(config, urls=None):
  config["selectors"]["text"] = ".contents p, .contents li, .usage, .template-article .contents .pre"
  config["selectors_exclude"] = [".dont-index"]
  config["custom_settings"] = {"separatorsToIndex": "_"}
- # config["stop_urls"] = [start_url + "index.html", "LICENSE-text.html"]
+ config["scrap_start_urls"] = False
+ config["stop_urls"] = ["LICENSE-text.html"]
  return config
 
 

diff --git a/scraper/src/custom_downloader_middleware.py b/scraper/src/custom_downloader_middleware.py
@@ -5,7 +5,6 @@
 import time
 
 from scrapy.http import Request, HtmlResponse
-from scrapy.exceptions import IgnoreRequest
 
 try:
  from urlparse import urlparse