Skip to content

Commit

Permalink
Merge pull request #402 from algolia/feat/enhance_pkgdown_integration
Browse files Browse the repository at this point in the history
Feat/enhance pkgdown integration
  • Loading branch information
Sylvain Pace committed Oct 15, 2018
2 parents 76913d6 + 8813de6 commit 7e890f2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
25 changes: 23 additions & 2 deletions deployer/src/config_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,32 @@
import re
from . import helpers
from . import helpdesk_helper
from urlparse import urlparse


def extract_root_from_input(input_string):
# We cant parse the url since user might have not enter a proper link

if input_string.endswith('/'): # We assume that the string is already the proper root
return input_string

domain = re.match(".+?([^\/]\/(?!\/))", input_string) # extracting substring before the first isolated / (not //)
return domain.group() if domain else input_string
try:
url_parsed = urlparse(input_string);
# Removing unused parameters
url_parsed._replace(params='', query='', fragment='')
path_splited = url_parsed.path.split('/')

# Path is redirecting to a page
if ('html' in path_splited[-1]):
url_parsed = url_parsed._replace(path='/'.join(path_splited[: -1]))
# We are fine
else:
pass

return url_parsed.geturl()
except ValueError:
return domain.group() if domain else input_string


def to_docusaurus_config(config, urls=None):
Expand Down Expand Up @@ -61,7 +81,8 @@ def to_pkgdown_config(config, urls=None):
config["selectors"]["text"] = ".contents p, .contents li, .usage, .template-article .contents .pre"
config["selectors_exclude"] = [".dont-index"]
config["custom_settings"] = {"separatorsToIndex": "_"}
# config["stop_urls"] = [start_url + "index.html", "LICENSE-text.html"]
config["scrap_start_urls"] = False
config["stop_urls"] = ["LICENSE-text.html"]
return config


Expand Down
1 change: 0 additions & 1 deletion scraper/src/custom_downloader_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import time

from scrapy.http import Request, HtmlResponse
from scrapy.exceptions import IgnoreRequest

try:
from urlparse import urlparse
Expand Down

0 comments on commit 7e890f2

Please sign in to comment.