Skip to content

Commit

Permalink
Merge pull request #5979 from dojutsu-user/index-more-domain-data
Browse files Browse the repository at this point in the history
Index more domain data into elasticsearch
  • Loading branch information
ericholscher authored Aug 27, 2019
2 parents 9fb0b69 + 4beee77 commit 00ab116
Show file tree
Hide file tree
Showing 17 changed files with 279 additions and 220 deletions.
7 changes: 3 additions & 4 deletions media/css/core.css
Original file line number Diff line number Diff line change
Expand Up @@ -379,12 +379,11 @@ a.cta-btn:hover, a.cta-btn:active {

/* search */

.search {
border-bottom: solid 1px #bfbfbf;
margin-bottom: 24px;
}
.search { border-bottom: solid 1px #bfbfbf; margin-bottom: 24px; }
.search input[type=text] { float: left; margin-right: 10px; padding: 8px 10px; }
.search input[type=submit] { margin-top: 0; }
/* this is same as the css class ".highlighted" */
.search-result-item span { background-color: #ee9; padding: 0 1px; margin: 0 1px; border-radius: 3px; -moz-border-radius: 3px; -webkit-border-radius: 3px; }

.filter { margin-bottom: 1em; }
.filter dd { display: inline-block; margin-right: 0.75em; }
Expand Down
33 changes: 18 additions & 15 deletions readthedocs/core/static-src/core/js/doc-embed/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ function attach_elastic_search_query(data) {

// Creating the result from elements
var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX + "?highlight=" + $.urlencode(query);

var item = $('<a>', {'href': link});

item.html(title);
item.find('span').addClass('highlighted');
list_item.append(item);
Expand All @@ -88,7 +88,6 @@ function attach_elastic_search_query(data) {
if (doc.project !== project) {
var text = " (from project " + doc.project + ")";
var extra = $('<span>', {'text': text});

list_item.append(extra);
}

Expand All @@ -103,10 +102,12 @@ function attach_elastic_search_query(data) {
var content = "";

var domain = "";
var domain_subtitle = "";
var domain_role_name = "";
var domain_subtitle_link = "";
var domain_content = "";
var domain_name = "";
var domain_subtitle = "";
var domain_content = "";
var domain_docstrings = "";

var section_template = '' +
'<div>' +
Expand Down Expand Up @@ -136,7 +137,7 @@ function attach_elastic_search_query(data) {
section = inner_hits[j];
section_subtitle = section._source.title;
section_subtitle_link = link + "#" + section._source.id;
section_content = [section._source.content.substring(0, MAX_SUBSTRING_LIMIT) + " ..."];
section_content = [section._source.content.substr(0, MAX_SUBSTRING_LIMIT) + " ..."];

if (section.highlight) {
if (section.highlight["sections.title"]) {
Expand Down Expand Up @@ -171,27 +172,29 @@ function attach_elastic_search_query(data) {
if (inner_hits[j].type === "domains") {

domain = inner_hits[j];
domain_subtitle = domain._source.role_name;
domain_role_name = domain._source.role_name;
domain_subtitle_link = link + "#" + domain._source.anchor;
domain_content = "";
domain_name = domain._source.name;
domain_subtitle = "";
domain_content = "";
domain_docstrings = "";

if (
typeof domain._source.display_name === "string" &&
domain._source.display_name.length >= 1
) {
domain_subtitle = "(" + domain._source.role_name + ") " + domain._source.display_name;
if (domain._source.docstrings !== "") {
domain_docstrings = domain._source.docstrings.substr(0, MAX_SUBSTRING_LIMIT) + " ...";
}

if (domain.highlight) {
if (domain.highlight["domains.docstrings"]) {
domain_docstrings = "... " + xss(domain.highlight["domains.docstrings"][0]) + " ...";
}

if (domain.highlight["domains.name"]) {
// domain_content = type_display -- name
domain_name = xss(domain.highlight["domains.name"][0]);
}
}

// domain_content = type_display -- name -- in doc_display
domain_content = domain._source.type_display + " -- " + domain_name + " -- in " + domain._source.doc_display;
domain_subtitle = "[" + domain_role_name + "]: " + domain_name;
domain_content = domain_docstrings;

append_html_to_contents(
contents,
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/core/static/core/js/readthedocs-doc-embed.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,7 @@ def get_processed_json(self):
'path': file_path,
'title': '',
'sections': [],
'domain_data': {},
}

@cached_property
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/projects/static/projects/js/tools.js

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,17 +88,15 @@ class PageDocument(RTDDocTypeMixin, DocType):
'role_name': fields.KeywordField(),

# For linking to the URL
'doc_name': fields.KeywordField(),
'anchor': fields.KeywordField(),

# For showing in the search result
'type_display': fields.TextField(),
'doc_display': fields.TextField(),
'docstrings': fields.TextField(),

# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
'name': fields.TextField(analyzer='simple'),
'display_name': fields.TextField(analyzer='simple'),
}
)

Expand All @@ -122,12 +120,12 @@ def prepare_domains(self, html_file):
all_domains = [
{
'role_name': domain.role_name,
'doc_name': domain.doc_name,
'anchor': domain.anchor,
'type_display': domain.type_display,
'doc_display': domain.doc_display,
'docstrings': html_file.processed_json.get(
'domain_data', {}
).get(domain.anchor, ''),
'name': domain.name,
'display_name': domain.display_name if domain.display_name != '-' else '',
}
for domain in domains_qs
]
Expand Down
26 changes: 15 additions & 11 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,8 @@ class PageSearchBase(RTDFacetedSearch):
_outer_fields = ['title^4']
_section_fields = ['sections.title^3', 'sections.content']
_domain_fields = [
'domains.type_display',
'domains.name^2',
'domains.display_name',
'domains.docstrings',
]
_common_highlight_options = {
'encoder': 'html',
Expand Down Expand Up @@ -134,8 +133,17 @@ def query(self, search, query):
"""Manipulates query to support nested query."""
search = search.highlight_options(**self._common_highlight_options)

all_queries = []

# match query for the title (of the page) field.
match_title_query = Match(title=query)
for operator in self.operators:
all_queries.append(
SimpleQueryString(
query=query,
fields=self.fields,
default_operator=operator
)
)

# nested query for search in sections
sections_nested_query = self.generate_nested_query(
Expand All @@ -162,21 +170,17 @@ def query(self, search, query):
'highlight': dict(
self._common_highlight_options,
fields={
'domains.type_display': {},
'domains.name': {},
'domains.display_name': {},
'domains.docstrings': {},
}
)
}
)

final_query = Bool(should=[
match_title_query,
sections_nested_query,
domains_nested_query,
])

all_queries.extend([sections_nested_query, domains_nested_query])
final_query = Bool(should=all_queries)
search = search.query(final_query)

return search

def generate_nested_query(self, query, path, fields, inner_hits):
Expand Down
94 changes: 81 additions & 13 deletions readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,23 @@
log = logging.getLogger(__name__)


def generate_sections_from_pyquery(body):
def generate_sections_from_pyquery(body, fjson_storage_path):
"""Given a pyquery object, generate section dicts for each section."""

# Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
try:
# remove all <dl> tags which contains <dt> tags having 'id' attribute
dt_tags = body('dt[id]')
dt_tags.parents('dl').remove()
except Exception:
log.exception('Error removing <dl> tags from file: %s', fjson_storage_path)

# remove toctree elements
try:
body('.toctree-wrapper').remove()
except Exception:
log.exception('Error removing toctree elements from file: %s', fjson_storage_path)

# Capture text inside h1 before the first h2
h1_section = body('.section > h1')
if h1_section:
Expand All @@ -27,7 +42,12 @@ def generate_sections_from_pyquery(body):
if 'section' in next_p[0].attrib['class']:
break

h1_content += parse_content(next_p.text())
text = parse_content(next_p.text(), remove_first_line=True)
if h1_content:
h1_content = f'{h1_content.rstrip(".")}. {text}'
else:
h1_content = text

next_p = next_p.next()
if h1_content:
yield {
Expand All @@ -45,7 +65,7 @@ def generate_sections_from_pyquery(body):
section_id = div.attr('id')

content = div.text()
content = parse_content(content)
content = parse_content(content, remove_first_line=True)

yield {
'id': section_id,
Expand Down Expand Up @@ -74,6 +94,7 @@ def process_file(fjson_storage_path):
sections = []
path = ''
title = ''
domain_data = {}

if 'current_page_name' in data:
path = data['current_page_name']
Expand All @@ -82,7 +103,8 @@ def process_file(fjson_storage_path):

if data.get('body'):
body = PyQuery(data['body'])
sections.extend(generate_sections_from_pyquery(body))
sections.extend(generate_sections_from_pyquery(body.clone(), fjson_storage_path))
domain_data = generate_domains_data_from_pyquery(body.clone(), fjson_storage_path)
else:
log.info('Unable to index content for: %s', fjson_storage_path)

Expand All @@ -96,24 +118,70 @@ def process_file(fjson_storage_path):
'path': path,
'title': title,
'sections': sections,
'domain_data': domain_data,
}


def parse_content(content):
"""
Removes the starting text and ¶.
It removes the starting text from the content
because it contains the title of that content,
which is redundant here.
"""
def parse_content(content, remove_first_line=False):
"""Removes new line characters and ¶."""
content = content.replace('¶', '').strip()

# removing the starting text of each
content = content.split('\n')
if len(content) > 1: # there were \n
if remove_first_line and len(content) > 1:
content = content[1:]

# converting newlines to ". "
content = '. '.join([text.strip().rstrip('.') for text in content])
return content


def _get_text_for_domain_data(desc_contents):
"""Returns the text from the PyQuery object ``desc_contents``."""
# remove the 'dl', 'dt' and 'dd' tags from it
# because all the 'dd' and 'dt' tags are inside 'dl'
# and all 'dl' tags are already captured.
desc_contents.remove('dl')
desc_contents.remove('dt')
desc_contents.remove('dd')

# remove multiple spaces, new line characters and '¶' symbol.
docstrings = parse_content(desc_contents.text())
return docstrings


def generate_domains_data_from_pyquery(body, fjson_storage_path):
"""
Given a pyquery object, generate sphinx domain objects' docstrings.
Returns a dict with the generated data.
The returned dict is in the following form::
{
"domain-id-1": "docstrings for the domain-id-1",
"domain-id-2": "docstrings for the domain-id-2",
}
"""

domain_data = {}
dl_tags = body('dl')

for dl_tag in dl_tags:

dt = dl_tag.findall('dt')
dd = dl_tag.findall('dd')

# len(dt) should be equal to len(dd)
# because these tags go together.
for title, desc in zip(dt, dd):
try:
id_ = title.attrib.get('id')
if id_:
# clone the PyQuery objects so that
# the original one remains undisturbed
docstrings = _get_text_for_domain_data(PyQuery(desc).clone())
domain_data[id_] = docstrings
except Exception:
log.exception('Error parsing docstrings for domains in file %s', fjson_storage_path)

return domain_data
30 changes: 14 additions & 16 deletions readthedocs/search/tests/data/docs/support.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
{
"id": "usage-questions",
"title": "Usage Questions",
"content": "If you have questions about how to use Read the Docs, or have an issue that isn’t related to a bug, Stack Overflow is the best place to ask. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
"content": "For help, Stack Overflow is the palce. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
},
{
"id": "community-support",
Expand All @@ -20,22 +20,20 @@
],
"domains": [
{
"role_name": "http:post",
"doc_name": "api/v3.html",
"anchor": "post--api-v3-projects-(string-project_slug)-versions-(string-version_slug)-builds-",
"type_display": "post",
"doc_display": "API v3",
"name": "/api/v3/projects/(string:project_slug)/versions/(string:version_slug)/builds/",
"display_name": ""
"role_name": "py:function",
"anchor": "celery.utils.deprecated.warn",
"type_display": "function",
"name": "celery.utils.deprecated.warn"
},
{
"role_name": "http:patch",
"doc_name": "api/v3.html",
"anchor": "patch--api-v3-projects-(string-project_slug)-version-(string-version_slug)-",
"type_display": "patch",
"doc_display": "API v3",
"name": "/api/v3/projects/(string:project_slug)/version/(string:version_slug)/",
"display_name": ""
"role_name": "py:function",
"anchor": "celery.utils.deprecated.Property",
"type_display": "function",
"name": "celery.utils.deprecated.Property"
}
]
],
"domain_data": {
"celery.utils.deprecated.warn": "Warn of (pending) deprecation",
"celery.utils.deprecated.Property": "Decorator for deprecated properties"
}
}
Loading

0 comments on commit 00ab116

Please sign in to comment.