Merge pull request #5979 from dojutsu-user/index-more-domain-data

Index more domain data into elasticsearch
readthedocs · Aug 27, 2019 · 00ab116 · 00ab116
2 parents 9fb0b69 + 4beee77
commit 00ab116
Show file tree

Hide file tree

Showing 17 changed files with 279 additions and 220 deletions.
diff --git a/media/css/core.css b/media/css/core.css
@@ -379,12 +379,11 @@ a.cta-btn:hover, a.cta-btn:active {
 
 /* search */
 
-.search {
-    border-bottom: solid 1px #bfbfbf;
-    margin-bottom: 24px;
-}
+.search { border-bottom: solid 1px #bfbfbf; margin-bottom: 24px; }
 .search input[type=text] { float: left; margin-right: 10px; padding: 8px 10px; }
 .search input[type=submit] { margin-top: 0; }
+/* this is same as the css class ".highlighted" */
+.search-result-item span { background-color: #ee9; padding: 0 1px; margin: 0 1px; border-radius: 3px; -moz-border-radius: 3px; -webkit-border-radius: 3px; }
 
 .filter { margin-bottom: 1em; }
 .filter dd { display: inline-block; margin-right: 0.75em; }

diff --git a/readthedocs/core/static-src/core/js/doc-embed/search.js b/readthedocs/core/static-src/core/js/doc-embed/search.js
@@ -78,8 +78,8 @@ function attach_elastic_search_query(data) {
 
                         // Creating the result from elements
                         var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX + "?highlight=" + $.urlencode(query);
-
                         var item = $('<a>', {'href': link});
+
                         item.html(title);
                         item.find('span').addClass('highlighted');
                         list_item.append(item);
@@ -88,7 +88,6 @@ function attach_elastic_search_query(data) {
                         if (doc.project !== project) {
                             var text = " (from project " + doc.project + ")";
                             var extra = $('<span>', {'text': text});
-
                             list_item.append(extra);
                         }
 
@@ -103,10 +102,12 @@ function attach_elastic_search_query(data) {
                             var content = "";
 
                             var domain = "";
-                            var domain_subtitle = "";
+                            var domain_role_name = "";
                             var domain_subtitle_link = "";
-                            var domain_content = "";
                             var domain_name = "";
+                            var domain_subtitle = "";
+                            var domain_content = "";
+                            var domain_docstrings = "";
 
                             var section_template = '' +
                                 '<div>' +
@@ -136,7 +137,7 @@ function attach_elastic_search_query(data) {
                                 section = inner_hits[j];
                                 section_subtitle = section._source.title;
                                 section_subtitle_link = link + "#" + section._source.id;
-                                section_content = [section._source.content.substring(0, MAX_SUBSTRING_LIMIT) + " ..."];
+                                section_content = [section._source.content.substr(0, MAX_SUBSTRING_LIMIT) + " ..."];
 
                                 if (section.highlight) {
                                     if (section.highlight["sections.title"]) {
@@ -171,27 +172,29 @@ function attach_elastic_search_query(data) {
                             if (inner_hits[j].type === "domains") {
 
                                 domain = inner_hits[j];
-                                domain_subtitle = domain._source.role_name;
+                                domain_role_name = domain._source.role_name;
                                 domain_subtitle_link = link + "#" + domain._source.anchor;
-                                domain_content = "";
                                 domain_name = domain._source.name;
+                                domain_subtitle = "";
+                                domain_content = "";
+                                domain_docstrings = "";
 
-                                if (
-                                    typeof domain._source.display_name === "string" &&
-                                    domain._source.display_name.length >= 1
-                                ) {
-                                    domain_subtitle = "(" + domain._source.role_name + ") " + domain._source.display_name;
+                                if (domain._source.docstrings !== "") {
+                                    domain_docstrings = domain._source.docstrings.substr(0, MAX_SUBSTRING_LIMIT) + " ...";
                                 }
 
                                 if (domain.highlight) {
+                                    if (domain.highlight["domains.docstrings"]) {
+                                        domain_docstrings = "... " + xss(domain.highlight["domains.docstrings"][0]) + " ...";
+                                    }
+
                                     if (domain.highlight["domains.name"]) {
-                                        // domain_content = type_display -- name
                                         domain_name = xss(domain.highlight["domains.name"][0]);
                                     }
                                 }
 
-                                // domain_content = type_display -- name -- in doc_display
-                                domain_content = domain._source.type_display + " -- " + domain_name + " -- in " + domain._source.doc_display;
+                                domain_subtitle = "[" + domain_role_name + "]: " + domain_name;
+                                domain_content = domain_docstrings;
 
                                 append_html_to_contents(
                                     contents,

diff --git a/readthedocs/core/static/core/js/readthedocs-doc-embed.js b/readthedocs/core/static/core/js/readthedocs-doc-embed.js
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1305,6 +1305,7 @@ def get_processed_json(self):
             'path': file_path,
             'title': '',
             'sections': [],
+            'domain_data': {},
         }
 
     @cached_property

diff --git a/readthedocs/projects/static/projects/js/tools.js b/readthedocs/projects/static/projects/js/tools.js
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -88,17 +88,15 @@ class PageDocument(RTDDocTypeMixin, DocType):
             'role_name': fields.KeywordField(),
 
             # For linking to the URL
-            'doc_name': fields.KeywordField(),
             'anchor': fields.KeywordField(),
 
             # For showing in the search result
             'type_display': fields.TextField(),
-            'doc_display': fields.TextField(),
+            'docstrings': fields.TextField(),
 
             # Simple analyzer breaks on `.`,
             # otherwise search results are too strict for this use case
             'name': fields.TextField(analyzer='simple'),
-            'display_name': fields.TextField(analyzer='simple'),
         }
     )
 
@@ -122,12 +120,12 @@ def prepare_domains(self, html_file):
             all_domains = [
                 {
                     'role_name': domain.role_name,
-                    'doc_name': domain.doc_name,
                     'anchor': domain.anchor,
                     'type_display': domain.type_display,
-                    'doc_display': domain.doc_display,
+                    'docstrings': html_file.processed_json.get(
+                        'domain_data', {}
+                    ).get(domain.anchor, ''),
                     'name': domain.name,
-                    'display_name': domain.display_name if domain.display_name != '-' else '',
                 }
                 for domain in domains_qs
             ]

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -104,9 +104,8 @@ class PageSearchBase(RTDFacetedSearch):
     _outer_fields = ['title^4']
     _section_fields = ['sections.title^3', 'sections.content']
     _domain_fields = [
-        'domains.type_display',
         'domains.name^2',
-        'domains.display_name',
+        'domains.docstrings',
     ]
     _common_highlight_options = {
         'encoder': 'html',
@@ -134,8 +133,17 @@ def query(self, search, query):
         """Manipulates query to support nested query."""
         search = search.highlight_options(**self._common_highlight_options)
 
+        all_queries = []
+
         # match query for the title (of the page) field.
-        match_title_query = Match(title=query)
+        for operator in self.operators:
+            all_queries.append(
+                SimpleQueryString(
+                    query=query,
+                    fields=self.fields,
+                    default_operator=operator
+                )
+            )
 
         # nested query for search in sections
         sections_nested_query = self.generate_nested_query(
@@ -162,21 +170,17 @@ def query(self, search, query):
                 'highlight': dict(
                     self._common_highlight_options,
                     fields={
-                        'domains.type_display': {},
                         'domains.name': {},
-                        'domains.display_name': {},
+                        'domains.docstrings': {},
                     }
                 )
             }
         )
 
-        final_query = Bool(should=[
-            match_title_query,
-            sections_nested_query,
-            domains_nested_query,
-        ])
-
+        all_queries.extend([sections_nested_query, domains_nested_query])
+        final_query = Bool(should=all_queries)
         search = search.query(final_query)
+
         return search
 
     def generate_nested_query(self, query, path, fields, inner_hits):

diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -12,8 +12,23 @@
 log = logging.getLogger(__name__)
 
 
-def generate_sections_from_pyquery(body):
+def generate_sections_from_pyquery(body, fjson_storage_path):
     """Given a pyquery object, generate section dicts for each section."""
+
+    # Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
+    try:
+        # remove all <dl> tags which contains <dt> tags having 'id' attribute
+        dt_tags = body('dt[id]')
+        dt_tags.parents('dl').remove()
+    except Exception:
+        log.exception('Error removing <dl> tags from file: %s', fjson_storage_path)
+
+    # remove toctree elements
+    try:
+        body('.toctree-wrapper').remove()
+    except Exception:
+        log.exception('Error removing toctree elements from file: %s', fjson_storage_path)
+
     # Capture text inside h1 before the first h2
     h1_section = body('.section > h1')
     if h1_section:
@@ -27,7 +42,12 @@ def generate_sections_from_pyquery(body):
                 if 'section' in next_p[0].attrib['class']:
                     break
 
-            h1_content += parse_content(next_p.text())
+            text = parse_content(next_p.text(), remove_first_line=True)
+            if h1_content:
+                h1_content = f'{h1_content.rstrip(".")}. {text}'
+            else:
+                h1_content = text
+
             next_p = next_p.next()
         if h1_content:
             yield {
@@ -45,7 +65,7 @@ def generate_sections_from_pyquery(body):
         section_id = div.attr('id')
 
         content = div.text()
-        content = parse_content(content)
+        content = parse_content(content, remove_first_line=True)
 
         yield {
             'id': section_id,
@@ -74,6 +94,7 @@ def process_file(fjson_storage_path):
     sections = []
     path = ''
     title = ''
+    domain_data = {}
 
     if 'current_page_name' in data:
         path = data['current_page_name']
@@ -82,7 +103,8 @@ def process_file(fjson_storage_path):
 
     if data.get('body'):
         body = PyQuery(data['body'])
-        sections.extend(generate_sections_from_pyquery(body))
+        sections.extend(generate_sections_from_pyquery(body.clone(), fjson_storage_path))
+        domain_data = generate_domains_data_from_pyquery(body.clone(), fjson_storage_path)
     else:
         log.info('Unable to index content for: %s', fjson_storage_path)
 
@@ -96,24 +118,70 @@ def process_file(fjson_storage_path):
         'path': path,
         'title': title,
         'sections': sections,
+        'domain_data': domain_data,
     }
 
 
-def parse_content(content):
-    """
-    Removes the starting text and ¶.
-
-    It removes the starting text from the content
-    because it contains the title of that content,
-    which is redundant here.
-    """
+def parse_content(content, remove_first_line=False):
+    """Removes new line characters and ¶."""
     content = content.replace('¶', '').strip()
 
     # removing the starting text of each
     content = content.split('\n')
-    if len(content) > 1:  # there were \n
+    if remove_first_line and len(content) > 1:
         content = content[1:]
 
     # converting newlines to ". "
     content = '. '.join([text.strip().rstrip('.') for text in content])
     return content
+
+
+def _get_text_for_domain_data(desc_contents):
+    """Returns the text from the PyQuery object ``desc_contents``."""
+    # remove the 'dl', 'dt' and 'dd' tags from it
+    # because all the 'dd' and 'dt' tags are inside 'dl'
+    # and all 'dl' tags are already captured.
+    desc_contents.remove('dl')
+    desc_contents.remove('dt')
+    desc_contents.remove('dd')
+
+    # remove multiple spaces, new line characters and '¶' symbol.
+    docstrings = parse_content(desc_contents.text())
+    return docstrings
+
+
+def generate_domains_data_from_pyquery(body, fjson_storage_path):
+    """
+    Given a pyquery object, generate sphinx domain objects' docstrings.
+
+    Returns a dict with the generated data.
+    The returned dict is in the following form::
+
+        {
+            "domain-id-1": "docstrings for the domain-id-1",
+            "domain-id-2": "docstrings for the domain-id-2",
+        }
+    """
+
+    domain_data = {}
+    dl_tags = body('dl')
+
+    for dl_tag in dl_tags:
+
+        dt = dl_tag.findall('dt')
+        dd = dl_tag.findall('dd')
+
+        # len(dt) should be equal to len(dd)
+        # because these tags go together.
+        for title, desc in zip(dt, dd):
+            try:
+                id_ = title.attrib.get('id')
+                if id_:
+                    # clone the PyQuery objects so that
+                    # the original one remains undisturbed
+                    docstrings = _get_text_for_domain_data(PyQuery(desc).clone())
+                    domain_data[id_] = docstrings
+            except Exception:
+                log.exception('Error parsing docstrings for domains in file %s', fjson_storage_path)
+
+    return domain_data
diff --git a/readthedocs/search/tests/data/docs/support.json b/readthedocs/search/tests/data/docs/support.json
@@ -5,7 +5,7 @@
         {
             "id": "usage-questions",
             "title": "Usage Questions",
-            "content": "If you have questions about how to use Read the Docs, or have an issue that isn’t related to a bug, Stack Overflow is the best place to ask. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
+            "content": "For help, Stack Overflow is the palce. Tag questions with read-the-docs so other folks can find them easily.. Good questions for Stack Overflow would be:. “What is the best way to structure the table of contents across a project?”. “How do I structure translations inside of my project for easiest contribution from users?”. “How do I use Sphinx to use SVG images in HTML output but PNG in PDF output?”"
         },
         {
             "id": "community-support",
@@ -20,22 +20,20 @@
     ],
     "domains": [
         {
-            "role_name": "http:post",
-            "doc_name": "api/v3.html",
-            "anchor": "post--api-v3-projects-(string-project_slug)-versions-(string-version_slug)-builds-",
-            "type_display": "post",
-            "doc_display": "API v3",
-            "name": "/api/v3/projects/(string:project_slug)/versions/(string:version_slug)/builds/",
-            "display_name": ""
+            "role_name": "py:function",
+            "anchor": "celery.utils.deprecated.warn",
+            "type_display": "function",
+            "name": "celery.utils.deprecated.warn"
         },
         {
-            "role_name": "http:patch",
-            "doc_name": "api/v3.html",
-            "anchor": "patch--api-v3-projects-(string-project_slug)-version-(string-version_slug)-",
-            "type_display": "patch",
-            "doc_display": "API v3",
-            "name": "/api/v3/projects/(string:project_slug)/version/(string:version_slug)/",
-            "display_name": ""
+            "role_name": "py:function",
+            "anchor": "celery.utils.deprecated.Property",
+            "type_display": "function",
+            "name": "celery.utils.deprecated.Property"
         }
-    ]
+    ],
+    "domain_data": {
+        "celery.utils.deprecated.warn": "Warn of (pending) deprecation",
+        "celery.utils.deprecated.Property": "Decorator for deprecated properties"
+    }
 }