Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Add test filesection indexer #11791

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions readthedocs/filesections/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Module for the file sections feature.

This feature is used to store the title and path name of each page in the index.
"""

import json
import logging

from readthedocs.builds.models import Version
from readthedocs.filesections.dataclasses import FileSectionManifest
from readthedocs.projects.constants import MEDIA_TYPE_SECTIONS
from readthedocs.storage import build_media_storage

SECTION_MANIFEST_FILE_NAME = "sections_manifest.json"

log = logging.getLogger(__name__)


def get_section_manifest(version: Version) -> FileSectionManifest | None:
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_SECTIONS,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, SECTION_MANIFEST_FILE_NAME)
try:
with build_media_storage.open(manifest_path) as manifest_file:
manifest = json.load(manifest_file)
log.info(f"Loaded section manifest from {manifest_path}")
except FileNotFoundError:
log.warning(f"Section manifest not found at {manifest_path}")
return None

return FileSectionManifest.from_dict(manifest)


def write_section_manifest(version: Version, manifest: FileSectionManifest):
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_SECTIONS,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, SECTION_MANIFEST_FILE_NAME)
with build_media_storage.open(manifest_path, "w") as f:
manifest_dict = manifest.as_dict()
log.info(f"Writing section manifest: {manifest_dict}")
json.dump(manifest_dict, f)
38 changes: 38 additions & 0 deletions readthedocs/filesections/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from dataclasses import asdict, dataclass


@dataclass(slots=True)
class FileSection:
id: str
title: str


@dataclass(slots=True)
class Page:
path: str
sections: list[FileSection]


@dataclass(slots=True)
class FileSectionManifest:
build: int
pages: list[Page]

def __init__(self, build_id: int, pages: list[Page]):
self.build = build_id
self.pages = pages

@classmethod
def from_dict(cls, data: dict) -> "FileSectionManifest":
build_id = data["build"]
pages = [
Page(
path=page["path"],
sections=[FileSection(**section) for section in page["sections"]],
Comment on lines +30 to +31
Copy link
Member

@humitos humitos Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably makes sense to save the title of the page itself as well -- so we don't show release.html only but, Releasing Addons :: release.html or similar.

)
for page in data["pages"]
]
return cls(build_id, pages)

def as_dict(self) -> dict:
return asdict(self)
2 changes: 2 additions & 0 deletions readthedocs/projects/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
MEDIA_TYPE_HTMLZIP = "htmlzip"
MEDIA_TYPE_JSON = "json"
MEDIA_TYPE_DIFF = "diff"
MEDIA_TYPE_SECTIONS = "sections"
DOWNLOADABLE_MEDIA_TYPES = (
MEDIA_TYPE_PDF,
MEDIA_TYPE_EPUB,
Expand All @@ -47,6 +48,7 @@
MEDIA_TYPE_HTMLZIP,
MEDIA_TYPE_JSON,
MEDIA_TYPE_DIFF,
MEDIA_TYPE_SECTIONS,
)

BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"
Expand Down
39 changes: 39 additions & 0 deletions readthedocs/projects/tasks/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST
from readthedocs.builds.models import Build, Version
from readthedocs.filesections import write_section_manifest
from readthedocs.filesections.dataclasses import FileSection, FileSectionManifest, Page
from readthedocs.filetreediff import write_manifest
from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest
from readthedocs.projects.models import HTMLFile, Project
Expand Down Expand Up @@ -142,6 +144,36 @@ def collect(self, sync_id: int):
write_manifest(self.version, manifest)


class FileSectionIndexer(Indexer):
def __init__(self, version: Version, build: Build):
self.version = version
self.build = build
self.pages = []

def process(self, html_file: HTMLFile, sync_id: int):
log.debug("Processing file for sections", path=html_file.path)
processed_json = html_file.processed_json
if processed_json:
sections = [
FileSection(
id=section["id"],
title=section["title"],
)
for section in processed_json.get("sections", [])
]
self.pages.append(Page(path=html_file.path, sections=sections))
log.debug("Finished processing file for sections", path=html_file.path)

def collect(self, sync_id: int):
log.debug("Collecting sections for manifest", build_id=self.build.id)
manifest = FileSectionManifest(
build_id=self.build.id,
pages=self.pages,
)
write_section_manifest(self.version, manifest)
log.debug("Finished collecting sections for manifest", build_id=self.build.id)


def _get_indexers(*, version: Version, build: Build, search_index_name=None):
build_config = build.config or {}
search_config = build_config.get("search", {})
Expand Down Expand Up @@ -186,6 +218,13 @@ def _get_indexers(*, version: Version, build: Build, search_index_name=None):
version=version,
)
indexers.append(index_file_indexer)

file_section_indexer = FileSectionIndexer(
version=version,
build=build,
)
indexers.append(file_section_indexer)

return indexers


Expand Down
36 changes: 36 additions & 0 deletions readthedocs/proxito/views/hosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from readthedocs.core.resolver import Resolver
from readthedocs.core.unresolver import UnresolverError, unresolver
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.filesections import get_section_manifest
from readthedocs.filetreediff import get_diff
from readthedocs.projects.constants import (
ADDONS_FLYOUT_SORTING_CALVER,
Expand Down Expand Up @@ -531,6 +532,9 @@ def _v1(self, project, version, build, filename, url, request):
"filetreediff": {
"enabled": False,
},
"filesections": {
"enabled": False,
},
},
}

Expand All @@ -544,6 +548,13 @@ def _v1(self, project, version, build, filename, url, request):
if response:
data["addons"]["filetreediff"].update(response)

sections_response = self._get_filesections_response(
project=project,
version=version,
)
if sections_response:
data["addons"]["filesections"].update(sections_response)

# Show the subprojects filter on the parent project and subproject
# TODO: Remove these queries and try to find a way to get this data
# from the resolver, which has already done these queries.
Expand Down Expand Up @@ -707,6 +718,31 @@ def _get_filetreediff_response(self, *, request, project, version, resolver):
},
}

def _get_filesections_response(self, *, project, version):
"""
Get the file sections response for the given version.
"""
manifest = get_section_manifest(version)
if not manifest:
return None

return {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example in dev from Addons.

{
    "enabled": true,
    "sections": [
        {
            "path": "genindex.html",
            "sections": [
                {
                    "id": "index",
                    "title": "Index"
                }
            ]
        },
        {
            "path": "index.html",
            "sections": [
                {
                    "id": "read-the-docs-addons",
                    "title": "🦸 Read the Docs Addons"
                },
                {
                    "id": "examples",
                    "title": "Examples"
                },
                {
                    "id": "contents",
                    "title": "Contents"
                }
            ]
        },
        {
            "path": "releasing.html",
            "sections": [
                {
                    "id": "releasing",
                    "title": "Releasing"
                }
            ]
        },
        {
            "path": "search.html",
            "sections": [
                {
                    "id": "",
                    "title": "Search - Read the Docs Addons documentation"
                }
            ]
        },
        {
            "path": "testing.html",
            "sections": [
                {
                    "id": "testing",
                    "title": "Testing"
                },
                {
                    "id": "localhost",
                    "title": "localhost"
                },
                {
                    "id": "read-the-docs",
                    "title": "Read the Docs"
                }
            ]
        }
    ]
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't return the content of manifest here, but instead, we should return the URL where the frontend would download it and parse it on the client side. Otherwise, this response will be huge and most of the times it's not going to be used.

Copy link
Member

@humitos humitos Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, I think we will probably want to do the same for filetreediff as well; since there will be PRs will a ton of files changed.

Besides, that will help us to invalidate the cache for filetreediff when a new build is done, but keep cached the /_/addons/ endpoint for the whole project, as we are currently doing.

Copy link
Member Author

@ericholscher ericholscher Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea I just copied the implementation from filetreediff. I figured in production we would only pull the data when somebody pulls up the command pallet, but this made it easy to test.

In theory, the data is also already in our search index, I just wasn't 100% sure how to get it out of elastic.

"enabled": True,
"pages": [
{
"path": page.path,
"sections": [
{
"id": section.id,
"title": section.title,
}
for section in page.sections
],
}
for page in manifest.pages
],
}

def _v2(self, project, version, build, filename, url, user):
return {
"api_version": "2",
Expand Down