Merge branch 'main' into docs-collection-sharing-edits

webrecorder · Feb 12, 2025 · 58c9ba6 · 58c9ba6
2 parents 32e0cc6 + f7b9b73
commit 58c9ba6
Show file tree

Hide file tree

Showing 59 changed files with 3,136 additions and 1,610 deletions.
diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
@@ -3,7 +3,7 @@
 """
 
 # pylint: disable=too-many-lines
-
+from datetime import datetime
 from collections import Counter
 from uuid import UUID, uuid4
 from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union
@@ -20,10 +20,12 @@
 
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
 from .models import (
+    AnyHttpUrl,
     Collection,
     CollIn,
     CollOut,
     CollIdName,
+    CollectionThumbnailSource,
     UpdateColl,
     AddRemoveCrawlList,
     BaseCrawl,
@@ -753,7 +755,7 @@ async def list_urls_in_collection(
         page_size: int = DEFAULT_PAGE_SIZE,
         page: int = 1,
     ) -> Tuple[List[PageUrlCount], int]:
-        """List all URLs in collection sorted desc by snapshot count"""
+        """List all URLs in collection sorted desc by snapshot count unless prefix is specified"""
         # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
         # Zero-index page for query
         page = page - 1
@@ -762,13 +764,15 @@ async def list_urls_in_collection(
         crawl_ids = await self.get_collection_crawl_ids(coll_id)
 
         match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}}
+        sort_query: dict[str, int] = {"count": -1, "_id": 1}
 
         if url_prefix:
             url_prefix = urllib.parse.unquote(url_prefix)
             regex_pattern = f"^{re.escape(url_prefix)}"
             match_query["url"] = {"$regex": regex_pattern, "$options": "i"}
+            sort_query = {"_id": 1}
 
-        aggregate = [{"$match": match_query}]
+        aggregate: List[Dict[str, Union[int, object]]] = [{"$match": match_query}]
 
         aggregate.extend(
             [
@@ -779,7 +783,7 @@ async def list_urls_in_collection(
                         "count": {"$sum": 1},
                     },
                 },
-                {"$sort": {"count": -1}},
+                {"$sort": sort_query},
                 {"$set": {"url": "$_id"}},
                 {
                     "$facet": {
@@ -843,8 +847,17 @@ async def set_home_url(
 
         return {"updated": True}
 
+    # pylint: disable=too-many-locals
     async def upload_thumbnail_stream(
-        self, stream, filename: str, coll_id: UUID, org: Organization, user: User
+        self,
+        stream,
+        filename: str,
+        coll_id: UUID,
+        org: Organization,
+        user: User,
+        source_url: Optional[AnyHttpUrl] = None,
+        source_ts: Optional[datetime] = None,
+        source_page_id: Optional[UUID] = None,
     ) -> Dict[str, bool]:
         """Upload file as stream to use as collection thumbnail"""
         coll = await self.get_collection(coll_id)
@@ -903,6 +916,13 @@ async def stream_iter():
 
         coll.thumbnail = thumbnail_file
 
+        if source_url and source_ts and source_page_id:
+            coll.thumbnailSource = CollectionThumbnailSource(
+                url=source_url,
+                urlTs=source_ts,
+                urlPageId=source_page_id,
+            )
+
         # Update entire document to avoid bson.errors.InvalidDocument exception
         await self.collections.find_one_and_update(
             {"_id": coll_id, "oid": org.id},
@@ -1226,11 +1246,21 @@ async def upload_thumbnail_stream(
         request: Request,
         filename: str,
         coll_id: UUID,
+        sourceUrl: Optional[AnyHttpUrl],
+        sourceTs: Optional[datetime],
+        sourcePageId: Optional[UUID],
         org: Organization = Depends(org_crawl_dep),
         user: User = Depends(user_dep),
     ):
         return await colls.upload_thumbnail_stream(
-            request.stream(), filename, coll_id, org, user
+            request.stream(),
+            filename,
+            coll_id,
+            org,
+            user,
+            sourceUrl,
+            sourceTs,
+            sourcePageId,
         )
 
     @app.delete(

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -251,11 +251,6 @@ async def add_crawl_config(
             crawlconfig.lastStartedBy = user.id
             crawlconfig.lastStartedByName = user.name
 
-        # Ensure page limit is below org maxPagesPerCall if set
-        max_pages = org.quotas.maxPagesPerCrawl or 0
-        if max_pages > 0:
-            crawlconfig.config.limit = max_pages
-
         # add  CrawlConfig to DB here
         result = await self.crawl_configs.insert_one(crawlconfig.to_dict())
 
@@ -286,13 +281,30 @@ async def add_crawl_config(
             execMinutesQuotaReached=exec_mins_quota_reached,
         )
 
+    def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
+        """ensure page limit is set to no greater than quota page limit, if any"""
+        if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
+            if crawlconfig.config.limit and crawlconfig.config.limit > 0:
+                crawlconfig.config.limit = min(
+                    org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
+                )
+            else:
+                crawlconfig.config.limit = org.quotas.maxPagesPerCrawl
+
     async def add_new_crawl(
-        self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool
+        self,
+        crawl_id: str,
+        crawlconfig: CrawlConfig,
+        user: User,
+        org: Organization,
+        manual: bool,
     ) -> None:
         """increments crawl count for this config and adds new crawl"""
 
         started = dt_now()
 
+        self.ensure_quota_page_limit(crawlconfig, org)
+
         inc = self.inc_crawl_count(crawlconfig.id)
         add = self.crawl_ops.add_new_crawl(
             crawl_id, crawlconfig, user.id, started, manual
@@ -892,7 +904,7 @@ async def run_now_internal(
                 storage_filename=storage_filename,
                 profile_filename=profile_filename or "",
             )
-            await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
+            await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
             return crawl_id
 
         except Exception as exc:

diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
@@ -248,7 +248,14 @@ def main() -> None:
     upload_ops = init_uploads_api(*base_crawl_init)
 
     page_ops = init_pages_api(
-        app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
+        app,
+        mdb,
+        crawls,
+        org_ops,
+        storage_ops,
+        background_job_ops,
+        coll_ops,
+        current_active_user,
     )
 
     base_crawl_ops.set_page_ops(page_ops)

diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -1236,6 +1236,15 @@ class CollAccessType(str, Enum):
     PUBLIC = "public"
 
 
+# ============================================================================
+class CollectionThumbnailSource(BaseModel):
+    """The page source for a thumbnail"""
+
+    url: AnyHttpUrl
+    urlTs: datetime
+    urlPageId: UUID
+
+
 # ============================================================================
 class Collection(BaseMongoModel):
     """Org collection structure"""
@@ -1268,6 +1277,7 @@ class Collection(BaseMongoModel):
     homeUrlPageId: Optional[UUID] = None
 
     thumbnail: Optional[ImageFile] = None
+    thumbnailSource: Optional[CollectionThumbnailSource] = None
     defaultThumbnailName: Optional[str] = None
 
     allowPublicDownload: Optional[bool] = True
@@ -1323,6 +1333,7 @@ class CollOut(BaseMongoModel):
 
     resources: List[CrawlFileOut] = []
     thumbnail: Optional[ImageFileOut] = None
+    thumbnailSource: Optional[CollectionThumbnailSource] = None
     defaultThumbnailName: Optional[str] = None
 
     allowPublicDownload: bool = True
@@ -1372,6 +1383,7 @@ class UpdateColl(BaseModel):
     access: Optional[CollAccessType] = None
     defaultThumbnailName: Optional[str] = None
     allowPublicDownload: Optional[bool] = None
+    thumbnailSource: Optional[CollectionThumbnailSource] = None
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
@@ -6,6 +6,7 @@
 from pprint import pprint
 from typing import Optional, Any, Sequence
 from datetime import datetime
+from uuid import UUID
 
 import json
 
@@ -29,7 +30,6 @@
     CrawlFile,
     CrawlCompleteIn,
     StorageRef,
-    Organization,
 )
 
 from btrixcloud.utils import str_to_date, date_to_str, dt_now
@@ -145,11 +145,13 @@ async def sync_crawls(self, data: MCSyncData):
         params["userid"] = spec.get("userid", "")
 
         pods = data.children[POD]
+        org = await self.org_ops.get_org_by_id(UUID(oid))
 
         crawl = CrawlSpec(
             id=crawl_id,
             cid=cid,
             oid=oid,
+            org=org,
             storage=StorageRef(spec["storageName"]),
             crawler_channel=spec.get("crawlerChannel"),
             proxy_id=spec.get("proxyId"),
@@ -204,8 +206,6 @@ async def sync_crawls(self, data: MCSyncData):
             await self.k8s.delete_crawl_job(crawl.id)
             return {"status": status.dict(exclude_none=True), "children": []}
 
-        org = None
-
         # first, check storage quota, and fail immediately if quota reached
         if status.state in (
             "starting",
@@ -215,7 +215,6 @@ async def sync_crawls(self, data: MCSyncData):
             # only check on very first run, before any pods/pvcs created
             # for now, allow if crawl has already started (pods/pvcs created)
             if not pods and not data.children[PVC]:
-                org = await self.org_ops.get_org_by_id(crawl.oid)
                 if self.org_ops.storage_quota_reached(org):
                     await self.mark_finished(
                         crawl, status, "skipped_storage_quota_reached"
@@ -229,7 +228,7 @@ async def sync_crawls(self, data: MCSyncData):
                     return self._empty_response(status)
 
         if status.state in ("starting", "waiting_org_limit"):
-            if not await self.can_start_new(crawl, data, status, org):
+            if not await self.can_start_new(crawl, data, status):
                 return self._empty_response(status)
 
             await self.set_state(
@@ -382,8 +381,9 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):
 
         crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)
 
-        raw_config = crawlconfig.get_raw_config()
+        self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)
 
+        raw_config = crawlconfig.get_raw_config()
         raw_config["behaviors"] = self._filter_autoclick_behavior(
             raw_config["behaviors"], params["crawler_image"]
         )
@@ -637,14 +637,10 @@ async def can_start_new(
         crawl: CrawlSpec,
         data: MCSyncData,
         status: CrawlStatus,
-        org: Optional[Organization] = None,
     ):
         """return true if crawl can start, otherwise set crawl to 'queued' state
         until more crawls for org finish"""
-        if not org:
-            org = await self.org_ops.get_org_by_id(crawl.oid)
-
-        max_crawls = org.quotas.maxConcurrentCrawls or 0
+        max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
         if not max_crawls:
             return True
 
@@ -1238,15 +1234,13 @@ def get_log_line(self, message, details):
         }
         return json.dumps(err)
 
-    async def add_file_to_crawl(self, cc_data, crawl, redis):
+    async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
         """Handle finished CrawlFile to db"""
 
         filecomplete = CrawlCompleteIn(**cc_data)
 
-        org = await self.org_ops.get_org_by_id(crawl.oid)
-
         filename = self.storage_ops.get_org_relative_path(
-            org, crawl.storage, filecomplete.filename
+            crawl.org, crawl.storage, filecomplete.filename
         )
 
         crawl_file = CrawlFile(
@@ -1299,7 +1293,7 @@ async def is_crawl_stopping(
             return "size-limit"
 
         # gracefully stop crawl if current running crawl sizes reach storage quota
-        org = await self.org_ops.get_org_by_id(crawl.oid)
+        org = crawl.org
 
         if org.readOnly:
             return "stopped_org_readonly"

diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py
@@ -112,6 +112,7 @@ async def make_new_crawljob(
                 crawl_id,
                 crawlconfig,
                 user,
+                org,
                 manual=False,
             )
             print("Scheduled Crawl Created: " + crawl_id)

diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py
@@ -5,7 +5,7 @@
 from typing import Optional, DefaultDict, Literal, Annotated, Any
 from pydantic import BaseModel, Field
 from kubernetes.utils import parse_quantity
-from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES
+from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization
 
 
 BTRIX_API = "btrix.cloud/v1"
@@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
     id: str
     cid: UUID
     oid: UUID
+    org: Organization
     scale: int = 1
     storage: StorageRef
     started: str

diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py
@@ -89,7 +89,9 @@ def init_ops() -> Tuple[
 
     upload_ops = UploadOps(*base_crawl_init)
 
-    page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
+    page_ops = PageOps(
+        mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
+    )
 
     base_crawl_ops.set_page_ops(page_ops)
     crawl_ops.set_page_ops(page_ops)