Skip to content

Commit

Permalink
Merge branch 'main' into docs-collection-sharing-edits
Browse files Browse the repository at this point in the history
  • Loading branch information
Shrinks99 authored Feb 12, 2025
2 parents 32e0cc6 + f7b9b73 commit 58c9ba6
Show file tree
Hide file tree
Showing 59 changed files with 3,136 additions and 1,610 deletions.
42 changes: 36 additions & 6 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

# pylint: disable=too-many-lines

from datetime import datetime
from collections import Counter
from uuid import UUID, uuid4
from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union
Expand All @@ -20,10 +20,12 @@

from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
AnyHttpUrl,
Collection,
CollIn,
CollOut,
CollIdName,
CollectionThumbnailSource,
UpdateColl,
AddRemoveCrawlList,
BaseCrawl,
Expand Down Expand Up @@ -753,7 +755,7 @@ async def list_urls_in_collection(
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
) -> Tuple[List[PageUrlCount], int]:
"""List all URLs in collection sorted desc by snapshot count"""
"""List all URLs in collection sorted desc by snapshot count unless prefix is specified"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
# Zero-index page for query
page = page - 1
Expand All @@ -762,13 +764,15 @@ async def list_urls_in_collection(
crawl_ids = await self.get_collection_crawl_ids(coll_id)

match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}}
sort_query: dict[str, int] = {"count": -1, "_id": 1}

if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
match_query["url"] = {"$regex": regex_pattern, "$options": "i"}
sort_query = {"_id": 1}

aggregate = [{"$match": match_query}]
aggregate: List[Dict[str, Union[int, object]]] = [{"$match": match_query}]

aggregate.extend(
[
Expand All @@ -779,7 +783,7 @@ async def list_urls_in_collection(
"count": {"$sum": 1},
},
},
{"$sort": {"count": -1}},
{"$sort": sort_query},
{"$set": {"url": "$_id"}},
{
"$facet": {
Expand Down Expand Up @@ -843,8 +847,17 @@ async def set_home_url(

return {"updated": True}

# pylint: disable=too-many-locals
async def upload_thumbnail_stream(
self, stream, filename: str, coll_id: UUID, org: Organization, user: User
self,
stream,
filename: str,
coll_id: UUID,
org: Organization,
user: User,
source_url: Optional[AnyHttpUrl] = None,
source_ts: Optional[datetime] = None,
source_page_id: Optional[UUID] = None,
) -> Dict[str, bool]:
"""Upload file as stream to use as collection thumbnail"""
coll = await self.get_collection(coll_id)
Expand Down Expand Up @@ -903,6 +916,13 @@ async def stream_iter():

coll.thumbnail = thumbnail_file

if source_url and source_ts and source_page_id:
coll.thumbnailSource = CollectionThumbnailSource(
url=source_url,
urlTs=source_ts,
urlPageId=source_page_id,
)

# Update entire document to avoid bson.errors.InvalidDocument exception
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
Expand Down Expand Up @@ -1226,11 +1246,21 @@ async def upload_thumbnail_stream(
request: Request,
filename: str,
coll_id: UUID,
sourceUrl: Optional[AnyHttpUrl],
sourceTs: Optional[datetime],
sourcePageId: Optional[UUID],
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await colls.upload_thumbnail_stream(
request.stream(), filename, coll_id, org, user
request.stream(),
filename,
coll_id,
org,
user,
sourceUrl,
sourceTs,
sourcePageId,
)

@app.delete(
Expand Down
26 changes: 19 additions & 7 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,6 @@ async def add_crawl_config(
crawlconfig.lastStartedBy = user.id
crawlconfig.lastStartedByName = user.name

# Ensure page limit is below org maxPagesPerCall if set
max_pages = org.quotas.maxPagesPerCrawl or 0
if max_pages > 0:
crawlconfig.config.limit = max_pages

# add CrawlConfig to DB here
result = await self.crawl_configs.insert_one(crawlconfig.to_dict())

Expand Down Expand Up @@ -286,13 +281,30 @@ async def add_crawl_config(
execMinutesQuotaReached=exec_mins_quota_reached,
)

def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
"""ensure page limit is set to no greater than quota page limit, if any"""
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
if crawlconfig.config.limit and crawlconfig.config.limit > 0:
crawlconfig.config.limit = min(
org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
)
else:
crawlconfig.config.limit = org.quotas.maxPagesPerCrawl

async def add_new_crawl(
self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool
self,
crawl_id: str,
crawlconfig: CrawlConfig,
user: User,
org: Organization,
manual: bool,
) -> None:
"""increments crawl count for this config and adds new crawl"""

started = dt_now()

self.ensure_quota_page_limit(crawlconfig, org)

inc = self.inc_crawl_count(crawlconfig.id)
add = self.crawl_ops.add_new_crawl(
crawl_id, crawlconfig, user.id, started, manual
Expand Down Expand Up @@ -892,7 +904,7 @@ async def run_now_internal(
storage_filename=storage_filename,
profile_filename=profile_filename or "",
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
return crawl_id

except Exception as exc:
Expand Down
9 changes: 8 additions & 1 deletion backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,14 @@ def main() -> None:
upload_ops = init_uploads_api(*base_crawl_init)

page_ops = init_pages_api(
app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
app,
mdb,
crawls,
org_ops,
storage_ops,
background_job_ops,
coll_ops,
current_active_user,
)

base_crawl_ops.set_page_ops(page_ops)
Expand Down
12 changes: 12 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,15 @@ class CollAccessType(str, Enum):
PUBLIC = "public"


# ============================================================================
class CollectionThumbnailSource(BaseModel):
"""The page source for a thumbnail"""

url: AnyHttpUrl
urlTs: datetime
urlPageId: UUID


# ============================================================================
class Collection(BaseMongoModel):
"""Org collection structure"""
Expand Down Expand Up @@ -1268,6 +1277,7 @@ class Collection(BaseMongoModel):
homeUrlPageId: Optional[UUID] = None

thumbnail: Optional[ImageFile] = None
thumbnailSource: Optional[CollectionThumbnailSource] = None
defaultThumbnailName: Optional[str] = None

allowPublicDownload: Optional[bool] = True
Expand Down Expand Up @@ -1323,6 +1333,7 @@ class CollOut(BaseMongoModel):

resources: List[CrawlFileOut] = []
thumbnail: Optional[ImageFileOut] = None
thumbnailSource: Optional[CollectionThumbnailSource] = None
defaultThumbnailName: Optional[str] = None

allowPublicDownload: bool = True
Expand Down Expand Up @@ -1372,6 +1383,7 @@ class UpdateColl(BaseModel):
access: Optional[CollAccessType] = None
defaultThumbnailName: Optional[str] = None
allowPublicDownload: Optional[bool] = None
thumbnailSource: Optional[CollectionThumbnailSource] = None


# ============================================================================
Expand Down
26 changes: 10 additions & 16 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pprint import pprint
from typing import Optional, Any, Sequence
from datetime import datetime
from uuid import UUID

import json

Expand All @@ -29,7 +30,6 @@
CrawlFile,
CrawlCompleteIn,
StorageRef,
Organization,
)

from btrixcloud.utils import str_to_date, date_to_str, dt_now
Expand Down Expand Up @@ -145,11 +145,13 @@ async def sync_crawls(self, data: MCSyncData):
params["userid"] = spec.get("userid", "")

pods = data.children[POD]
org = await self.org_ops.get_org_by_id(UUID(oid))

crawl = CrawlSpec(
id=crawl_id,
cid=cid,
oid=oid,
org=org,
storage=StorageRef(spec["storageName"]),
crawler_channel=spec.get("crawlerChannel"),
proxy_id=spec.get("proxyId"),
Expand Down Expand Up @@ -204,8 +206,6 @@ async def sync_crawls(self, data: MCSyncData):
await self.k8s.delete_crawl_job(crawl.id)
return {"status": status.dict(exclude_none=True), "children": []}

org = None

# first, check storage quota, and fail immediately if quota reached
if status.state in (
"starting",
Expand All @@ -215,7 +215,6 @@ async def sync_crawls(self, data: MCSyncData):
# only check on very first run, before any pods/pvcs created
# for now, allow if crawl has already started (pods/pvcs created)
if not pods and not data.children[PVC]:
org = await self.org_ops.get_org_by_id(crawl.oid)
if self.org_ops.storage_quota_reached(org):
await self.mark_finished(
crawl, status, "skipped_storage_quota_reached"
Expand All @@ -229,7 +228,7 @@ async def sync_crawls(self, data: MCSyncData):
return self._empty_response(status)

if status.state in ("starting", "waiting_org_limit"):
if not await self.can_start_new(crawl, data, status, org):
if not await self.can_start_new(crawl, data, status):
return self._empty_response(status)

await self.set_state(
Expand Down Expand Up @@ -382,8 +381,9 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):

crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)

raw_config = crawlconfig.get_raw_config()
self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)

raw_config = crawlconfig.get_raw_config()
raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
)
Expand Down Expand Up @@ -637,14 +637,10 @@ async def can_start_new(
crawl: CrawlSpec,
data: MCSyncData,
status: CrawlStatus,
org: Optional[Organization] = None,
):
"""return true if crawl can start, otherwise set crawl to 'queued' state
until more crawls for org finish"""
if not org:
org = await self.org_ops.get_org_by_id(crawl.oid)

max_crawls = org.quotas.maxConcurrentCrawls or 0
max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
if not max_crawls:
return True

Expand Down Expand Up @@ -1238,15 +1234,13 @@ def get_log_line(self, message, details):
}
return json.dumps(err)

async def add_file_to_crawl(self, cc_data, crawl, redis):
async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
"""Handle finished CrawlFile to db"""

filecomplete = CrawlCompleteIn(**cc_data)

org = await self.org_ops.get_org_by_id(crawl.oid)

filename = self.storage_ops.get_org_relative_path(
org, crawl.storage, filecomplete.filename
crawl.org, crawl.storage, filecomplete.filename
)

crawl_file = CrawlFile(
Expand Down Expand Up @@ -1299,7 +1293,7 @@ async def is_crawl_stopping(
return "size-limit"

# gracefully stop crawl if current running crawl sizes reach storage quota
org = await self.org_ops.get_org_by_id(crawl.oid)
org = crawl.org

if org.readOnly:
return "stopped_org_readonly"
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/operator/cronjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ async def make_new_crawljob(
crawl_id,
crawlconfig,
user,
org,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)
Expand Down
3 changes: 2 additions & 1 deletion backend/btrixcloud/operator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Optional, DefaultDict, Literal, Annotated, Any
from pydantic import BaseModel, Field
from kubernetes.utils import parse_quantity
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization


BTRIX_API = "btrix.cloud/v1"
Expand Down Expand Up @@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
id: str
cid: UUID
oid: UUID
org: Organization
scale: int = 1
storage: StorageRef
started: str
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def init_ops() -> Tuple[

upload_ops = UploadOps(*base_crawl_init)

page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
page_ops = PageOps(
mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
)

base_crawl_ops.set_page_ops(page_ops)
crawl_ops.set_page_ops(page_ops)
Expand Down
Loading

0 comments on commit 58c9ba6

Please sign in to comment.