Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional operator edge case fixes #2007

Merged
merged 4 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 36 additions & 38 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,19 +305,21 @@ async def sync_crawls(self, data: MCSyncData):
"resyncAfterSeconds": status.resync_after,
}

def _load_redis(self, params, status, children):
def _load_redis(self, params, status: CrawlStatus, children):
name = f"redis-{params['id']}"
has_pod = name in children[POD]

pod_info = status.podStatus[name]
params["name"] = name
params["cpu"] = pod_info.newCpu or params.get("redis_cpu")
params["memory"] = pod_info.newMemory or params.get("redis_memory")
restart = pod_info.should_restart_pod() and has_pod
if restart:
print(f"Restart {name}")
restart_reason = None
if has_pod:
restart_reason = pod_info.should_restart_pod()
if restart_reason:
print(f"Restarting {name}, reason: {restart_reason}")

params["init_redis"] = status.initRedis and not restart
params["init_redis"] = status.initRedis and not restart_reason

return self.load_from_yaml("redis.yaml", params)

Expand Down Expand Up @@ -362,7 +364,7 @@ async def _load_qa_configmap(self, params, children):
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
return self.load_from_yaml("qa_configmap.yaml", params)

def _load_crawler(self, params, i, status, children):
def _load_crawler(self, params, i, status: CrawlStatus, children):
name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD]

Expand All @@ -387,11 +389,12 @@ def _load_crawler(self, params, i, status, children):
else:
params["memory_limit"] = self.k8s.max_crawler_memory_size
params["workers"] = params.get(worker_field) or 1
params["do_restart"] = (
pod_info.should_restart_pod() or params.get("force_restart")
) and has_pod
if params.get("do_restart"):
print(f"Restart {name}")
params["do_restart"] = False
if has_pod:
restart_reason = pod_info.should_restart_pod(params.get("force_restart"))
if restart_reason:
print(f"Restarting {name}, reason: {restart_reason}")
params["do_restart"] = True

return self.load_from_yaml("crawler.yaml", params)

Expand Down Expand Up @@ -523,7 +526,7 @@ async def set_state(
finished=finished,
stats=stats,
)
if res:
if res and status.state != state:
print(f"Setting state: {status.state} -> {state}, {crawl.id}")
status.state = state
return True
Expand Down Expand Up @@ -804,14 +807,6 @@ async def sync_crawl_state(
status.resync_after = self.fast_retry_secs
return status

# ensure running state is set
await self.set_state(
"running",
status,
crawl,
allowed_from=["starting", "waiting_capacity"],
)

# update lastActiveTime if crawler is running
if crawler_running:
status.lastActiveTime = to_k8s_date(dt_now())
Expand Down Expand Up @@ -874,25 +869,32 @@ def sync_pod_status(
try:
for name, pod in pods.items():
running = False
evicted = False

pstatus = pod["status"]
phase = pstatus["phase"]
role = pod["metadata"]["labels"]["role"]

if phase in ("Running", "Succeeded"):
running = True
elif phase == "Failed" and pstatus.get("reason") == "Evicted":
evicted = True

status.podStatus[name].evicted = evicted

if "containerStatuses" in pstatus:
cstatus = pstatus["containerStatuses"][0]

# consider 'ContainerCreating' as running
waiting = cstatus["state"].get("waiting")
if (
phase == "Pending"
and waiting
and waiting.get("reason") == "ContainerCreating"
):
running = True
# don't consider 'ContainerCreating' as running for now
# may be stuck in this state for other reasons
#
# waiting = cstatus["state"].get("waiting")
# if (
# phase == "Pending"
# and waiting
# and waiting.get("reason") == "ContainerCreating"
# ):
# running = True

self.handle_terminated_pod(
name, role, status, cstatus["state"].get("terminated")
Expand Down Expand Up @@ -1388,24 +1390,20 @@ async def update_crawl_state(
else:
await self.fail_crawl(crawl, status, pods, stats)

# check for other statuses
# check for other statuses, default to "running"
else:
new_status: Optional[TYPE_RUNNING_STATES] = None
if status_count.get("running"):
if status.state in ("generate-wacz", "uploading-wacz", "pending-wacz"):
new_status = "running"
new_status: TYPE_RUNNING_STATES = "running"

elif status_count.get("generate-wacz"):
if status_count.get("generate-wacz"):
new_status = "generate-wacz"
elif status_count.get("uploading-wacz"):
new_status = "uploading-wacz"
elif status_count.get("pending-wait"):
new_status = "pending-wait"

if new_status:
await self.set_state(
new_status, status, crawl, allowed_from=RUNNING_STATES
)
await self.set_state(
new_status, status, crawl, allowed_from=RUNNING_AND_WAITING_STATES
)

return status

Expand Down
16 changes: 12 additions & 4 deletions backend/btrixcloud/operator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ class PodInfo(BaseModel):
newMemory: Optional[int] = None
signalAtMem: Optional[int] = None

evicted: Optional[bool] = False

def dict(self, *a, **kw):
res = super().dict(*a, **kw)
percent = {
Expand Down Expand Up @@ -168,15 +170,21 @@ def get_percent_storage(self) -> float:
else 0
)

def should_restart_pod(self):
def should_restart_pod(self, forced: bool = False) -> Optional[str]:
"""return true if pod should be restarted"""
if self.newMemory and self.newMemory != self.allocated.memory:
return True
return "newMemory"

if self.newCpu and self.newCpu != self.allocated.cpu:
return True
return "newCpu"

if self.evicted:
return "evicted"

if forced:
return "forced"

return False
return None


# ============================================================================
Expand Down
Loading