Skip to content

Commit

Permalink
fix: drain completed page_service connections (#8632)
Browse files Browse the repository at this point in the history
We've noticed increased memory usage with the latest release. Drain the
joinset of `page_service` connection handlers to avoid leaking them
until shutdown. An alternative would be to use a TaskTracker.
TaskTracker was not discussed in original PR #8339 review, so not hot
fixing it in here either.
  • Loading branch information
koivunej authored Aug 7, 2024
1 parent 8468d51 commit 05dd1ae
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
42 changes: 22 additions & 20 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,19 @@ impl Listener {
}
}
impl Connections {
pub async fn shutdown(self) {
pub(crate) async fn shutdown(self) {
let Self { cancel, mut tasks } = self;
cancel.cancel();
while let Some(res) = tasks.join_next().await {
// the logging done here mimics what was formerly done by task_mgr
match res {
Ok(Ok(())) => {}
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
Err(e) => error!("page_service connection task panicked: {:?}", e),
}
Self::handle_connection_completion(res);
}
}

fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
match res {
Ok(Ok(())) => {}
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
Err(e) => error!("page_service connection task panicked: {:?}", e),
}
}
}
Expand All @@ -155,20 +158,19 @@ pub async fn libpq_listener_main(
let connections_cancel = CancellationToken::new();
let mut connection_handler_tasks = tokio::task::JoinSet::default();

// Wait for a new connection to arrive, or for server shutdown.
while let Some(res) = tokio::select! {
biased;

_ = listener_cancel.cancelled() => {
// We were requested to shut down.
None
}
loop {
let accepted = tokio::select! {
biased;
_ = listener_cancel.cancelled() => break,
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
let res = next.expect("we dont poll while empty");
Connections::handle_connection_completion(res);
continue;
}
accepted = listener.accept() => accepted,
};

res = listener.accept() => {
Some(res)
}
} {
match res {
match accepted {
Ok((socket, peer_addr)) => {
// Connection established. Spawn a new task to handle it.
debug!("accepted connection from {}", peer_addr);
Expand Down
11 changes: 10 additions & 1 deletion test_runner/regress/test_bad_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
@pytest.mark.timeout(600)
def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*simulated connection error.*")
env.pageserver.allowed_errors.append(".*simulated connection error.*") # this is never hit

# the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out.
env.pageserver.allowed_errors.append(
".*ERROR error in page_service connection task: Postgres query error"
)

# Enable failpoint before starting everything else up so that we exercise the retry
# on fetching basebackup
Expand Down Expand Up @@ -69,3 +74,7 @@ def execute_retry_on_timeout(query):
cur.fetchall()
times_executed += 1
log.info(f"Workload executed {times_executed} times")

# do a graceful shutdown which would had caught the allowed_errors before
# https://github.com/neondatabase/neon/pull/8632
env.pageserver.stop()

1 comment on commit 05dd1ae

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2200 tests run: 2119 passed, 0 failed, 81 skipped (full report)


Code coverage* (full report)

  • functions: 32.6% (7159 of 21946 functions)
  • lines: 50.5% (57850 of 114486 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
05dd1ae at 2024-08-07T19:18:57.168Z :recycle:

Please sign in to comment.