Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use OnceLock to store TokioRuntime #895

Merged
merged 8 commits into from
Oct 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ description = "Apache DataFusion DataFrame and SQL Query Engine"
readme = "README.md"
license = "Apache-2.0"
edition = "2021"
rust-version = "1.64"
rust-version = "1.78"
include = ["/src", "/datafusion", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"]

[features]
Expand Down
3 changes: 1 addition & 2 deletions python/datafusion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from .catalog import Catalog, Database, Table

# The following imports are okay to remain as opaque to the user.
from ._internal import Config, runtime
from ._internal import Config

from .record_batch import RecordBatchStream, RecordBatch

Expand Down Expand Up @@ -75,7 +75,6 @@
"literal",
"lit",
"DFSchema",
"runtime",
"Catalog",
"Database",
"Table",
Expand Down
2 changes: 1 addition & 1 deletion src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ impl PySessionContext {
) -> PyResult<PyRecordBatchStream> {
let ctx: TaskContext = TaskContext::from(&self.ctx.state());
// create a Tokio runtime to run the async code
let rt = &get_tokio_runtime(py).0;
let rt = &get_tokio_runtime().0;
let plan = plan.plan.clone();
let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
rt.spawn(async move { plan.execute(part, Arc::new(ctx)) });
Expand Down
4 changes: 2 additions & 2 deletions src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ impl PyDataFrame {

fn execute_stream(&self, py: Python) -> PyResult<PyRecordBatchStream> {
// create a Tokio runtime to run the async code
let rt = &get_tokio_runtime(py).0;
let rt = &get_tokio_runtime().0;
let df = self.df.as_ref().clone();
let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
rt.spawn(async move { df.execute_stream().await });
Expand All @@ -553,7 +553,7 @@ impl PyDataFrame {

fn execute_stream_partitioned(&self, py: Python) -> PyResult<Vec<PyRecordBatchStream>> {
// create a Tokio runtime to run the async code
let rt = &get_tokio_runtime(py).0;
let rt = &get_tokio_runtime().0;
let df = self.df.as_ref().clone();
let fut: JoinHandle<datafusion::common::Result<Vec<SendableRecordBatchStream>>> =
rt.spawn(async move { df.execute_stream_partitioned().await });
Expand Down
6 changes: 0 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ pub mod utils;
static GLOBAL: MiMalloc = MiMalloc;

// Used to define Tokio Runtime as a Python module attribute
#[pyclass]
pub(crate) struct TokioRuntime(tokio::runtime::Runtime);

/// Low-level DataFusion internal package.
Expand All @@ -75,11 +74,6 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime);
/// datafusion directory.
#[pymodule]
fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
// Register the Tokio Runtime as a module attribute so we can reuse it
m.add(
"runtime",
TokioRuntime(tokio::runtime::Runtime::new().unwrap()),
)?;
// Register the python classes
m.add_class::<catalog::PyCatalog>()?;
m.add_class::<catalog::PyDatabase>()?;
Expand Down
23 changes: 11 additions & 12 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,19 @@ use crate::TokioRuntime;
use datafusion::logical_expr::Volatility;
use pyo3::prelude::*;
use std::future::Future;
use std::sync::OnceLock;
use tokio::runtime::Runtime;

/// Utility to get the Tokio Runtime from Python
pub(crate) fn get_tokio_runtime(py: Python) -> PyRef<TokioRuntime> {
let datafusion = py.import_bound("datafusion._internal").unwrap();
let tmp = datafusion.getattr("runtime").unwrap();
match tmp.extract::<PyRef<TokioRuntime>>() {
Ok(runtime) => runtime,
Err(_e) => {
let rt = TokioRuntime(tokio::runtime::Runtime::new().unwrap());
let obj: Bound<'_, TokioRuntime> = Py::new(py, rt).unwrap().into_bound(py);
obj.extract().unwrap()
}
}
#[inline]
pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime {
// NOTE: Other pyo3 python libraries have had issues with using tokio
// behind a forking app-server like `gunicorn`
// If we run into that problem, in the future we can look to `delta-rs`
// which adds a check in that disallows calls from a forked process
// https://github.com/delta-io/delta-rs/blob/87010461cfe01563d91a4b9cd6fa468e2ad5f283/python/src/utils.rs#L10-L31
static RUNTIME: OnceLock<TokioRuntime> = OnceLock::new();
Copy link

@austin362667 austin362667 Oct 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if I'm understanding it correctly (please correct me if I'm not making sense).

So now it use OnceLock<Runtime> than OnceLock<Arc<Runtime>> to get rid of Python heap thoroughly. And it will not be thread-safe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend looking at the PR diff to see the real change.

There is an intermediate commit that went from OnceLock<Arc<Runtime>> to OnceLock<Runtime>, but that's because the Arc was superfluous.

OnceLock is already Send + Sync and explicitly thread-safe.

So, if you discover that it is not actually thread-safe, please share!

RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap()))
}

/// Utility to collect rust futures with GIL released
Expand All @@ -42,7 +41,7 @@ where
F: Future + Send,
F::Output: Send,
{
let runtime: &Runtime = &get_tokio_runtime(py).0;
let runtime: &Runtime = &get_tokio_runtime().0;
py.allow_threads(|| runtime.block_on(f))
}

Expand Down
Loading