diff --git a/api/openapi/nydus-api-v2.yaml b/api/openapi/nydus-api-v2.yaml new file mode 100644 index 00000000000..01a91d92837 --- /dev/null +++ b/api/openapi/nydus-api-v2.yaml @@ -0,0 +1,145 @@ +openapi: "3.0.2" +info: + title: Nydus Service and Management APIs, version 2. + description: + This is the second version of RESTful Nydus service and management APIs to manage the global daemon and + individual services. + license: + name: Apache 2.0 + url: http://www.apache.org/licenses/LICENSE-2.0.html + version: "0.1" +servers: + - url: https://localhost/v2 +paths: + /daemon: + summary: Returns general information about the nydus daemon + get: + operationId: describeDaemon + responses: + "200": + description: Daemon information + content: + application/json: + schema: + $ref: "#/components/schemas/DaemonInfo" + "500": + description: Internal Server Error + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorMsg" + put: + operationId: configureDaemon + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DaemonConf" + responses: + "204": + description: "Successfully configure the daemon!" + "500": + description: "Can't configure the daemon!" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorMsg" + /blob_objects: + summary: Manage cached blob objects + #################################################################### + get: + operationId: getBlobObject + responses: + "200": + description: Blob objects + content: + application/json: + schema: + $ref: "#/components/schemas/BlobObjectList" + "404": + description: "Blob object not found" + "500": + description: "Internal Server Error" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorMsg" + put: + operationId: createBlobObject + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/BlobObjectConf" + responses: + "204": + description: "Successfully created the blob object!" + "500": + description: "Can't create the blob object!" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorMsg" + delete: + operationId: deleteBlobObject + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/BlobObjectParam" + responses: + "204": + description: "Successfully deleted the blob object!" + "500": + description: "Can't delete the blob object!" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorMsg" +################################################################ +components: + schemas: + DaemonInfo: + type: object + properties: + version: + type: object + properties: + package_ver: + type: string + git_commit: + type: string + build_time: + type: string + profile: + type: string + rustc: + type: string + id: + type: string + supervisor: + type: string + state: + type: string + enum: + - INIT + - RUNNING + - UPGRADING + - INTERRUPTED + - STOPPED + - UNKNOWN + DaemonConf: + type: object + properties: + log_level: + type: string + enum: [trace, debug, info, warn, error] + ErrorMsg: + type: object + properties: + code: + description: Nydus defined error code indicating certain error type + type: string + message: + description: Details about the error + type: string diff --git a/api/src/http.rs b/api/src/http.rs index da9379de834..277e1d9ffd1 100644 --- a/api/src/http.rs +++ b/api/src/http.rs @@ -1,3 +1,4 @@ +// Copyright 2022 Alibaba Cloud. All rights reserved. // Copyright 2020 Ant Group. All rights reserved. // Copyright © 2019 Intel Corporation // @@ -18,7 +19,7 @@ use mio::unix::SourceFd; use mio::{Events, Interest, Poll, Token, Waker}; use nydus_utils::metrics::IoStatsError; use serde::Deserialize; -use serde_json::Error as SerdeError; +use serde_json::{Error as SerdeError, Value}; use url::Url; use crate::http_endpoint_v1::{ @@ -26,6 +27,7 @@ use crate::http_endpoint_v1::{ MetricsBlobcacheHandler, MetricsFilesHandler, MetricsHandler, MetricsInflightHandler, MetricsPatternHandler, MountHandler, SendFuseFdHandler, TakeoverHandler, HTTP_ROOT_V1, }; +use crate::http_endpoint_v2::{BlobObjectListHandlerV2, HTTP_ROOT_V2}; const EXIT_TOKEN: Token = Token(usize::MAX); const REQUEST_TOKEN: Token = Token(1); @@ -50,6 +52,65 @@ pub struct DaemonConf { pub log_level: String, } +/// Configuration information for a cached blob, corresponding to `storage::FactoryConfig`. +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BlobCacheEntryConfig { + /// Identifier for the blob cache configuration: corresponding to `FactoryConfig::id`. + #[serde(default)] + pub id: String, + /// Type of storage backend, corresponding to `FactoryConfig::BackendConfig::backend_type`. + pub backend_type: String, + /// Configuration for storage backend, corresponding to `FactoryConfig::BackendConfig::backend_config`. + /// One of `LocalFsConfig`, `CommonConfig`. + pub backend_config: Value, + /// Type of blob cache, corresponding to `FactoryConfig::CacheConfig::cache_type`. + #[serde(default)] + pub cache_type: String, + /// Configuration for blob cache, corresponding to `FactoryConfig::CacheConfig::cache_config`. + /// One of `FileCacheConfig`, `FsCacheConfig`, or empty. + #[serde(default)] + pub cache_config: Value, + /// Optional file path for metadata blobs. + #[serde(default)] + pub metadata_path: Option, +} + +/// Blob cache object type for nydus/rafs bootstrap blob. +pub const BLOB_CACHE_TYPE_BOOTSTRAP: &str = "bootstrap"; +/// Blob cache object type for nydus/rafs data blob. +pub const BLOB_CACHE_TYPE_DATA_BLOB: &str = "datablob"; + +/// Configuration information for a cached blob. +#[derive(Debug, Deserialize, Serialize)] +pub struct BlobCacheEntry { + /// Type of blob object, bootstrap or data blob. + #[serde(rename = "type")] + pub blob_type: String, + /// Blob id. + #[serde(rename = "id")] + pub blob_id: String, + /// Configuration information to generate blob cache object. + #[serde(rename = "config")] + pub blob_config: BlobCacheEntryConfig, + /// Domain id for the blob, which is used to group cached blobs into management domains. + #[serde(default)] + pub domain_id: String, + #[serde(default)] + pub fs_prefetch: Value, +} + +/// Configuration information for a list of cached blob objects. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct BlobCacheList { + /// List of blob configuration information. + pub blobs: Vec, +} + +#[derive(Clone, Deserialize, Debug)] +pub struct BlobObjectParam { + pub domain_id: String, +} + #[derive(Debug)] pub enum ApiRequest { // Common requests @@ -57,36 +118,39 @@ pub enum ApiRequest { // Nydus API v1 requests DaemonInfo, - Events, - Mount(String, ApiMountCmd), - Remount(String, ApiMountCmd), - Umount(String), ExportGlobalMetrics(Option), - ExportFilesMetrics(Option, bool), ExportAccessPatterns(Option), ExportBackendMetrics(Option), ExportBlobcacheMetrics(Option), - ExportInflightMetrics, + ExportFilesMetrics(Option, bool), + Exit, + Takeover, + + // Filesystem Related + Mount(String, ApiMountCmd), + Remount(String, ApiMountCmd), + Umount(String), ExportFsBackendInfo(String), + ExportInflightMetrics, SendFuseFd, - Takeover, - Exit, + + // Nydus API v2 + DaemonInfoV2, + CreateBlobObject(BlobCacheEntry), + GetBlobObject(BlobObjectParam), + DeleteBlobObject(BlobObjectParam), + ListBlobObject, } #[derive(Debug)] pub enum DaemonErrorKind { NotReady, - UpgradeManager, - Unsupported, - Connect(io::Error), - SendFd, - RecvFd, - Disconnect(io::Error), - Channel, + Other(String), Serde(SerdeError), UnexpectedEvent(String), - Other(String), + UpgradeManager, + Unsupported, } #[derive(Debug)] @@ -99,6 +163,7 @@ pub enum MetricsErrorKind { /// Errors generated by/related to the API service, sent back through `ApiResponse`. #[derive(Debug)] +#[allow(clippy::large_enum_variant)] pub enum ApiError { /// Daemon internal error DaemonAbnormal(DaemonErrorKind), @@ -123,20 +188,31 @@ pub type ApiResult = std::result::Result; #[derive(Serialize)] pub enum ApiResponsePayload { - /// No data is sent on the channel. - Empty, /// Daemon version, configuration and status information in json. DaemonInfo(String), + /// No data is sent on the channel. + Empty, + + /// Filesystem backend metrics. + BackendMetrics(String), + /// Blobcache metrics. + BlobcacheMetrics(String), + /// Global events. Events(String), - FsBackendInfo(String), - /// Nydus filesystem global metrics + /// Filesystem global metrics. FsGlobalMetrics(String), - /// Nydus filesystem per-file metrics + /// Filesystem per-file metrics. FsFilesMetrics(String), + /// Filesystem access pattern trace log. FsFilesPatterns(String), - BackendMetrics(String), - BlobcacheMetrics(String), + + // Filesystem Backend Information + FsBackendInfo(String), + // Filesystem Inflight Requests InflightMetrics(String), + + /// List of blob objects, v2 + BlobObjectList(String), } /// HTTP error messages sent back to the clients. @@ -145,31 +221,54 @@ pub enum ApiResponsePayload { /// So unfortunately it implicitly becomes parts of the API, please keep it stable. #[derive(Debug)] pub enum HttpError { - /// No handler registered for HTTP request URI - NoRoute, + // Daemon common related errors /// Invalid HTTP request BadRequest, - /// Query parameter is missed from the HTTP request. - QueryString(String), - /// Failed to parse HTTP request message body - ParseBody(SerdeError), + /// Failed to configure the daemon. + Configure(ApiError), /// Failed to query information about daemon. DaemonInfo(ApiError), + /// No handler registered for HTTP request URI + NoRoute, + /// Failed to parse HTTP request message body + ParseBody(SerdeError), + /// Query parameter is missed from the HTTP request. + QueryString(String), + + // Metrics related errors + /// Failed to query global events. Events(ApiError), - /// Could not mount resource - Mount(ApiError), + /// Failed to get backend metrics. + BackendMetrics(ApiError), + /// Failed to get blobcache metrics. + BlobcacheMetrics(ApiError), + /// Failed to get global metrics. GlobalMetrics(ApiError), + /// Failed to get filesystem per-file metrics. FsFilesMetrics(ApiError), + /// Failed to get filesystem file access trace. Pattern(ApiError), - Configure(ApiError), - Upgrade(ApiError), - BlobcacheMetrics(ApiError), - BackendMetrics(ApiError), + + // Filesystem related errors (v1) + /// Failed to get filesystem backend information FsBackendInfo(ApiError), + /// Failed to get information about inflight request InflightMetrics(ApiError), + /// Failed to mount filesystem. + Mount(ApiError), + /// Failed to remount filesystem. + Upgrade(ApiError), + + // Blob cache management related errors (v2) + /// Failed to create blob object + CreateBlobObject(ApiError), + /// Failed to delete blob object + DeleteBlobObject(ApiError), + /// Failed to list existing blob objects + GetBlobObjects(ApiError), } -/// This is the response sent by the API server through the mpsc channel. +// This is the response sent by the API server through the mpsc channel. pub type ApiResponse = std::result::Result; pub type HttpResult = std::result::Result; @@ -277,6 +376,12 @@ macro_rules! endpoint_v1 { }; } +macro_rules! endpoint_v2 { + ($path:expr) => { + format!("{}{}", HTTP_ROOT_V2, $path) + }; +} + lazy_static! { /// HTTP_ROUTES contain all the nydusd HTTP routes. pub static ref HTTP_ROUTES: HttpRoutes = { @@ -284,20 +389,33 @@ lazy_static! { routes: HashMap::new(), }; - // Nydus API, v1 + // Global r.routes.insert(endpoint_v1!("/daemon"), Box::new(InfoHandler{})); - r.routes.insert(endpoint_v1!("/daemon/events"), Box::new(EventsHandler{})); + r.routes.insert(endpoint_v2!("/daemon"), Box::new(InfoHandler{})); r.routes.insert(endpoint_v1!("/daemon/backend"), Box::new(FsBackendInfo{})); + r.routes.insert(endpoint_v2!("/daemon/backend"), Box::new(FsBackendInfo{})); + r.routes.insert(endpoint_v1!("/daemon/events"), Box::new(EventsHandler{})); + r.routes.insert(endpoint_v2!("/daemon/events"), Box::new(EventsHandler{})); r.routes.insert(endpoint_v1!("/daemon/exit"), Box::new(ExitHandler{})); - r.routes.insert(endpoint_v1!("/daemon/fuse/sendfd"), Box::new(SendFuseFdHandler{})); - r.routes.insert(endpoint_v1!("/daemon/fuse/takeover"), Box::new(TakeoverHandler{})); - r.routes.insert(endpoint_v1!("/mount"), Box::new(MountHandler{})); - r.routes.insert(endpoint_v1!("/metrics"), Box::new(MetricsHandler{})); - r.routes.insert(endpoint_v1!("/metrics/files"), Box::new(MetricsFilesHandler{})); - r.routes.insert(endpoint_v1!("/metrics/pattern"), Box::new(MetricsPatternHandler{})); + r.routes.insert(endpoint_v2!("/daemon/exit"), Box::new(ExitHandler{})); r.routes.insert(endpoint_v1!("/metrics/backend"), Box::new(MetricsBackendHandler{})); + r.routes.insert(endpoint_v2!("/metrics/backend"), Box::new(MetricsBackendHandler{})); r.routes.insert(endpoint_v1!("/metrics/blobcache"), Box::new(MetricsBlobcacheHandler{})); + r.routes.insert(endpoint_v2!("/metrics/blobcache"), Box::new(MetricsBlobcacheHandler{})); + r.routes.insert(endpoint_v1!("/metrics/files"), Box::new(MetricsFilesHandler{})); + r.routes.insert(endpoint_v2!("/metrics/files"), Box::new(MetricsFilesHandler{})); + r.routes.insert(endpoint_v1!("/metrics/pattern"), Box::new(MetricsPatternHandler{})); + r.routes.insert(endpoint_v2!("/metrics/pattern"), Box::new(MetricsPatternHandler{})); + + // Nydus API, v1 + r.routes.insert(endpoint_v1!("/mount"), Box::new(MountHandler{})); + r.routes.insert(endpoint_v1!("/metrics"), Box::new(MetricsHandler{})); r.routes.insert(endpoint_v1!("/metrics/inflight"), Box::new(MetricsInflightHandler{})); + r.routes.insert(endpoint_v1!("/daemon/fuse/sendfd"), Box::new(SendFuseFdHandler{})); + r.routes.insert(endpoint_v1!("/daemon/fuse/takeover"), Box::new(TakeoverHandler{})); + + // Nydus API, v2 + r.routes.insert(endpoint_v2!("/blobs"), Box::new(BlobObjectListHandlerV2{})); r }; @@ -382,12 +500,7 @@ fn handle_http_request( /// Start a HTTP server parsing http requests and send to nydus API server a concrete /// request to operate nydus or fetch working status. -/// The HTTP server sends request by `to_api` channel and wait for response from `from_api` channel -/// `api_notifier` is used to notify an execution context to fetch above request and handle it. -/// We can't forward signal to native rust thread, so we rely on `exit_evtfd` to notify -/// the server to exit. Therefore, it adds the unix domain socket fd receiving http request -/// to a global epoll_fd associated with a event_fd which will be used later to notify -/// the server thread to exit. +/// The HTTP server sends request by `to_api` channel and wait for response from `from_api` channel. pub fn start_http_thread( path: &str, api_notifier: Option>, @@ -508,6 +621,11 @@ mod tests { assert!(HTTP_ROUTES.routes.get("/api/v1/metrics/inflight").is_some()); } + #[test] + fn test_http_api_routes_v2() { + assert!(HTTP_ROUTES.routes.get("/api/v2/daemon").is_some()); + } + #[test] fn test_kick_api_server() { let (to_api, from_route) = channel(); diff --git a/api/src/http_endpoint_v1.rs b/api/src/http_endpoint_v1.rs index e8ce0d3706f..47e99c8e02c 100644 --- a/api/src/http_endpoint_v1.rs +++ b/api/src/http_endpoint_v1.rs @@ -17,6 +17,8 @@ use crate::http::{ pub const HTTP_ROOT_V1: &str = "/api/v1"; +// Convert an ApiResponse to a HTTP response. +// // API server has successfully processed the request, but can't fulfill that. Therefore, // a `error_response` is generated whose status code is 4XX or 5XX. With error response, // it still returns Ok(error_response) to http request handling framework, which means @@ -26,16 +28,19 @@ fn convert_to_response HttpError>(api_resp: ApiResponse, Ok(r) => { use ApiResponsePayload::*; match r { + // Daemon Common Empty => success_response(None), DaemonInfo(d) => success_response(Some(d)), Events(d) => success_response(Some(d)), - FsFilesMetrics(d) => success_response(Some(d)), - FsGlobalMetrics(d) => success_response(Some(d)), - FsFilesPatterns(d) => success_response(Some(d)), BackendMetrics(d) => success_response(Some(d)), BlobcacheMetrics(d) => success_response(Some(d)), + FsFilesMetrics(d) => success_response(Some(d)), + FsFilesPatterns(d) => success_response(Some(d)), + FsGlobalMetrics(d) => success_response(Some(d)), + // Filesystem Specific FsBackendInfo(d) => success_response(Some(d)), InflightMetrics(d) => success_response(Some(d)), + _ => panic!("Unexpected response message from API service"), } } Err(e) => { @@ -84,30 +89,17 @@ impl EndpointHandler for EventsHandler { } } -pub struct MountHandler {} -impl EndpointHandler for MountHandler { +pub struct ExitHandler {} +impl EndpointHandler for ExitHandler { fn handle_request( &self, req: &Request, kicker: &dyn Fn(ApiRequest) -> ApiResponse, ) -> HttpResult { - let mountpoint = extract_query_part(req, "mountpoint").ok_or_else(|| { - HttpError::QueryString("'mountpoint' should be specified in query string".to_string()) - })?; match (req.method(), req.body.as_ref()) { - (Method::Post, Some(body)) => { - let cmd = parse_body(body)?; - let r = kicker(ApiRequest::Mount(mountpoint, cmd)); - Ok(convert_to_response(r, HttpError::Mount)) - } - (Method::Put, Some(body)) => { - let cmd = parse_body(body)?; - let r = kicker(ApiRequest::Remount(mountpoint, cmd)); - Ok(convert_to_response(r, HttpError::Mount)) - } - (Method::Delete, None) => { - let r = kicker(ApiRequest::Umount(mountpoint)); - Ok(convert_to_response(r, HttpError::Mount)) + (Method::Put, None) => { + let r = kicker(ApiRequest::Exit); + Ok(convert_to_response(r, HttpError::Upgrade)) } _ => Err(HttpError::BadRequest), } @@ -132,8 +124,8 @@ impl EndpointHandler for MetricsHandler { } } -pub struct MetricsFilesHandler {} -impl EndpointHandler for MetricsFilesHandler { +pub struct MetricsBackendHandler {} +impl EndpointHandler for MetricsBackendHandler { fn handle_request( &self, req: &Request, @@ -142,18 +134,16 @@ impl EndpointHandler for MetricsFilesHandler { match (req.method(), req.body.as_ref()) { (Method::Get, None) => { let id = extract_query_part(req, "id"); - let latest_read_files = extract_query_part(req, "latest") - .map_or(false, |b| b.parse::().unwrap_or(false)); - let r = kicker(ApiRequest::ExportFilesMetrics(id, latest_read_files)); - Ok(convert_to_response(r, HttpError::FsFilesMetrics)) + let r = kicker(ApiRequest::ExportBackendMetrics(id)); + Ok(convert_to_response(r, HttpError::BackendMetrics)) } _ => Err(HttpError::BadRequest), } } } -pub struct MetricsPatternHandler {} -impl EndpointHandler for MetricsPatternHandler { +pub struct MetricsBlobcacheHandler {} +impl EndpointHandler for MetricsBlobcacheHandler { fn handle_request( &self, req: &Request, @@ -162,16 +152,16 @@ impl EndpointHandler for MetricsPatternHandler { match (req.method(), req.body.as_ref()) { (Method::Get, None) => { let id = extract_query_part(req, "id"); - let r = kicker(ApiRequest::ExportAccessPatterns(id)); - Ok(convert_to_response(r, HttpError::Pattern)) + let r = kicker(ApiRequest::ExportBlobcacheMetrics(id)); + Ok(convert_to_response(r, HttpError::BlobcacheMetrics)) } _ => Err(HttpError::BadRequest), } } } -pub struct MetricsBackendHandler {} -impl EndpointHandler for MetricsBackendHandler { +pub struct MetricsFilesHandler {} +impl EndpointHandler for MetricsFilesHandler { fn handle_request( &self, req: &Request, @@ -180,16 +170,18 @@ impl EndpointHandler for MetricsBackendHandler { match (req.method(), req.body.as_ref()) { (Method::Get, None) => { let id = extract_query_part(req, "id"); - let r = kicker(ApiRequest::ExportBackendMetrics(id)); - Ok(convert_to_response(r, HttpError::BackendMetrics)) + let latest_read_files = extract_query_part(req, "latest") + .map_or(false, |b| b.parse::().unwrap_or(false)); + let r = kicker(ApiRequest::ExportFilesMetrics(id, latest_read_files)); + Ok(convert_to_response(r, HttpError::FsFilesMetrics)) } _ => Err(HttpError::BadRequest), } } } -pub struct MetricsBlobcacheHandler {} -impl EndpointHandler for MetricsBlobcacheHandler { +pub struct MetricsPatternHandler {} +impl EndpointHandler for MetricsPatternHandler { fn handle_request( &self, req: &Request, @@ -198,33 +190,16 @@ impl EndpointHandler for MetricsBlobcacheHandler { match (req.method(), req.body.as_ref()) { (Method::Get, None) => { let id = extract_query_part(req, "id"); - let r = kicker(ApiRequest::ExportBlobcacheMetrics(id)); - Ok(convert_to_response(r, HttpError::BlobcacheMetrics)) - } - _ => Err(HttpError::BadRequest), - } - } -} - -pub struct MetricsInflightHandler {} -impl EndpointHandler for MetricsInflightHandler { - fn handle_request( - &self, - req: &Request, - kicker: &dyn Fn(ApiRequest) -> ApiResponse, - ) -> HttpResult { - match (req.method(), req.body.as_ref()) { - (Method::Get, None) => { - let r = kicker(ApiRequest::ExportInflightMetrics); - Ok(convert_to_response(r, HttpError::InflightMetrics)) + let r = kicker(ApiRequest::ExportAccessPatterns(id)); + Ok(convert_to_response(r, HttpError::Pattern)) } _ => Err(HttpError::BadRequest), } } } -pub struct SendFuseFdHandler {} -impl EndpointHandler for SendFuseFdHandler { +pub struct TakeoverHandler {} +impl EndpointHandler for TakeoverHandler { fn handle_request( &self, req: &Request, @@ -232,7 +207,7 @@ impl EndpointHandler for SendFuseFdHandler { ) -> HttpResult { match (req.method(), req.body.as_ref()) { (Method::Put, None) => { - let r = kicker(ApiRequest::SendFuseFd); + let r = kicker(ApiRequest::Takeover); Ok(convert_to_response(r, HttpError::Upgrade)) } _ => Err(HttpError::BadRequest), @@ -240,34 +215,47 @@ impl EndpointHandler for SendFuseFdHandler { } } -pub struct TakeoverHandler {} -impl EndpointHandler for TakeoverHandler { +pub struct MountHandler {} +impl EndpointHandler for MountHandler { fn handle_request( &self, req: &Request, kicker: &dyn Fn(ApiRequest) -> ApiResponse, ) -> HttpResult { + let mountpoint = extract_query_part(req, "mountpoint").ok_or_else(|| { + HttpError::QueryString("'mountpoint' should be specified in query string".to_string()) + })?; match (req.method(), req.body.as_ref()) { - (Method::Put, None) => { - let r = kicker(ApiRequest::Takeover); - Ok(convert_to_response(r, HttpError::Upgrade)) + (Method::Post, Some(body)) => { + let cmd = parse_body(body)?; + let r = kicker(ApiRequest::Mount(mountpoint, cmd)); + Ok(convert_to_response(r, HttpError::Mount)) + } + (Method::Put, Some(body)) => { + let cmd = parse_body(body)?; + let r = kicker(ApiRequest::Remount(mountpoint, cmd)); + Ok(convert_to_response(r, HttpError::Mount)) + } + (Method::Delete, None) => { + let r = kicker(ApiRequest::Umount(mountpoint)); + Ok(convert_to_response(r, HttpError::Mount)) } _ => Err(HttpError::BadRequest), } } } -pub struct ExitHandler {} -impl EndpointHandler for ExitHandler { +pub struct MetricsInflightHandler {} +impl EndpointHandler for MetricsInflightHandler { fn handle_request( &self, req: &Request, kicker: &dyn Fn(ApiRequest) -> ApiResponse, ) -> HttpResult { match (req.method(), req.body.as_ref()) { - (Method::Put, None) => { - let r = kicker(ApiRequest::Exit); - Ok(convert_to_response(r, HttpError::Upgrade)) + (Method::Get, None) => { + let r = kicker(ApiRequest::ExportInflightMetrics); + Ok(convert_to_response(r, HttpError::InflightMetrics)) } _ => Err(HttpError::BadRequest), } @@ -275,7 +263,6 @@ impl EndpointHandler for ExitHandler { } pub struct FsBackendInfo {} - impl EndpointHandler for FsBackendInfo { fn handle_request( &self, @@ -296,3 +283,20 @@ impl EndpointHandler for FsBackendInfo { } } } + +pub struct SendFuseFdHandler {} +impl EndpointHandler for SendFuseFdHandler { + fn handle_request( + &self, + req: &Request, + kicker: &dyn Fn(ApiRequest) -> ApiResponse, + ) -> HttpResult { + match (req.method(), req.body.as_ref()) { + (Method::Put, None) => { + let r = kicker(ApiRequest::SendFuseFd); + Ok(convert_to_response(r, HttpError::Upgrade)) + } + _ => Err(HttpError::BadRequest), + } + } +} diff --git a/api/src/http_endpoint_v2.rs b/api/src/http_endpoint_v2.rs new file mode 100644 index 00000000000..ca2eb0bedd4 --- /dev/null +++ b/api/src/http_endpoint_v2.rs @@ -0,0 +1,82 @@ +// Copyright 2022 Alibaba Cloud. All rights reserved. +// Copyright 2020 Ant Group. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +use dbs_uhttp::{Method, Request, Response}; + +use crate::http::{ + error_response, extract_query_part, parse_body, success_response, translate_status_code, + ApiError, ApiRequest, ApiResponse, ApiResponsePayload, BlobObjectParam, EndpointHandler, + HttpError, HttpResult, +}; + +pub const HTTP_ROOT_V2: &str = "/api/v2"; + +// API server has successfully processed the request, but can't fulfill that. Therefore, +// a `error_response` is generated whose status code is 4XX or 5XX. With error response, +// it still returns Ok(error_response) to http request handling framework, which means +// nydusd api server receives the request and try handle it, even the request can't be fulfilled. +fn convert_to_response HttpError>(api_resp: ApiResponse, op: O) -> Response { + match api_resp { + Ok(r) => { + use ApiResponsePayload::*; + match r { + Empty => success_response(None), + DaemonInfo(d) => success_response(Some(d)), + Events(d) => success_response(Some(d)), + BackendMetrics(d) => success_response(Some(d)), + BlobcacheMetrics(d) => success_response(Some(d)), + FsFilesMetrics(d) => success_response(Some(d)), + FsFilesPatterns(d) => success_response(Some(d)), + FsGlobalMetrics(d) => success_response(Some(d)), + + // Nydus API v1 + FsBackendInfo(d) => success_response(Some(d)), + InflightMetrics(d) => success_response(Some(d)), + + // Nydus API v2 + BlobObjectList(d) => success_response(Some(d)), + } + } + Err(e) => { + let status_code = translate_status_code(&e); + error_response(op(e), status_code) + } + } +} + +pub struct BlobObjectListHandlerV2 {} +impl EndpointHandler for BlobObjectListHandlerV2 { + fn handle_request( + &self, + req: &Request, + kicker: &dyn Fn(ApiRequest) -> ApiResponse, + ) -> HttpResult { + match (req.method(), req.body.as_ref()) { + (Method::Get, None) => { + if let Some(domain_id) = extract_query_part(req, "domain_id") { + let param = BlobObjectParam { domain_id }; + let r = kicker(ApiRequest::GetBlobObject(param)); + return Ok(convert_to_response(r, HttpError::GetBlobObjects)); + } + Err(HttpError::BadRequest) + } + (Method::Put, Some(body)) => { + let conf = parse_body(body)?; + let r = kicker(ApiRequest::CreateBlobObject(conf)); + Ok(convert_to_response(r, HttpError::CreateBlobObject)) + } + (Method::Delete, None) => { + if let Some(domain_id) = extract_query_part(req, "domain_id") { + let param = BlobObjectParam { domain_id }; + let r = kicker(ApiRequest::DeleteBlobObject(param)); + return Ok(convert_to_response(r, HttpError::DeleteBlobObject)); + } + Err(HttpError::BadRequest) + } + _ => Err(HttpError::BadRequest), + } + } +} diff --git a/api/src/lib.rs b/api/src/lib.rs index f9c2e3b6b6d..0a51da41dc9 100644 --- a/api/src/lib.rs +++ b/api/src/lib.rs @@ -11,3 +11,4 @@ extern crate lazy_static; pub mod http; pub mod http_endpoint_v1; +pub mod http_endpoint_v2; diff --git a/docs/nydus-fscache.md b/docs/nydus-fscache.md new file mode 100644 index 00000000000..8e9e22e6a45 --- /dev/null +++ b/docs/nydus-fscache.md @@ -0,0 +1,157 @@ +# Nydus EROFS fscache user guide + +This guide shows you how to use fscache-based EROFS nydus image service to launch docker containers with the fscache-enabled in-kernel erofs on-demand download feature. + +**Please be careful**, currently, the user-space daemon only implements _the basic functionality_ and it's aimed to test the fscache on-demand kernel code as a real end-to-end workload for container use cases, so it may take more extra steps compared with existing well-done solutions. This guide can be _frequently updated_ due to the overall implementation changes, so please make sure that you're now referring to the latest document version. + +## Prepare the kernel + +Be aware of using the fscache-enabled erofs linux kernel, it can be built with the following steps: + +1. ``git clone git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git`` \ + or (mirror in china): ``git://kernel.source.codeaurora.cn/pub/scm/linux/kernel/git/xiang/erofs.git`` + +2. ``make olddefconfig`` + +3. Update _.config_ to enable the follow kernel configurations: +``` +CONFIG_FSCACHE=m +CONFIG_CACHEFILES=m +CONFIG_CACHEFILES_ONDEMAND=y +CONFIG_EROFS_FS=m +CONFIG_EROFS_FS_ONDEMAND=y +``` + +5. ``make -jX`` + +6. ``make modules_install && make install`` + +7. Reboot to the kernel just built + +8. ``modprobe cachefiles`` if cachefiles is built as module + +9. ``[ -c /dev/cachefiles ] && echo ok`` + +## Get ctr-remote and the fscache-supported nydusd + +1. Make sure you have installed _rust 1.52.1_ version and golang. + +2. Check out the latest nydus source code with \ +``git clone https://github.com/dragonflyoss/image-service.git -b fscache`` + +3. Build nydusd with \ +``cargo build --target x86_64-unknown-linux-gnu --features=fusedev --release --target-dir target-fusedev --bin nydusd`` + +4. Build ctr-remote with + +``` bash +cd contrib/ctr-remote +make +``` + +## Run container with nydus snapshotter + +1. Make sure your containerd version is 1.4 or above. + +2. Get nydus snapshotter with erofs supported: + ```shell + # clone code + git clone https://github.com/imeoer/nydus-snapshotter.git -b erofs-with-fscache-support + # compile binary to ./bin/containerd-nydus-grpc + cd nydus-snapshotter + make + ``` + +3. Prepare a configuration json like below, named as `/path/nydus-erofs-config.json`: + +```json +{ + "type": "bootstrap", + "config": { + "backend_type": "registry", + "backend_config": { + "scheme": "https" + }, + "cache_type": "fscache" + } +} +``` + +4. Start nydus snapshotter with the command below: + +``` +./bin/containerd-nydus-grpc \ + --config-path /path/nydus-erofs-config.json \ + --daemon-mode shared \ + --daemon-backend erofs \ + --log-level info \ + --root /var/lib/containerd/io.containerd.snapshotter.v1.nydus \ + --cache-dir /var/lib/nydus/cache \ + --address /run/containerd/containerd-nydus-grpc.sock \ + --nydusd-path /path/to/nydusd \ + --log-to-stdout +``` + +5. Configure containerd to use `nydus-snapshotter` by editing + `/etc/containerd/config.toml` like below: + +``` toml +version = 2 + +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".cni] + bin_dir = "/usr/lib/cni" + conf_dir = "/etc/cni/net.d" + [plugins."io.containerd.internal.v1.opt"] + path = "/var/lib/containerd/opt" + +[proxy_plugins] + [proxy_plugins.nydus] + type = "snapshot" + address = "/run/containerd/containerd-nydus-grpc.sock" + +[plugins."io.containerd.grpc.v1.cri".containerd] + snapshotter = "nydus" + disable_snapshot_annotations = false +``` + +For more information on how to configure containerd to use nydus snapshotter please refer to [here](./containerd-env-setup.md). + +6. Restart containerd with + `service containerd restart` + +7. Run container with [ctr-remote](../contrib/ctr-remote) + +``` shell +# pull nydus image +contrib/ctr-remote/bin/ctr-remote images rpull docker.io/hsiangkao/ubuntu:20.04-rafs-v6-docker + +# run nydus image +ctr run --rm -t --snapshotter=nydus docker.io/hsiangkao/ubuntu:20.04-rafs-v6-docker ubuntu /bin/bash + +# remove nydus image +ctr images rm docker.io/hsiangkao/ubuntu:20.04-rafs-v6-docker +``` + +## Try to convert a new image to RAFS v6 + +1. Get nydus image conversion tool `accelctl` + +``` shell +# clone acceld code +git clone https://github.com/goharbor/acceleration-service.git + +# compile binary to ./accelctl +cd acceleration-service +make +``` + +2. Convert to nydus image + +Duplicate `./misc/config/config.yaml.nydus.tmpl` configuration file as `path/to/config.yaml`, make sure that the `rafs_version` option in `converter.driver.config` is changed to `6` and the registry auth have been configured in `provider.source`. + +``` shell +# convert to nydus image +./accelctl convert --config path/to/config.yaml /ubuntu:latest +``` diff --git a/docs/samples/boostrap_blob_cache_entry.json b/docs/samples/boostrap_blob_cache_entry.json new file mode 100644 index 00000000000..3c0973bb717 --- /dev/null +++ b/docs/samples/boostrap_blob_cache_entry.json @@ -0,0 +1,17 @@ +{ + "type": "bootstrap", + "id": "bootstrap1", + "domain_id": "userid1", + "config": { + "id": "factory1", + "backend_type": "localfs", + "backend_config": { + "dir": "/tmp/nydus" + }, + "cache_type": "fscache", + "cache_config": { + "work_dir": "/tmp/nydus" + }, + "metadata_file": "/tmp/nydus/bootstrap1" + } +} diff --git a/rafs/src/fs.rs b/rafs/src/fs.rs index 0a79b84f834..1ac3a5ee851 100644 --- a/rafs/src/fs.rs +++ b/rafs/src/fs.rs @@ -58,7 +58,7 @@ fn default_threads_count() -> usize { 8 } -fn default_merging_size() -> usize { +pub fn default_merging_size() -> usize { 128 * 1024 } diff --git a/rafs/src/metadata/mod.rs b/rafs/src/metadata/mod.rs index bec857f2a29..5b39bf695d0 100644 --- a/rafs/src/metadata/mod.rs +++ b/rafs/src/metadata/mod.rs @@ -466,9 +466,16 @@ impl RafsSuper { } /// Load Rafs super block from a metadata file. - pub fn load_from_metadata(path: &Path, mode: RafsMode, validate_digest: bool) -> Result { + pub fn load_from_metadata>( + path: P, + mode: RafsMode, + validate_digest: bool, + ) -> Result { // open bootstrap file - let file = OpenOptions::new().read(true).write(false).open(path)?; + let file = OpenOptions::new() + .read(true) + .write(false) + .open(path.as_ref())?; let mut rs = RafsSuper { mode, validate_digest, diff --git a/src/bin/nydusd/api_server_glue.rs b/src/bin/nydusd/api_server_glue.rs index cac4080f244..3744be0f07a 100644 --- a/src/bin/nydusd/api_server_glue.rs +++ b/src/bin/nydusd/api_server_glue.rs @@ -1,28 +1,29 @@ // Copyright 2020 Ant Group. All rights reserved. -// Copyright (C) 2020 Alibaba Cloud. All rights reserved. +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. // // SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) -use mio::{Events, Poll, Token, Waker}; use std::convert::From; +use std::io::Result; use std::str::FromStr; -use std::sync::mpsc::{Receiver, Sender}; +use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; use std::thread::JoinHandle; +use mio::Waker; +use nix::sys::signal::{kill, SIGTERM}; +use nix::unistd::Pid; + use nydus::{FsBackendType, NydusError}; use nydus_api::http::{ - ApiError, ApiMountCmd, ApiRequest, ApiResponse, ApiResponsePayload, ApiResult, DaemonConf, - DaemonErrorKind, MetricsErrorKind, + start_http_thread, ApiError, ApiMountCmd, ApiRequest, ApiResponse, ApiResponsePayload, + ApiResult, BlobCacheEntry, BlobObjectParam, DaemonConf, DaemonErrorKind, MetricsErrorKind, }; use nydus_utils::metrics; -use crate::daemon::{DaemonError, FsBackendMountCmd, FsBackendUmountCmd, NydusDaemon}; -#[cfg(fusedev)] -use crate::fusedev::FusedevDaemon; - -type Result = ApiResult; -const API_WAKE_TOKEN: Token = Token(0); +use crate::daemon::{DaemonError, NydusDaemon}; +use crate::fs_service::{FsBackendMountCmd, FsBackendUmountCmd, FsService}; +use crate::DAEMON_CONTROLLER; impl From for DaemonErrorKind { fn from(e: DaemonError) -> Self { @@ -47,30 +48,22 @@ impl From for DaemonError { } } -pub struct ApiServer { +struct ApiServer { to_http: Sender, - daemon: Arc, } impl ApiServer { - pub fn new( - to_http: Sender, - daemon: Arc, - ) -> std::io::Result { - Ok(ApiServer { to_http, daemon }) + fn new(to_http: Sender) -> Result { + Ok(ApiServer { to_http }) } - fn process_request(&self, request: ApiRequest) -> std::io::Result<()> { + fn process_request(&self, request: ApiRequest) -> Result<()> { let resp = match request { - ApiRequest::DaemonInfo => self.daemon_info(), - ApiRequest::ExportFsBackendInfo(mountpoint) => self.backend_info(&mountpoint), + // Common (v1/v2) ApiRequest::ConfigureDaemon(conf) => self.configure_daemon(conf), + ApiRequest::DaemonInfo => self.daemon_info(true), ApiRequest::Exit => self.do_exit(), - - ApiRequest::Mount(mountpoint, info) => self.do_mount(mountpoint, info), - ApiRequest::Remount(mountpoint, info) => self.do_remount(mountpoint, info), - ApiRequest::Umount(mountpoint) => self.do_umount(mountpoint), - + ApiRequest::Takeover => self.do_takeover(), ApiRequest::Events => Self::events(), ApiRequest::ExportGlobalMetrics(id) => Self::export_global_metrics(id), ApiRequest::ExportFilesMetrics(id, latest_read_files) => { @@ -79,10 +72,21 @@ impl ApiServer { ApiRequest::ExportAccessPatterns(id) => Self::export_access_patterns(id), ApiRequest::ExportBackendMetrics(id) => Self::export_backend_metrics(id), ApiRequest::ExportBlobcacheMetrics(id) => Self::export_blobcache_metrics(id), - ApiRequest::ExportInflightMetrics => self.export_inflight_metrics(), + // Filesystem (v1) + ApiRequest::ExportFsBackendInfo(mountpoint) => self.backend_info(&mountpoint), + ApiRequest::ExportInflightMetrics => self.export_inflight_metrics(), + ApiRequest::Mount(mountpoint, info) => self.do_mount(mountpoint, info), + ApiRequest::Remount(mountpoint, info) => self.do_remount(mountpoint, info), + ApiRequest::Umount(mountpoint) => self.do_umount(mountpoint), ApiRequest::SendFuseFd => self.send_fuse_fd(), - ApiRequest::Takeover => self.do_takeover(), + + // Nydus API v2 + ApiRequest::DaemonInfoV2 => self.daemon_info(false), + ApiRequest::GetBlobObject(_param) => todo!(), + ApiRequest::CreateBlobObject(entry) => self.create_blob_cache_entry(&entry), + ApiRequest::DeleteBlobObject(param) => self.remove_blob_cache_entry(¶m), + ApiRequest::ListBlobObject => todo!(), }; self.respond(resp); @@ -90,28 +94,12 @@ impl ApiServer { Ok(()) } - fn respond(&self, resp: Result) { + fn respond(&self, resp: ApiResult) { if let Err(e) = self.to_http.send(resp) { error!("send API response failed {}", e); } } - fn daemon_info(&self) -> ApiResponse { - let d = self.daemon.as_ref(); - let info = d - .export_info() - .map_err(|e| ApiError::Metrics(MetricsErrorKind::Daemon(e.into())))?; - Ok(ApiResponsePayload::DaemonInfo(info)) - } - - fn backend_info(&self, mountpoint: &str) -> ApiResponse { - let d = self.daemon.as_ref(); - let info = d - .export_backend_info(mountpoint) - .map_err(|e| ApiError::Metrics(MetricsErrorKind::Daemon(e.into())))?; - Ok(ApiResponsePayload::FsBackendInfo(info)) - } - fn configure_daemon(&self, conf: DaemonConf) -> ApiResponse { conf.log_level .parse::() @@ -125,6 +113,47 @@ impl ApiServer { }) } + fn daemon_info(&self, include_fs_info: bool) -> ApiResponse { + self.get_daemon_object()? + .export_info(include_fs_info) + .map_err(|e| ApiError::Metrics(MetricsErrorKind::Daemon(e.into()))) + .map(ApiResponsePayload::DaemonInfo) + } + + /// External supervisor wants this instance to exit. But it can't just die leave + /// some pending or in-flight fuse messages un-handled. So this method guarantees + /// all fuse messages read from kernel are handled and replies are sent back. + /// Before http response are sent back, this must can ensure that current process + /// has absolutely stopped. Otherwise, multiple processes might read from single + /// fuse session simultaneously. + fn do_exit(&self) -> ApiResponse { + let d = self.get_daemon_object()?; + d.trigger_exit() + .map(|_| { + info!("exit daemon by http request"); + ApiResponsePayload::Empty + }) + .map_err(|e| ApiError::DaemonAbnormal(e.into()))?; + + // Should be reliable since this Api server works under event manager. + kill(Pid::this(), SIGTERM).unwrap_or_else(|e| error!("Send signal error. {}", e)); + + Ok(ApiResponsePayload::Empty) + } + + /// External supervisor wants this instance to fetch `/dev/fuse` fd. Before + /// invoking this method, supervisor should already listens on a Unix socket and + /// waits for connection from this instance. Then supervisor should send the *fd* + /// back. Note, the http response does not mean this process already finishes Takeover + /// procedure. Supervisor has to continuously query the state of Nydusd until it gets + /// to *RUNNING*, which means new Nydusd has successfully served as a fuse server. + fn do_takeover(&self) -> ApiResponse { + let d = self.get_daemon_object()?; + d.trigger_takeover() + .map(|_| ApiResponsePayload::Empty) + .map_err(|e| ApiError::DaemonAbnormal(e.into())) + } + fn events() -> ApiResponse { let events = metrics::export_events().map_err(|e| ApiError::Events(format!("{:?}", e)))?; Ok(ApiResponsePayload::Events(events)) @@ -161,6 +190,19 @@ impl ApiServer { .map_err(|e| ApiError::Metrics(MetricsErrorKind::Stats(e))) } + #[inline] + fn get_daemon_object(&self) -> std::result::Result, ApiError> { + Ok(DAEMON_CONTROLLER.get_daemon()) + } + + fn backend_info(&self, mountpoint: &str) -> ApiResponse { + let info = self + .get_default_fs_service()? + .export_backend_info(mountpoint) + .map_err(|e| ApiError::Metrics(MetricsErrorKind::Daemon(e.into())))?; + Ok(ApiResponsePayload::FsBackendInfo(info)) + } + /// Detect if there is fop being hang. /// `ApiResponsePayload::Empty` will be converted to http status code 204, which means /// there is no requests being processed right now. @@ -189,8 +231,8 @@ impl ApiServer { /// It means 3 threads are processing inflight requests. fn export_inflight_metrics(&self) -> ApiResponse { // TODO: Implement automatic error conversion between DaemonError and ApiError. - let d = self.daemon.as_ref(); - if let Some(ops) = d + let fs = self.get_default_fs_service()?; + if let Some(ops) = fs .export_inflight_ops() .map_err(|e| ApiError::Metrics(MetricsErrorKind::Daemon(e.into())))? { @@ -200,58 +242,25 @@ impl ApiServer { } } - /// External supervisor wants this instance to exit without umounting rafs. We can't - /// leave some in-flight fuse messages un-handled. So this method guarantees - /// all fuse messages read from kernel are handled and replies are sent back. - /// Before http response are sent back, this must can ensure that current process - /// has absolutely stopped. Otherwise, multiple processes might read from single - /// fuse session simultaneously. - fn do_exit(&self) -> ApiResponse { - let d = self.daemon.as_ref(); - d.trigger_exit() - .map(|_| { - info!("exit daemon by http request"); - ApiResponsePayload::Empty - }) - .map_err(|e| { - error!("exit fuse service failed {:}", e); - ApiError::DaemonAbnormal(e.into()) - })?; - - // Ensure both fuse and state machine threads have been terminated thus this - // nydusd won't race fuse messages when upgrading. - d.wait() - .map(|_| { - info!("fuse service exited by http request"); - ApiResponsePayload::Empty - }) - .map_err(|e| { - error!("wait for fuse service failed {:}", e); - ApiError::DaemonAbnormal(e.into()) - })?; - - Ok(ApiResponsePayload::Empty) - } - fn do_mount(&self, mountpoint: String, cmd: ApiMountCmd) -> ApiResponse { let fs_type = FsBackendType::from_str(&cmd.fs_type) .map_err(|e| ApiError::MountFilesystem(DaemonError::from(e).into()))?; - self.daemon - .mount(FsBackendMountCmd { - fs_type, - mountpoint, - config: cmd.config, - source: cmd.source, - prefetch_files: cmd.prefetch_files, - }) - .map(|_| ApiResponsePayload::Empty) - .map_err(|e| ApiError::MountFilesystem(e.into())) + let fs = self.get_default_fs_service()?; + fs.mount(FsBackendMountCmd { + fs_type, + mountpoint, + config: cmd.config, + source: cmd.source, + prefetch_files: cmd.prefetch_files, + }) + .map(|_| ApiResponsePayload::Empty) + .map_err(|e| ApiError::MountFilesystem(e.into())) } fn do_remount(&self, mountpoint: String, cmd: ApiMountCmd) -> ApiResponse { let fs_type = FsBackendType::from_str(&cmd.fs_type) .map_err(|e| ApiError::MountFilesystem(DaemonError::from(e).into()))?; - self.daemon + self.get_default_fs_service()? .remount(FsBackendMountCmd { fs_type, mountpoint, @@ -264,102 +273,168 @@ impl ApiServer { } fn do_umount(&self, mountpoint: String) -> ApiResponse { - self.daemon + self.get_default_fs_service()? .umount(FsBackendUmountCmd { mountpoint }) .map(|_| ApiResponsePayload::Empty) .map_err(|e| ApiError::MountFilesystem(e.into())) } fn send_fuse_fd(&self) -> ApiResponse { - let d = self.daemon.as_ref(); + let d = self.get_daemon_object()?; d.save() .map(|_| ApiResponsePayload::Empty) .map_err(|e| ApiError::DaemonAbnormal(e.into())) } - /// External supervisor wants this instance to fetch `/dev/fuse` fd. Before - /// invoking this method, supervisor should already listens on a Unix socket and - /// waits for connection from this instance. Then supervisor should send the *fd* - /// back. Note, the http response does not mean this process already finishes Takeover - /// procedure. Supervisor has to continuously query the state of Nydusd until it gets - /// to *RUNNING*, which means new Nydusd has successfully served as a fuse server. - fn do_takeover(&self) -> ApiResponse { - let d = self.daemon.as_ref(); - d.trigger_takeover() - .map(|_| ApiResponsePayload::Empty) - .map_err(|e| ApiError::DaemonAbnormal(e.into())) + fn get_default_fs_service(&self) -> std::result::Result, ApiError> { + DAEMON_CONTROLLER + .get_fs_service() + .ok_or(ApiError::DaemonAbnormal(DaemonErrorKind::Unsupported)) + } + + // HTTP API v2 + fn create_blob_cache_entry(&self, entry: &BlobCacheEntry) -> ApiResponse { + match DAEMON_CONTROLLER.get_blob_cache_mgr() { + None => Err(ApiError::DaemonAbnormal(DaemonErrorKind::Unsupported)), + Some(mgr) => { + if let Err(e) = mgr.add_blob_entry(entry) { + Err(ApiError::DaemonAbnormal(DaemonErrorKind::Other(format!( + "{}", + e + )))) + } else { + Ok(ApiResponsePayload::Empty) + } + } + } + } + + fn remove_blob_cache_entry(&self, param: &BlobObjectParam) -> ApiResponse { + match DAEMON_CONTROLLER.get_blob_cache_mgr() { + None => Err(ApiError::DaemonAbnormal(DaemonErrorKind::Unsupported)), + Some(mgr) => { + if let Err(e) = mgr.remove_blob_entry(param) { + Err(ApiError::DaemonAbnormal(DaemonErrorKind::Other(format!( + "{}", + e + )))) + } else { + Ok(ApiResponsePayload::Empty) + } + } + } } } -pub struct ApiSeverSubscriber { - poll: Poll, - waker: Arc, +struct ApiServerHandler { server: ApiServer, api_receiver: Receiver>, } -impl ApiSeverSubscriber { - pub fn new( - server: ApiServer, - api_receiver: Receiver>, - ) -> std::io::Result { - let poll = Poll::new()?; - let waker = Waker::new(poll.registry(), API_WAKE_TOKEN)?; +impl ApiServerHandler { + fn new(server: ApiServer, api_receiver: Receiver>) -> Result { Ok(Self { - waker: Arc::new(waker), - poll, server, api_receiver, }) } - pub fn get_waker(&self) -> Arc { - self.waker.clone() + fn handle_requests_from_router(&self) { + loop { + match self.api_receiver.recv() { + Ok(request) => { + if let Some(req) = request { + self.server.process_request(req).unwrap_or_else(|e| { + error!("HTTP handler failed to process request, {}", e) + }); + } else { + debug!("Received exit notification from the HTTP router"); + return; + } + } + Err(_e) => { + error!("Failed to receive request from the HTTP router"); + return; + } + } + } + } +} + +/// HTTP API server to serve the administration socket. +pub struct ApiServerController { + http_handler_thread: Option>>, + http_router_thread: Option>>, + sock: Option, + waker: Option>, +} + +impl ApiServerController { + /// Create a new instance of `ApiServerController`. + pub fn new(sock: Option<&str>) -> Self { + ApiServerController { + sock: sock.map(|v| v.to_string()), + http_handler_thread: None, + http_router_thread: None, + waker: None, + } } - pub fn run(self) -> std::io::Result> { - std::thread::Builder::new() + /// Try to start the HTTP working thread. + pub fn start(&mut self) -> Result<()> { + if self.sock.is_none() { + return Ok(()); + } + + // Safe to unwrap() because self.sock is valid. + let apisock = self.sock.as_ref().unwrap(); + let (to_handler, from_router) = channel(); + let (to_router, from_handler) = channel(); + let api_server = ApiServer::new(to_router)?; + let api_handler = ApiServerHandler::new(api_server, from_router)?; + let (router_thread, waker) = start_http_thread(apisock, None, to_handler, from_handler)?; + let daemon_waker = DAEMON_CONTROLLER.waker.clone(); + + info!("HTTP API server running at {}", apisock); + let handler_thread = std::thread::Builder::new() .name("api-server".to_string()) .spawn(move || { - let ApiSeverSubscriber { - mut poll, - server, - api_receiver, - .. - } = self; - let mut events = Events::with_capacity(100); - 'wait: loop { - match poll.poll(&mut events, None) { - Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(e) => { - error!("API server poll events failed, {}", e); - return; - } - Ok(_) => {} - } - - for event in &events { - match event.token() { - API_WAKE_TOKEN => { - if let Some(request) = api_receiver.recv().unwrap_or_else(|e| { - error!("API server recv failed, {}", e); - None - }) { - server.process_request(request).unwrap_or_else(|e| { - error!("API server process events failed, {}", e) - }); - } else { - break 'wait; - } - } - _ => { - unreachable!("unknown event token"); - } - } - } - } - info!("api-server thread exits"); + api_handler.handle_requests_from_router(); + info!("HTTP api-server handler thread exits"); + let _ = daemon_waker.wake(); + Ok(()) }) + .map_err(|_e| einval!("Failed to start work thread for HTTP handler"))?; + + self.waker = Some(waker); + self.http_handler_thread = Some(handler_thread); + self.http_router_thread = Some(router_thread); + + Ok(()) + } + + /// Stop the HTTP working thread. + pub fn stop(&mut self) { + // Signal the HTTP router thread to exit, which will then notify the HTTP handler thread. + if let Some(waker) = self.waker.take() { + let _ = waker.wake(); + } + if let Some(t) = self.http_handler_thread.take() { + if let Err(e) = t.join() { + error!( + "Failed to join the HTTP handler thread, execution error. {:?}", + e + ); + } + } + if let Some(t) = self.http_router_thread.take() { + if let Err(e) = t.join() { + error!( + "Failed to join the HTTP router thread, execution error. {:?}", + e + ); + } + } } } diff --git a/src/bin/nydusd/blob_cache.rs b/src/bin/nydusd/blob_cache.rs new file mode 100644 index 00000000000..c7ff975030a --- /dev/null +++ b/src/bin/nydusd/blob_cache.rs @@ -0,0 +1,392 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) + +// Blob cache manager to manage all cached blob objects. +use std::collections::HashMap; +use std::io::{Error, ErrorKind, Result}; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex, MutexGuard}; + +use nydus_api::http::{BlobCacheEntry, BlobCacheList, BlobObjectParam, BLOB_CACHE_TYPE_BOOTSTRAP}; +use rafs::metadata::{RafsMode, RafsSuper}; +use storage::cache::FsCacheConfig; +use storage::device::BlobInfo; +use storage::factory::{BackendConfig, CacheConfig, FactoryConfig}; + +#[derive(Clone)] +pub struct BlobCacheConfigBootstrap { + blob_id: String, + scoped_blob_id: String, + path: PathBuf, + factory_config: Arc, +} + +impl BlobCacheConfigBootstrap { + pub fn path(&self) -> &Path { + &self.path + } +} + +#[derive(Clone)] +pub struct BlobCacheConfigDataBlob { + blob_info: Arc, + scoped_blob_id: String, + factory_config: Arc, +} + +impl BlobCacheConfigDataBlob { + pub fn blob_info(&self) -> &Arc { + &self.blob_info + } + + pub fn factory_config(&self) -> &Arc { + &self.factory_config + } +} + +#[derive(Clone)] +pub enum BlobCacheObjectConfig { + DataBlob(Arc), + Bootstrap(Arc), +} + +impl BlobCacheObjectConfig { + fn new_data_blob( + domain_id: String, + blob_info: Arc, + factory_config: Arc, + ) -> Self { + let scoped_blob_id = if domain_id.is_empty() { + blob_info.blob_id().to_string() + } else { + domain_id + "-" + blob_info.blob_id() + }; + BlobCacheObjectConfig::DataBlob(Arc::new(BlobCacheConfigDataBlob { + blob_info, + scoped_blob_id, + factory_config, + })) + } + + fn new_bootstrap_blob( + domain_id: String, + blob_id: String, + path: PathBuf, + factory_config: Arc, + ) -> Self { + let scoped_blob_id = if domain_id.is_empty() { + blob_id.clone() + } else { + domain_id + "-" + &blob_id + }; + BlobCacheObjectConfig::Bootstrap(Arc::new(BlobCacheConfigBootstrap { + blob_id, + scoped_blob_id, + path, + factory_config, + })) + } + + fn get_key(&self) -> &str { + match self { + BlobCacheObjectConfig::Bootstrap(o) => &o.scoped_blob_id, + BlobCacheObjectConfig::DataBlob(o) => &o.scoped_blob_id, + } + } +} + +#[derive(Default)] +struct BlobCacheState { + id_to_config_map: HashMap, +} + +impl BlobCacheState { + fn new() -> Self { + Self { + id_to_config_map: HashMap::new(), + } + } + + fn remove(&mut self, domain_id: &str) { + let scoped_blob_prefix = format!("{}-", domain_id); + self.id_to_config_map.retain(|_k, v| match v { + BlobCacheObjectConfig::Bootstrap(o) => { + !o.scoped_blob_id.starts_with(&scoped_blob_prefix) + } + BlobCacheObjectConfig::DataBlob(o) => { + !o.scoped_blob_id.starts_with(&scoped_blob_prefix) + } + }) + } + + fn try_add(&mut self, config: BlobCacheObjectConfig) -> Result<()> { + let key = config.get_key(); + if self.id_to_config_map.contains_key(key) { + return Err(Error::new( + ErrorKind::AlreadyExists, + "blob configuration information already exists", + )); + } + self.id_to_config_map.insert(key.to_owned(), config); + Ok(()) + } + + fn get(&self, key: &str) -> Option { + self.id_to_config_map.get(key).cloned() + } +} + +/// Struct to maintain cached file objects. +#[derive(Default)] +pub struct BlobCacheMgr { + state: Mutex, +} + +impl BlobCacheMgr { + /// Create a new instance of `BlobCacheMgr`. + pub fn new() -> Self { + BlobCacheMgr { + state: Mutex::new(BlobCacheState::new()), + } + } + + /// Add a metadata blob object to be managed by the `FsCacheHandler`. + /// + /// When adding a rafs metadata blob to the manager, all data blobs referenced by it will + /// also be added to the manager. It's convenient to support rafs image filesystem. + /// + /// The `domain_id` and `id` forms a unique identifier to identify cached bootstrap objects. + /// That means `domain_id` is used to divide cached objects into groups and blobs with the + /// same `id` may exist in different groups. + fn add_bootstrap_object( + &self, + domain_id: &str, + id: &str, + path: PathBuf, + factory_config: Arc, + ) -> Result<()> { + let rs = RafsSuper::load_from_metadata(&path, RafsMode::Direct, true)?; + let meta_config = BlobCacheObjectConfig::new_bootstrap_blob( + domain_id.to_string(), + id.to_string(), + path, + factory_config.clone(), + ); + + let mut state = self.get_state(); + state.try_add(meta_config)?; + + // Try to add the referenced data blob object if it doesn't exist yet. + for bi in rs.superblock.get_blob_infos() { + debug!("Found blob {} on domain {}", &bi.blob_id(), domain_id); + let blob_config = BlobCacheObjectConfig::new_data_blob( + domain_id.to_string(), + bi, + factory_config.clone(), + ); + state.try_add(blob_config)?; + } + + Ok(()) + } + + /// Add an entry of bootstrap and/or data blobs. + pub fn add_blob_entry(&self, entry: &BlobCacheEntry) -> Result<()> { + if entry.blob_type == BLOB_CACHE_TYPE_BOOTSTRAP { + let (path, factory_config) = self.get_bootstrap_info(entry)?; + if let Err(e) = + self.add_bootstrap_object(&entry.domain_id, &entry.blob_id, path, factory_config) + { + warn!("Failed to add cache entry for bootstrap blob: {:?}", entry); + return Err(e); + } + } else { + warn!("Invalid blob cache entry: {:?}", entry); + return Err(einval!("Invalid blob cache entry")); + } + + Ok(()) + } + + pub fn remove_blob_entry(&self, param: &BlobObjectParam) -> Result<()> { + let mut state = self.get_state(); + state.remove(¶m.domain_id); + Ok(()) + } + + /// Add a list of bootstrap and/or data blobs. + pub fn add_blob_list(&self, blobs: &BlobCacheList) -> Result<()> { + for entry in blobs.blobs.iter() { + self.add_blob_entry(entry)?; + } + + Ok(()) + } + + /// Get blob configuration for blob with `key`. + pub fn get_config(&self, key: &str) -> Option { + self.get_state().get(key) + } + + #[inline] + fn get_state(&self) -> MutexGuard { + self.state.lock().unwrap() + } + + fn get_bootstrap_info(&self, entry: &BlobCacheEntry) -> Result<(PathBuf, Arc)> { + // Validate type of backend and cache. + let config = &entry.blob_config; + if config.cache_type != "fscache" { + return Err(einval!("`config.cache_type` for metadata blob is invalid")); + } + let cache_config = + serde_json::from_value::(entry.blob_config.cache_config.clone()) + .map_err(|_e| { + eother!("Invalid configuration of `FsCacheConfig` in blob cache entry") + })?; + + let path = config.metadata_path.clone().unwrap_or_default(); + if path.is_empty() { + return Err(einval!("`config.metadata_path` for metadata blob is empty")); + } + let path = Path::new(&path) + .canonicalize() + .map_err(|_e| einval!("`config.backend_config.blob_file` is invalid"))?; + if !path.is_file() { + return Err(einval!("`config.backend_config.blob_file` is not a file")); + } + + // Validate the working directory for fscache + let path2 = Path::new(&cache_config.work_dir); + let path2 = path2 + .canonicalize() + .map_err(|_e| eio!("`config.cache_config.work_dir` is invalid"))?; + if !path2.is_dir() { + return Err(einval!("`config.cache_config.work_dir` is not a directory")); + } + + let factory_config = Arc::new(FactoryConfig { + id: entry.blob_config.id.clone(), + backend: BackendConfig { + backend_type: entry.blob_config.backend_type.clone(), + backend_config: entry.blob_config.backend_config.clone(), + }, + cache: CacheConfig { + cache_type: entry.blob_config.cache_type.clone(), + cache_compressed: false, + cache_config: entry.blob_config.cache_config.clone(), + cache_validate: false, + prefetch_config: match serde_json::from_value(entry.fs_prefetch.clone()) { + Ok(fs_prefetch) => fs_prefetch, + Err(_e) => Default::default(), + }, + }, + }); + + Ok((path, factory_config)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vmm_sys_util::tempdir::TempDir; + + #[test] + fn test_blob_cache_entry() { + let tmpdir = TempDir::new().unwrap(); + let path = tmpdir.as_path().join("bootstrap1"); + std::fs::write(&path, "metadata").unwrap(); + + let config = r#" + { + "type": "bootstrap", + "id": "bootstrap1", + "domain_id": "userid1", + "config": { + "id": "factory1", + "backend_type": "localfs", + "backend_config": { + "dir": "/tmp/nydus" + }, + "cache_type": "fscache", + "cache_config": { + "work_dir": "/tmp/nydus" + }, + "metadata_path": "/tmp/nydus/bootstrap1" + } + }"#; + let content = config.replace("/tmp/nydus", tmpdir.as_path().to_str().unwrap()); + let entry: BlobCacheEntry = serde_json::from_str(&content).unwrap(); + + assert_eq!(&entry.blob_type, "bootstrap"); + assert_eq!(&entry.blob_id, "bootstrap1"); + assert_eq!(&entry.domain_id, "userid1"); + assert_eq!(&entry.blob_config.id, "factory1"); + assert_eq!(&entry.blob_config.backend_type, "localfs"); + assert_eq!(&entry.blob_config.cache_type, "fscache"); + assert!(entry.blob_config.backend_config.is_object()); + assert!(entry.blob_config.cache_config.is_object()); + + let mgr = BlobCacheMgr::new(); + let (path, factory_config) = mgr.get_bootstrap_info(&entry).unwrap(); + assert_eq!(path, tmpdir.as_path().join("bootstrap1")); + assert_eq!(&factory_config.id, "factory1"); + assert_eq!(&factory_config.backend.backend_type, "localfs"); + assert_eq!(&factory_config.cache.cache_type, "fscache"); + } + + #[test] + fn test_blob_cache_list() { + let config = r#" + { + "blobs" : [ + { + "type": "bootstrap", + "id": "bootstrap1", + "domain_id": "userid1", + "config": { + "id": "factory1", + "backend_type": "localfs", + "backend_config": { + "dir": "/tmp/nydus" + }, + "cache_type": "fscache", + "cache_config": { + "work_dir": "/tmp/nydus" + }, + "metadata_path": "/tmp/nydus/bootstrap1" + } + }, + { + "type": "bootstrap", + "id": "bootstrap2", + "domain_id": "userid2", + "config": { + "id": "factory1", + "backend_type": "localfs", + "backend_config": { + "dir": "/tmp/nydus" + }, + "cache_type": "fscache", + "cache_config": { + "work_dir": "/tmp/nydus" + }, + "metadata_path": "/tmp/nydus/bootstrap2" + } + } + ] + }"#; + let list: BlobCacheList = serde_json::from_str(config).unwrap(); + + assert_eq!(list.blobs.len(), 2); + assert_eq!(&list.blobs[0].blob_type, "bootstrap"); + assert_eq!(&list.blobs[0].blob_id, "bootstrap1"); + assert_eq!(&list.blobs[0].blob_config.id, "factory1"); + assert_eq!(&list.blobs[0].blob_config.backend_type, "localfs"); + assert_eq!(&list.blobs[0].blob_config.cache_type, "fscache"); + assert_eq!(&list.blobs[1].blob_type, "bootstrap"); + assert_eq!(&list.blobs[1].blob_id, "bootstrap2"); + } +} diff --git a/src/bin/nydusd/daemon.rs b/src/bin/nydusd/daemon.rs index 3a7456441ef..bf18c800ed0 100644 --- a/src/bin/nydusd/daemon.rs +++ b/src/bin/nydusd/daemon.rs @@ -6,41 +6,28 @@ use std::any::Any; use std::cmp::PartialEq; -use std::collections::HashMap; use std::convert::From; use std::fmt::{Display, Formatter}; use std::io::Result; use std::ops::Deref; -use std::path::{Path, PathBuf}; use std::process::id; -use std::str::FromStr; -use std::sync::{ - mpsc::{Receiver, Sender}, - Arc, MutexGuard, -}; +use std::sync::mpsc::{Receiver, Sender}; +use std::sync::Arc; use std::thread::{self, JoinHandle}; use std::{error, fmt, io}; -use fuse_backend_rs::api::{vfs::VfsError, BackendFileSystem, Vfs}; -#[cfg(target_os = "linux")] -use fuse_backend_rs::passthrough::{Config, PassthroughFs}; +use fuse_backend_rs::api::vfs::VfsError; use fuse_backend_rs::transport::Error as FuseTransportError; use fuse_backend_rs::Error as FuseError; use rust_fsm::*; -use serde::{self, Deserialize, Serialize}; +use serde::{self, Serialize}; use serde_json::Error as SerdeError; -use nydus::{FsBackendDesc, FsBackendType}; +use crate::fs_service::{FsBackendCollection, FsService}; use nydus_app::BuildTimeInfo; -use rafs::{ - fs::{Rafs, RafsConfig}, - trim_backend_config, RafsError, RafsIoRead, -}; +use rafs::RafsError; -use crate::upgrade::{self, UpgradeManager, UpgradeMgrError}; - -//TODO: Try to public below type from fuse-rs thus no need to redefine it here. -type BackFileSystem = Box + Send + Sync>; +use crate::upgrade::UpgradeMgrError; #[allow(dead_code)] #[allow(clippy::upper_case_acronyms)] @@ -75,16 +62,53 @@ impl From for DaemonState { #[derive(Debug)] pub enum DaemonError { + /// Object already exists. + AlreadyExists, + /// Generic error message. + Common(String), /// Invalid arguments provided. InvalidArguments(String), /// Invalid config provided InvalidConfig(String), + /// Object not found. + NotFound, + /// Daemon does not reach the stable working state yet, + /// some capabilities may not be provided. + NotReady, + /// Request not supported. + Unsupported, + /// Failed to serialize/deserialize message. + Serde(SerdeError), + /// Cannot spawn a new thread + ThreadSpawn(io::Error), + /// Failed to upgrade the mount + UpgradeManager(UpgradeMgrError), + + /// State-machine related error codes if something bad happens when to communicate with state-machine + Channel(String), + /// Failed to start service. + StartService(String), + /// Failed to stop service + ServiceStop, + /// Input event to stat-machine is not expected. + UnexpectedEvent(DaemonStateMachineInput), + /// Wait daemon failure + WaitDaemon(io::Error), + + // Filesystem type mismatch. + FsTypeMismatch(String), + /// Failure occurred in the Passthrough subsystem. + PassthroughFs(io::Error), + /// Failure occurred in the Rafs subsystem. + Rafs(RafsError), + /// Failure occurred in the VFS subsystem. + Vfs(VfsError), + + // virtio-fs /// Failed to handle event other than input event. HandleEventNotEpollIn, /// Failed to handle unknown event. HandleEventUnknownEvent, - /// No memory configured. - NoMemoryConfigured, /// Fail to walk descriptor chain IterateQueue, /// Invalid Virtio descriptor chain. @@ -93,39 +117,11 @@ pub enum DaemonError { ProcessQueue(FuseError), /// Cannot create epoll context. Epoll(io::Error), - /// Cannot clone event fd. - EventFdClone(io::Error), - /// Cannot spawn a new thread - ThreadSpawn(io::Error), - /// Failure against Passthrough FS. - PassthroughFs(io::Error), /// Daemon related error DaemonFailure(String), - Common(String), - NotFound, - AlreadyExists, - Serde(SerdeError), - UpgradeManager(UpgradeMgrError), - Vfs(VfsError), - Rafs(RafsError), - /// Daemon does not reach the stable working state yet, - /// some capabilities may not be provided. - NotReady, - /// Daemon can't fulfill external requests. - Unsupported, - /// State-machine related error codes if something bad happens when to communicate with state-machine - Channel(String), - /// Input event to stat-machine is not expected. - UnexpectedEvent(DaemonStateMachineInput), - /// File system backend service related errors. - StartService(String), - ServiceStop, - /// Wait daemon failure - WaitDaemon(io::Error), + // Fuse session has been shutdown. SessionShutdown(FuseTransportError), - Downcast(String), - FsTypeMismatch(String), } impl fmt::Display for DaemonError { @@ -159,6 +155,7 @@ impl From for DaemonError { } } +/// Specialized version of `std::result::Result` for `NydusDaemon`. pub type DaemonResult = std::result::Result; /// Used to export daemon working state @@ -168,264 +165,63 @@ pub struct DaemonInfo { pub id: Option, pub supervisor: Option, pub state: DaemonState, - pub backend_collection: FsBackendCollection, + pub backend_collection: Option, } -#[derive(Clone)] -pub struct FsBackendMountCmd { - pub fs_type: FsBackendType, - pub source: String, - pub config: String, - pub mountpoint: String, - pub prefetch_files: Option>, -} - -#[derive(Clone, Deserialize, Serialize, Debug)] -pub struct FsBackendUmountCmd { - pub mountpoint: String, -} - -#[derive(Default, Serialize, Clone)] -pub struct FsBackendCollection(HashMap); - -impl FsBackendCollection { - fn add(&mut self, id: &str, cmd: &FsBackendMountCmd) -> DaemonResult<()> { - // We only wash Rafs backend now. - let fs_config = match cmd.fs_type { - FsBackendType::Rafs => { - let mut config: serde_json::Value = - serde_json::from_str(&cmd.config).map_err(DaemonError::Serde)?; - trim_backend_config!( - config, - "access_key_id", - "access_key_secret", - "auth", - "token" - ); - Some(config) - } - FsBackendType::PassthroughFs => { - // Passthrough Fs has no config ever input. - None - } - }; - - let desc = FsBackendDesc { - backend_type: cmd.fs_type.clone(), - mountpoint: cmd.mountpoint.clone(), - mounted_time: chrono::Local::now(), - config: fs_config, - }; - - self.0.insert(id.to_string(), desc); - - Ok(()) - } - - fn del(&mut self, id: &str) { - self.0.remove(id); - } -} - -pub trait NydusDaemon: DaemonStateMachineSubscriber { - fn start(&self) -> DaemonResult<()>; - fn wait(&self) -> DaemonResult<()>; - fn stop(&self) -> DaemonResult<()> { - let s = self.get_state(); - if s != DaemonState::INTERRUPTED && s != DaemonState::STOPPED { - return self.on_event(DaemonStateMachineInput::Stop); - } - Ok(()) - } - /// close the current FUSE connection to properly shutdown - /// the FUSE server daemon. - fn disconnect(&self) -> DaemonResult<()>; - /// close the FUSE server without closing the FUSE connection - /// so that another FUSE server daemon can take over the same - /// FUSE connection and continue to serve the incoming FUSE requests. - fn interrupt(&self) {} +pub trait NydusDaemon: DaemonStateMachineSubscriber + Send + Sync { fn as_any(&self) -> &dyn Any; + fn id(&self) -> Option; fn get_state(&self) -> DaemonState; fn set_state(&self, s: DaemonState); - fn trigger_exit(&self) -> DaemonResult<()> { - self.on_event(DaemonStateMachineInput::Exit) - } - fn trigger_takeover(&self) -> DaemonResult<()> { - self.on_event(DaemonStateMachineInput::Takeover)?; - self.on_event(DaemonStateMachineInput::Successful)?; - Ok(()) - } - fn id(&self) -> Option; - fn supervisor(&self) -> Option; - fn save(&self) -> DaemonResult<()>; - fn restore(&self) -> DaemonResult<()>; - fn get_vfs(&self) -> &Vfs; - fn upgrade_mgr(&self) -> Option>; - fn backend_collection(&self) -> MutexGuard; fn version(&self) -> BuildTimeInfo; - fn export_info(&self) -> DaemonResult { - let response = DaemonInfo { + fn export_info(&self, include_fs_info: bool) -> DaemonResult { + let mut response = DaemonInfo { version: self.version(), id: self.id(), supervisor: self.supervisor(), state: self.get_state(), - backend_collection: self.backend_collection().deref().clone(), + backend_collection: None, }; + if include_fs_info { + if let Some(fs) = self.get_default_fs_service() { + response.backend_collection = Some(fs.backend_collection().deref().clone()); + } + } serde_json::to_string(&response).map_err(DaemonError::Serde) } - fn export_backend_info(&self, mountpoint: &str) -> DaemonResult { - let fs = self - .backend_from_mountpoint(mountpoint)? - .ok_or(DaemonError::NotFound)?; - let any_fs = fs.deref().as_any(); - let rafs = any_fs - .downcast_ref::() - .ok_or_else(|| DaemonError::FsTypeMismatch("to rafs".to_string()))?; - let resp = serde_json::to_string(rafs.metadata()).map_err(DaemonError::Serde)?; - Ok(resp) - } - fn backend_from_mountpoint(&self, mp: &str) -> DaemonResult>> { - let r = self.get_vfs().get_rootfs(mp)?; - Ok(r) - } - fn export_inflight_ops(&self) -> DaemonResult>; - - // NOTE: This method is not thread-safe, however, it is acceptable as - // mount/umount/remount/restore_mount is invoked from single thread in FSM - fn mount(&self, cmd: FsBackendMountCmd) -> DaemonResult<()> { - if self.backend_from_mountpoint(&cmd.mountpoint)?.is_some() { - return Err(DaemonError::AlreadyExists); - } - let backend = fs_backend_factory(&cmd)?; - let index = self.get_vfs().mount(backend, &cmd.mountpoint)?; - info!("{} mounted at {}", &cmd.fs_type, &cmd.mountpoint); - self.backend_collection().add(&cmd.mountpoint, &cmd)?; - - // Add mounts opaque to UpgradeManager - if let Some(mut mgr_guard) = self.upgrade_mgr() { - upgrade::add_mounts_state(&mut mgr_guard, cmd, index)?; + fn start(&self) -> DaemonResult<()>; + fn disconnect(&self) -> DaemonResult<()>; + fn interrupt(&self) {} + fn stop(&self) -> DaemonResult<()> { + let s = self.get_state(); + if s != DaemonState::INTERRUPTED && s != DaemonState::STOPPED { + return self.on_event(DaemonStateMachineInput::Stop); } - Ok(()) } - - fn remount(&self, cmd: FsBackendMountCmd) -> DaemonResult<()> { - let rootfs = self - .backend_from_mountpoint(&cmd.mountpoint)? - .ok_or(DaemonError::NotFound)?; - let rafs_config = RafsConfig::from_str(&cmd.config)?; - let mut bootstrap = ::from_file(&&cmd.source)?; - let any_fs = rootfs.deref().as_any(); - let rafs = any_fs - .downcast_ref::() - .ok_or_else(|| DaemonError::FsTypeMismatch("to rafs".to_string()))?; - - rafs.update(&mut bootstrap, rafs_config) - .map_err(|e| match e { - RafsError::Unsupported => DaemonError::Unsupported, - e => DaemonError::Rafs(e), - })?; - - // To update mounted time and backend configurations. - self.backend_collection().add(&cmd.mountpoint, &cmd)?; - - // Update mounts opaque from UpgradeManager - if let Some(mut mgr_guard) = self.upgrade_mgr() { - upgrade::update_mounts_state(&mut mgr_guard, cmd)?; - } - - Ok(()) + fn wait(&self) -> DaemonResult<()>; + fn trigger_exit(&self) -> DaemonResult<()> { + self.on_event(DaemonStateMachineInput::Exit) } - fn umount(&self, cmd: FsBackendUmountCmd) -> DaemonResult<()> { - let _ = self - .backend_from_mountpoint(&cmd.mountpoint)? - .ok_or(DaemonError::NotFound)?; - self.get_vfs().umount(&cmd.mountpoint)?; - - self.backend_collection().del(&cmd.mountpoint); - - // Remove mount opaque from UpgradeManager - if let Some(mut mgr_guard) = self.upgrade_mgr() { - upgrade::remove_mounts_state(&mut mgr_guard, cmd)?; - } - + fn supervisor(&self) -> Option; + fn save(&self) -> DaemonResult<()>; + fn restore(&self) -> DaemonResult<()>; + fn trigger_takeover(&self) -> DaemonResult<()> { + self.on_event(DaemonStateMachineInput::Takeover)?; + self.on_event(DaemonStateMachineInput::Successful)?; Ok(()) } -} -/// Validate prefetch file list command line parameter. -/// -/// A string including multiple directories and regular files should be separated by white-spaces, e.g. -/// -/// And each path should be relative to rafs root, e.g. -/// /foo1/bar1 /foo2/bar2 -/// Specifying both regular file and directory simultaneously is supported. -fn input_prefetch_files_verify(input: &Option>) -> DaemonResult>> { - let prefetch_files: Option> = input - .as_ref() - .map(|files| files.iter().map(PathBuf::from).collect()); - - if let Some(files) = &prefetch_files { - for f in files.iter() { - if !f.starts_with(Path::new("/")) { - return Err(DaemonError::Common("Illegal prefetch list".to_string())); - } - } - } - - Ok(prefetch_files) -} - -fn fs_backend_factory(cmd: &FsBackendMountCmd) -> DaemonResult { - let prefetch_files = input_prefetch_files_verify(&cmd.prefetch_files)?; - - match cmd.fs_type { - FsBackendType::Rafs => { - let rafs_config = RafsConfig::from_str(cmd.config.as_str())?; - let mut bootstrap = ::from_file(&cmd.source)?; - let mut rafs = Rafs::new(rafs_config, &cmd.mountpoint, &mut bootstrap)?; - rafs.import(bootstrap, prefetch_files)?; - info!("Rafs imported"); - Ok(Box::new(rafs)) - } - FsBackendType::PassthroughFs => { - #[cfg(target_os = "macos")] - return Err(DaemonError::InvalidArguments(String::from( - "not support passthroughfs", - ))); - #[cfg(target_os = "linux")] - { - // Vfs by default enables no_open and writeback, passthroughfs - // needs to specify them explicitly. - // TODO(liubo): enable no_open_dir. - let fs_cfg = Config { - root_dir: cmd.source.to_string(), - do_import: false, - writeback: true, - no_open: true, - xattr: true, - ..Default::default() - }; - // TODO: Passthrough Fs needs to enlarge rlimit against host. We can exploit `MountCmd` - // `config` field to pass such a configuration into here. - let passthrough_fs = - PassthroughFs::new(fs_cfg).map_err(DaemonError::PassthroughFs)?; - passthrough_fs - .import() - .map_err(DaemonError::PassthroughFs)?; - info!("PassthroughFs imported"); - Ok(Box::new(passthrough_fs)) - } - } - } + // For backward compatibility. + fn get_default_fs_service(&self) -> Option>; } // State machine for Nydus daemon workflow. // -// Valid states: +// State machine for FUSE: // - `Init` means nydusd is just started and potentially configured well but not // yet negotiate with kernel the capabilities of both sides. It even does not try // to set up fuse session by mounting `/fuse/dev`(in case of `fusedev` backend). @@ -435,7 +231,7 @@ fn fs_backend_factory(cmd: &FsBackendMountCmd) -> DaemonResult { // - `Upgrading` state means the nydus daemon is being live-upgraded. There's no need // to do kernel mount again to set up a session but try to reuse a fuse fd from somewhere else. // In this state, we try to push `Successful` event to state machine to trigger state transition. -// - `Interrupt` state means nydusd has shutdown fuse server, which means no more message will +// - `Interrupted` state means nydusd has shutdown fuse server, which means no more message will // be read from kernel and handled and no pending and in-flight fuse message exists. But the // nydusd daemon should be alive and wait for coming events. // - `Die` state means the whole nydusd process is going to die. @@ -446,13 +242,13 @@ state_machine! { // FIXME: It's possible that failover does not succeed or resource is not capable to // be passed. To handle event `Stop` when being `Init`. Init => { - Mount => Running [StartService], + Start => Running [StartService], Takeover => Upgrading [Restore], Exit => Die[StopStateMachine], Stop => Die[Umount], }, Running => { - Exit => Interrupted [TerminateFuseService], + Exit => Interrupted [TerminateService], Stop => Die[Umount], }, Upgrading(Successful) => Running [StartService], @@ -460,26 +256,28 @@ state_machine! { Interrupted(Stop) => Die[StopStateMachine], } +/// Implementation of the state machine defined by `DaemonStateMachine`. pub struct DaemonStateMachineContext { + pid: u32, + daemon: Arc, sm: StateMachine, - daemon: Arc, - event_collector: Receiver, + request_receiver: Receiver, result_sender: Sender>, - pid: u32, } impl DaemonStateMachineContext { + /// Create a new instance of `DaemonStateMachineContext`. pub fn new( - d: Arc, - rx: Receiver, + daemon: Arc, + request_receiver: Receiver, result_sender: Sender>, ) -> Self { DaemonStateMachineContext { + pid: id(), + daemon, sm: StateMachine::new(), - daemon: d, - event_collector: rx, + request_receiver, result_sender, - pid: id(), } } @@ -490,7 +288,7 @@ impl DaemonStateMachineContext { loop { use DaemonStateMachineOutput::*; let event = self - .event_collector + .request_receiver .recv() .expect("Event channel can't be broken!"); let last = self.sm.state().clone(); @@ -522,7 +320,7 @@ impl DaemonStateMachineContext { d.set_state(DaemonState::RUNNING); r }), - TerminateFuseService => { + TerminateService => { d.interrupt(); d.set_state(DaemonState::INTERRUPTED); Ok(()) @@ -564,6 +362,7 @@ impl DaemonStateMachineContext { } } +/// Handler to process rquest from the state machine. pub trait DaemonStateMachineSubscriber { /// Event handler for state transition events. /// @@ -571,11 +370,10 @@ pub trait DaemonStateMachineSubscriber { fn on_event(&self, event: DaemonStateMachineInput) -> DaemonResult<()>; } -pub type Trigger = Sender; - #[cfg(test)] mod tests { use super::*; + use nydus::FsBackendType; #[test] fn it_should_convert_int_to_daemonstate() { @@ -599,81 +397,4 @@ mod tests { assert!("xxxxxxxxxxxxx".parse::().is_err()); } - - #[test] - fn it_should_add_new_backend() { - let mut col: FsBackendCollection = Default::default(); - let r = col.add( - "test", - &FsBackendMountCmd { - fs_type: FsBackendType::Rafs, - config: "{\"config\": \"test\"}".to_string(), - mountpoint: "testmonutount".to_string(), - source: "testsource".to_string(), - prefetch_files: Some(vec!["testfile".to_string()]), - }, - ); - assert!(r.is_ok(), "failed to add backend collection"); - - assert_eq!(col.0.len(), 1); - - col.del("test"); - assert_eq!(col.0.len(), 0); - } - - #[test] - fn it_should_verify_prefetch_files() { - let files = input_prefetch_files_verify(&Some(vec!["/etc/passwd".to_string()])); - assert!(files.is_ok(), "failed to verify prefetch files"); - assert_eq!(1, files.unwrap().unwrap().len()); - - assert!( - input_prefetch_files_verify(&Some(vec!["etc/passwd".to_string()])).is_err(), - "should not pass verify" - ); - } - - #[test] - fn it_should_create_rafs_backend() { - let config = r#" - { - "device": { - "backend": { - "type": "oss", - "config": { - "endpoint": "test", - "access_key_id": "test", - "access_key_secret": "test", - "bucket_name": "antsys-nydus", - "object_prefix":"nydus_v2/", - "scheme": "http" - } - } - }, - "mode": "direct", - "digest_validate": false, - "enable_xattr": true, - "fs_prefetch": { - "enable": true, - "threads_count": 10, - "merging_size": 131072, - "bandwidth_rate": 10485760 - } - }"#; - let bootstrap = "./tests/texture/bootstrap/nydusd_daemon_test_bootstrap"; - if fs_backend_factory(&FsBackendMountCmd { - fs_type: FsBackendType::Rafs, - config: config.to_string(), - mountpoint: "testmountpoint".to_string(), - source: bootstrap.to_string(), - prefetch_files: Some(vec!["/testfile".to_string()]), - }) - .unwrap() - .as_any() - .downcast_ref::() - .is_none() - { - panic!("failed to create rafs backend") - } - } } diff --git a/src/bin/nydusd/fs_cache.rs b/src/bin/nydusd/fs_cache.rs new file mode 100644 index 00000000000..3282d51bfcb --- /dev/null +++ b/src/bin/nydusd/fs_cache.rs @@ -0,0 +1,695 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) + +//! Handler to cooperate with Linux fscache subsystem for blob cache. + +use std::cmp; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::fs::{File, OpenOptions}; +use std::io::{Error, ErrorKind, Result, Write}; +use std::ops::Deref; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::ptr::read_unaligned; +use std::string::String; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Barrier, Mutex, MutexGuard}; + +use mio::unix::SourceFd; +use mio::{Events, Interest, Poll, Token, Waker}; +use storage::cache::BlobCache; +use storage::device::BlobPrefetchRequest; +use storage::factory::BLOB_FACTORY; + +use crate::blob_cache::{ + BlobCacheConfigBootstrap, BlobCacheConfigDataBlob, BlobCacheMgr, BlobCacheObjectConfig, +}; + +ioctl_write_int!(fscache_cread, 0x98, 1); + +/// Maximum size of fscache request message from kernel. +const MIN_DATA_BUF_SIZE: usize = 1024; +const MSG_HEADER_SIZE: usize = 16; +const MSG_OPEN_SIZE: usize = 16; +const MSG_READ_SIZE: usize = 16; + +const TOKEN_EVENT_WAKER: usize = 1; +const TOKEN_EVENT_FSCACHE: usize = 2; + +/// Command code in requests from fscache driver. +#[repr(u32)] +#[derive(Debug, Eq, PartialEq)] +enum FsCacheOpCode { + Open = 0, + Close = 1, + Read = 2, +} + +impl TryFrom for FsCacheOpCode { + type Error = Error; + + fn try_from(value: u32) -> std::result::Result { + match value { + 0 => Ok(FsCacheOpCode::Open), + 1 => Ok(FsCacheOpCode::Close), + 2 => Ok(FsCacheOpCode::Read), + _ => Err(einval!(format!("invalid fscache operation code {}", value))), + } + } +} + +/// Common header for request messages. +#[repr(C)] +#[derive(Debug, Eq, PartialEq)] +struct FsCacheMsgHeader { + /// Message identifier to associate reply with request by the fscache driver. + msg_id: u32, + /// Message operation code. + opcode: FsCacheOpCode, + /// Message length, including message header and message body. + len: u32, + /// A unique ID identifying the cache file operated on. + object_id: u32, +} + +impl TryFrom<&[u8]> for FsCacheMsgHeader { + type Error = Error; + + fn try_from(value: &[u8]) -> std::result::Result { + if value.len() < MSG_HEADER_SIZE { + return Err(einval!(format!( + "fscache request message size is too small, {}", + value.len() + ))); + } + + // Safe because we have verified buffer size. + let msg_id = unsafe { read_unaligned(value[0..4].as_ptr() as *const u32) }; + let opcode = unsafe { read_unaligned(value[4..8].as_ptr() as *const u32) }; + let len = unsafe { read_unaligned(value[8..12].as_ptr() as *const u32) }; + let opcode = FsCacheOpCode::try_from(opcode)?; + let object_id = unsafe { read_unaligned(value[12..16].as_ptr() as *const u32) }; + if len as usize != value.len() { + return Err(einval!(format!( + "message length {} does not match length from message header {}", + value.len(), + len + ))); + } + + Ok(FsCacheMsgHeader { + msg_id, + opcode, + len, + object_id, + }) + } +} + +/// Request message to open a file. +/// +/// The opened file should be kept valid until corresponding `CLOSE` message has been received +/// from the fscache driver. +#[derive(Default, Debug, Eq, PartialEq)] +struct FsCacheMsgOpen { + volume_key: String, + cookie_key: String, + fd: u32, + flags: u32, +} + +impl TryFrom<&[u8]> for FsCacheMsgOpen { + type Error = Error; + + fn try_from(value: &[u8]) -> std::result::Result { + if value.len() < MSG_OPEN_SIZE { + return Err(einval!(format!( + "fscache request message size is too small, {}", + value.len() + ))); + } + + // Safe because we have verified buffer size. + let volume_key_size = unsafe { read_unaligned(value[0..4].as_ptr() as *const u32) }; + let cookie_key_size = unsafe { read_unaligned(value[4..8].as_ptr() as *const u32) }; + let fd = unsafe { read_unaligned(value[8..12].as_ptr() as *const u32) }; + let flags = unsafe { read_unaligned(value[12..16].as_ptr() as *const u32) }; + if volume_key_size.checked_add(cookie_key_size).is_none() + || (volume_key_size + cookie_key_size) + .checked_add(MSG_OPEN_SIZE as u32) + .is_none() + { + return Err(einval!( + "invalid volume/cookie key length in fscache OPEN request" + )); + } + let total_sz = (volume_key_size + cookie_key_size) as usize + MSG_OPEN_SIZE; + if value.len() < total_sz { + return Err(einval!("invalid message length for fscache OPEN request")); + } + let pos = MSG_OPEN_SIZE + volume_key_size as usize; + let volume_key = String::from_utf8(value[MSG_OPEN_SIZE..pos].to_vec()) + .map_err(|_e| einval!("invalid volume key in fscache OPEN request"))? + .trim_end_matches('\0') + .to_string(); + let cookie_key = String::from_utf8(value[pos..pos + cookie_key_size as usize].to_vec()) + .map_err(|_e| einval!("invalid cookie key in fscache OPEN request"))?; + + Ok(FsCacheMsgOpen { + volume_key, + cookie_key, + fd, + flags, + }) + } +} + +/// Request message to feed requested data into the cache file. +#[repr(C)] +#[derive(Default, Debug, Eq, PartialEq)] +struct FsCacheMsgRead { + off: u64, + len: u64, +} + +impl TryFrom<&[u8]> for FsCacheMsgRead { + type Error = Error; + + fn try_from(value: &[u8]) -> std::result::Result { + if value.len() < MSG_READ_SIZE { + return Err(einval!(format!( + "fscache request message size is too small, {}", + value.len() + ))); + } + + // Safe because we have verified buffer size. + let off = unsafe { read_unaligned(value[0..8].as_ptr() as *const u64) }; + let len = unsafe { read_unaligned(value[8..16].as_ptr() as *const u64) }; + + Ok(FsCacheMsgRead { off, len }) + } +} + +struct FsCacheBootStrap { + bootstrap_file: File, + cache_file: File, +} + +#[derive(Clone)] +enum FsCacheObject { + DataBlob(Arc), + Bootstrap(Arc), +} + +/// Struct to maintain cached file objects. +#[derive(Default)] +struct FsCacheState { + id_to_object_map: HashMap, + id_to_config_map: HashMap>, + blob_cache_mgr: Arc, +} + +/// Handler to cooperate with Linux fscache driver to manage cached blob objects. +/// +/// The `FsCacheHandler` create a communication channel with the Linux fscache driver, configure +/// the communication session and serves all requests from the fscache driver. +pub struct FsCacheHandler { + active: AtomicBool, + barrier: Barrier, + file: File, + state: Arc>, + poller: Mutex, + waker: Arc, +} + +impl FsCacheHandler { + /// Create a new instance of `FsCacheService`. + pub fn new( + path: &str, + dir: &str, + tag: Option<&str>, + blob_cache_mgr: Arc, + ) -> Result { + info!( + "create FsCacheHandler with dir {}, tag {}", + dir, + tag.unwrap_or("") + ); + + let mut file = OpenOptions::new() + .write(true) + .read(true) + .create(false) + .open(path)?; + let poller = + Poll::new().map_err(|_e| eother!("Failed to create poller for fscache service"))?; + let waker = Waker::new(poller.registry(), Token(TOKEN_EVENT_WAKER)) + .map_err(|_e| eother!("Failed to create waker for fscache service"))?; + poller + .registry() + .register( + &mut SourceFd(&file.as_raw_fd()), + Token(TOKEN_EVENT_FSCACHE), + Interest::READABLE, + ) + .map_err(|_e| eother!("Failed to register fd for fscache service"))?; + + // Initialize the fscache session + file.write_all(format!("dir {}", dir).as_bytes())?; + file.flush()?; + if let Some(tag) = tag { + file.write_all(format!("tag {}", tag).as_bytes())?; + file.flush()?; + } + file.write_all(b"bind ondemand")?; + file.flush()?; + + let state = FsCacheState { + id_to_object_map: Default::default(), + id_to_config_map: Default::default(), + blob_cache_mgr, + }; + + Ok(FsCacheHandler { + active: AtomicBool::new(true), + barrier: Barrier::new(2), + file, + state: Arc::new(Mutex::new(state)), + poller: Mutex::new(poller), + waker: Arc::new(waker), + }) + } + + /// Stop the fscache event loop. + pub fn stop(&self) { + self.active.store(false, Ordering::Release); + if let Err(e) = self.waker.wake() { + error!("Failed to signal fscache worker thread to exit, {}", e); + } + self.barrier.wait(); + } + + /// Run the event loop to handle all requests from kernel fscache driver. + /// + /// This method should only be invoked by a single thread, which will poll the fscache fd + /// and dispatch requests from fscache fd to other working threads. + pub fn run_loop(&self) -> Result<()> { + let mut events = Events::with_capacity(64); + let mut buf = [0u8; MIN_DATA_BUF_SIZE]; + + loop { + match self.poller.lock().unwrap().poll(&mut events, None) { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => { + warn!("Failed to poll events for fscache service"); + return Err(e); + } + } + + for event in events.iter() { + if event.is_error() { + error!("Got error event for fscache poller"); + continue; + } + if event.token() == Token(TOKEN_EVENT_FSCACHE) { + if event.is_readable() { + self.handle_requests(&mut buf)?; + } + } else if event.is_readable() + && event.token() == Token(TOKEN_EVENT_WAKER) + && !self.active.load(Ordering::Acquire) + { + self.barrier.wait(); + return Ok(()); + } + } + } + } + + /// Read and process all requests from fscache driver until no data available. + fn handle_requests(&self, buf: &mut [u8]) -> Result<()> { + loop { + let ret = unsafe { + libc::read( + self.file.as_raw_fd(), + buf.as_ptr() as *mut u8 as *mut libc::c_void, + buf.len(), + ) + }; + match ret { + 0 => return Ok(()), + _i if _i > 0 => self.handle_one_request(&buf[0..ret as usize])?, + _ => { + let err = Error::last_os_error(); + match err.kind() { + ErrorKind::Interrupted => continue, + ErrorKind::WouldBlock => return Ok(()), + _ => return Err(err), + } + } + } + } + } + + fn handle_one_request(&self, buf: &[u8]) -> Result<()> { + let hdr = FsCacheMsgHeader::try_from(buf)?; + let buf = &buf[MSG_HEADER_SIZE..]; + + match hdr.opcode { + FsCacheOpCode::Open => { + let msg = FsCacheMsgOpen::try_from(buf)?; + self.handle_open_request(&hdr, &msg); + } + FsCacheOpCode::Close => { + self.handle_close_request(&hdr); + } + FsCacheOpCode::Read => { + let msg = FsCacheMsgRead::try_from(buf)?; + self.handle_read_request(&hdr, &msg); + } + } + + Ok(()) + } + + fn handle_open_request(&self, hdr: &FsCacheMsgHeader, msg: &FsCacheMsgOpen) { + // Drop the 'erofs,' prefix if any + let domain_id = match msg.volume_key.clone().strip_prefix("erofs,") { + None => msg.volume_key.clone(), + Some(str) => str.to_string(), + }; + let key = domain_id + "-" + &msg.cookie_key; + let msg = match self.get_config(&key) { + None => { + unsafe { + libc::close(msg.fd as i32); + } + format!("copen {},{}", hdr.msg_id, -libc::ENOENT) + } + Some(cfg) => match cfg { + BlobCacheObjectConfig::DataBlob(config) => { + self.handle_open_data_blob(hdr, msg, config) + } + BlobCacheObjectConfig::Bootstrap(config) => { + self.handle_open_bootstrap(hdr, msg, config) + } + }, + }; + self.reply(&msg); + } + + fn handle_open_data_blob( + &self, + hdr: &FsCacheMsgHeader, + msg: &FsCacheMsgOpen, + config: Arc, + ) -> String { + let mut state = self.state.lock().unwrap(); + + use std::collections::hash_map::Entry::Vacant; + if let Vacant(e) = state.id_to_object_map.entry(hdr.object_id) { + match self.create_data_blob_object(&config, msg.fd) { + Err(s) => format!("copen {},{}", hdr.msg_id, s), + Ok((blob, blob_size)) => { + e.insert((FsCacheObject::DataBlob(blob.clone()), msg.fd)); + state.id_to_config_map.insert(hdr.object_id, config.clone()); + let _ = self.do_prefetch(&config, blob); + format!("copen {},{}", hdr.msg_id, blob_size) + } + } + } else { + unsafe { + libc::close(msg.fd as i32); + }; + format!("copen {},{}", hdr.msg_id, -libc::EALREADY) + } + } + + pub fn do_prefetch(&self, config: &BlobCacheConfigDataBlob, blob: Arc) { + let blob_info = config.blob_info().deref(); + let factory_config = config.factory_config().deref(); + if !factory_config.cache.prefetch_config.enable { + return; + } + let size = match factory_config + .cache + .prefetch_config + .merging_size + .checked_next_power_of_two() + { + None => rafs::fs::default_merging_size() as u64, + Some(1) => rafs::fs::default_merging_size() as u64, + Some(s) => s as u64, + }; + let blob_size = blob_info.compressed_size(); + let count = (blob_size + size - 1) / size; + let mut blob_req = Vec::with_capacity(count as usize); + let mut pre_offset = 0u64; + for _i in 0..count { + blob_req.push(BlobPrefetchRequest { + blob_id: blob_info.blob_id().to_owned(), + offset: pre_offset, + len: cmp::min(size, blob_size - pre_offset), + }); + pre_offset += size; + if pre_offset > blob_size { + break; + } + } + info!("blob prefetch start"); + let _ = std::thread::spawn(move || { + let _ = blob + .prefetch(blob.clone(), &blob_req, &[]) + .map_err(|_e| eio!("failed to prefetch blob data")); + let _ = blob.stop_prefetch(); + }); + } + + /// The `fscache` factory essentially creates a namespace for blob objects cached by the + /// fscache subsystem. The data blob files will be managed the in kernel fscache driver, + /// the chunk map file will be managed by the userspace daemon. We need to figure out the + /// way to share blob/chunkamp files with filecache manager. + fn create_data_blob_object( + &self, + config: &BlobCacheConfigDataBlob, + fd: u32, + ) -> std::result::Result<(Arc, u64), i32> { + let mut blob_info = config.blob_info().deref().clone(); + // `BlobInfo` from the configuration cache should not have fscache file associated with it. + assert!(blob_info.get_fscache_file().is_none()); + + // Safe because we trust the kernel fscache driver. + let file = unsafe { File::from_raw_fd(fd as RawFd) }; + blob_info.set_fscache_file(Some(Arc::new(file))); + let blob_ref = Arc::new(blob_info); + + match BLOB_FACTORY.new_blob_cache(config.factory_config(), &blob_ref) { + Err(_e) => Err(-libc::ENOENT), + Ok(blob) => { + let blob_size = match blob.blob_size() { + Err(_e) => return Err(-libc::EIO), + Ok(v) => v, + }; + Ok((blob, blob_size)) + } + } + } + + fn handle_open_bootstrap( + &self, + hdr: &FsCacheMsgHeader, + msg: &FsCacheMsgOpen, + config: Arc, + ) -> String { + let mut state = self.get_state(); + use std::collections::hash_map::Entry::Vacant; + let ret: i64 = if let Vacant(e) = state.id_to_object_map.entry(hdr.object_id) { + match OpenOptions::new().read(true).open(config.path()) { + Err(e) => { + warn!( + "Failed to open bootstrap file {}, {}", + config.path().display(), + e + ); + -libc::ENOENT as i64 + } + Ok(f) => match f.metadata() { + Err(e) => { + warn!( + "Failed to open bootstrap file {}, {}", + config.path().display(), + e + ); + -libc::ENOENT as i64 + } + Ok(md) => { + let cache_file = unsafe { File::from_raw_fd(msg.fd as RawFd) }; + let object = FsCacheObject::Bootstrap(Arc::new(FsCacheBootStrap { + bootstrap_file: f, + cache_file, + })); + e.insert((object, msg.fd)); + md.len() as i64 + } + }, + } + } else { + -libc::EALREADY as i64 + }; + + if ret < 0 { + unsafe { + libc::close(msg.fd as i32); + } + } + format!("copen {},{}", hdr.msg_id, ret) + } + + fn handle_close_request(&self, hdr: &FsCacheMsgHeader) { + let mut state = self.get_state(); + + if let Some((FsCacheObject::DataBlob(blob), _)) = + state.id_to_object_map.remove(&hdr.object_id) + { + let config = state.id_to_config_map.remove(&hdr.object_id).unwrap(); + BLOB_FACTORY.gc(Some((config.factory_config(), blob.blob_id()))); + } + } + + fn handle_read_request(&self, hdr: &FsCacheMsgHeader, msg: &FsCacheMsgRead) { + let fd: u32; + match self.get_object(hdr.object_id) { + None => { + warn!("No cached file object found for obj_id {}", hdr.object_id); + return; + } + Some((FsCacheObject::DataBlob(blob), u)) => { + fd = u; + match blob.get_blob_object() { + None => { + warn!( + "Internal error: blob object used by fscache is not BlobCache objects" + ) + } + Some(obj) => match obj.fetch_range_uncompressed(msg.off, msg.len) { + Ok(v) if v == msg.len as usize => {} + _ => debug!("Failed to read data from blob object"), + }, + } + } + Some((FsCacheObject::Bootstrap(bs), u)) => { + // TODO: should we feed the bootstrap at together to improve performance? + fd = u; + let base = unsafe { + libc::mmap( + std::ptr::null_mut(), + msg.len as usize, + libc::PROT_READ, + libc::MAP_SHARED, + bs.bootstrap_file.as_raw_fd(), + msg.off as libc::off_t, + ) + }; + if base == libc::MAP_FAILED { + warn!( + "Failed to mmap bootstrap file, {}", + std::io::Error::last_os_error() + ); + } else { + let ret = unsafe { + libc::pwrite( + bs.cache_file.as_raw_fd(), + base, + msg.len as usize, + msg.off as libc::off_t, + ) + }; + let _ = unsafe { libc::munmap(base, msg.len as usize) }; + if ret < 0 { + warn!( + "Failed to write bootstrap blob data to cached file, {}", + std::io::Error::last_os_error() + ); + } + } + } + } + unsafe { fscache_cread(fd as i32, hdr.msg_id as u64).unwrap() }; + } + + #[inline] + fn reply(&self, result: &str) { + // Safe because the fd and data buffer are valid. And we trust the fscache driver which + // will never return error for write operations. + let ret = unsafe { + libc::write( + self.file.as_raw_fd(), + result.as_bytes().as_ptr() as *const u8 as *const libc::c_void, + result.len(), + ) + }; + if ret as usize != result.len() { + warn!( + "Failed to reply \"{}\", {}", + result, + std::io::Error::last_os_error() + ); + } + } + + #[inline] + fn get_state(&self) -> MutexGuard { + self.state.lock().unwrap() + } + + #[inline] + fn get_object(&self, object_id: u32) -> Option<(FsCacheObject, u32)> { + self.get_state().id_to_object_map.get(&object_id).cloned() + } + + #[inline] + fn get_config(&self, key: &str) -> Option { + self.get_state().blob_cache_mgr.get_config(key) + } +} + +impl AsRawFd for FsCacheHandler { + fn as_raw_fd(&self) -> RawFd { + self.file.as_raw_fd() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_op_code() { + assert_eq!(FsCacheOpCode::try_from(0).unwrap(), FsCacheOpCode::Open); + assert_eq!(FsCacheOpCode::try_from(1).unwrap(), FsCacheOpCode::Close); + assert_eq!(FsCacheOpCode::try_from(2).unwrap(), FsCacheOpCode::Read); + FsCacheOpCode::try_from(3).unwrap_err(); + } + + #[test] + fn test_msg_header() { + let hdr = FsCacheMsgHeader::try_from( + vec![1u8, 0, 0, 0, 2, 0, 0, 0, 17, 0, 0, 0, 2u8, 0, 0, 0, 0].as_slice(), + ) + .unwrap(); + assert_eq!(hdr.msg_id, 0x1); + assert_eq!(hdr.opcode, FsCacheOpCode::Read); + assert_eq!(hdr.len, 17); + assert_eq!(hdr.object_id, 0x2); + + FsCacheMsgHeader::try_from(vec![0u8, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 13, 0].as_slice()) + .unwrap_err(); + FsCacheMsgHeader::try_from(vec![0u8, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 13].as_slice()) + .unwrap_err(); + FsCacheMsgHeader::try_from(vec![0u8, 0, 0, 1, 0, 0, 0, 2, 0, 0].as_slice()).unwrap_err(); + FsCacheMsgHeader::try_from(vec![].as_slice()).unwrap_err(); + } +} diff --git a/src/bin/nydusd/fs_service.rs b/src/bin/nydusd/fs_service.rs new file mode 100644 index 00000000000..4b242d15e13 --- /dev/null +++ b/src/bin/nydusd/fs_service.rs @@ -0,0 +1,317 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// Copyright 2020 Ant Group. All rights reserved. +// Copyright 2019 Intel Corporation. All Rights Reserved. +// +// SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) + +use std::collections::HashMap; +use std::ops::Deref; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, MutexGuard}; + +use fuse_backend_rs::api::{BackendFileSystem, Vfs}; +#[cfg(target_os = "linux")] +use fuse_backend_rs::passthrough::{Config, PassthroughFs}; +use nydus::{FsBackendDesc, FsBackendType}; +use rafs::fs::{Rafs, RafsConfig}; +use rafs::{trim_backend_config, RafsError, RafsIoRead}; +use serde::{self, Deserialize, Serialize}; + +use crate::daemon::DaemonResult; +use crate::upgrade::{self, UpgradeManager}; +use crate::DaemonError; + +//TODO: Try to public below type from fuse-rs thus no need to redefine it here. +type BackFileSystem = Box + Send + Sync>; + +/// Command to mount a filesystem. +#[derive(Clone)] +pub struct FsBackendMountCmd { + pub fs_type: FsBackendType, + pub source: String, + pub config: String, + pub mountpoint: String, + pub prefetch_files: Option>, +} + +/// Command to unmount a filesystem. +#[derive(Clone, Deserialize, Serialize, Debug)] +pub struct FsBackendUmountCmd { + pub mountpoint: String, +} + +/// List of filesystem backend information. +#[derive(Default, Serialize, Clone)] +pub struct FsBackendCollection(HashMap); + +impl FsBackendCollection { + pub fn add(&mut self, id: &str, cmd: &FsBackendMountCmd) -> DaemonResult<()> { + // We only wash Rafs backend now. + let fs_config = match cmd.fs_type { + FsBackendType::Rafs => { + let mut config: serde_json::Value = + serde_json::from_str(&cmd.config).map_err(DaemonError::Serde)?; + trim_backend_config!( + config, + "access_key_id", + "access_key_secret", + "auth", + "token" + ); + Some(config) + } + FsBackendType::PassthroughFs => { + // Passthrough Fs has no config ever input. + None + } + }; + + let desc = FsBackendDesc { + backend_type: cmd.fs_type.clone(), + mountpoint: cmd.mountpoint.clone(), + mounted_time: chrono::Local::now(), + config: fs_config, + }; + + self.0.insert(id.to_string(), desc); + + Ok(()) + } + + pub fn del(&mut self, id: &str) { + self.0.remove(id); + } +} + +/// Define services provided by a filesystem provider. +pub trait FsService: Send + Sync { + fn get_vfs(&self) -> &Vfs; + fn upgrade_mgr(&self) -> Option>; + fn backend_collection(&self) -> MutexGuard; + + // NOTE: This method is not thread-safe, however, it is acceptable as + // mount/umount/remount/restore_mount is invoked from single thread in FSM + fn mount(&self, cmd: FsBackendMountCmd) -> DaemonResult<()> { + if self.backend_from_mountpoint(&cmd.mountpoint)?.is_some() { + return Err(DaemonError::AlreadyExists); + } + let backend = fs_backend_factory(&cmd)?; + let index = self.get_vfs().mount(backend, &cmd.mountpoint)?; + info!("{} mounted at {}", &cmd.fs_type, &cmd.mountpoint); + self.backend_collection().add(&cmd.mountpoint, &cmd)?; + + // Add mounts opaque to UpgradeManager + if let Some(mut mgr_guard) = self.upgrade_mgr() { + upgrade::add_mounts_state(&mut mgr_guard, cmd, index)?; + } + + Ok(()) + } + + fn remount(&self, cmd: FsBackendMountCmd) -> DaemonResult<()> { + let rootfs = self + .backend_from_mountpoint(&cmd.mountpoint)? + .ok_or(DaemonError::NotFound)?; + let rafs_config = RafsConfig::from_str(&cmd.config)?; + let mut bootstrap = ::from_file(&&cmd.source)?; + let any_fs = rootfs.deref().as_any(); + let rafs = any_fs + .downcast_ref::() + .ok_or_else(|| DaemonError::FsTypeMismatch("to rafs".to_string()))?; + + rafs.update(&mut bootstrap, rafs_config) + .map_err(|e| match e { + RafsError::Unsupported => DaemonError::Unsupported, + e => DaemonError::Rafs(e), + })?; + + // To update mounted time and backend configurations. + self.backend_collection().add(&cmd.mountpoint, &cmd)?; + + // Update mounts opaque from UpgradeManager + if let Some(mut mgr_guard) = self.upgrade_mgr() { + upgrade::update_mounts_state(&mut mgr_guard, cmd)?; + } + + Ok(()) + } + + fn umount(&self, cmd: FsBackendUmountCmd) -> DaemonResult<()> { + let _ = self + .backend_from_mountpoint(&cmd.mountpoint)? + .ok_or(DaemonError::NotFound)?; + + self.get_vfs().umount(&cmd.mountpoint)?; + self.backend_collection().del(&cmd.mountpoint); + if let Some(mut mgr_guard) = self.upgrade_mgr() { + // Remove mount opaque from UpgradeManager + upgrade::remove_mounts_state(&mut mgr_guard, cmd)?; + } + + Ok(()) + } + + fn backend_from_mountpoint(&self, mp: &str) -> DaemonResult>> { + self.get_vfs().get_rootfs(mp).map_err(|e| e.into()) + } + + fn export_backend_info(&self, mountpoint: &str) -> DaemonResult { + let fs = self + .backend_from_mountpoint(mountpoint)? + .ok_or(DaemonError::NotFound)?; + let any_fs = fs.deref().as_any(); + let rafs = any_fs + .downcast_ref::() + .ok_or_else(|| DaemonError::FsTypeMismatch("to rafs".to_string()))?; + let resp = serde_json::to_string(rafs.metadata()).map_err(DaemonError::Serde)?; + Ok(resp) + } + fn export_inflight_ops(&self) -> DaemonResult>; +} + +/// Validate prefetch file list from user input. +/// +/// Validation rules: +/// - an item may be file or directroy. +/// - items must be separated by space, such as " ". +/// - each item must be absolute path, such as "/foo1/bar1 /foo2/bar2". +fn validate_prefetch_file_list(input: &Option>) -> DaemonResult>> { + if let Some(list) = input { + let list: Vec = list.iter().map(PathBuf::from).collect(); + for elem in list.iter() { + if !elem.is_absolute() { + return Err(DaemonError::Common("Illegal prefetch list".to_string())); + } + } + Ok(Some(list)) + } else { + Ok(None) + } +} + +fn fs_backend_factory(cmd: &FsBackendMountCmd) -> DaemonResult { + let prefetch_files = validate_prefetch_file_list(&cmd.prefetch_files)?; + + match cmd.fs_type { + FsBackendType::Rafs => { + let rafs_config = RafsConfig::from_str(cmd.config.as_str())?; + let mut bootstrap = ::from_file(&cmd.source)?; + let mut rafs = Rafs::new(rafs_config, &cmd.mountpoint, &mut bootstrap)?; + rafs.import(bootstrap, prefetch_files)?; + info!("Rafs imported"); + Ok(Box::new(rafs)) + } + FsBackendType::PassthroughFs => { + #[cfg(target_os = "macos")] + return Err(DaemonError::InvalidArguments(String::from( + "not support passthroughfs", + ))); + #[cfg(target_os = "linux")] + { + // Vfs by default enables no_open and writeback, passthroughfs + // needs to specify them explicitly. + // TODO(liubo): enable no_open_dir. + let fs_cfg = Config { + root_dir: cmd.source.to_string(), + do_import: false, + writeback: true, + no_open: true, + xattr: true, + ..Default::default() + }; + // TODO: Passthrough Fs needs to enlarge rlimit against host. We can exploit `MountCmd` + // `config` field to pass such a configuration into here. + let passthrough_fs = + PassthroughFs::new(fs_cfg).map_err(DaemonError::PassthroughFs)?; + passthrough_fs + .import() + .map_err(DaemonError::PassthroughFs)?; + info!("PassthroughFs imported"); + Ok(Box::new(passthrough_fs)) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_should_add_new_backend() { + let mut col: FsBackendCollection = Default::default(); + let r = col.add( + "test", + &FsBackendMountCmd { + fs_type: FsBackendType::Rafs, + config: "{\"config\": \"test\"}".to_string(), + mountpoint: "testmonutount".to_string(), + source: "testsource".to_string(), + prefetch_files: Some(vec!["testfile".to_string()]), + }, + ); + assert!(r.is_ok(), "failed to add backend collection"); + + assert_eq!(col.0.len(), 1); + + col.del("test"); + assert_eq!(col.0.len(), 0); + } + + #[test] + fn it_should_verify_prefetch_files() { + let files = validate_prefetch_file_list(&Some(vec!["/etc/passwd".to_string()])); + assert!(files.is_ok(), "failed to verify prefetch files"); + assert_eq!(1, files.unwrap().unwrap().len()); + + assert!( + validate_prefetch_file_list(&Some(vec!["etc/passwd".to_string()])).is_err(), + "should not pass verify" + ); + } + + #[test] + fn it_should_create_rafs_backend() { + let config = r#" + { + "device": { + "backend": { + "type": "oss", + "config": { + "endpoint": "test", + "access_key_id": "test", + "access_key_secret": "test", + "bucket_name": "antsys-nydus", + "object_prefix":"nydus_v2/", + "scheme": "http" + } + } + }, + "mode": "direct", + "digest_validate": false, + "enable_xattr": true, + "fs_prefetch": { + "enable": true, + "threads_count": 10, + "merging_size": 131072, + "bandwidth_rate": 10485760 + } + }"#; + let bootstrap = "./tests/texture/bootstrap/nydusd_daemon_test_bootstrap"; + if fs_backend_factory(&FsBackendMountCmd { + fs_type: FsBackendType::Rafs, + config: config.to_string(), + mountpoint: "testmountpoint".to_string(), + source: bootstrap.to_string(), + prefetch_files: Some(vec!["/testfile".to_string()]), + }) + .unwrap() + .as_any() + .downcast_ref::() + .is_none() + { + panic!("failed to create rafs backend") + } + } +} diff --git a/src/bin/nydusd/fusedev.rs b/src/bin/nydusd/fusedev.rs index 7957c2ae5ea..f913241b20d 100644 --- a/src/bin/nydusd/fusedev.rs +++ b/src/bin/nydusd/fusedev.rs @@ -12,14 +12,13 @@ use std::ops::Deref; use std::os::linux::fs::MetadataExt; #[cfg(target_os = "linux")] use std::os::unix::ffi::OsStrExt; - #[cfg(target_os = "macos")] use std::os::unix::fs::MetadataExt; use std::os::unix::net::UnixStream; use std::path::Path; use std::sync::{ atomic::{AtomicI32, AtomicU64, Ordering}, - mpsc::{channel, Receiver}, + mpsc::{channel, Receiver, Sender}, Arc, Mutex, MutexGuard, }; use std::thread::{self, JoinHandle}; @@ -29,8 +28,7 @@ use fuse_backend_rs::abi::fuse_abi::{InHeader, OutHeader}; use fuse_backend_rs::api::server::{MetricsHook, Server}; use fuse_backend_rs::api::Vfs; use fuse_backend_rs::transport::fusedev::{FuseChannel, FuseSession}; -#[cfg(target_os = "macos")] -use libc::statfs; +use mio::Waker; #[cfg(target_os = "linux")] use nix::sys::stat::{major, minor}; use nydus_app::BuildTimeInfo; @@ -38,10 +36,11 @@ use serde::Serialize; use crate::daemon::{ DaemonError, DaemonResult, DaemonState, DaemonStateMachineContext, DaemonStateMachineInput, - DaemonStateMachineSubscriber, FsBackendCollection, FsBackendMountCmd, NydusDaemon, Trigger, + DaemonStateMachineSubscriber, NydusDaemon, }; -use crate::exit_daemon; +use crate::fs_service::{FsBackendCollection, FsBackendMountCmd, FsService}; use crate::upgrade::{self, FailoverPolicy, UpgradeManager}; +use crate::DAEMON_CONTROLLER; #[derive(Serialize)] struct FuseOp { @@ -144,40 +143,122 @@ impl FuseServer { } } -pub struct FusedevDaemon { +pub struct FusedevFsService { /// Fuse connection ID which usually equals to `st_dev` pub conn: AtomicU64, pub failover_policy: FailoverPolicy, pub session: Mutex, - bti: BuildTimeInfo, - id: Option, - supervisor: Option, - vfs: Arc, - threads_cnt: u32, - - state: AtomicI32, server: Arc>>, upgrade_mgr: Option>, + vfs: Arc, backend_collection: Mutex, inflight_ops: Mutex>, +} + +impl FusedevFsService { + fn new( + vfs: Arc, + mnt: &Path, + supervisor: Option<&String>, + fp: FailoverPolicy, + readonly: bool, + ) -> Result { + let session = FuseSession::new(mnt, "rafs", "", readonly)?; + let upgrade_mgr = supervisor + .as_ref() + .map(|s| Mutex::new(UpgradeManager::new(s.to_string().into()))); + + Ok(FusedevFsService { + vfs: vfs.clone(), + conn: AtomicU64::new(0), + failover_policy: fp, + session: Mutex::new(session), + server: Arc::new(Server::new(vfs)), + upgrade_mgr, + + backend_collection: Default::default(), + inflight_ops: Mutex::new(Vec::new()), + }) + } + + fn create_fuse_server(&self) -> Result { + FuseServer::new(self.server.clone(), self.session.lock().unwrap().deref()) + } + + fn create_inflight_op(&self) -> FuseOpWrapper { + let inflight_op = FuseOpWrapper::default(); + + // "Not expected poisoned lock" + self.inflight_ops.lock().unwrap().push(inflight_op.clone()); + + inflight_op + } + + fn disconnect(&self) -> DaemonResult<()> { + let mut session = self.session.lock().expect("Not expect poisoned lock."); + session.umount().map_err(DaemonError::SessionShutdown)?; + session.wake().map_err(DaemonError::SessionShutdown)?; + Ok(()) + } +} + +impl FsService for FusedevFsService { + #[inline] + fn get_vfs(&self) -> &Vfs { + &self.vfs + } + + #[inline] + fn upgrade_mgr(&self) -> Option> { + self.upgrade_mgr.as_ref().map(|mgr| mgr.lock().unwrap()) + } + + fn backend_collection(&self) -> MutexGuard { + self.backend_collection.lock().unwrap() + } + + fn export_inflight_ops(&self) -> DaemonResult> { + let ops = self.inflight_ops.lock().unwrap(); + + let r = ops + .iter() + .filter(|w| w.op.lock().unwrap().is_some()) + .map(|w| &w.op) + .collect::>>>>(); + + if r.is_empty() { + Ok(None) + } else { + let resp = serde_json::to_string(&r).map_err(DaemonError::Serde)?; + Ok(Some(resp)) + } + } +} + +pub struct FusedevDaemon { + bti: BuildTimeInfo, + id: Option, + request_sender: Arc>>, result_receiver: Mutex>>, - trigger: Arc>, + service: Arc, + state: AtomicI32, + supervisor: Option, + threads_cnt: u32, threads: Mutex>>>, } impl FusedevDaemon { - fn kick_one_server(&self) -> Result<()> { - let mut s = FuseServer::new(self.server.clone(), self.session.lock().unwrap().deref())?; - - let inflight_op = self.create_inflight_op(); + fn kick_one_server(&self, waker: Arc) -> Result<()> { + let mut s = self.service.create_fuse_server()?; + let inflight_op = self.service.create_inflight_op(); let thread = thread::Builder::new() .name("fuse_server".to_string()) .spawn(move || { let _ = s.svc_loop(&inflight_op); - // quit the daemon if any fuse server thread exits - exit_daemon(); + // Notify the daemon controller that one working thread has exited. + let _ = waker.wake(); Ok(()) }) .map_err(DaemonError::ThreadSpawn)?; @@ -186,20 +267,11 @@ impl FusedevDaemon { Ok(()) } - - fn create_inflight_op(&self) -> FuseOpWrapper { - let inflight_op = FuseOpWrapper::default(); - - // "Not expected poisoned lock" - self.inflight_ops.lock().unwrap().push(inflight_op.clone()); - - inflight_op - } } impl DaemonStateMachineSubscriber for FusedevDaemon { fn on_event(&self, event: DaemonStateMachineInput) -> DaemonResult<()> { - self.trigger + self.request_sender .lock() .unwrap() .send(event) @@ -219,16 +291,52 @@ impl NydusDaemon for FusedevDaemon { self } + #[inline] + fn id(&self) -> Option { + self.id.clone() + } + + #[inline] + fn get_state(&self) -> DaemonState { + self.state.load(Ordering::Relaxed).into() + } + + #[inline] + fn set_state(&self, state: DaemonState) { + self.state.store(state as i32, Ordering::Relaxed); + } + + fn version(&self) -> BuildTimeInfo { + self.bti.clone() + } + fn start(&self) -> DaemonResult<()> { info!("start {} fuse servers", self.threads_cnt); for _ in 0..self.threads_cnt { - self.kick_one_server() + let waker = DAEMON_CONTROLLER.alloc_waker(); + self.kick_one_server(waker) .map_err(|e| DaemonError::StartService(format!("{:?}", e)))?; } Ok(()) } + fn disconnect(&self) -> DaemonResult<()> { + self.service.disconnect() + } + + #[inline] + fn interrupt(&self) { + let session = self + .service + .session + .lock() + .expect("Not expect poisoned lock."); + if let Err(e) = session.wake().map_err(DaemonError::SessionShutdown) { + error!("stop fuse service thread failed: {:?}", e); + } + } + fn wait(&self) -> DaemonResult<()> { loop { let handle = self.threads.lock().unwrap().pop(); @@ -251,41 +359,11 @@ impl NydusDaemon for FusedevDaemon { Ok(()) } - fn disconnect(&self) -> DaemonResult<()> { - let mut session = self.session.lock().expect("Not expect poisoned lock."); - session.umount().map_err(DaemonError::SessionShutdown)?; - session.wake().map_err(DaemonError::SessionShutdown)?; - Ok(()) - } - - #[inline] - fn id(&self) -> Option { - self.id.clone() - } - #[inline] fn supervisor(&self) -> Option { self.supervisor.clone() } - #[inline] - fn interrupt(&self) { - let session = self.session.lock().expect("Not expect poisoned lock."); - if let Err(e) = session.wake().map_err(DaemonError::SessionShutdown) { - error!("stop fuse service thread failed: {:?}", e); - } - } - - #[inline] - fn set_state(&self, state: DaemonState) { - self.state.store(state as i32, Ordering::Relaxed); - } - - #[inline] - fn get_state(&self) -> DaemonState { - self.state.load(Ordering::Relaxed).into() - } - fn save(&self) -> DaemonResult<()> { upgrade::fusedev_upgrade::save(self) } @@ -294,39 +372,8 @@ impl NydusDaemon for FusedevDaemon { upgrade::fusedev_upgrade::restore(self) } - #[inline] - fn get_vfs(&self) -> &Vfs { - &self.vfs - } - - #[inline] - fn upgrade_mgr(&self) -> Option> { - self.upgrade_mgr.as_ref().map(|mgr| mgr.lock().unwrap()) - } - - fn backend_collection(&self) -> MutexGuard { - self.backend_collection.lock().unwrap() - } - - fn version(&self) -> BuildTimeInfo { - self.bti.clone() - } - - fn export_inflight_ops(&self) -> DaemonResult> { - let ops = self.inflight_ops.lock().unwrap(); - - let r = ops - .iter() - .filter(|w| w.op.lock().unwrap().is_some()) - .map(|w| &w.op) - .collect::>>>>(); - - if r.is_empty() { - Ok(None) - } else { - let resp = serde_json::to_string(&r).map_err(DaemonError::Serde)?; - Ok(Some(resp)) - } + fn get_default_fs_service(&self) -> Option> { + Some(self.service.clone()) } } @@ -337,11 +384,11 @@ fn is_mounted(mp: impl AsRef) -> Result { .to_str() .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?; let mp = CString::new(String::from(mp)).map_err(|_| Error::from_raw_os_error(libc::EINVAL))?; - let mut mpb: Vec = Vec::new(); + let mut mpb: Vec = Vec::new(); let mut mpb_ptr = mpb.as_mut_ptr(); let mpb_ptr = &mut mpb_ptr; - let mpb: Vec = unsafe { + let mpb: Vec = unsafe { let res = libc::getmntinfo(mpb_ptr, libc::MNT_NOWAIT); if res < 0 { return Err(Error::from_raw_os_error(res)); @@ -434,7 +481,7 @@ fn calc_fuse_conn(mp: impl AsRef) -> Result { } #[allow(clippy::too_many_arguments)] -pub fn create_nydus_daemon( +pub fn create_fuse_daemon( mountpoint: &str, vfs: Arc, supervisor: Option, @@ -446,40 +493,23 @@ pub fn create_nydus_daemon( fp: FailoverPolicy, mount_cmd: Option, bti: BuildTimeInfo, -) -> Result> { +) -> Result> { let mnt = Path::new(mountpoint).canonicalize()?; - let session = FuseSession::new(&mnt, "rafs", "", readonly)?; - - // Create upgrade manager - let upgrade_mgr = supervisor - .as_ref() - .map(|s| Mutex::new(UpgradeManager::new(s.to_string().into()))); - let (trigger, events_rx) = channel::(); let (result_sender, result_receiver) = channel::>(); - + let service = FusedevFsService::new(vfs, &mnt, supervisor.as_ref(), fp, readonly)?; let daemon = Arc::new(FusedevDaemon { - conn: AtomicU64::new(0), - failover_policy: fp, - session: Mutex::new(session), - bti, id, supervisor, threads_cnt, - vfs: vfs.clone(), state: AtomicI32::new(DaemonState::INIT as i32), - server: Arc::new(Server::new(vfs)), - upgrade_mgr, - - backend_collection: Default::default(), - inflight_ops: Mutex::new(Vec::new()), result_receiver: Mutex::new(result_receiver), - trigger: Arc::new(Mutex::new(trigger)), + request_sender: Arc::new(Mutex::new(trigger)), + service: Arc::new(service), threads: Mutex::new(Vec::new()), }); - let machine = DaemonStateMachineContext::new(daemon.clone(), events_rx, result_sender); let machine_thread = machine.kick_state_machine()?; daemon.threads.lock().unwrap().push(machine_thread); @@ -490,13 +520,16 @@ pub fn create_nydus_daemon( || api_sock.is_none() { if let Some(cmd) = mount_cmd { - daemon.mount(cmd)?; + daemon.service.mount(cmd)?; } - daemon.session.lock().unwrap().mount()?; + daemon.service.session.lock().unwrap().mount()?; daemon - .on_event(DaemonStateMachineInput::Mount) + .on_event(DaemonStateMachineInput::Start) .map_err(|e| eother!(e))?; - daemon.conn.store(calc_fuse_conn(mnt)?, Ordering::Relaxed); + daemon + .service + .conn + .store(calc_fuse_conn(mnt)?, Ordering::Relaxed); } Ok(daemon) diff --git a/src/bin/nydusd/main.rs b/src/bin/nydusd/main.rs index edf56407bf6..0459faf61d1 100644 --- a/src/bin/nydusd/main.rs +++ b/src/bin/nydusd/main.rs @@ -4,13 +4,16 @@ // // SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) #![deny(warnings)] +#![allow(dead_code)] #[macro_use(crate_version)] extern crate clap; #[macro_use] extern crate log; #[macro_use] extern crate lazy_static; -extern crate serde_json; +#[cfg(target_os = "linux")] +#[macro_use] +extern crate nix; #[macro_use] extern crate nydus_error; extern crate nydus_rafs as rafs; @@ -19,35 +22,36 @@ extern crate nydus_storage as storage; #[cfg(feature = "fusedev")] use std::convert::TryInto; use std::io::{Error, ErrorKind, Result}; -use std::ops::{Deref, DerefMut}; -use std::process; -use std::sync::{mpsc::channel, Arc, Mutex}; -use std::thread; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; -use clap::{App, Arg, ArgMatches}; +use clap::{App, Arg, ArgMatches, SubCommand, Values}; use fuse_backend_rs::api::{Vfs, VfsOptions}; -use mio::Waker; +use mio::{Events, Poll, Token, Waker}; use nix::sys::signal; use rlimit::Resource; use nydus::FsBackendType; -use nydus_api::http::start_http_thread; use nydus_app::{dump_program_info, setup_logging, BuildTimeInfo}; -use self::api_server_glue::{ApiServer, ApiSeverSubscriber}; -use self::daemon::{DaemonError, FsBackendMountCmd, NydusDaemon}; +use crate::api_server_glue::ApiServerController; +use crate::blob_cache::BlobCacheMgr; +use crate::daemon::{DaemonError, NydusDaemon}; +use crate::fs_service::{FsBackendMountCmd, FsService}; +use crate::service_controller::create_daemon; -#[cfg(feature = "virtiofs")] -mod virtiofs; -#[cfg(feature = "virtiofs")] -use self::virtiofs::create_nydus_daemon; #[cfg(feature = "fusedev")] mod fusedev; -#[cfg(feature = "fusedev")] -use self::fusedev::create_nydus_daemon; +#[cfg(feature = "virtiofs")] +mod virtiofs; mod api_server_glue; +mod blob_cache; mod daemon; +#[cfg(target_os = "linux")] +mod fs_cache; +mod fs_service; +mod service_controller; mod upgrade; /// Minimal number of file descriptors reserved for system. @@ -56,63 +60,315 @@ const RLIMIT_NOFILE_RESERVED: u64 = 16384; const RLIMIT_NOFILE_MAX: u64 = 1_000_000; lazy_static! { - static ref FUSE_DAEMON: Mutex::>> = Mutex::default(); + static ref DAEMON_CONTROLLER: DaemonController = DaemonController::new(); } -pub fn exit_daemon() { - let daemon = FUSE_DAEMON.lock().expect("Not posioned lock"); - if let Some(daemon) = daemon.deref() { - daemon - .stop() - .unwrap_or_else(|e| error!("exit daemon failed, {}", e)); +/// Controller to manage registered filesystem/blobcache/fscache services. +pub struct DaemonController { + active: AtomicBool, + singleton_mode: AtomicBool, + daemon: Mutex>>, + blob_cache_mgr: Mutex>>, + // For backward compatibility to support singleton fusedev/virtiofs server. + fs_service: Mutex>>, + waker: Arc, + poller: Mutex, +} + +impl DaemonController { + fn new() -> Self { + let poller = Poll::new().expect("Failed to create `ServiceController` instance"); + let waker = Waker::new(poller.registry(), Token(1)) + .expect("Failed to create waker for ServiceController"); + + Self { + active: AtomicBool::new(true), + singleton_mode: AtomicBool::new(true), + daemon: Mutex::new(None), + blob_cache_mgr: Mutex::new(None), + fs_service: Mutex::new(None), + waker: Arc::new(waker), + poller: Mutex::new(poller), + } + } + + /// Check whether the service controller is still in active/working state. + pub fn is_active(&self) -> bool { + self.active.load(Ordering::Acquire) + } + + /// Allocate a waker to notify stop events. + pub fn alloc_waker(&self) -> Arc { + self.waker.clone() + } + + /// Enable/disable singleton mode, which will shutdown the process when any working thread exits. + pub fn set_singleton_mode(&self, enabled: bool) { + self.singleton_mode.store(enabled, Ordering::Release); + } + + /// Set the daemon service object. + pub fn set_daemon(&self, daemon: Arc) -> Option> { + self.daemon.lock().unwrap().replace(daemon) + } + + /// Get the daemon service object. + /// + /// Panic if called before `set_daemon()` has been called. + pub fn get_daemon(&self) -> Arc { + self.daemon.lock().unwrap().clone().unwrap() + } + + /// Get the optional blob cache manager. + pub fn get_blob_cache_mgr(&self) -> Option> { + self.blob_cache_mgr.lock().unwrap().clone() + } + + /// Set the optional blob cache manager. + pub fn set_blob_cache_mgr(&self, mgr: Arc) -> Option> { + self.blob_cache_mgr.lock().unwrap().replace(mgr) + } + + /// Set the default fs service object. + pub fn set_fs_service(&self, service: Arc) -> Option> { + self.fs_service.lock().unwrap().replace(service) + } + + /// Get the default fs service object. + pub fn get_fs_service(&self) -> Option> { + self.fs_service.lock().unwrap().clone() + } + + fn shutdown(&self) { + // Marking exiting state. + self.active.store(false, Ordering::Release); + // Signal the `run_loop()` working thread to exit. + let _ = self.waker.wake(); + + let daemon = self.daemon.lock().unwrap().take(); + if let Some(d) = daemon { + /* + // TODO: fix the behavior + if cfg!(feature = "virtiofs") { + // In case of virtiofs, mechanism to unblock recvmsg() from VMM is lacked. + // Given the fact that we have nothing to clean up, directly exit seems fine. + process::exit(0); + } + */ + if let Err(e) = d.stop() { + error!("failed to stop daemon: {}", e); + } + if let Err(e) = d.wait() { + error!("failed to wait daemon: {}", e) + } + } + } + + fn run_loop(&self) { + let mut events = Events::with_capacity(8); + + loop { + match self.poller.lock().unwrap().poll(&mut events, None) { + Err(e) if e.kind() == ErrorKind::Interrupted => continue, + Err(e) => error!("failed to receive notification from waker: {}", e), + Ok(_) => {} + } + + for event in events.iter() { + if event.is_error() { + error!("Got error on the monitored event."); + continue; + } + + if event.is_readable() && event.token() == Token(1) { + if self.active.load(Ordering::Acquire) { + return; + } else if self.singleton_mode.load(Ordering::Acquire) { + self.active.store(false, Ordering::Relaxed); + return; + } + } + } + } } } extern "C" fn sig_exit(_sig: std::os::raw::c_int) { - if cfg!(feature = "virtiofs") { - // In case of virtiofs, mechanism to unblock recvmsg() from VMM is lacked. - // Given the fact that we have nothing to clean up, directly exit seems fine. - process::exit(0); - } else { - // Can't directly exit here since we want to umount rafs reflecting the signal. - exit_daemon(); - } + DAEMON_CONTROLLER.shutdown(); } -fn parse_commandline_options(bti_string: String) -> ArgMatches<'static> { - let cmd_arguments = App::new("nydusd") - .version(bti_string.as_str()) - .about("Nydus Image Service") +#[cfg(any(feature = "fusedev", feature = "virtiofs"))] +fn append_fs_options(app: App<'static, 'static>) -> App<'static, 'static> { + app.arg( + Arg::with_name("bootstrap") + .long("bootstrap") + .short("B") + .help("Bootstrap/metadata file for rafs filesystem, which also enables rafs mode") + .takes_value(true) + .requires("config") + .conflicts_with("shared-dir"), + ) + .arg( + Arg::with_name("prefetch-files") + .long("prefetch-files") + .short("P") + .help("List of file/directory to prefetch") + .takes_value(true) + .required(false) + .requires("bootstrap") + .multiple(true), + ) + .arg( + Arg::with_name("virtual-mountpoint") + .long("virtual-mountpoint") + .short("m") + .help("Path inside FUSE/virtiofs virtual filesystem to mount the rafs/passthroughfs instance") + .takes_value(true) + .default_value("/") + .required(false), + ) +} + +#[cfg(feature = "fusedev")] +fn append_fuse_options(app: App<'static, 'static>) -> App<'static, 'static> { + app.arg( + Arg::with_name("mountpoint") + .long("mountpoint") + .short("M") + .help("Path to mount the FUSE filesystem, target for `mount.fuse`") + .takes_value(true) + .required(false), + ) + .arg( + Arg::with_name("failover-policy") + .long("failover-policy") + .short("F") + .default_value("resend") + .help("FUSE server failover policy") + .possible_values(&["resend", "flush"]) + .takes_value(true) + .required(false), + ) + .arg( + Arg::with_name("threads") + .long("thread-num") + .short("T") + .default_value("1") + .help("Number of working threads to serve FUSE IO requests") + .takes_value(true) + .required(false) + .validator(|v| { + if let Ok(t) = v.parse::() { + if t > 0 && t <= 1024 { + Ok(()) + } else { + Err("Invalid working thread number {}, valid values: [1-1024]".to_string()) + } + } else { + Err("Input thread number is invalid".to_string()) + } + }), + ) + .arg( + Arg::with_name("writable") + .long("writable") + .short("W") + .help("Mount FUSE filesystem in rw mode") + .takes_value(false), + ) +} + +#[cfg(feature = "fusedev")] +fn append_fuse_subcmd_options(app: App<'static, 'static>) -> App<'static, 'static> { + let subcmd = SubCommand::with_name("fuse").about("Run as a dedicated FUSE server"); + let subcmd = append_fuse_options(subcmd); + let subcmd = append_fs_options(subcmd); + app.subcommand(subcmd) +} + +#[cfg(feature = "virtiofs")] +fn append_virtiofs_options(app: App<'static, 'static>) -> App<'static, 'static> { + app.arg( + Arg::with_name("hybrid-mode") + .long("hybrid-mode") + .short("H") + .help("Enable support for both rafs and passthroughfs modes") + .required(false) + .takes_value(false), + ) + .arg( + Arg::with_name("shared-dir") + .long("shared-dir") + .short("s") + .help("Directory shared by host and guest for passthroughfs, which also enables pathroughfs mode") + .takes_value(true) + .conflicts_with("bootstrap"), + ) + .arg( + Arg::with_name("sock") + .long("sock") + .short("v") + .help("Vhost-user API socket") + .takes_value(true) + .required(false), + ) +} + +#[cfg(feature = "virtiofs")] +fn append_virtiofs_subcmd_options(app: App<'static, 'static>) -> App<'static, 'static> { + let subcmd = SubCommand::with_name("virtiofs").about("Run as a dedicated virtiofs server"); + let subcmd = append_virtiofs_options(subcmd); + let subcmd = append_fs_options(subcmd); + app.subcommand(subcmd) +} + +fn append_services_subcmd_options(app: App<'static, 'static>) -> App<'static, 'static> { + let subcmd = SubCommand::with_name("daemon") + .about("Run as a global daemon hosting multiple blobcache/fscache/virtiofs services.") + .arg( + Arg::with_name("fscache") + .long("fscache") + .short("F") + .help("Working directory fscache driver to cache files") + .takes_value(true), + ) + .arg( + Arg::with_name("fscache-tag") + .long("fscache-tag") + .help("Fscache tag to identify the fs daemon instance") + .takes_value(true) + .requires("fscache"), + ); + + app.subcommand(subcmd) +} + +fn prepare_commandline_options() -> App<'static, 'static> { + let cmdline = App::new("nydusd") + .about("Nydus BlobCache/FsCache/Image Service") .arg( Arg::with_name("apisock") .long("apisock") .short("A") .help("Administration API socket") .takes_value(true) - .required(false), + .required(false) + .global(true), ) .arg( Arg::with_name("config") .long("config") .short("C") .help("Configuration file") - .takes_value(true) .required(false) - ) - .arg( - Arg::with_name("failover-policy") - .long("failover-policy") - .default_value("resend") - .help("Nydus image service failover policy") - .possible_values(&["resend", "flush"]) - .takes_value(true) - .required(false) - .global(true), + .global(true) + .takes_value(true), ) .arg( Arg::with_name("id") .long("id") - .help("Nydus image service identifier") + .short("I") + .help("Service instance identifier") .takes_value(true) .required(false) .requires("supervisor") @@ -138,18 +394,10 @@ fn parse_commandline_options(bti_string: String) -> ArgMatches<'static> { .required(false) .global(true), ) - .arg( - Arg::with_name("prefetch-files") - .long("prefetch-files") - .help("List of file/directory to prefetch") - .takes_value(true) - .required(false) - .multiple(true) - .global(true), - ) .arg( Arg::with_name("rlimit-nofile") .long("rlimit-nofile") + .short("R") .default_value("1000000") .help("Set rlimit for maximum file descriptor number (0 leaves it unchanged)") .takes_value(true) @@ -174,89 +422,21 @@ fn parse_commandline_options(bti_string: String) -> ArgMatches<'static> { .takes_value(false) .required(false) .global(true), - ) - .arg( - Arg::with_name("virtual-mountpoint") - .long("virtual-mountpoint") - .short("V") - .help("Virtual mountpoint for the filesystem") - .takes_value(true) - .default_value("/") - .required(false) - .global(true), - ).arg( - Arg::with_name("bootstrap") - .long("bootstrap") - .short("B") - .help("Rafs filesystem bootstrap/metadata file") - .takes_value(true) - .conflicts_with("shared-dir") - ) - .arg( - Arg::with_name("shared-dir") - .long("shared-dir") - .short("s") - .help("Directory to pass through to the guest VM") - .takes_value(true) - .conflicts_with("bootstrap"), - ) - .arg( - Arg::with_name("hybrid-mode").long("hybrid-mode") - .help("run nydusd in rafs and passthroughfs hybrid mode") - .required(false) - .takes_value(false) - .global(true) ); #[cfg(feature = "fusedev")] - let cmd_arguments = cmd_arguments - .arg( - Arg::with_name("mountpoint") - .long("mountpoint") - .short("M") - .help("Fuse mount point") - .takes_value(true) - .required(true), - ) - .arg( - Arg::with_name("threads") - .long("thread-num") - .short("T") - .default_value("1") - .help("Number of working threads to serve IO requests") - .takes_value(true) - .required(false) - .global(true) - .validator(|v| { - if let Ok(t) = v.parse::() { - if t > 0 && t <= 1024 { - Ok(()) - } else { - Err("Invalid working thread number {}, valid values: [1-1024]" - .to_string()) - } - } else { - Err("Input thread number is not legal".to_string()) - } - }), - ) - .arg( - Arg::with_name("writable") - .long("writable") - .help("set fuse mountpoint non-readonly") - .takes_value(false), - ); + let cmdline = append_fuse_subcmd_options(cmdline); + #[cfg(feature = "virtiofs")] + let cmdline = append_virtiofs_subcmd_options(cmdline); + #[cfg(feature = "fusedev")] + let cmdline = append_fuse_options(cmdline); #[cfg(feature = "virtiofs")] - let cmd_arguments = cmd_arguments.arg( - Arg::with_name("sock") - .long("sock") - .help("Vhost-user API socket") - .takes_value(true) - .required(true), - ); + let cmdline = append_virtiofs_options(cmdline); + #[cfg(any(feature = "fusedev", feature = "virtiofs"))] + let cmdline = append_fs_options(cmdline); - cmd_arguments.get_matches() + append_services_subcmd_options(cmdline) } #[cfg(target_os = "macos")] @@ -330,26 +510,49 @@ fn handle_rlimit_nofile_option(args: &ArgMatches, option_name: &str) -> Result<( Ok(()) } -fn main() -> Result<()> { - let (bti_string, bti) = BuildTimeInfo::dump(crate_version!()); - let args = parse_commandline_options(bti_string); - let logging_file = args.value_of("log-file").map(|l| l.into()); - // Safe to unwrap because it has default value and possible values are defined - let level = args.value_of("log-level").unwrap().parse().unwrap(); +pub struct SubCmdArgs<'a> { + args: &'a ArgMatches<'a>, + subargs: &'a ArgMatches<'a>, +} - setup_logging(logging_file, level)?; - dump_program_info(crate_version!()); - handle_rlimit_nofile_option(&args, "rlimit-nofile")?; +impl<'a> SubCmdArgs<'a> { + fn new(args: &'a ArgMatches, subargs: &'a ArgMatches) -> Self { + SubCmdArgs { args, subargs } + } + + pub fn value_of(&self, key: &str) -> Option<&str> { + if let Some(v) = self.subargs.value_of(key) { + Some(v) + } else { + self.args.value_of(key) + } + } + + pub fn values_of(&self, key: &str) -> Option { + if let Some(v) = self.subargs.values_of(key) { + Some(v) + } else { + self.args.values_of(key) + } + } + + pub fn is_present(&self, key: &str) -> bool { + self.subargs.is_present(key) || self.args.is_present(key) + } +} - // Retrieve arguments +fn process_default_fs_service( + args: SubCmdArgs, + bti: BuildTimeInfo, + apisock: Option<&str>, + is_fuse: bool, +) -> Result<()> { // shared-dir means fs passthrough let shared_dir = args.value_of("shared-dir"); // bootstrap means rafs only let bootstrap = args.value_of("bootstrap"); // safe as virtual_mountpoint default to "/" let virtual_mnt = args.value_of("virtual-mountpoint").unwrap(); - // apisock means admin api socket support - let apisock = args.value_of("apisock"); let mut opts = VfsOptions::default(); let mount_cmd = if let Some(shared_dir) = shared_dir { @@ -398,22 +601,13 @@ fn main() -> Result<()> { } let vfs = Vfs::new(opts); - let vfs = Arc::new(vfs); // Basically, below two arguments are essential for live-upgrade/failover/ and external management. let daemon_id = args.value_of("id").map(|id| id.to_string()); let supervisor = args.value_of("supervisor").map(|s| s.to_string()); - #[cfg(feature = "virtiofs")] - let daemon = { - // sock means vhost-user-backend only - let vu_sock = args.value_of("sock").ok_or_else(|| { - DaemonError::InvalidArguments("vhost socket must be provided!".to_string()) - })?; - create_nydus_daemon(daemon_id, supervisor, vu_sock, vfs, mount_cmd, bti)? - }; #[cfg(feature = "fusedev")] - let daemon = { + if is_fuse { // threads means number of fuse service threads let threads: u32 = args .value_of("threads") @@ -431,80 +625,125 @@ fn main() -> Result<()> { // mountpoint means fuse device only let mountpoint = args.value_of("mountpoint").ok_or_else(|| { - DaemonError::InvalidArguments("Mountpoint must be provided!".to_string()) + DaemonError::InvalidArguments( + "Mountpoint must be provided for FUSE server!".to_string(), + ) })?; - create_nydus_daemon( - mountpoint, - vfs, - supervisor, - daemon_id, - threads, - apisock, - args.is_present("upgrade"), - !args.is_present("writable"), - p, - mount_cmd, - bti, - ) - .map(|d| { - info!("Fuse daemon started!"); - d - }) - .map_err(|e| { - error!("Failed in starting daemon: {}", e); - e - })? - }; + let daemon = { + fusedev::create_fuse_daemon( + mountpoint, + vfs, + supervisor, + daemon_id, + threads, + apisock, + args.is_present("upgrade"), + !args.is_present("writable"), + p, + mount_cmd, + bti, + ) + .map(|d| { + info!("Fuse daemon started!"); + d + }) + .map_err(|e| { + error!("Failed in starting daemon: {}", e); + e + })? + }; + DAEMON_CONTROLLER.set_daemon(daemon); + } - #[allow(clippy::type_complexity)] - let mut http_thread_and_waker: Option<( - thread::JoinHandle>, - thread::JoinHandle<()>, - Arc, - )> = None; - if let Some(apisock) = apisock { - let (to_api, from_http) = channel(); - let (to_http, from_api) = channel(); - - let api_server = ApiServer::new(to_http, daemon.clone())?; - - let api_server_subscriber = ApiSeverSubscriber::new(api_server, from_http)?; - let api_notifier = api_server_subscriber.get_waker(); - let api_server_thread = api_server_subscriber.run()?; - - let (ret, thread_exit_waker) = - start_http_thread(apisock, Some(api_notifier), to_api, from_api)?; - http_thread_and_waker = Some((ret, api_server_thread, thread_exit_waker)); - info!("api server running at {}", apisock); + #[cfg(feature = "virtiofs")] + if !is_fuse { + let vu_sock = args.value_of("sock").ok_or_else(|| { + DaemonError::InvalidArguments("vhost socket must be provided!".to_string()) + })?; + let _ = apisock.as_ref(); + DAEMON_CONTROLLER.set_daemon(virtiofs::create_virtiofs_daemon( + daemon_id, supervisor, vu_sock, vfs, mount_cmd, bti, + )?); } - *FUSE_DAEMON.lock().unwrap().deref_mut() = Some(daemon.clone()); - nydus_app::signal::register_signal_handler(signal::SIGINT, sig_exit); - nydus_app::signal::register_signal_handler(signal::SIGTERM, sig_exit); + Ok(()) +} + +fn process_daemon_arguments( + subargs: &SubCmdArgs, + _apisock: Option<&str>, + bti: BuildTimeInfo, +) -> Result<()> { + info!("Start Nydus in daemon mode!"); + let daemon = create_daemon(subargs, bti)?; + DAEMON_CONTROLLER.set_daemon(daemon); + Ok(()) +} - daemon - .wait() - .unwrap_or_else(|e| error!("failed to wait daemon {}", e)); +fn main() -> Result<()> { + let (bti_string, bti) = BuildTimeInfo::dump(crate_version!()); + let cmd_options = prepare_commandline_options().version(bti_string.as_str()); + let args = cmd_options.clone().get_matches(); + let logging_file = args.value_of("log-file").map(|l| l.into()); + // Safe to unwrap because it has default value and possible values are defined + let level = args.value_of("log-level").unwrap().parse().unwrap(); + let apisock = args.value_of("apisock"); - if let Some((http_server_thread, api_server_thread, waker)) = http_thread_and_waker { - info!("wake http_thread."); - if waker.wake().is_err() { - error!("wake http thread failed."); + setup_logging(logging_file, level)?; + dump_program_info(crate_version!()); + handle_rlimit_nofile_option(&args, "rlimit-nofile")?; + + match args.subcommand_name() { + Some("daemon") => { + // Safe to unwrap because the subcommand is `daemon`. + let subargs = args.subcommand_matches("daemon").unwrap(); + let subargs = SubCmdArgs::new(&args, subargs); + process_daemon_arguments(&subargs, apisock, bti)?; + } + Some("fuse") => { + // Safe to unwrap because the subcommand is `fuse`. + let subargs = args.subcommand_matches("fuse").unwrap(); + let subargs = SubCmdArgs::new(&args, subargs); + process_default_fs_service(subargs, bti, apisock, true)?; } - if http_server_thread - .join() - .map(|r| r.map_err(|e| error!("Http server thread execution error. {:?}", e))) - .is_err() - { - error!("Join http server thread failed."); + Some("virtiofs") => { + // Safe to unwrap because the subcommand is `virtiofs`. + let subargs = args.subcommand_matches("virtiofs").unwrap(); + let subargs = SubCmdArgs::new(&args, subargs); + process_default_fs_service(subargs, bti, apisock, false)?; } - if api_server_thread.join().is_err() { - error!("Join api server thread failed."); + _ => { + let subargs = SubCmdArgs::new(&args, &args); + #[cfg(feature = "fusedev")] + process_default_fs_service(subargs, bti, apisock, true)?; + #[cfg(feature = "virtiofs")] + process_default_fs_service(subargs, bti, apisock, false)?; } } + let daemon = DAEMON_CONTROLLER.get_daemon(); + if let Some(fs) = daemon.get_default_fs_service() { + DAEMON_CONTROLLER.set_fs_service(fs); + } + + // Start the HTTP Administration API server + let mut api_controller = ApiServerController::new(apisock); + api_controller.start()?; + + // Initialize and run the daemon controller event loop. + nydus_app::signal::register_signal_handler(signal::SIGINT, sig_exit); + nydus_app::signal::register_signal_handler(signal::SIGTERM, sig_exit); + + // Run the main event loop + if DAEMON_CONTROLLER.is_active() { + DAEMON_CONTROLLER.run_loop(); + } + + // Gracefully shutdown system. info!("nydusd quits"); + api_controller.stop(); + DAEMON_CONTROLLER.shutdown(); Ok(()) } diff --git a/src/bin/nydusd/service_controller.rs b/src/bin/nydusd/service_controller.rs new file mode 100644 index 00000000000..098fa238592 --- /dev/null +++ b/src/bin/nydusd/service_controller.rs @@ -0,0 +1,249 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) + +use std::any::Any; +use std::io::Result; +#[cfg(target_os = "linux")] +use std::path::Path; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; +use std::sync::mpsc::{channel, Receiver, Sender}; +use std::sync::{Arc, Mutex}; + +use nydus_api::http::BlobCacheList; +use nydus_app::BuildTimeInfo; + +use crate::blob_cache::BlobCacheMgr; +use crate::daemon::{ + DaemonError, DaemonResult, DaemonState, DaemonStateMachineContext, DaemonStateMachineInput, + DaemonStateMachineSubscriber, +}; +use crate::{FsService, NydusDaemon, SubCmdArgs, DAEMON_CONTROLLER}; + +pub struct ServiceContoller { + bti: BuildTimeInfo, + id: Option, + request_sender: Arc>>, + result_receiver: Mutex>>, + state: AtomicI32, + supervisor: Option, + + blob_cache_mgr: Arc, + + fscache_enabled: AtomicBool, + #[cfg(target_os = "linux")] + fscache: Mutex>>, +} + +impl ServiceContoller { + /// Start all enabled services. + fn start_services(&self) -> Result<()> { + info!("Starting all Nydus services..."); + + #[cfg(target_os = "linux")] + if self.fscache_enabled.load(Ordering::Acquire) { + if let Some(fscache) = self.fscache.lock().unwrap().clone() { + std::thread::spawn(move || { + if let Err(e) = fscache.run_loop() { + error!("Failed to run fscache service loop, {}", e); + } + // Notify the global service controller that one working thread is exiting. + if let Err(e) = crate::DAEMON_CONTROLLER.waker.wake() { + error!("Failed to notify the global service controller, {}", e); + } + }); + } + } + + Ok(()) + } + + /// Stop all enabled services. + fn stop_services(&self) { + info!("Stopping all Nydus services..."); + + #[cfg(target_os = "linux")] + if self.fscache_enabled.load(Ordering::Acquire) { + if let Some(fscache) = self.fscache.lock().unwrap().take() { + fscache.stop(); + } + } + } + + fn initialize_blob_cache(&self, config: &Option) -> Result<()> { + DAEMON_CONTROLLER.set_blob_cache_mgr(self.blob_cache_mgr.clone()); + + // Create blob cache objects configured by the configuration file. + if let Some(config) = config { + if let Some(config1) = config.as_object() { + if config1.contains_key("blobs") { + if let Ok(v) = serde_json::from_value::(config.clone()) { + if let Err(e) = self.blob_cache_mgr.add_blob_list(&v) { + error!("Failed to add blob list: {}", e); + return Err(e); + } + } + } + } + } + + Ok(()) + } +} + +#[cfg(target_os = "linux")] +impl ServiceContoller { + fn initialize_fscache_service(&self, subargs: &SubCmdArgs, path: &str) -> Result<()> { + // Validate --fscache option value is an existing directory. + let p = match Path::new(&path).canonicalize() { + Err(e) => { + error!("--fscache option needs a directory to cache files"); + return Err(e); + } + Ok(v) => { + if !v.is_dir() { + error!("--fscache options needs a directory to cache files"); + return Err(einval!("--fscache options is not a directory")); + } + v + } + }; + let p = match p.to_str() { + Some(v) => v, + None => { + error!("--fscache option contains invalid characters"); + return Err(einval!("--fscache option contains invalid characters")); + } + }; + let tag = subargs.value_of("fscache-tag"); + + info!( + "Create fscache instance at {} with tag {}", + p, + tag.unwrap_or("") + ); + let fscache = crate::fs_cache::FsCacheHandler::new( + "/dev/cachefiles", + p, + tag, + self.blob_cache_mgr.clone(), + )?; + *self.fscache.lock().unwrap() = Some(Arc::new(fscache)); + self.fscache_enabled.store(true, Ordering::Release); + + Ok(()) + } +} + +impl NydusDaemon for ServiceContoller { + fn as_any(&self) -> &dyn Any { + self + } + + fn id(&self) -> Option { + self.id.clone() + } + + fn get_state(&self) -> DaemonState { + self.state.load(Ordering::Relaxed).into() + } + + fn set_state(&self, state: DaemonState) { + self.state.store(state as i32, Ordering::Relaxed); + } + + fn version(&self) -> BuildTimeInfo { + self.bti.clone() + } + + fn start(&self) -> DaemonResult<()> { + self.start_services() + .map_err(|e| DaemonError::StartService(format!("{}", e))) + } + + fn disconnect(&self) -> DaemonResult<()> { + self.stop_services(); + Ok(()) + } + + fn wait(&self) -> DaemonResult<()> { + Ok(()) + } + + fn supervisor(&self) -> Option { + self.supervisor.clone() + } + + fn save(&self) -> DaemonResult<()> { + Err(DaemonError::Unsupported) + } + + fn restore(&self) -> DaemonResult<()> { + Err(DaemonError::Unsupported) + } + + fn get_default_fs_service(&self) -> Option> { + None + } +} + +impl DaemonStateMachineSubscriber for ServiceContoller { + fn on_event(&self, event: DaemonStateMachineInput) -> DaemonResult<()> { + self.request_sender + .lock() + .unwrap() + .send(event) + .map_err(|e| DaemonError::Channel(format!("send {:?}", e)))?; + self.result_receiver + .lock() + .expect("Not expect poisoned lock!") + .recv() + .map_err(|e| DaemonError::Channel(format!("recv {:?}", e)))? + } +} + +pub fn create_daemon(subargs: &SubCmdArgs, bti: BuildTimeInfo) -> Result> { + let id = subargs.value_of("id").map(|id| id.to_string()); + let supervisor = subargs.value_of("supervisor").map(|s| s.to_string()); + let config = match subargs.value_of("config") { + None => None, + Some(path) => { + let config = std::fs::read_to_string(path)?; + let config: serde_json::Value = serde_json::from_str(&config) + .map_err(|_e| einval!("invalid configuration file"))?; + Some(config) + } + }; + + let (to_sm, from_client) = channel::(); + let (to_client, from_sm) = channel::>(); + let service_controller = ServiceContoller { + bti, + id, + request_sender: Arc::new(Mutex::new(to_sm)), + result_receiver: Mutex::new(from_sm), + state: Default::default(), + supervisor, + + blob_cache_mgr: Arc::new(BlobCacheMgr::new()), + + fscache_enabled: AtomicBool::new(false), + #[cfg(target_os = "linux")] + fscache: Mutex::new(None), + }; + + service_controller.initialize_blob_cache(&config)?; + #[cfg(target_os = "linux")] + if let Some(path) = subargs.value_of("fscache") { + service_controller.initialize_fscache_service(subargs, path)?; + } + + let daemon = Arc::new(service_controller); + let machine = DaemonStateMachineContext::new(daemon.clone(), from_client, to_client); + machine.kick_state_machine()?; + daemon + .on_event(DaemonStateMachineInput::Start) + .map_err(|e| eother!(e))?; + + Ok(daemon) +} diff --git a/src/bin/nydusd/upgrade.rs b/src/bin/nydusd/upgrade.rs index b139d267cfb..fd56fd3d3b0 100644 --- a/src/bin/nydusd/upgrade.rs +++ b/src/bin/nydusd/upgrade.rs @@ -2,7 +2,9 @@ use std::convert::TryFrom; #[cfg(feature = "fusedev")] use std::path::PathBuf; -use crate::daemon::{DaemonResult, FsBackendMountCmd, FsBackendUmountCmd}; +use crate::daemon::DaemonResult; +use crate::fs_service::FsBackendUmountCmd; +use crate::FsBackendMountCmd; #[derive(Debug)] pub enum UpgradeMgrError {} diff --git a/src/bin/nydusd/virtiofs.rs b/src/bin/nydusd/virtiofs.rs index 4edc7b5bb47..f9cdd9b0f2b 100644 --- a/src/bin/nydusd/virtiofs.rs +++ b/src/bin/nydusd/virtiofs.rs @@ -6,18 +6,13 @@ use std::any::Any; use std::io::Result; -use std::sync::{ - atomic::{AtomicI32, Ordering}, - mpsc::{channel, Receiver}, - Arc, Mutex, MutexGuard, RwLock, -}; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::mpsc::{channel, Receiver, Sender}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::thread; -use libc::EFD_NONBLOCK; - use fuse_backend_rs::api::{server::Server, Vfs}; use fuse_backend_rs::transport::{FsCacheReqHandler, Reader, Writer}; - use vhost::vhost_user::{message::*, Listener, SlaveFsCacheReq}; use vhost_user_backend::{ VhostUserBackend, VhostUserBackendMut, VhostUserDaemon, VringMutex, VringState, VringT, @@ -34,9 +29,11 @@ use nydus_app::BuildTimeInfo; use crate::daemon::{ DaemonError, DaemonResult, DaemonState, DaemonStateMachineContext, DaemonStateMachineInput, - DaemonStateMachineSubscriber, FsBackendCollection, FsBackendMountCmd, NydusDaemon, Trigger, + DaemonStateMachineSubscriber, NydusDaemon, }; +use crate::fs_service::{FsBackendCollection, FsService}; use crate::upgrade::UpgradeManager; +use crate::FsBackendMountCmd; const VIRTIO_F_VERSION_1: u32 = 32; const QUEUE_SIZE: usize = 1024; @@ -49,48 +46,17 @@ const REQ_QUEUE_EVENT: u16 = 1; // The device has been dropped. // const KILL_EVENT: u16 = 2; -type VhostUserBackendResult = std::result::Result; - -struct VhostUserFsBackendHandler { - backend: Mutex, -} +type VhostUserBackendResult = std::io::Result; struct VhostUserFsBackend { - mem: Option>, - kill_evt: EventFd, event_idx: bool, + kill_evt: EventFd, + mem: Option>, server: Arc>>, // handle request from slave to master vu_req: Option, } -impl VhostUserFsBackendHandler { - fn new(vfs: Arc) -> Result { - let backend = VhostUserFsBackend { - mem: None, - kill_evt: EventFd::new(EFD_NONBLOCK).map_err(DaemonError::Epoll)?, - event_idx: false, - server: Arc::new(Server::new(vfs)), - vu_req: None, - }; - Ok(VhostUserFsBackendHandler { - backend: Mutex::new(backend), - }) - } -} - -impl Clone for VhostUserFsBackend { - fn clone(&self) -> Self { - VhostUserFsBackend { - mem: self.mem.clone(), - kill_evt: self.kill_evt.try_clone().unwrap(), - event_idx: self.event_idx, - server: self.server.clone(), - vu_req: self.vu_req.clone(), - } - } -} - impl VhostUserFsBackend { // There's no way to recover if error happens during processing a virtq, let the caller // to handle it. @@ -153,6 +119,26 @@ impl VhostUserFsBackend { } } +struct VhostUserFsBackendHandler { + backend: Mutex, +} + +impl VhostUserFsBackendHandler { + fn new(vfs: Arc) -> Result { + let backend = VhostUserFsBackend { + event_idx: false, + kill_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(DaemonError::Epoll)?, + mem: None, + server: Arc::new(Server::new(vfs)), + vu_req: None, + }; + + Ok(VhostUserFsBackendHandler { + backend: Mutex::new(backend), + }) + } +} + impl VhostUserBackendMut for VhostUserFsBackendHandler { fn num_queues(&self) -> usize { NUM_QUEUES @@ -185,6 +171,16 @@ impl VhostUserBackendMut for VhostUserFsBackendHandler { Ok(()) } + fn set_slave_req_fd(&mut self, vu_req: SlaveFsCacheReq) { + self.backend.lock().unwrap().vu_req = Some(vu_req); + } + + fn exit_event(&self, _thread_index: usize) -> Option { + // FIXME: need to patch vhost-user-backend to return KILL_EVENT + // so that daemon stop event gets popped up. + Some(self.backend.lock().unwrap().kill_evt.try_clone().unwrap()) + } + fn handle_event( &mut self, device_event: u16, @@ -233,37 +229,79 @@ impl VhostUserBackendMut for VhostUserFsBackendHandler { Ok(false) } +} - fn exit_event(&self, _thread_index: usize) -> Option { - // FIXME: need to patch vhost-user-backend to return KILL_EVENT - // so that daemon stop event gets popped up. - Some(self.backend.lock().unwrap().kill_evt.try_clone().unwrap()) +pub struct VirtioFsService { + vfs: Arc, + upgrade_mgr: Option>, + backend_collection: Mutex, +} + +impl VirtioFsService { + fn new(vfs: Arc) -> Self { + VirtioFsService { + vfs, + upgrade_mgr: None, + backend_collection: Default::default(), + } } +} - fn set_slave_req_fd(&mut self, vu_req: SlaveFsCacheReq) { - self.backend.lock().unwrap().vu_req = Some(vu_req); +impl FsService for VirtioFsService { + fn get_vfs(&self) -> &Vfs { + &self.vfs + } + + fn upgrade_mgr(&self) -> Option> { + self.upgrade_mgr.as_ref().map(|mgr| mgr.lock().unwrap()) + } + + fn backend_collection(&self) -> MutexGuard { + self.backend_collection.lock().unwrap() + } + + fn export_inflight_ops(&self) -> DaemonResult> { + Err(DaemonError::Unsupported) } } struct VirtiofsDaemon + Clone> { - vfs: Arc, - daemon: Arc>>, - sock: String, + bti: BuildTimeInfo, id: Option, - supervisor: Option, - upgrade_mgr: Option>, - trigger: Arc>, + request_sender: Arc>>, result_receiver: Mutex>>, - backend_collection: Mutex, - bti: BuildTimeInfo, + service: Arc, state: AtomicI32, + supervisor: Option, + + daemon: Arc>>, + sock: String, } impl + Clone> NydusDaemon for VirtiofsDaemon { + fn as_any(&self) -> &dyn Any { + self + } + + fn id(&self) -> Option { + self.id.clone() + } + + fn get_state(&self) -> DaemonState { + self.state.load(Ordering::Relaxed).into() + } + + fn set_state(&self, state: DaemonState) { + self.state.store(state as i32, Ordering::Relaxed); + } + + fn version(&self) -> BuildTimeInfo { + self.bti.clone() + } + fn start(&self) -> DaemonResult<()> { let listener = Listener::new(&self.sock, true) .map_err(|e| DaemonError::StartService(format!("{:?}", e)))?; - let vu_daemon = self.daemon.clone(); let _ = thread::Builder::new() .name("vhost_user_listener".to_string()) @@ -279,6 +317,10 @@ impl + Clone> NydusDaemon for Virtiofs Ok(()) } + fn disconnect(&self) -> DaemonResult<()> { + Ok(()) + } + fn wait(&self) -> DaemonResult<()> { self.daemon .lock() @@ -287,56 +329,20 @@ impl + Clone> NydusDaemon for Virtiofs .map_err(|e| DaemonError::WaitDaemon(eother!(e))) } - fn disconnect(&self) -> DaemonResult<()> { - Ok(()) - } - - fn id(&self) -> Option { - self.id.clone() - } - fn supervisor(&self) -> Option { self.supervisor.clone() } - fn as_any(&self) -> &dyn Any { - self - } - - fn get_state(&self) -> DaemonState { - self.state.load(Ordering::Relaxed).into() - } - - fn set_state(&self, state: DaemonState) { - self.state.store(state as i32, Ordering::Relaxed); - } - fn save(&self) -> DaemonResult<()> { - unimplemented!(); + Err(DaemonError::Unsupported) } fn restore(&self) -> DaemonResult<()> { - unimplemented!(); - } - - fn get_vfs(&self) -> &Vfs { - &self.vfs - } - - fn upgrade_mgr(&self) -> Option> { - self.upgrade_mgr.as_ref().map(|mgr| mgr.lock().unwrap()) - } - - fn backend_collection(&self) -> MutexGuard { - self.backend_collection.lock().unwrap() + Err(DaemonError::Unsupported) } - fn version(&self) -> BuildTimeInfo { - self.bti.clone() - } - - fn export_inflight_ops(&self) -> DaemonResult> { - Err(DaemonError::Unsupported) + fn get_default_fs_service(&self) -> Option> { + Some(self.service.clone()) } } @@ -344,7 +350,7 @@ impl + Clone> DaemonStateMachineSubscr for VirtiofsDaemon { fn on_event(&self, event: DaemonStateMachineInput) -> DaemonResult<()> { - self.trigger + self.request_sender .lock() .unwrap() .send(event) @@ -358,50 +364,43 @@ impl + Clone> DaemonStateMachineSubscr } } -pub fn create_nydus_daemon( +pub fn create_virtiofs_daemon( id: Option, supervisor: Option, sock: &str, vfs: Arc, mount_cmd: Option, bti: BuildTimeInfo, -) -> Result> { +) -> Result> { let vu_daemon = VhostUserDaemon::new( String::from("vhost-user-fs-backend"), Arc::new(RwLock::new(VhostUserFsBackendHandler::new(vfs.clone())?)), GuestMemoryAtomic::new(GuestMemoryMmap::new()), ) .map_err(|e| DaemonError::DaemonFailure(format!("{:?}", e)))?; - let (trigger, events_rx) = channel::(); let (result_sender, result_receiver) = channel::>(); - + let service = VirtioFsService::new(vfs); let daemon = Arc::new(VirtiofsDaemon { - vfs, - daemon: Arc::new(Mutex::new(vu_daemon)), - sock: sock.to_string(), + bti, id, - supervisor, - upgrade_mgr: None, - trigger: Arc::new(Mutex::new(trigger)), + request_sender: Arc::new(Mutex::new(trigger)), result_receiver: Mutex::new(result_receiver), - bti, - backend_collection: Default::default(), + service: Arc::new(service), state: AtomicI32::new(DaemonState::INIT as i32), - }); + supervisor, + daemon: Arc::new(Mutex::new(vu_daemon)), + sock: sock.to_string(), + }); let machine = DaemonStateMachineContext::new(daemon.clone(), events_rx, result_sender); - machine.kick_state_machine()?; + machine.kick_state_machine()?; if let Some(cmd) = mount_cmd { - daemon.mount(cmd)?; + daemon.service.mount(cmd)?; } - - // TODO: In fact, for virtiofs, below event triggers virtio-queue setup and some other - // preparation/connection work. So this event name `Mount` might not be suggestive. - // I'd like to rename it someday. daemon - .on_event(DaemonStateMachineInput::Mount) + .on_event(DaemonStateMachineInput::Start) .map_err(|e| eother!(e))?; Ok(daemon) diff --git a/storage/src/backend/localfs.rs b/storage/src/backend/localfs.rs index 12d61254855..1bb49ba7c2e 100644 --- a/storage/src/backend/localfs.rs +++ b/storage/src/backend/localfs.rs @@ -58,18 +58,21 @@ fn default_readahead_sec() -> u32 { } /// Configuration information for localfs storage backend. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Clone, Deserialize, Serialize)] -struct LocalFsConfig { +pub struct LocalFsConfig { #[serde(default)] - readahead: bool, + pub readahead: bool, #[serde(default = "default_readahead_sec")] - readahead_sec: u32, + pub readahead_sec: u32, #[serde(default)] - blob_file: String, + pub blob_file: String, #[serde(default)] - dir: String, + pub dir: String, #[serde(default)] - alt_dirs: Vec, + pub alt_dirs: Vec, } struct LocalFsEntry { diff --git a/storage/src/backend/mod.rs b/storage/src/backend/mod.rs index 0064a2e163e..3df4cd6b438 100644 --- a/storage/src/backend/mod.rs +++ b/storage/src/backend/mod.rs @@ -25,10 +25,16 @@ use crate::StorageError; pub mod connection; #[cfg(feature = "backend-localfs")] pub mod localfs; +#[cfg(feature = "backend-localfs")] +pub use localfs::LocalFsConfig; #[cfg(feature = "backend-oss")] pub mod oss; +#[cfg(feature = "backend-oss")] +pub use oss::OssConfig; #[cfg(feature = "backend-registry")] pub mod registry; +#[cfg(feature = "backend-registry")] +pub use registry::RegistryConfig; /// Error codes related to storage backend operations. #[derive(Debug)] @@ -52,6 +58,9 @@ pub enum BackendError { pub type BackendResult = std::result::Result; /// Configuration information for network proxy. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(default)] pub struct ProxyConfig { @@ -77,6 +86,9 @@ impl Default for ProxyConfig { } /// Generic configuration for storage backends. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(default)] pub struct CommonConfig { diff --git a/storage/src/backend/oss.rs b/storage/src/backend/oss.rs index 3305c121475..9df29e06d17 100644 --- a/storage/src/backend/oss.rs +++ b/storage/src/backend/oss.rs @@ -41,20 +41,24 @@ impl From for BackendError { } } +/// OSS configuration information to access blobs. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Clone, Deserialize, Serialize)] -struct OssConfig { - endpoint: String, - access_key_id: String, - access_key_secret: String, - bucket_name: String, +pub struct OssConfig { + pub endpoint: String, + pub access_key_id: String, + pub access_key_secret: String, + pub bucket_name: String, #[serde(default = "default_http_scheme")] - scheme: String, + pub scheme: String, /// Prefix object_prefix to OSS object key, for example the simulation of subdirectory: /// - object_key: sha256:xxx /// - object_prefix: nydus/ /// - object_key with object_prefix: nydus/sha256:xxx #[serde(default)] - object_prefix: String, + pub object_prefix: String, } // `OssState` is almost identical to `OssConfig`, but let's keep them separated. diff --git a/storage/src/backend/registry.rs b/storage/src/backend/registry.rs index 3cd230b9b91..c4163bc6b0f 100644 --- a/storage/src/backend/registry.rs +++ b/storage/src/backend/registry.rs @@ -94,24 +94,28 @@ impl HashCache { } } +/// Container registry configuration information to access blobs. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Clone, Deserialize, Serialize)] -struct RegistryConfig { +pub struct RegistryConfig { #[serde(default = "default_http_scheme")] - scheme: String, - host: String, - repo: String, + pub scheme: String, + pub host: String, + pub repo: String, // Base64_encoded(username:password), the field should be // sent to registry auth server to get a bearer token. #[serde(default)] - auth: Option, + pub auth: Option, // The field is a bearer token to be sent to registry // to authorize registry requests. #[serde(default)] - registry_token: Option, + pub registry_token: Option, #[serde(default)] - blob_url_scheme: String, + pub blob_url_scheme: String, #[serde(default)] - blob_redirected_host: String, + pub blob_redirected_host: String, } #[derive(Clone, Deserialize)] diff --git a/storage/src/cache/cachedfile.rs b/storage/src/cache/cachedfile.rs index 13416362d90..c3931e29e16 100644 --- a/storage/src/cache/cachedfile.rs +++ b/storage/src/cache/cachedfile.rs @@ -60,6 +60,8 @@ pub(crate) struct FileCacheEntry { pub(crate) is_direct_chunkmap: bool, // The blob is for an stargz image. pub(crate) is_stargz: bool, + // True if direct IO is enabled for the `self.file`, supported for fscache only. + pub(crate) dio_enabled: bool, // Data from the file cache should be validated before use. pub(crate) need_validate: bool, pub(crate) prefetch_config: Arc, @@ -310,19 +312,15 @@ impl BlobObject for FileCacheEntry { fn fetch_range_compressed(&self, offset: u64, size: u64) -> Result { let meta = self.meta.as_ref().ok_or_else(|| einval!())?; - let chunks = meta.get_chunks_compressed(offset, size)?; + let chunks = meta.get_chunks_compressed(offset, size, RAFS_DEFAULT_CHUNK_SIZE * 2)?; debug_assert!(!chunks.is_empty()); - self.do_fetch_chunks(&chunks) } fn fetch_range_uncompressed(&self, offset: u64, size: u64) -> Result { let meta = self.meta.as_ref().ok_or_else(|| einval!())?; - - // TODO: read amplify the range to naturally aligned 2M? - let chunks = meta.get_chunks_uncompressed(offset, size)?; + let chunks = meta.get_chunks_uncompressed(offset, size, RAFS_DEFAULT_CHUNK_SIZE * 2)?; debug_assert!(!chunks.is_empty()); - self.do_fetch_chunks(&chunks) } @@ -379,7 +377,7 @@ impl FileCacheEntry { let blob_size = (blob_end - blob_offset) as usize; match self.read_chunks(blob_offset, blob_size, &chunks[start_idx..=end_idx]) { - Ok(v) => { + Ok(mut v) => { total_size += blob_size; trace!( "range persist chunk start {} {} pending {} {}", @@ -394,10 +392,14 @@ impl FileCacheEntry { } else { chunks[idx].uncompress_offset() }; + let buf = &mut v[idx - start_idx]; + if self.dio_enabled { + self.adjust_buffer_for_dio(buf) + } trace!("persist_chunk idx {}", idx); - Self::persist_chunk(&self.file, offset, &v[idx - start_idx]).map_err( - |e| eio!(format!("do_fetch_chunk failed to persist {:?}", e)), - )?; + Self::persist_chunk(&self.file, offset, buf).map_err(|e| { + eio!(format!("do_fetch_chunk failed to persist {:?}", e)) + })?; } bitmap @@ -418,6 +420,14 @@ impl FileCacheEntry { Ok(total_size) } } + + fn adjust_buffer_for_dio(&self, buf: &mut Vec) { + debug_assert!(buf.capacity() % 0x1000 == 0); + if buf.len() != buf.capacity() { + // Padding with 0 for direct IO. + buf.resize(buf.capacity(), 0); + } + } } impl FileCacheEntry { diff --git a/storage/src/cache/filecache/mod.rs b/storage/src/cache/filecache/mod.rs index ff77f54c85a..37d1aeecd34 100644 --- a/storage/src/cache/filecache/mod.rs +++ b/storage/src/cache/filecache/mod.rs @@ -25,12 +25,18 @@ fn default_work_dir() -> String { ".".to_string() } +/// Configuration information for file cache. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Clone, Debug, Deserialize, Serialize)] -struct FileCacheConfig { +pub struct FileCacheConfig { + /// Working directory to keep cached files. #[serde(default = "default_work_dir")] - work_dir: String, + pub work_dir: String, + /// Legacy: disable index mapping, keep it as false when possible. #[serde(default)] - disable_indexed_map: bool, + pub disable_indexed_map: bool, } impl FileCacheConfig { @@ -149,10 +155,12 @@ impl BlobCacheMgr for FileCacheMgr { self.metrics.release().unwrap_or_else(|e| error!("{:?}", e)); } - fn gc(&self) { + fn gc(&self, id: Option<&str>) { let mut reclaim = Vec::new(); - { + if let Some(blob_id) = id { + reclaim.push(blob_id.to_string()); + } else { let guard = self.blobs.write().unwrap(); for (id, entry) in guard.iter() { if Arc::strong_count(entry) == 1 { @@ -253,6 +261,7 @@ impl FileCacheEntry { is_compressed, is_direct_chunkmap, is_stargz, + dio_enabled: false, need_validate, prefetch_config, }) @@ -277,6 +286,7 @@ impl FileCacheEntry { Arc::new(BlobStateMap::from(IndexedChunkMap::new( blob_file, blob_info.chunk_count(), + true, )?)) }; diff --git a/storage/src/cache/fscache/mod.rs b/storage/src/cache/fscache/mod.rs new file mode 100644 index 00000000000..573b023e8cf --- /dev/null +++ b/storage/src/cache/fscache/mod.rs @@ -0,0 +1,246 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::collections::HashMap; +use std::fs; +use std::io::Result; +use std::sync::atomic::AtomicU32; +use std::sync::{Arc, RwLock}; + +use nydus_utils::metrics::BlobcacheMetrics; +use tokio::runtime::Runtime; + +use crate::backend::BlobBackend; +use crate::cache::cachedfile::FileCacheEntry; +use crate::cache::state::{BlobStateMap, IndexedChunkMap}; +use crate::cache::worker::{AsyncPrefetchConfig, AsyncRequestState, AsyncWorkerMgr}; +use crate::cache::{BlobCache, BlobCacheMgr}; +use crate::device::{BlobFeatures, BlobInfo}; +use crate::factory::CacheConfig; +use crate::meta::BlobMetaInfo; + +fn default_work_dir() -> String { + ".".to_string() +} + +/// Configuration information for fscache. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct FsCacheConfig { + /// Working directory to keep cached files. + #[serde(default = "default_work_dir")] + pub work_dir: String, +} + +impl FsCacheConfig { + fn get_work_dir(&self) -> Result<&str> { + let path = fs::metadata(&self.work_dir) + .or_else(|_| { + fs::create_dir_all(&self.work_dir)?; + fs::metadata(&self.work_dir) + }) + .map_err(|e| { + last_error!(format!( + "fail to stat fscache work_dir {}: {}", + self.work_dir, e + )) + })?; + + if path.is_dir() { + Ok(&self.work_dir) + } else { + Err(enoent!(format!( + "fscache work_dir {} is not a directory", + self.work_dir + ))) + } + } +} + +/// An implementation of [BlobCacheMgr](../trait.BlobCacheMgr.html) to improve performance by +/// caching uncompressed blob with Linux fscache subsystem. +#[derive(Clone)] +pub struct FsCacheMgr { + blobs: Arc>>>, + backend: Arc, + metrics: Arc, + prefetch_config: Arc, + runtime: Arc, + worker_mgr: Arc, + work_dir: String, + validate: bool, +} + +impl FsCacheMgr { + /// Create a new instance of `FileCacheMgr`. + pub fn new( + config: CacheConfig, + backend: Arc, + runtime: Arc, + id: &str, + ) -> Result { + let blob_config: FsCacheConfig = + serde_json::from_value(config.cache_config).map_err(|e| einval!(e))?; + let work_dir = blob_config.get_work_dir()?; + let metrics = BlobcacheMetrics::new(id, work_dir); + let prefetch_config: Arc = Arc::new(config.prefetch_config.into()); + let worker_mgr = AsyncWorkerMgr::new(metrics.clone(), prefetch_config.clone())?; + + Ok(FsCacheMgr { + blobs: Arc::new(RwLock::new(HashMap::new())), + backend, + metrics, + prefetch_config, + runtime, + worker_mgr: Arc::new(worker_mgr), + work_dir: work_dir.to_owned(), + validate: config.cache_validate, + }) + } + + // Get the file cache entry for the specified blob object. + fn get(&self, blob: &Arc) -> Option> { + self.blobs.read().unwrap().get(blob.blob_id()).cloned() + } + + // Create a file cache entry for the specified blob object if not present, otherwise + // return the existing one. + fn get_or_create_cache_entry(&self, blob: &Arc) -> Result> { + if let Some(entry) = self.get(blob) { + return Ok(entry); + } + + let entry = FileCacheEntry::new_fs_cache( + self, + blob.clone(), + self.prefetch_config.clone(), + self.runtime.clone(), + self.worker_mgr.clone(), + )?; + let entry = Arc::new(entry); + let mut guard = self.blobs.write().unwrap(); + if let Some(entry) = guard.get(blob.blob_id()) { + Ok(entry.clone()) + } else { + guard.insert(blob.blob_id().to_owned(), entry.clone()); + self.metrics + .underlying_files + .lock() + .unwrap() + .insert(blob.blob_id().to_string()); + Ok(entry) + } + } +} + +impl BlobCacheMgr for FsCacheMgr { + fn init(&self) -> Result<()> { + AsyncWorkerMgr::start(self.worker_mgr.clone()) + } + + fn destroy(&self) { + self.worker_mgr.stop(); + self.backend().shutdown(); + self.metrics.release().unwrap_or_else(|e| error!("{:?}", e)); + } + + fn gc(&self, id: Option<&str>) { + if let Some(blob_id) = id { + self.blobs.write().unwrap().remove(blob_id); + } else { + let mut reclaim = Vec::new(); + let guard = self.blobs.write().unwrap(); + for (id, entry) in guard.iter() { + if Arc::strong_count(entry) == 1 { + reclaim.push(id.to_owned()); + } + } + drop(guard); + + for key in reclaim.iter() { + let mut guard = self.blobs.write().unwrap(); + if let Some(entry) = guard.get(key) { + if Arc::strong_count(entry) == 1 { + guard.remove(key); + } + } + } + } + } + + fn backend(&self) -> &(dyn BlobBackend) { + self.backend.as_ref() + } + + fn get_blob_cache(&self, blob_info: &Arc) -> Result> { + self.get_or_create_cache_entry(blob_info) + .map(|v| v as Arc) + } +} + +impl FileCacheEntry { + pub fn new_fs_cache( + mgr: &FsCacheMgr, + blob_info: Arc, + prefetch_config: Arc, + runtime: Arc, + workers: Arc, + ) -> Result { + if blob_info.has_feature(BlobFeatures::V5_NO_EXT_BLOB_TABLE) { + return Err(einval!("fscache does not support Rafs v5 blobs")); + } + if blob_info.is_stargz() { + return Err(einval!("fscache does not support stargz blob file")); + } + let file = blob_info + .get_fscache_file() + .ok_or_else(|| einval!("No fscache file associated with the blob_info"))?; + + let blob_file_path = format!("{}/{}", mgr.work_dir, blob_info.blob_id()); + let chunk_map = Arc::new(BlobStateMap::from(IndexedChunkMap::new( + &blob_file_path, + blob_info.chunk_count(), + false, + )?)); + let reader = mgr + .backend + .get_reader(blob_info.blob_id()) + .map_err(|_e| eio!("failed to get blob reader"))?; + let blob_size = blob_info.uncompressed_size(); + let meta = if blob_info.meta_ci_is_valid() { + Some(Arc::new(BlobMetaInfo::new( + &blob_file_path, + &blob_info, + Some(&reader), + )?)) + } else { + None + }; + + Ok(FileCacheEntry { + blob_info: blob_info.clone(), + chunk_map, + file, + meta, + metrics: mgr.metrics.clone(), + prefetch_state: Arc::new(AtomicU32::new(AsyncRequestState::Init as u32)), + reader, + runtime, + workers, + + blob_size, + compressor: blob_info.compressor(), + digester: blob_info.digester(), + is_get_blob_object_supported: true, + is_compressed: false, + is_direct_chunkmap: true, + is_stargz: false, + dio_enabled: true, + need_validate: mgr.validate, + prefetch_config, + }) + } +} diff --git a/storage/src/cache/mod.rs b/storage/src/cache/mod.rs index 625d38eed44..0a203e5c681 100644 --- a/storage/src/cache/mod.rs +++ b/storage/src/cache/mod.rs @@ -23,9 +23,6 @@ use std::slice; use std::sync::Arc; use fuse_backend_rs::transport::FileVolatileSlice; - -pub use dummycache::DummyCacheMgr; -pub use filecache::FileCacheMgr; use nydus_utils::{compress, digest}; use crate::backend::{BlobBackend, BlobReader}; @@ -40,10 +37,15 @@ use crate::{StorageResult, RAFS_MAX_CHUNK_SIZE}; mod cachedfile; mod dummycache; mod filecache; +mod fscache; mod worker; pub mod state; +pub use dummycache::DummyCacheMgr; +pub use filecache::{FileCacheConfig, FileCacheMgr}; +pub use fscache::{FsCacheConfig, FsCacheMgr}; + /// Timeout in milli-seconds to retrieve blob data from backend storage. pub const SINGLE_INFLIGHT_WAIT_TIMEOUT: u64 = 2000; @@ -115,7 +117,7 @@ impl<'a, F: FnMut(BlobIoRange)> BlobIoMergeState<'a, F> { } /// Configuration information for blob data prefetching. -#[derive(Clone, Default, Eq, Hash, PartialEq)] +#[derive(Clone, Debug, Default, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct BlobPrefetchConfig { /// Whether to enable blob data prefetching. pub enable: bool, @@ -342,7 +344,9 @@ pub trait BlobCacheMgr: Send + Sync { fn destroy(&self); /// Garbage-collect unused resources. - fn gc(&self) {} + fn gc(&self, _id: Option<&str>) { + todo!() + } /// Get the underlying `BlobBackend` object of the blob cache object. fn backend(&self) -> &(dyn BlobBackend); diff --git a/storage/src/cache/state/blob_state_map.rs b/storage/src/cache/state/blob_state_map.rs index 3ec4a6637cd..7117ca3d065 100644 --- a/storage/src/cache/state/blob_state_map.rs +++ b/storage/src/cache/state/blob_state_map.rs @@ -456,13 +456,13 @@ pub(crate) mod tests { let skip_index = 77; let indexed_chunk_map1 = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(&blob_path, chunk_count).unwrap(), + IndexedChunkMap::new(&blob_path, chunk_count, true).unwrap(), )); let indexed_chunk_map2 = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(&blob_path, chunk_count).unwrap(), + IndexedChunkMap::new(&blob_path, chunk_count, true).unwrap(), )); let indexed_chunk_map3 = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(&blob_path, chunk_count).unwrap(), + IndexedChunkMap::new(&blob_path, chunk_count, true).unwrap(), )); let now = Instant::now(); @@ -549,7 +549,7 @@ pub(crate) mod tests { } let indexed_chunk_map = - BlobStateMap::from(IndexedChunkMap::new(&blob_path, chunk_count).unwrap()); + BlobStateMap::from(IndexedChunkMap::new(&blob_path, chunk_count, true).unwrap()); let now = Instant::now(); iterate(&chunks, &indexed_chunk_map as &dyn ChunkMap, chunk_count); let elapsed1 = now.elapsed().as_millis(); @@ -582,7 +582,7 @@ pub(crate) mod tests { // indexed ChunkMap let tmp_file = TempFile::new().unwrap(); let index_map = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10).unwrap(), + IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10, true).unwrap(), )); index_map .check_ready_and_mark_pending(chunk_1.as_ref()) @@ -658,7 +658,7 @@ pub(crate) mod tests { fn test_inflight_tracer_race() { let tmp_file = TempFile::new().unwrap(); let map = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10).unwrap(), + IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10, true).unwrap(), )); let chunk_4: Arc = Arc::new({ @@ -724,7 +724,7 @@ pub(crate) mod tests { fn test_inflight_tracer_timeout() { let tmp_file = TempFile::new().unwrap(); let map = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10).unwrap(), + IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10, true).unwrap(), )); let chunk_4: Arc = Arc::new({ @@ -768,7 +768,7 @@ pub(crate) mod tests { fn test_inflight_tracer_race_range() { let tmp_file = TempFile::new().unwrap(); let map = Arc::new(BlobStateMap::from( - IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10).unwrap(), + IndexedChunkMap::new(tmp_file.as_path().to_str().unwrap(), 10, true).unwrap(), )); assert!(!map.is_range_all_ready()); diff --git a/storage/src/cache/state/indexed_chunk_map.rs b/storage/src/cache/state/indexed_chunk_map.rs index 189a48d4953..379c9ceca86 100644 --- a/storage/src/cache/state/indexed_chunk_map.rs +++ b/storage/src/cache/state/indexed_chunk_map.rs @@ -34,17 +34,17 @@ pub struct IndexedChunkMap { impl IndexedChunkMap { /// Create a new instance of `IndexedChunkMap`. - pub fn new(blob_path: &str, chunk_count: u32) -> Result { + pub fn new(blob_path: &str, chunk_count: u32, persist: bool) -> Result { let filename = format!("{}.{}", blob_path, FILE_SUFFIX); - PersistMap::open(&filename, chunk_count, true).map(|map| IndexedChunkMap { map }) + PersistMap::open(&filename, chunk_count, true, persist).map(|map| IndexedChunkMap { map }) } /// Create a new instance of `IndexedChunkMap` from an existing chunk map file. pub fn open(blob_info: &BlobInfo, workdir: &str) -> Result { let filename = format!("{}/{}.{}", workdir, blob_info.blob_id(), FILE_SUFFIX); - PersistMap::open(&filename, blob_info.chunk_count(), false) + PersistMap::open(&filename, blob_info.chunk_count(), false, true) .map(|map| IndexedChunkMap { map }) } } @@ -159,7 +159,7 @@ mod tests { let blob_path = dir.as_path().join("blob-1"); let blob_path = blob_path.as_os_str().to_str().unwrap().to_string(); - assert!(IndexedChunkMap::new(&blob_path, 0).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 0, false).is_err()); let cache_path = format!("{}.{}", blob_path, FILE_SUFFIX); let mut file = OpenOptions::new() @@ -179,7 +179,7 @@ mod tests { let chunk = MockChunkInfo::new(); assert_eq!(chunk.id(), 0); - assert!(IndexedChunkMap::new(&blob_path, 1).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 1, true).is_err()); } #[test] @@ -188,7 +188,7 @@ mod tests { let blob_path = dir.as_path().join("blob-1"); let blob_path = blob_path.as_os_str().to_str().unwrap().to_string(); - assert!(IndexedChunkMap::new(&blob_path, 0).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 0, true).is_err()); let cache_path = format!("{}.{}", blob_path, FILE_SUFFIX); let _file = OpenOptions::new() @@ -207,7 +207,7 @@ mod tests { let chunk = MockChunkInfo::new(); assert_eq!(chunk.id(), 0); - let map = IndexedChunkMap::new(&blob_path, 1).unwrap(); + let map = IndexedChunkMap::new(&blob_path, 1, true).unwrap(); assert_eq!(map.map.not_ready_count.load(Ordering::Acquire), 1); assert_eq!(map.map.count, 1); assert_eq!(map.map.size, 0x1001); @@ -223,7 +223,7 @@ mod tests { let blob_path = dir.as_path().join("blob-1"); let blob_path = blob_path.as_os_str().to_str().unwrap().to_string(); - assert!(IndexedChunkMap::new(&blob_path, 0).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 0, true).is_err()); let cache_path = format!("{}.{}", blob_path, FILE_SUFFIX); let file = OpenOptions::new() @@ -243,7 +243,7 @@ mod tests { let chunk = MockChunkInfo::new(); assert_eq!(chunk.id(), 0); - let map = IndexedChunkMap::new(&blob_path, 1).unwrap(); + let map = IndexedChunkMap::new(&blob_path, 1, true).unwrap(); assert_eq!(map.map.not_ready_count.load(Ordering::Acquire), 1); assert_eq!(map.map.count, 1); assert_eq!(map.map.size, 0x1001); @@ -259,7 +259,7 @@ mod tests { let blob_path = dir.as_path().join("blob-1"); let blob_path = blob_path.as_os_str().to_str().unwrap().to_string(); - assert!(IndexedChunkMap::new(&blob_path, 0).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 0, true).is_err()); let cache_path = format!("{}.{}", blob_path, FILE_SUFFIX); let mut file = OpenOptions::new() @@ -289,7 +289,7 @@ mod tests { let chunk = MockChunkInfo::new(); assert_eq!(chunk.id(), 0); - let map = IndexedChunkMap::new(&blob_path, 1).unwrap(); + let map = IndexedChunkMap::new(&blob_path, 1, true).unwrap(); assert!(map.is_range_all_ready()); assert_eq!(map.map.count, 1); assert_eq!(map.map.size, 0x1001); @@ -304,7 +304,7 @@ mod tests { let blob_path = dir.as_path().join("blob-1"); let blob_path = blob_path.as_os_str().to_str().unwrap().to_string(); - assert!(IndexedChunkMap::new(&blob_path, 0).is_err()); + assert!(IndexedChunkMap::new(&blob_path, 0, true).is_err()); let cache_path = format!("{}.{}", blob_path, FILE_SUFFIX); let mut file = OpenOptions::new() @@ -334,7 +334,7 @@ mod tests { let chunk = MockChunkInfo::new(); assert_eq!(chunk.id(), 0); - let map = IndexedChunkMap::new(&blob_path, 1).unwrap(); + let map = IndexedChunkMap::new(&blob_path, 1, true).unwrap(); assert_eq!(map.map.not_ready_count.load(Ordering::Acquire), 1); assert_eq!(map.map.count, 1); assert_eq!(map.map.size, 0x1001); diff --git a/storage/src/cache/state/persist_map.rs b/storage/src/cache/state/persist_map.rs index e6628c023c7..2de64674d14 100644 --- a/storage/src/cache/state/persist_map.rs +++ b/storage/src/cache/state/persist_map.rs @@ -49,7 +49,7 @@ pub(crate) struct PersistMap { } impl PersistMap { - pub fn open(filename: &str, chunk_count: u32, create: bool) -> Result { + pub fn open(filename: &str, chunk_count: u32, create: bool, persist: bool) -> Result { if chunk_count == 0 { return Err(einval!("chunk count should be greater than 0")); } @@ -58,6 +58,7 @@ impl PersistMap { .read(true) .write(create) .create(create) + .truncate(!persist) .open(filename) .map_err(|err| { einval!(format!( @@ -157,6 +158,9 @@ impl PersistMap { } readahead(fd, 0, expected_size); + if !persist { + let _ = std::fs::remove_file(filename); + } Ok(Self { count: chunk_count, diff --git a/storage/src/cache/state/range_map.rs b/storage/src/cache/state/range_map.rs index c1fed9931b5..7d4cda2f42a 100644 --- a/storage/src/cache/state/range_map.rs +++ b/storage/src/cache/state/range_map.rs @@ -28,7 +28,7 @@ impl BlobRangeMap { let filename = format!("{}.{}", blob_path, FILE_SUFFIX); debug_assert!(shift < 64); - PersistMap::open(&filename, count, true).map(|map| BlobRangeMap { shift, map }) + PersistMap::open(&filename, count, true, true).map(|map| BlobRangeMap { shift, map }) } /// Create a new instance of `BlobRangeMap` from an existing chunk map file. @@ -36,7 +36,7 @@ impl BlobRangeMap { let filename = format!("{}/{}.{}", workdir, blob_id, FILE_SUFFIX); debug_assert!(shift < 64); - PersistMap::open(&filename, count, false).map(|map| BlobRangeMap { shift, map }) + PersistMap::open(&filename, count, false, true).map(|map| BlobRangeMap { shift, map }) } pub(crate) fn get_range(&self, start: u64, count: u64) -> Result<(u32, u32)> { diff --git a/storage/src/device.rs b/storage/src/device.rs index b52ae35e249..71b9fb55004 100644 --- a/storage/src/device.rs +++ b/storage/src/device.rs @@ -22,6 +22,7 @@ use std::any::Any; use std::cmp; use std::fmt::{Debug, Formatter}; +use std::fs::File; use std::io::{self, Error}; use std::os::unix::io::AsRawFd; use std::sync::Arc; @@ -97,6 +98,8 @@ pub struct BlobInfo { meta_ci_compressed_size: u64, /// V6: Size of the uncompressed chunk information array. meta_ci_uncompressed_size: u64, + + fs_cache_file: Option>, } impl BlobInfo { @@ -130,6 +133,8 @@ impl BlobInfo { meta_ci_offset: 0, meta_ci_compressed_size: 0, meta_ci_uncompressed_size: 0, + + fs_cache_file: None, }; blob_info.compute_features(); @@ -320,6 +325,16 @@ impl BlobInfo { && self.meta_ci_compressed_size != 0 && self.meta_ci_uncompressed_size != 0 } + + /// Set the associated `File` object provided by Linux fscache subsystem. + pub fn set_fscache_file(&mut self, file: Option>) { + self.fs_cache_file = file; + } + + /// Get the associated `File` object provided by Linux fscache subsystem. + pub fn get_fscache_file(&self) -> Option> { + self.fs_cache_file.clone() + } } bitflags! { diff --git a/storage/src/factory.rs b/storage/src/factory.rs index 1413efd9e34..3f43504f3eb 100644 --- a/storage/src/factory.rs +++ b/storage/src/factory.rs @@ -30,7 +30,9 @@ use crate::backend::oss; #[cfg(feature = "backend-registry")] use crate::backend::registry; use crate::backend::BlobBackend; -use crate::cache::{BlobCache, BlobCacheMgr, BlobPrefetchConfig, DummyCacheMgr, FileCacheMgr}; +use crate::cache::{ + BlobCache, BlobCacheMgr, BlobPrefetchConfig, DummyCacheMgr, FileCacheMgr, FsCacheMgr, +}; use crate::device::BlobInfo; lazy_static! { @@ -49,6 +51,9 @@ lazy_static! { } /// Configuration information for storage backend. +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BackendConfig { /// Type of storage backend. @@ -86,7 +91,10 @@ impl BackendConfig { } /// Configuration information for blob cache manager. -#[derive(Clone, Default, Deserialize, Eq, PartialEq, Serialize)] +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct CacheConfig { /// Type of blob cache. #[serde(default, rename = "type")] @@ -106,7 +114,10 @@ pub struct CacheConfig { } /// Configuration information to create blob cache manager. -#[derive(Clone, Default, Deserialize, Eq, PartialEq, Serialize)] +/// +/// This structure is externally visible through configuration file and HTTP API, please keep them +/// stable. +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct FactoryConfig { /// Id of the factory. #[serde(default)] @@ -126,6 +137,7 @@ struct BlobCacheMgrKey { #[allow(clippy::derive_hash_xor_eq)] impl Hash for BlobCacheMgrKey { fn hash(&self, state: &mut H) { + self.config.id.hash(state); self.config.backend.backend_type.hash(state); self.config.cache.cache_type.hash(state); self.config.cache.prefetch_config.hash(state); @@ -176,6 +188,16 @@ impl BlobFactory { mgr.init()?; Arc::new(mgr) as Arc } + "fscache" => { + let mgr = FsCacheMgr::new( + config.cache.clone(), + backend, + ASYNC_RUNTIME.clone(), + &config.id, + )?; + mgr.init()?; + Arc::new(mgr) as Arc + } _ => { let mgr = DummyCacheMgr::new(config.cache.clone(), backend, false, false)?; mgr.init()?; @@ -190,8 +212,18 @@ impl BlobFactory { } /// Garbage-collect unused blob cache managers and blob caches. - pub fn gc(&self) { - unimplemented!("TODO") + pub fn gc(&self, victim: Option<(&Arc, &str)>) { + if let Some((config, id)) = victim { + let key = BlobCacheMgrKey { + config: config.clone(), + }; + let mgr = self.mgrs.lock().unwrap().get(&key).cloned(); + if let Some(mgr) = mgr { + mgr.gc(Some(id)); + } + } else { + unimplemented!("TODO") + } } /// Create a storage backend for the blob with id `blob_id`. diff --git a/storage/src/meta/mod.rs b/storage/src/meta/mod.rs index fe2fc9085ce..c8cef54e111 100644 --- a/storage/src/meta/mod.rs +++ b/storage/src/meta/mod.rs @@ -406,7 +406,12 @@ impl BlobMetaInfo { /// - `start` is bigger than blob size. /// - some portion of the range [start, start + size) is not covered by chunks. /// - the blob metadata is invalid. - pub fn get_chunks_uncompressed(&self, start: u64, size: u64) -> Result> { + pub fn get_chunks_uncompressed( + &self, + start: u64, + size: u64, + batch_size: u64, + ) -> Result> { let end = start.checked_add(size).ok_or_else(|| einval!())?; if end > self.state.uncompressed_size { return Err(einval!(format!( @@ -414,6 +419,14 @@ impl BlobMetaInfo { end, self.state.uncompressed_size ))); } + let batch_end = if batch_size <= size { + end + } else { + std::cmp::min( + start.checked_add(batch_size).unwrap_or(end), + self.state.uncompressed_size, + ) + }; let infos = &*self.state.chunks; let mut index = self.state.get_chunk_index_nocheck(start, false)?; @@ -432,7 +445,7 @@ impl BlobMetaInfo { vec.push(BlobMetaChunk::new(index, &self.state)); let mut last_end = entry.aligned_uncompressed_end(); - if last_end >= end { + if last_end >= batch_end { Ok(vec) } else { while index + 1 < infos.len() { @@ -448,9 +461,14 @@ impl BlobMetaInfo { ))); } + // Avoid read amplify if next chunk is too big. + if last_end >= end && entry.aligned_uncompressed_end() > batch_end { + return Ok(vec); + } + vec.push(BlobMetaChunk::new(index, &self.state)); last_end = entry.aligned_uncompressed_end(); - if last_end >= end { + if last_end >= batch_end { return Ok(vec); } } @@ -470,7 +488,12 @@ impl BlobMetaInfo { /// - `start` is bigger than blob size. /// - some portion of the range [start, start + size) is not covered by chunks. /// - the blob metadata is invalid. - pub fn get_chunks_compressed(&self, start: u64, size: u64) -> Result> { + pub fn get_chunks_compressed( + &self, + start: u64, + size: u64, + batch_size: u64, + ) -> Result> { let end = start.checked_add(size).ok_or_else(|| einval!())?; if end > self.state.compressed_size { return Err(einval!(format!( @@ -478,6 +501,14 @@ impl BlobMetaInfo { end, self.state.compressed_size ))); } + let batch_end = if batch_size <= size { + end + } else { + std::cmp::min( + start.checked_add(batch_size).unwrap_or(end), + self.state.compressed_size, + ) + }; let infos = &*self.state.chunks; let mut index = self.state.get_chunk_index_nocheck(start, true)?; @@ -489,7 +520,7 @@ impl BlobMetaInfo { vec.push(BlobMetaChunk::new(index, &self.state)); let mut last_end = entry.compressed_end(); - if last_end >= end { + if last_end >= batch_end { Ok(vec) } else { while index + 1 < infos.len() { @@ -500,9 +531,14 @@ impl BlobMetaInfo { return Err(einval!()); } + // Avoid read amplify if next chunk is too big. + if last_end >= end && entry.compressed_end() > batch_end { + return Ok(vec); + } + vec.push(BlobMetaChunk::new(index, &self.state)); last_end = entry.compressed_end(); - if last_end >= end { + if last_end >= batch_end { return Ok(vec); } } @@ -826,7 +862,7 @@ mod tests { state: Arc::new(state), }; - let vec = info.get_chunks_uncompressed(0x0, 0x1001).unwrap(); + let vec = info.get_chunks_uncompressed(0x0, 0x1001, 0).unwrap(); assert_eq!(vec.len(), 1); assert_eq!(vec[0].blob_index(), 1); assert_eq!(vec[0].id(), 0); @@ -837,7 +873,7 @@ mod tests { assert!(vec[0].is_compressed()); assert!(!vec[0].is_hole()); - let vec = info.get_chunks_uncompressed(0x0, 0x4000).unwrap(); + let vec = info.get_chunks_uncompressed(0x0, 0x4000, 0).unwrap(); assert_eq!(vec.len(), 2); assert_eq!(vec[1].blob_index(), 1); assert_eq!(vec[1].id(), 1); @@ -848,24 +884,24 @@ mod tests { assert!(!vec[1].is_compressed()); assert!(!vec[1].is_hole()); - let vec = info.get_chunks_uncompressed(0x0, 0x4001).unwrap(); + let vec = info.get_chunks_uncompressed(0x0, 0x4001, 0).unwrap(); assert_eq!(vec.len(), 3); - let vec = info.get_chunks_uncompressed(0x100000, 0x2000).unwrap(); + let vec = info.get_chunks_uncompressed(0x100000, 0x2000, 0).unwrap(); assert_eq!(vec.len(), 1); - assert!(info.get_chunks_uncompressed(0x0, 0x6001).is_err()); - assert!(info.get_chunks_uncompressed(0x0, 0xfffff).is_err()); - assert!(info.get_chunks_uncompressed(0x0, 0x100000).is_err()); - assert!(info.get_chunks_uncompressed(0x0, 0x104000).is_err()); - assert!(info.get_chunks_uncompressed(0x0, 0x104001).is_err()); - assert!(info.get_chunks_uncompressed(0x100000, 0x2001).is_err()); - assert!(info.get_chunks_uncompressed(0x100000, 0x4000).is_err()); - assert!(info.get_chunks_uncompressed(0x100000, 0x4001).is_err()); + assert!(info.get_chunks_uncompressed(0x0, 0x6001, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x0, 0xfffff, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x0, 0x100000, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x0, 0x104000, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x0, 0x104001, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x100000, 0x2001, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x100000, 0x4000, 0).is_err()); + assert!(info.get_chunks_uncompressed(0x100000, 0x4001, 0).is_err()); assert!(info - .get_chunks_uncompressed(0x102000, 0xffff_ffff_ffff_ffff) + .get_chunks_uncompressed(0x102000, 0xffff_ffff_ffff_ffff, 0) .is_err()); - assert!(info.get_chunks_uncompressed(0x104000, 0x1).is_err()); + assert!(info.get_chunks_uncompressed(0x104000, 0x1, 0).is_err()); } #[test] diff --git a/storage/src/utils.rs b/storage/src/utils.rs index 4b42b66465e..f2fc9308c5b 100644 --- a/storage/src/utils.rs +++ b/storage/src/utils.rs @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 //! Utility helpers to supprt the storage subsystem. +use std::alloc::{alloc, Layout}; use std::cmp::{self, min}; use std::io::{ErrorKind, Result}; use std::os::unix::io::RawFd; @@ -222,13 +223,12 @@ pub fn readahead(fd: libc::c_int, mut offset: u64, end: u64) { /// A customized buf allocator that avoids zeroing pub fn alloc_buf(size: usize) -> Vec { - let mut buf = Vec::with_capacity(size); - // It's ok to provide uninitialized data buffer, the caller should take care of it. - #[allow(clippy::uninit_vec)] - unsafe { - buf.set_len(size) - }; - buf + debug_assert!(size < isize::MAX as usize); + let layout = Layout::from_size_align(size, 0x1000) + .unwrap() + .pad_to_align(); + let ptr = unsafe { alloc(layout) }; + unsafe { Vec::from_raw_parts(ptr, size, layout.size()) } } /// Check hash of data matches provided one