Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GString, StringName: add conversions from bytes and C-strings #1062

Merged
merged 1 commit into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion godot-core/src/builtin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ pub mod __prelude_reexport {
pub use rect2i::*;
pub use rid::*;
pub use signal::*;
pub use string::{GString, NodePath, StringName};
pub use string::{Encoding, GString, NodePath, StringName};
pub use transform2d::*;
pub use transform3d::*;
pub use variant::*;
Expand Down
74 changes: 71 additions & 3 deletions godot-core/src/builtin/string/gstring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
*/

use std::convert::Infallible;
use std::ffi::c_char;
use std::fmt;
use std::fmt::Write;

use godot_ffi as sys;
use sys::types::OpaqueString;
use sys::{ffi_methods, interface_fn, GodotFfi};

use crate::builtin::string::Encoding;
use crate::builtin::{inner, NodePath, StringName, Variant};
use crate::meta::error::StringError;
use crate::meta::AsArg;
use crate::{impl_shared_string_api, meta};

Expand Down Expand Up @@ -77,6 +78,73 @@ impl GString {
Self::default()
}

/// Convert string from bytes with given encoding, returning `Err` on validation errors.
///
/// Intermediate `NUL` characters are not accepted in Godot and always return `Err`.
///
/// Some notes on the encodings:
/// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed.
/// It is your responsibility to ensure that the input is valid Latin-1.
/// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters.
/// - **UTF-8**: The input is validated to be UTF-8.
///
/// Specifying incorrect encoding is safe, but may result in unintended string values.
pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Result<Self, StringError> {
Self::try_from_bytes_with_nul_check(bytes, encoding, true)
}

/// Convert string from C-string with given encoding, returning `Err` on validation errors.
///
/// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information.
pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Result<Self, StringError> {
Self::try_from_bytes_with_nul_check(cstr.to_bytes(), encoding, false)
}

pub(super) fn try_from_bytes_with_nul_check(
bytes: &[u8],
encoding: Encoding,
check_nul: bool,
) -> Result<Self, StringError> {
match encoding {
Encoding::Ascii => {
// If the bytes are ASCII, we can fall back to Latin-1, which is always valid (except for NUL).
// is_ascii() does *not* check for the NUL byte, so the check in the Latin-1 branch is still necessary.
if bytes.is_ascii() {
Self::try_from_bytes_with_nul_check(bytes, Encoding::Latin1, check_nul)
.map_err(|_e| StringError::new("intermediate NUL byte in ASCII string"))
} else {
Err(StringError::new("invalid ASCII"))
}
}
Encoding::Latin1 => {
// Intermediate NUL bytes are not accepted in Godot. Both ASCII + Latin-1 encodings need to explicitly check for this.
if check_nul && bytes.contains(&0) {
// Error overwritten when called from ASCII branch.
return Err(StringError::new("intermediate NUL byte in Latin-1 string"));
}

let s = unsafe {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing safety comment

Self::new_with_string_uninit(|string_ptr| {
let ctor = interface_fn!(string_new_with_latin1_chars_and_len);
ctor(
string_ptr,
bytes.as_ptr() as *const std::ffi::c_char,
bytes.len() as i64,
);
})
};
Ok(s)
}
Encoding::Utf8 => {
// from_utf8() also checks for intermediate NUL bytes.
let utf8 = std::str::from_utf8(bytes);

utf8.map(GString::from)
.map_err(|e| StringError::with_source("invalid UTF-8", e))
}
}
}

/// Number of characters in the string.
///
/// _Godot equivalent: `length`_
Expand Down Expand Up @@ -260,7 +328,7 @@ impl From<&str> for GString {
let ctor = interface_fn!(string_new_with_utf8_chars_and_len);
ctor(
string_ptr,
bytes.as_ptr() as *const c_char,
bytes.as_ptr() as *const std::ffi::c_char,
bytes.len() as i64,
);
})
Expand Down Expand Up @@ -307,7 +375,7 @@ impl From<&GString> for String {

interface_fn!(string_to_utf8_chars)(
string.string_sys(),
buf.as_mut_ptr() as *mut c_char,
buf.as_mut_ptr() as *mut std::ffi::c_char,
len,
);

Expand Down
14 changes: 14 additions & 0 deletions godot-core/src/builtin/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,20 @@ impl FromGodot for String {
}
}

// ----------------------------------------------------------------------------------------------------------------------------------------------
// Encoding

/// Specifies string encoding.
///
/// Used in functions such as [`GString::try_from_bytes()`][GString::try_from_bytes] to handle multiple input string encodings.
#[non_exhaustive]
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe include Default? utf8 would make sense to me as it's the default in rust elsewhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would lead to code such as:

GString::try_from_bytes(bytes, Default::default())

which I find much less readable than:

GString::try_from_bytes(bytes, Encoding::Utf8)

Since UTF-8 isn't a 100% obvious default (Godot itself uses a mix of Latin-1 and UTF-32), I'd rather make this explicit.

pub enum Encoding {
Ascii,
Latin1,
Utf8,
}

// ----------------------------------------------------------------------------------------------------------------------------------------------

/// Returns a tuple of `(from, len)` from a Rust range.
Expand Down
82 changes: 80 additions & 2 deletions godot-core/src/builtin/string/string_name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

use std::fmt;

use godot_ffi as sys;
use godot_ffi::interface_fn;
use sys::{ffi_methods, GodotFfi};

use crate::builtin::{inner, GString, NodePath, Variant};
use crate::builtin::{inner, Encoding, GString, NodePath, Variant};
use crate::meta::error::StringError;
use crate::meta::AsArg;
use crate::{impl_shared_string_api, meta};

Expand Down Expand Up @@ -60,6 +61,83 @@ impl StringName {
Self { opaque }
}

/// Convert string from bytes with given encoding, returning `Err` on validation errors.
///
/// Intermediate `NUL` characters are not accepted in Godot and always return `Err`.
///
/// Some notes on the encodings:
/// - **Latin-1:** Since every byte is a valid Latin-1 character, no validation besides the `NUL` byte is performed.
/// It is your responsibility to ensure that the input is valid Latin-1.
/// - **ASCII**: Subset of Latin-1, which is additionally validated to be valid, non-`NUL` ASCII characters.
/// - **UTF-8**: The input is validated to be UTF-8.
///
/// Specifying incorrect encoding is safe, but may result in unintended string values.
pub fn try_from_bytes(bytes: &[u8], encoding: Encoding) -> Result<Self, StringError> {
Self::try_from_bytes_with_nul_check(bytes, encoding, true)
}

/// Convert string from bytes with given encoding, returning `Err` on validation errors.
///
/// Convenience function for [`try_from_bytes()`](Self::try_from_bytes); see its docs for more information.
///
/// When called with `Encoding::Latin1`, this can be slightly more efficient than `try_from_bytes()`.
pub fn try_from_cstr(cstr: &std::ffi::CStr, encoding: Encoding) -> Result<Self, StringError> {
// Short-circuit the direct Godot 4.2 function for Latin-1, which takes a null-terminated C string.
#[cfg(since_api = "4.2")]
if encoding == Encoding::Latin1 {
// Note: CStr guarantees no intermediate NUL bytes, so we don't need to check for them.

let is_static = sys::conv::SYS_FALSE;
let s = unsafe {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing safety comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many such FFI accesses have nothing:

unsafe {
Self::new_with_string_uninit(|string_ptr| {
let ctor = interface_fn!(string_new_with_utf8_chars_and_len);

unsafe { Gd::from_obj_sys(object_ptr) }

unsafe {
Self::new_with_uninit(|self_ptr| {
let ctor = sys::builtin_fn!(callable_from_object_method);

unsafe {
Self::new_with_uninit(|self_ptr| {
let ctor = sys::builtin_fn!(signal_from_object_signal);
let raw = object.to_ffi();

let result = unsafe {
Variant::new_with_var_uninit(|variant_ptr| {
interface_fn!(variant_call)(
sys::SysPtr::force_mut(self.var_sys()),


Some have a comment that's not descriptive...

// SAFETY: callable_custom_create() is a valid way of creating callables.
unsafe {
Callable::new_with_uninit(|type_ptr| {

...or only mentions one particular behavior, ignoring the wider safety:

// SAFETY: A `char` value is by definition a valid Unicode code point.
unsafe {
Self::new_with_string_uninit(|string_ptr| {

// SAFETY: safe to call for non-object variants (returns 0).
let raw_id: u64 = unsafe { interface_fn!(variant_get_object_instance_id)(self.var_sys()) };

// SAFETY: Godot looks up ID in ObjectDB and returns null if not found.
unsafe { sys::interface_fn!(object_get_instance_from_id)(instance_id.to_u64()) }

(Yes, I wrote many of those 😅 )


When calling low-level Godot functions, there's often not much that can be said beyond "passed pointer is valid", "Godot function pointer is initialized" or other repetitive points that hold for all low-level constructors.

Lints that enforce safety comments are deliberately not turned on, as this would often lead to bureaucracy and likely decreased doc quality. Being more selective gives those unsafe statements with long safety comments more attention. This goes a bit into the topic of #1046.

That's at least how I see it 🙂

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont think i entirely agree. i do agree that just saying "godot function pointer is initialized" isn't really needed at the moment, that's something which falls under #1046 and so is already technically unsound anyway. but the other parts, mainly explaining what kind of data is expected and why it's valid to call the function with this data. that part i think is still useful.

like here and the other place in this PR, you rely on a cstr/byte-array without null always being a valid latin1 encoded string. which isn't necessarily immediately obvious.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you write the safety statement in this case?

Copy link
Member

@lilizoey lilizoey Mar 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

honestly i'd probably just make the Note above into the safety comment, and reword it slightly. so something like
// SAFETY: CStr guarantees no intermediate NUL bytes, and so is automatically a valid latin1 encoded string.

Copy link
Member Author

@Bromeon Bromeon Mar 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Edit: sorry I missed your intermediate response, took me a bit longer to formulate.

like here and the other place in this PR, you rely on a cstr/byte-array without null always being a valid latin1 encoded string. which isn't necessarily immediately obvious.

I agree with focusing on non-obvious cases, but here is a good example why a safety comment can be worse than none 🙂 the fact that it's valid Latin-1 isn't relevant for safety.

It's only relevant for correctness -- and that part is explained in the RustDoc already:

  • Latin-1: Since every byte is a valid Latin-1 character, no validation besides the NUL byte is performed.
    It is your responsibility to ensure that the input is valid Latin-1.

And "valid" here means "correctly encoded text". If you pass UTF-8, you'll get garbage, but no UB.

What makes the GDExtension call actually sound is the validity of the pointers and length. Since this is true for every pointer-accepting FFI call in existence, I don't see the value of repeating it every time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What makes the GDExtension call actually sound is the validity of the pointers and length. Since this is true for every pointer-accepting FFI call in existence, I don't see the value of repeating it every time.

no? what godot decides to do with the data behind the pointers you pass it is important for safety. so when making an ffi call into godot you should (in the safety comment) explain why the data you are passing to godot wont cause UB when godot uses it. and in this case, that involves explaining that godot will treat this as a latin1 encoded string.

so the fact that we are passing godot a latin1 encoded string, and that godot will treat it as a latin1 encoded string, that is what makes the call safe (in addition to the pointer validity and length).

my safety comment may not express that as clearly, the previous paragraph is left kinda implied. my assumption was that it was already understood that the safety comment before an ffi-call explains what godot does with the input and why this input wont be UB in that case. a pointer being valid for reads only explains that godot will access the value, not what godot will subsequently do with the value.

Copy link
Member Author

@Bromeon Bromeon Mar 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no? what godot decides to do with the data behind the pointers you pass it is important for safety. so when making an ffi call into godot you should (in the safety comment) explain why the data you are passing to godot wont cause UB when godot uses it.

I don't know the implementation. I know the signature, and from that I can infer that one pointer is the to-be-constructed string instance, and that the other (ptr, len) pair points to an allocated byte range representing the characters. I need to rely on Godot doing the expected thing.

If I didn't, I'd need to study every single implementation. We don't do this for all the examples I showed above, let alone the thousands of individual class methods. We rely on the signature from Godot being a contract. I don't see why this here is different from my examples or class methods.

Latin-1 is only relevant to mention if you can cause UB when not passing in Latin-1.

Self::new_with_string_uninit(|string_ptr| {
let ctor = interface_fn!(string_name_new_with_latin1_chars);
ctor(
string_ptr,
cstr.as_ptr() as *const std::ffi::c_char,
is_static,
);
})
};
return Ok(s);
}

Self::try_from_bytes_with_nul_check(cstr.to_bytes(), encoding, false)
}

fn try_from_bytes_with_nul_check(
bytes: &[u8],
encoding: Encoding,
check_nul: bool,
) -> Result<Self, StringError> {
match encoding {
Encoding::Ascii => {
// ASCII is a subset of UTF-8, and UTF-8 has a more direct implementation than Latin-1; thus use UTF-8 via `From<&str>`.
if !bytes.is_ascii() {
Err(StringError::new("invalid ASCII"))
} else if check_nul && bytes.contains(&0) {
Err(StringError::new("intermediate NUL byte in ASCII string"))
} else {
// SAFETY: ASCII is a subset of UTF-8 and was verified above.
let ascii = unsafe { std::str::from_utf8_unchecked(bytes) };
Ok(Self::from(ascii))
}
}
Encoding::Latin1 => {
// This branch is short-circuited if invoked for CStr and Godot 4.2+, which uses `string_name_new_with_latin1_chars`
// (requires nul-termination). In general, fall back to GString conversion.
GString::try_from_bytes_with_nul_check(bytes, Encoding::Latin1, check_nul)
.map(Self::from)
}
Encoding::Utf8 => {
// from_utf8() also checks for intermediate NUL bytes.
let utf8 = std::str::from_utf8(bytes);

utf8.map(StringName::from)
.map_err(|e| StringError::with_source("invalid UTF-8", e))
}
}
}

/// Number of characters in the string.
///
/// _Godot equivalent: `length`_
Expand Down
2 changes: 2 additions & 0 deletions godot-core/src/meta/error/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
mod call_error;
mod convert_error;
mod io_error;
mod string_error;

pub use call_error::*;
pub use convert_error::*;
pub use io_error::*;
pub use string_error::*;
51 changes: 51 additions & 0 deletions godot-core/src/meta/error/string_error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) godot-rust; Bromeon and contributors.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

use std::error::Error;
use std::fmt;

/// Error related to string encoding/decoding.
#[derive(Debug)]
pub struct StringError {
message: String,
source: Option<Box<(dyn Error + 'static)>>,
}

impl fmt::Display for StringError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(source) = self.source() {
write!(f, "{}: {}", self.message, source)
} else {
write!(f, "{}", self.message)
}
}
}

impl Error for StringError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
self.source.as_deref()
}
}

impl StringError {
pub(crate) fn new(message: impl Into<String>) -> Self {
Self {
message: message.into(),
source: None,
}
}

pub(crate) fn with_source(
message: impl Into<String>,
source: impl Into<Box<(dyn Error + 'static)>>,
) -> Self {
Self {
message: message.into(),
source: Some(source.into()),
}
}
}
1 change: 1 addition & 0 deletions itest/rust/src/builtin_tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ mod string {
mod gstring_test;
mod node_path_test;
mod string_name_test;
mod string_test_macros;
}

mod script {
Expand Down
28 changes: 21 additions & 7 deletions itest/rust/src/builtin_tests/string/gstring_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
use std::collections::HashSet;

use crate::framework::{expect_debug_panic_or_release_ok, itest};
use godot::builtin::{GString, PackedStringArray};
use godot::builtin::{Encoding, GString, PackedStringArray};

// TODO use tests from godot-rust/gdnative

Expand Down Expand Up @@ -150,7 +150,7 @@ fn string_substr() {
}

#[itest]
fn string_find() {
fn gstring_find() {
let s = GString::from("Hello World");

assert_eq!(s.find("o"), Some(4));
Expand All @@ -171,7 +171,7 @@ fn string_find() {
}

#[itest]
fn string_split() {
fn gstring_split() {
let s = GString::from("Hello World");
assert_eq!(s.split(" "), packed(&["Hello", "World"]));
assert_eq!(
Expand Down Expand Up @@ -206,7 +206,7 @@ fn string_split() {
}

#[itest]
fn string_count() {
fn gstring_count() {
let s = GString::from("Long sentence with Sentry guns.");
assert_eq!(s.count("sent", ..), 1);
assert_eq!(s.count("en", 6..), 3);
Expand All @@ -224,7 +224,7 @@ fn string_count() {
}

#[itest]
fn string_erase() {
fn gstring_erase() {
let s = GString::from("Hello World");
assert_eq!(s.erase(..), GString::new());
assert_eq!(s.erase(4..4), s);
Expand All @@ -236,7 +236,7 @@ fn string_erase() {
}

#[itest]
fn string_insert() {
fn gstring_insert() {
let s = GString::from("H World");
assert_eq!(s.insert(1, "i"), "Hi World".into());
assert_eq!(s.insert(1, "ello"), "Hello World".into());
Expand All @@ -248,7 +248,7 @@ fn string_insert() {
}

#[itest]
fn string_pad() {
fn gstring_pad() {
let s = GString::from("123");
assert_eq!(s.lpad(5, '0'), "00123".into());
assert_eq!(s.lpad(2, ' '), "123".into());
Expand All @@ -266,7 +266,21 @@ fn string_pad() {
assert_eq!(s.pad_zeros(2), "123.456".into());
}

// Byte and C-string conversions.
crate::generate_string_bytes_and_cstr_tests!(
builtin: GString,
tests: [
gstring_from_bytes_ascii,
gstring_from_cstr_ascii,
gstring_from_bytes_latin1,
gstring_from_cstr_latin1,
gstring_from_bytes_utf8,
gstring_from_cstr_utf8,
]
);

// ----------------------------------------------------------------------------------------------------------------------------------------------
// Helpers

fn packed(strings: &[&str]) -> PackedStringArray {
strings.iter().map(|&s| GString::from(s)).collect()
Expand Down
Loading
Loading