diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a87f499..0f6e8e1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed + - [589](https://github.com/thoth-pub/thoth/issues/589) - Truncation of `short_abstract` in Thoth ONIX results in Invalid UTF-8 sequences ## [[0.12.0]](https://github.com/thoth-pub/thoth/releases/tag/v0.12.0) - 2024-03-14 ### Removed diff --git a/thoth-export-server/src/xml/onix3_thoth.rs b/thoth-export-server/src/xml/onix3_thoth.rs index aa68f0b2..0f718536 100644 --- a/thoth-export-server/src/xml/onix3_thoth.rs +++ b/thoth-export-server/src/xml/onix3_thoth.rs @@ -422,8 +422,13 @@ impl XmlElementBlock for Work { })?; write_element_block("CollateralDetail", w, |w| { if let Some(mut short_abstract) = self.short_abstract.clone() { - // Short description field may not exceed 350 characters - short_abstract.truncate(350); + // Short description field may not exceed 350 characters. + // Ensure that the string is truncated at a valid UTF-8 boundary + // by finding the byte index of the 350th character and then truncating + // the string at that index, to avoid creating invalid UTF-8 sequences. + if let Some((byte_index, _)) = short_abstract.char_indices().nth(350) { + short_abstract.truncate(byte_index); + } write_element_block("TextContent", w, |w| { // 02 Short description write_element_block("TextType", w, |w| {