Skip to content

Commit

Permalink
Support Unicode character substitutions
Browse files Browse the repository at this point in the history
  • Loading branch information
daviddrysdale committed Oct 24, 2023
1 parent 9204bcd commit 494a26f
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## 0.1.1 - TBD

- Support conversion of HTML comments
- Add `unicode-subst` config option for Unicode character substitution.

## 0.1.0 - 2023-10-21

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ The following configuration values can be set in `book.toml`.
| `allow-asciidoc` | boolean | false | Cope with some AsciiDoc constructs included in the Markdown source when set. |
| `heading-offset` | integer | 0 | Extra offset to apply to heading levels. |
| `skip-chapters` | string | "" | Comma-separated list of filenames to ignore when generating AsciiDoc. |
| `unicode-subst` | map<string,string> | empty | Map Unicode characters (either as-is or in U+1234 form) to replacements. |

### AsciiDoc Passthrough

Expand Down
79 changes: 74 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use mdbook::{
};
use pulldown_cmark as md;
use regex::Regex;
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::io::Write;
use std::path::{Path, PathBuf};

Expand All @@ -21,7 +21,10 @@ lazy_static! {
/// Regexp to match the invented tag used to transmit raw AsciiDoc.
static ref ASCIIDOC_ESCAPE_RE: Regex =
Regex::new(r#"<asciidoc content='(?P<content>[^']+)'"#).unwrap();
/// Regexp to match HTML comments.
static ref HTML_COMMENT_RE : Regex = Regex::new(r#"<!--\s*(?P<comment>.*?)\s*-->"#).unwrap();
/// Regexp to match Unicode U+1234 expressions.
static ref UNICODE_CHAR_RE: Regex = Regex::new(r#"^U\+(?P<code>[0-9a-fA-F]{4})(?P<rest>.*)$"#).unwrap();
/// Tags attached to code blocks that should be ignored.
static ref IGNORED_CODE_TAGS: HashSet<&'static str> = HashSet::from([
"ignore",
Expand Down Expand Up @@ -226,6 +229,8 @@ struct AsciiDocBackend {
heading_offset: isize,
/// Chapters to skip (by name).
skip_chapters: Vec<String>,
/// Unicode character substitutions. Keys and values are a single char.
unicode_subst: HashMap<String, String>,
}

impl AsciiDocBackend {
Expand Down Expand Up @@ -256,6 +261,43 @@ impl AsciiDocBackend {
.map(|s| s.to_owned())
.collect();

let mut unicode_subst = HashMap::new();
if let Some(toml::Value::Table(table)) = ctx.config.get("output.asciidoc.unicode-subst") {
for (key, val) in table {
if let toml::Value::String(value) = val {
let key = transmute_unicode(key);
let value = transmute_unicode(value);
let mut srcs = key.chars();
let mut dests = value.chars();

let src = match (srcs.next(), srcs.next()) {
(Some(v), None) => v,
(Some(c1), Some(c2)) => {
log::error!("Expect a single Unicode char as unicode-subst source, got '{c1}' '{c2}'...");
continue;
}
(None, _) => {
log::error!("Expect a single Unicode char as unicode-subst source, got empty string");
continue;
}
};
let dest = match (dests.next(), dests.next()) {
(Some(v), None) => v,
(Some(c1), Some(c2)) => {
log::error!("Expect a single Unicode char as unicode-subst dest, got '{c1}' '{c2}'...");
continue;
}
(None, _) => {
log::error!("Expect a single Unicode char as unicode-subst dest, got empty string");
continue;
}
};
log::info!("Map unicode '{src}' to '{dest}'");
unicode_subst.insert(format!("{src}"), format!("{dest}"));
}
}
}

let dest_dir = ctx.destination.clone();
std::fs::create_dir_all(&dest_dir).map_err(|e| {
format!(
Expand All @@ -273,6 +315,7 @@ impl AsciiDocBackend {
allow_asciidoc,
heading_offset,
skip_chapters,
unicode_subst,
};
info!("configured {backend:?}");
Ok(backend)
Expand Down Expand Up @@ -610,14 +653,15 @@ impl AsciiDocBackend {
}
Event::Text(text) => {
trace!("[MD]{indent}Text({text})");
let text = self.replace_unicode(text);
indent.inc();

if self.allow_asciidoc {
self.process_potential_asciidoc(text);
self.process_potential_asciidoc(&text);
}

if escaping_needed {
out!(f, "{}", md2ad(text));
out!(f, "{}", md2ad(&text));
} else {
out!(f, "{}", text);
}
Expand All @@ -626,6 +670,7 @@ impl AsciiDocBackend {
}
Event::Code(text) => {
trace!("[MD]{indent}Code({text})");
let text = self.replace_unicode(text);
if text.contains("+") {
// - Double-backtick allows use in positions without surrounding space.
// - Passthrough macro because the text has plus signs in it.
Expand Down Expand Up @@ -782,8 +827,7 @@ impl AsciiDocBackend {
Ok(())
}

/// Process raw text from the input that might already contain some
/// AsciiDoc.
/// Process raw text from the input that might already contain some AsciiDoc.
fn process_potential_asciidoc(&self, text: &str) {
if let Some(caps) = ASCIIDOC_IMAGE_RE.captures(&text) {
if let Some(url) = caps.name("url") {
Expand All @@ -796,6 +840,18 @@ impl AsciiDocBackend {
}
}
}

/// Perform Unicode character substitution on the text.
fn replace_unicode(&self, input: &str) -> String {
let mut text = input.to_string();
for (src, dest) in &self.unicode_subst {
text = text.replace(src, dest);
}
if text != input {
log::info!("Replaced '{input}' with '{text}' due to Unicode substitution");
}
text
}
}

/// Determine if a URL refers to a local file, and if so return the local path.
Expand Down Expand Up @@ -853,3 +909,16 @@ fn md2ad(v: &str) -> String {
fn footnote_marker(v: &str) -> String {
format!("TODO-footnote-text-{v}")
}

/// Convert a "U+1234" unicode code point description into a Unicode char.
fn transmute_unicode(text: &str) -> String {
if let Some(caps) = UNICODE_CHAR_RE.captures(text) {
let code = caps.name("code").unwrap();
let code = u32::from_str_radix(code.into(), 16).unwrap();
let code = char::from_u32(code).unwrap();
let rest: &str = caps.name("rest").unwrap().into();
format!("{code}{rest}")
} else {
text.to_string()
}
}
19 changes: 19 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,22 @@ fn test_image_re() {
assert_eq!(got, want, "Failed for input '{input}'");
}
}

#[test]
fn test_unicode_char() {
let tests = [
("abc", "abc"),
("", ""),
("U+0020", " "),
("U+020", "U+020"),
("U+02G", "U+02G"),
("U+002G", "U+002G"),
("U+23AE", "⎮"),
("U+23ae", "⎮"),
("only first char U+0020", "only first char U+0020"),
];
for (input, want) in tests {
let got = transmute_unicode(input);
assert_eq!(got, want, "Failed for input '{input}'");
}
}

0 comments on commit 494a26f

Please sign in to comment.