diff --git a/Cargo.toml b/Cargo.toml index b448d99..eaa8463 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "docxtools" -version = "0.6.0" +version = "0.6.1-SNAPSHOT" edition = "2021" [dependencies] diff --git a/src/bin/main.rs b/src/bin/main.rs index a5671cf..816ebdc 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -28,12 +28,10 @@ enum Commands { /// List the links in the document to the console Links(LinksArgs), - /// Search the text in the document like 'grep' - #[command(name = "_grep")] + /// Search the text in the document Grep(GrepArgs), /// Search and replace in document text and tables - #[command(name = "_replace")] Replace(ReplaceArgs), /// Search and replace hyperlinks in the document diff --git a/src/test/test_tree2/word/document.xml b/src/test/test_tree2/word/document.xml index db02d7d..970feb3 100644 --- a/src/test/test_tree2/word/document.xml +++ b/src/test/test_tree2/word/document.xml @@ -56,7 +56,7 @@ - And some some more text + And some some some more text diff --git a/src/test/test_tree5/[Content_Types].xml b/src/test/test_tree5/[Content_Types].xml new file mode 100644 index 0000000..aa8ac0e --- /dev/null +++ b/src/test/test_tree5/[Content_Types].xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/_rels/.rels b/src/test/test_tree5/_rels/.rels new file mode 100644 index 0000000..fdd8c4f --- /dev/null +++ b/src/test/test_tree5/_rels/.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/docProps/app.xml b/src/test/test_tree5/docProps/app.xml new file mode 100644 index 0000000..39ef68e --- /dev/null +++ b/src/test/test_tree5/docProps/app.xml @@ -0,0 +1,2 @@ + +1125147Microsoft Office Word011falsefalse171falsefalse16.0000 \ No newline at end of file diff --git a/src/test/test_tree5/docProps/core.xml b/src/test/test_tree5/docProps/core.xml new file mode 100644 index 0000000..cb6185e --- /dev/null +++ b/src/test/test_tree5/docProps/core.xml @@ -0,0 +1,2 @@ + +David BosschaertDavid Bosschaert122024-01-10T14:44:00Z2024-01-11T15:34:00Z \ No newline at end of file diff --git a/src/test/test_tree5/word/_rels/document.xml.rels b/src/test/test_tree5/word/_rels/document.xml.rels new file mode 100644 index 0000000..0079d06 --- /dev/null +++ b/src/test/test_tree5/word/_rels/document.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/word/document.xml b/src/test/test_tree5/word/document.xml new file mode 100644 index 0000000..d1b9063 --- /dev/null +++ b/src/test/test_tree5/word/document.xml @@ -0,0 +1,86 @@ + + + + + + + + + + + + + + Notwithstanding the eventual re + + + + + + sult + + + + + + ing quotations punters were agreeable to a technocratic compromise. + + + + + + + Here’s ano + + + + + + ther line of te + + + + + + xt. + + + + + + + + + + + + + And th + + + + + + is text is in the + + + + + + next para + + + + + + graph. + + + + + + + + + + diff --git a/src/test/test_tree5/word/fontTable.xml b/src/test/test_tree5/word/fontTable.xml new file mode 100644 index 0000000..3955e9e --- /dev/null +++ b/src/test/test_tree5/word/fontTable.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/word/settings.xml b/src/test/test_tree5/word/settings.xml new file mode 100644 index 0000000..710ffc4 --- /dev/null +++ b/src/test/test_tree5/word/settings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/word/styles.xml b/src/test/test_tree5/word/styles.xml new file mode 100644 index 0000000..bc08178 --- /dev/null +++ b/src/test/test_tree5/word/styles.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/word/theme/theme1.xml b/src/test/test_tree5/word/theme/theme1.xml new file mode 100644 index 0000000..a4458fa --- /dev/null +++ b/src/test/test_tree5/word/theme/theme1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/test/test_tree5/word/webSettings.xml b/src/test/test_tree5/word/webSettings.xml new file mode 100644 index 0000000..71f223e --- /dev/null +++ b/src/test/test_tree5/word/webSettings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/xml_util.rs b/src/xml_util.rs index ebafab4..05b5c5a 100644 --- a/src/xml_util.rs +++ b/src/xml_util.rs @@ -1,10 +1,10 @@ -use quick_xml::events::{Event, BytesStart}; +use quick_xml::events::{Event, BytesStart, BytesText}; use quick_xml::events::attributes::{Attr, Attribute}; use quick_xml::name::QName; use quick_xml::reader::Reader; use quick_xml::writer::Writer; use regex::Regex; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::fs::{File, self}; use std::io::{BufReader, BufWriter}; use std::path::Path; @@ -15,6 +15,11 @@ use walkdir::WalkDir; use crate::file_util::FileUtil; use crate::zip_util::ZipUtil; +#[cfg(windows)] +const LINE_ENDING: &'static str = "\r\n"; +#[cfg(not(windows))] +const LINE_ENDING: &'static str = "\n"; + #[derive(Clone, Debug, PartialEq, Eq)] enum Mode { AttrCondition { @@ -24,15 +29,21 @@ enum Mode { condval: String, }, Attribute, - Value + Cat, + Grep, + Replace } pub struct XMLUtil { } +/// A collection of functions for working with .docx XML files. The functions generally expect that the .docx file +/// is already unzipped and passed in as the root directory of the location where is was unzipped. impl XMLUtil { + /// Send the text content of the docx structure to stdout. `dir` is the directory containing + /// the unzipped docx file and `src_file` is the original name of the docx file. pub fn cat(dir: &str, src_file: &str) { - Self::snr_xml(Mode::Value, dir, src_file, None, None, None, None); + Self::snr_xml(Mode::Cat, dir, src_file, None, None, None, None); } pub fn cat_rel_attr(el_name: &str, attr_name: &str, cond_key: &str, cond_val: &str, @@ -47,12 +58,31 @@ impl XMLUtil { None, None, None); } - pub fn grep_xml(_dir: &str, _src_file: &str, _pattern: &str) { - panic!("The 'grep' functionality is currently disabled until issue #2 is fixed"); + /// Search for regex `pattern` in the text of the docx structure and send matches to stdout. + /// `dir` is the directory containing + /// the unzipped docx file and `src_file` is the original name of the docx file. + pub fn grep_xml(dir: &str, src_file: &str, pattern: &str) { + // TODO put the pattern in the 'Mode' enum. + Self::snr_xml(Mode::Grep, dir, src_file, None, Some(pattern), None, None); } - pub fn replace_xml(_dir: &str, _src_file: &str, _pattern: &str, _replace: &str, _output_file: &Option<&str>) { - panic!("The 'replace' functionality is currently disabled until issue #2 is fixed"); + /// Search for regex `pattern` in the text of the docx structure and replace all occurrences with `replacement`. + /// `dir` is the directory containing + /// the unzipped docx file and `src_file` is the original name of the docx file. + /// + /// `output_file` can be a .docx filename. If specified the result will be zipped and written to produce this + /// new .docx file. Otherwise the result is zipped and written to `src_file`. + pub fn replace_xml(dir: &str, src_file: &str, pattern: &str, replacement: &str, output_file: &Option<&str>) { + let out_file = match output_file { + Some(of) => of, + None => src_file + }; + + let (_, files) = Self::get_files_with_content_type(dir, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"); + let fref = files.iter().map(AsRef::as_ref).collect(); + + Self::snr_xml(Mode::Replace, dir, src_file, Some(fref), Some(pattern), Some(replacement), Some(out_file)) } pub fn replace_rel_attr(dir: &str, src_file: &str, pattern: &str, replace: &str, output_file: &Option<&str>) { @@ -88,7 +118,13 @@ impl XMLUtil { rels_files } - fn snr_xml(mode: Mode, dir: &str, src_file: &str, files: Option>, pattern: Option<&str>, replace: Option<&str>, output_file: Option<&str>) { + /// Iterate recursively over all files in `dir` and perform the operation specified in `mode` on each file. The original name + /// of the .docx file is provided in `src_file`. + /// + /// Optionally specify `files` as the list of files to match. If not specified, all files ending with `.xml` are matched. + /// `pattern` and `replacement` are used to search/replace operations. + /// `output_file` optionally specifies a different output file for replacement operations. + fn snr_xml(mode: Mode, dir: &str, src_file: &str, files: Option>, pattern: Option<&str>, replacement: Option<&str>, output_file: Option<&str>) { let mut base_dir = dir.to_owned(); if !dir.ends_with("/") { base_dir.push('/'); @@ -116,7 +152,7 @@ impl XMLUtil { } } - Self::snr_xml_file(&mode, entry.path(), ®ex, &replace, src_file); + Self::snr_xml_file(&mode, entry.path(), ®ex, &replacement, src_file); } if let Some(outfile) = output_file { @@ -125,40 +161,412 @@ impl XMLUtil { } fn snr_xml_file(mode: &Mode, path: &Path, regex: &Option, replace: &Option<&str>, src_file: &str) { - let reader = Reader::from_file(path).expect(&path.to_string_lossy()); - match mode { - Mode::Value => Self::snr_xml_node(reader, src_file), - Mode::Attribute => Self::snr_change_attribute(reader, regex, replace, src_file, path), - Mode::AttrCondition { .. } => Self::snr_xml_attribute(mode, reader, src_file), + Mode::Cat => Self::cat_text(path, src_file), + Mode::Attribute => Self::snr_change_attribute(path, regex, replace, src_file, path), + Mode::AttrCondition { .. } => Self::snr_xml_attribute(mode, path, src_file), + Mode::Grep => Self::grep_text(path, src_file, regex), + Mode::Replace => Self::replace_text(path, src_file, regex, replace.unwrap()) + } + } + + fn get_reader(path: &Path) -> Reader> { + Reader::from_file(path).expect(&path.to_string_lossy()) + } + + fn read_namespaces(e: &BytesStart, nslist: &mut Vec) { + for attr in e.attributes() { + if let Ok(a) = attr { + let k = str::from_utf8(a.key.as_ref()); + if let Ok(key) = k { + if key.starts_with("xmlns:") { + if a.value.as_ref() == b"http://schemas.openxmlformats.org/wordprocessingml/2006/main" { + let alt_name = (&key[6..]).to_string(); + nslist.push(alt_name); + } + } + } + } + } + } + + /// For each namespace in `nsl` produce a qname result that has the namespace and `tag` as local name. + fn nsl_to_fqnames(nsl: &Vec, tag: &str) -> Vec { + let mut fqnames = vec![]; + + for ns in nsl { + let mut fq = ns.clone(); + fq.push(':'); + fq.push_str(tag); + fqnames.push(fq); } + + fqnames + } + + /// Convert a list of qnames specified as strings in to a list of QNames. + fn qnames(fqnl: &Vec) -> Vec { + let mut qnames = vec![]; + + for fqn in fqnl { + qnames.push(QName(fqn.as_bytes())); + } + + qnames } - fn snr_xml_node(mut reader: Reader>, src_file: &str) { + /// Check if any of the namespaces specified as `nslist` with `tag` as local name contains the QName + /// specified as `qn`. + fn match_tag(qn: &QName, nslist: &Vec, tag: &str) -> bool { + let para_fqnl = Self::nsl_to_fqnames(nslist, tag); + let para_qnames = Self::qnames(¶_fqnl); + let contains = para_qnames.contains(qn); + contains + } + + /// Read the contents of `xml_file` which would typically be a `word/document.xml` file and collect + /// all paragraphs of text in the result as a `Vec`. + /// + /// In the input XML file a single paragraph and even a single word might be spread over different + /// tags. The String list returned merges these together so that the result looks like what + /// you would see in the word processor. However, in order to replace text, we need to know which text + /// originated in which tag. For this this method numbers the tags in the document and in its + /// second return value it returns a BTreeMap where the key is the number, or id, of each text element + /// and the value is a tuple where the first value is the paragraph that is relates to and the second + /// value is the character position in that paragraph that the tag with this id starts. + /// + /// `src_file` is the name of the original .docx file. + /// If the `replacements` HashMap contains data, then these will be applied and the result is used to + /// overwrite the `xml_file` input file. + /// The keys of the `replacements` map is the id of the tags that need to be replaced and the first + /// value of the value tuple of `replacements` is the new value for this tag. The second value of the + /// tuple is not used in this function. + fn get_replace_text(xml_file: &Path, src_file: &str, replacements: HashMap)>) + -> (Vec, BTreeMap) { + let mut reader = Self::get_reader(xml_file); + + // TODO Move create temp file writer to share function + let mut temp_res = xml_file.parent().unwrap().to_owned(); + temp_res.push(format!("{}.xml", Uuid::new_v4())); + let temp_file = &temp_res.to_string_lossy(); + let tf = File::create(&temp_res).expect(temp_file); + let mut writer = Writer::new(BufWriter::new(tf)); + + let mut paras = Vec::new(); + let mut cur_line = String::new(); + let mut coords = BTreeMap::new(); + let mut buf = Vec::new(); + let mut nslist = vec!["http://schemas.openxmlformats.org/wordprocessingml/2006/main".to_string()]; + + let mut first_element = true; + let mut inside_paragraph = false; + let mut inside_text = false; + let mut text_els: usize = 0; loop { - match reader.read_event_into(&mut buf) { + let ev = reader.read_event_into(&mut buf); + // println!("Read event: {:?}", ev); + match ev { Err(e) => panic!("Error reading {} at position {}: {:?}", src_file, reader.buffer_position(), e), Ok(Event::Eof) => break, + Ok(Event::Empty(e)) => { + if Self::match_tag(&e.name(), &nslist, "br") { + cur_line.push_str(LINE_ENDING); + } + writer.write_event(Event::Empty(e)).expect(temp_file); + }, + Ok(Event::Start(e)) => { + if first_element { + first_element = false; + Self::read_namespaces(&e, &mut nslist); + } + + if Self::match_tag(&e.name(), &nslist, "p") { // TODO Maybe store para_qnames eagerly + inside_paragraph = true; + } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "t") { + inside_text = true; + } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "br") { + cur_line.push_str(LINE_ENDING); + } + writer.write_event(Event::Start(e)).expect(temp_file); + }, + Ok(Event::End(e)) => { + if Self::match_tag(&e.name(), &nslist, "p") { + inside_paragraph = false; + if cur_line.len() > 0 { + paras.push(cur_line.clone()); + } + cur_line.clear(); + } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "t") { + inside_text = false; + } + writer.write_event(Event::End(e)).expect(&temp_file); + }, Ok(Event::Text(t)) => { - let val = t.unescape().expect(src_file); - let val_trimmed = val.trim(); - if val_trimmed.len() > 0 { - println!("{}: {}", src_file, val_trimmed); + let mut ct = t; + + if inside_text { + let val = ct.unescape().expect(src_file); + if val.len() > 0 { + coords.insert(text_els, (paras.len(), cur_line.len())); + + let new_text = replacements.get(&text_els); + if let Some((nt, _)) = new_text { + ct = BytesText::new(nt); + println!("{}: {}\n-> {}", src_file, val, nt); + } + + text_els += 1; + + cur_line.push_str(val.as_ref()); + } + } + writer.write_event(Event::Text(ct)).expect(&temp_file); + }, + Ok(e) => writer.write_event(e).expect(temp_file) + } + } + + drop(reader); // Close the file being read + + // This writes out the file + writer.into_inner().into_inner().unwrap(); + + if !replacements.is_empty() { + // Original file should be replaced + fs::remove_file(xml_file).unwrap(); + fs::rename(temp_res, xml_file).unwrap(); + } else { + fs::remove_file(temp_res).unwrap(); + } + + (paras, coords) + } + + fn cat_text(path: &Path, src_file: &str) { + let (paras, _) = Self::get_replace_text(path, src_file, HashMap::new()); + + for para in paras { + println!("{}: {}", src_file, para); + } + } + + fn grep_text(path: &Path, src_file: &str, regex: &Option) { + if let Some(rex) = regex { + let (paras, _) = Self::get_replace_text(path, src_file, HashMap::new()); + + for para in paras { + if rex.is_match(¶) { + println!("{}: {}", src_file, para); + } + } + } else { + panic!("Bad regex for grep: {:?}", regex); + } + } + + fn get_line_coords(cur_line: usize, coords: &BTreeMap) -> BTreeMap { + let mut res = BTreeMap::new(); + + for (id, (line, pos)) in coords { + if cur_line == *line { + res.insert(*pos, (usize::MAX, *id)); + + if *pos > 0 { + // The position is not at the start of the line, update the previous position with the endpos + let prev_id = id - 1; + let prev = coords.get(&prev_id); + + if let Some((pl, ppos)) = prev { + if *pl != cur_line { + // Shouldn't happen + break; + } + res.insert(*ppos, (*pos, prev_id)); + } + } + } + } + + res + } + + /// Apply the replacements needed for a single tag. The id (internal number) of the tag is + /// provided in `tag_id`. The original contents of the tags is provided in `tag`. The replacement text + /// is in `replace` and the location start and end of the text in the original tag to be replaced + /// is in `match_start` and `match_end`. + /// + /// `replacements` contains the currently known set of replacements, which may already contain other + /// replacements made for the current tag. The key of this HashMap is the tag id and the value contains + /// the current value of the tag, given any previous replacements and an offset mapping that contains + /// for each character position in the original tag text a positive or negative offset in case the + /// match locations must be adjusted given any previously applied replacements, as they may have + /// changed the length of text in the tag. + fn replace_within_tag(replacements: &mut HashMap)>, tag_id: usize, tag: &str, + match_start: usize, match_end: usize, replace: &str) { + let prev_repl = replacements.get(&tag_id); + + let mut replaced; + let mut corr_idxs: Vec; + if let Some((r, c)) = prev_repl { + replaced = r.clone(); + corr_idxs = c.clone(); + } else { + replaced = tag.to_string(); + corr_idxs = vec![0; replaced.len()]; + } + + let mut correction = 0; + for i in 0..match_start { + correction += corr_idxs[i as usize]; + } + + let repl_start = (match_start as i32 + correction) as usize; + let repl_end = (match_end as i32 + correction) as usize; + replaced.replace_range(repl_start..repl_end, replace); + + let delta = replace.len() as i32 - match_end as i32 + match_start as i32; + if delta < 0 { + let from_pos = (match_end as i32) + correction; + for i in from_pos+delta .. from_pos { + corr_idxs[i as usize] -= 1; + } + } else if delta > 0 { + let corr_pos = (match_end as i32) - 1; + corr_idxs[corr_pos as usize] += delta; + } + + replacements.insert(tag_id, (replaced, corr_idxs)); + } + + /// In the file pointed to by `path` replace all matching `regex`es with the `replace` value. + /// The input file will be overwritten with the result. `src_file` is the name of the original + /// .docx file + /// + /// This method works by reading the file contents first via `get_replace_text` and applying the + /// regex replacements to its result (a list of strings, representing each paragraph). + /// + /// Replacements are mapped to tags which are numbered internally. + /// Once all the replacements have been found, the `get_replace_text` method is called again + /// but now with the replacements to-be-applied. + fn replace_text(path: &Path, src_file: &str, regex: &Option, replace: &str) { + let mut replacements: HashMap)> = HashMap::new(); + + let rex = regex.as_ref().unwrap(); + let (paras, coords) = Self::get_replace_text(path, src_file, HashMap::new()); + + let mut cur_line: usize = 0; + for para in paras { + let line_coords = Self::get_line_coords(cur_line, &coords); + for m in rex.find_iter(¶) { + let mstart = m.start(); + let mend = m.end(); + + let mut start_id = 0; + let mut end_id = 0; + let mut start_idx = 0; + + let mut tags = BTreeMap::new(); + for (idx, (eidx, id)) in &line_coords { + let neidx; + if *eidx > para.len() { + neidx = para.len(); + } else { + neidx = *eidx; + } + + let t = ¶[*idx..neidx]; + tags.insert(*id, t); + + if *idx <= mstart { + start_id = *id; + end_id = *id; + start_idx = *idx; + } + if *idx < mend { + end_id = *id; + } + } + + // The match region is between start_id and end_id now + + if start_id == end_id { + // simplest case start and end are the same: + if let Some(tag) = tags.get(&start_id) { + Self::replace_within_tag(&mut replacements, start_id, tag, mstart - start_idx, mend - start_idx, replace); + } + } else { + /* + 1. get the length of the replacement + 2. get all tags + 3. Walk over tags, first one from match position, later ones from start + 4. divide up the caracters: + all up to but not including last: + replace the characters + last one: + replace the rest + */ + + let mut remaining_chars = mend as i32 - mstart as i32; + let mut cur_replacement = replace.to_string(); + for i in start_id..end_id + 1 { + if remaining_chars < 0 { remaining_chars = 0; } + + if let Some(tag) = tags.get(&i) { + if i == start_id { + let chars = tag.len() - mstart; + + let repl; + if cur_replacement.len() >= chars { + repl = &cur_replacement[0..chars]; + } else { + repl = &cur_replacement; + } + Self::replace_within_tag(&mut replacements, i, tag, mstart, mstart + chars, repl); + + remaining_chars -= chars as i32; + if cur_replacement.len() >= chars { + cur_replacement = cur_replacement[chars..].to_string(); + } else { + cur_replacement.clear(); + } + } else if i == end_id { + Self::replace_within_tag(&mut replacements, i, tag, 0, remaining_chars as usize, &cur_replacement); + } else { + let repl; + if cur_replacement.len() >= tag.len() { + repl = &cur_replacement[0..tag.len()]; + } else { + repl = &cur_replacement; + } + + Self::replace_within_tag(&mut replacements, i, tag, 0, tag.len(), repl); + + remaining_chars -= tag.len() as i32; + if cur_replacement.len() >= tag.len() { + cur_replacement = cur_replacement[tag.len()..].to_string(); + } else { + cur_replacement.clear(); + } + } + } } } - _ => (), } + cur_line += 1; + } - // buf.clear(); why is this suggested in the docs? + if !replacements.is_empty() { + Self::get_replace_text(path, src_file, replacements); } } - fn snr_change_attribute(mut reader: Reader>, regex: &Option, replace: &Option<&str>, src_file: &str, output_path: &Path) { + fn snr_change_attribute(path: &Path, regex: &Option, replace: &Option<&str>, src_file: &str, output_path: &Path) { if regex.is_none() || replace.is_none() { return; } + let mut reader = Self::get_reader(path); let rex = regex.as_ref().unwrap(); let repl = replace.unwrap(); @@ -203,7 +611,8 @@ impl XMLUtil { } fn update_attributes<'a>(bs: BytesStart<'a>, regex: &Regex, replace: &str, src_file: &str) -> (BytesStart<'a>, bool) { - let mut es = BytesStart::clone(&bs); + let mut es = bs.clone(); + es.clear_attributes(); let mut changed = false; @@ -236,7 +645,8 @@ impl XMLUtil { } } - fn snr_xml_attribute(mode: &Mode, mut reader: Reader>, src_file: &str) { + fn snr_xml_attribute(mode: &Mode, path: &Path, src_file: &str) { + let mut reader = Self::get_reader(path); let mut buf = Vec::new(); loop { @@ -391,18 +801,29 @@ mod tests { assert!(out.contains("my-file.docx: Here’s a hyperlink:")); } - /* + #[test] + #[serial] + fn test_cat2() { + let out = capture_stdout!(XMLUtil::cat("./src/test/test_tree5", "wordbreak.docx")); + let expected = + "wordbreak.docx: Notwithstanding the eventual resulting quotations punters were agreeable to a technocratic compromise.".to_string() + + super::LINE_ENDING + "Here’s another line of text."; + let idx1 = out.find(&expected).unwrap(); + let idx2 = out.find("wordbreak.docx: And this text is in the next paragraph.").unwrap(); + + assert!(idx1 < idx2); + } + #[test] #[serial] // This test has to run serially to avoid multiple tests to capture stdout fn test_grep() { let out = capture_stdout!(XMLUtil::grep_xml("./src/test/test_tree2", "doc123.docx", "[oe]re")); - assert!(out.contains("doc123.docx: And some some more text")); + assert!(out.contains("doc123.docx: And some some some more text")); assert!(out.contains("doc123.docx: Something here")); assert!(out.contains("doc123.docx: Here’s a hyperlink:")); assert!(out.contains("doc123.docx: And here’s just some text:")); assert!(!out.contains("Target")); } - */ #[test] #[serial] @@ -418,16 +839,15 @@ mod tests { assert!(!out.contains("Target=webSettings.xml")) } - /* #[test] - fn test_replace() -> io::Result<()> { + fn test_replace_shorten() -> io::Result<()> { let orgdir = "./src/test/test_tree2"; let testdir = testdir!(); copy_dir_all(orgdir, &testdir)?; let before = fs::read_to_string("./src/test/test_tree2/word/document.xml")?; - assert!(before.contains("And some some more text"), "Precondition"); + assert!(before.contains("And some some some more text"), "Precondition"); assert!(before.contains("and then some"), "Precondition"); assert!(before.contains("Something here"), "Precondition"); assert!(before.contains(">some<"), "Precondition"); @@ -440,7 +860,7 @@ mod tests { // Check that the replacement worked as expected let after = fs::read_to_string(testdir.join("word/document.xml"))?; - assert!(after.contains("And zzz zzz more text")); + assert!(after.contains("And zzz zzz zzz more text")); assert!(after.contains("and then zzz")); assert!(after.contains("zzzthing here")); assert!(after.contains(">zzz")); @@ -449,7 +869,147 @@ mod tests { Ok(()) } - */ + + #[test] + fn test_replace_make_longer() -> io::Result<()> { + let orgdir = "./src/test/test_tree2"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + let before = fs::read_to_string("./src/test/test_tree2/word/document.xml")?; + assert!(before.contains("And some some some more text"), "Precondition"); + assert!(before.contains("and then some"), "Precondition"); + assert!(before.contains("Something here"), "Precondition"); + assert!(before.contains(">some<"), "Precondition"); + assert!(before.contains(">Some <"), "Precondition"); + assert!(!before.contains("zzz"), "Precondition"); + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "my-source.docx", + "[Ss]ome", "ABCDEF", + &Some(&testdir.join("output.docx").to_string_lossy())); + + // Check that the replacement worked as expected + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("And ABCDEF ABCDEF ABCDEF more text")); + assert!(after.contains("and then ABCDEF")); + assert!(after.contains("ABCDEFthing here")); + assert!(after.contains(">ABCDEF")); + assert!(!after.contains("some")); + assert!(!after.contains("Some")); + + Ok(()) + } + + #[test] + fn test_replace_across_tags() -> io::Result<()> { + let orgdir = "./src/test/test_tree5"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?; + assert!(before.contains("re"), "Precondition"); + assert!(before.contains("sult"), "Precondition"); + assert!(before.contains("ing"), "Precondition"); + assert!(!before.contains("resulting"), "Precondition"); + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx", + "resulting", "1234567890", + &Some(&testdir.join("output.docx").to_string_lossy())); + + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("eventual 12<")); + assert!(after.contains(">3456<")); + assert!(after.contains(">7890 quotations")); + assert!(!after.contains("1234567890")); + + Ok(()) + } + + #[test] + fn test_replace_across_tags0() -> io::Result<()> { + let orgdir = "./src/test/test_tree5"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?; + assert!(before.contains("re"), "Precondition"); + assert!(before.contains("sult"), "Precondition"); + assert!(before.contains("ing"), "Precondition"); + assert!(!before.contains("resulting"), "Precondition"); + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx", + "resulting", "1", + &Some(&testdir.join("output.docx").to_string_lossy())); + + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("eventual 1<")); + assert!(after.contains("><")); + assert!(after.contains("> quotations")); + + Ok(()) + } + + #[test] + fn test_replace_across_tags1() -> io::Result<()> { + let orgdir = "./src/test/test_tree5"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?; + assert!(before.contains("re"), "Precondition"); + assert!(before.contains("sult"), "Precondition"); + assert!(before.contains("ing"), "Precondition"); + assert!(!before.contains("resulting"), "Precondition"); + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx", + "resulting", "123", + &Some(&testdir.join("output.docx").to_string_lossy())); + + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("eventual 12<")); + assert!(after.contains(">3<")); + assert!(after.contains("> quotations")); + + Ok(()) + } + + #[test] + fn test_replace_across_tags2() -> io::Result<()> { + let orgdir = "./src/test/test_tree2"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "xyz.docx", + "(text and|then some)", "aaa", &None); + + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("some more aaa aaa<")); + + Ok(()) + } + + #[test] + fn test_replace_across_tags3() -> io::Result<()> { + let orgdir = "./src/test/test_tree2"; + let testdir = testdir!(); + + copy_dir_all(orgdir, &testdir)?; + + XMLUtil::replace_xml(&testdir.to_string_lossy(), "xyz.docx", + "(text and|then some)", "bbbbb", &None); + + let after = fs::read_to_string(testdir.join("word/document.xml"))?; + assert!(after.contains("some more bbbbb bbbbb<")); + + Ok(()) + } #[test] fn test_replace_hyperlink() -> io::Result<()> { @@ -477,7 +1037,6 @@ mod tests { Ok(()) } - /* #[test] fn test_replace_both() -> io::Result<()> { let orgdir = "./src/test/test_tree3"; @@ -520,7 +1079,6 @@ mod tests { Ok(()) } - */ fn copy_dir_all(src: impl AsRef, dst: impl AsRef) -> io::Result<()> { fs::create_dir_all(&dst)?;