diff --git a/Cargo.toml b/Cargo.toml
index b448d99..eaa8463 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "docxtools"
-version = "0.6.0"
+version = "0.6.1-SNAPSHOT"
edition = "2021"
[dependencies]
diff --git a/src/bin/main.rs b/src/bin/main.rs
index a5671cf..816ebdc 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -28,12 +28,10 @@ enum Commands {
/// List the links in the document to the console
Links(LinksArgs),
- /// Search the text in the document like 'grep'
- #[command(name = "_grep")]
+ /// Search the text in the document
Grep(GrepArgs),
/// Search and replace in document text and tables
- #[command(name = "_replace")]
Replace(ReplaceArgs),
/// Search and replace hyperlinks in the document
diff --git a/src/test/test_tree2/word/document.xml b/src/test/test_tree2/word/document.xml
index db02d7d..970feb3 100644
--- a/src/test/test_tree2/word/document.xml
+++ b/src/test/test_tree2/word/document.xml
@@ -56,7 +56,7 @@
- And some some more text
+ And some some some more text
diff --git a/src/test/test_tree5/[Content_Types].xml b/src/test/test_tree5/[Content_Types].xml
new file mode 100644
index 0000000..aa8ac0e
--- /dev/null
+++ b/src/test/test_tree5/[Content_Types].xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/_rels/.rels b/src/test/test_tree5/_rels/.rels
new file mode 100644
index 0000000..fdd8c4f
--- /dev/null
+++ b/src/test/test_tree5/_rels/.rels
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/docProps/app.xml b/src/test/test_tree5/docProps/app.xml
new file mode 100644
index 0000000..39ef68e
--- /dev/null
+++ b/src/test/test_tree5/docProps/app.xml
@@ -0,0 +1,2 @@
+
+Normal.dotm1125147Microsoft Office Word011falsefalse171falsefalse16.0000
\ No newline at end of file
diff --git a/src/test/test_tree5/docProps/core.xml b/src/test/test_tree5/docProps/core.xml
new file mode 100644
index 0000000..cb6185e
--- /dev/null
+++ b/src/test/test_tree5/docProps/core.xml
@@ -0,0 +1,2 @@
+
+David BosschaertDavid Bosschaert122024-01-10T14:44:00Z2024-01-11T15:34:00Z
\ No newline at end of file
diff --git a/src/test/test_tree5/word/_rels/document.xml.rels b/src/test/test_tree5/word/_rels/document.xml.rels
new file mode 100644
index 0000000..0079d06
--- /dev/null
+++ b/src/test/test_tree5/word/_rels/document.xml.rels
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/word/document.xml b/src/test/test_tree5/word/document.xml
new file mode 100644
index 0000000..d1b9063
--- /dev/null
+++ b/src/test/test_tree5/word/document.xml
@@ -0,0 +1,86 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Notwithstanding the eventual re
+
+
+
+
+
+ sult
+
+
+
+
+
+ ing quotations punters were agreeable to a technocratic compromise.
+
+
+
+
+
+
+ Here’s ano
+
+
+
+
+
+ ther line of te
+
+
+
+
+
+ xt.
+
+
+
+
+
+
+
+
+
+
+
+
+ And th
+
+
+
+
+
+ is text is in the
+
+
+
+
+
+ next para
+
+
+
+
+
+ graph.
+
+
+
+
+
+
+
+
+
+
diff --git a/src/test/test_tree5/word/fontTable.xml b/src/test/test_tree5/word/fontTable.xml
new file mode 100644
index 0000000..3955e9e
--- /dev/null
+++ b/src/test/test_tree5/word/fontTable.xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/word/settings.xml b/src/test/test_tree5/word/settings.xml
new file mode 100644
index 0000000..710ffc4
--- /dev/null
+++ b/src/test/test_tree5/word/settings.xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/word/styles.xml b/src/test/test_tree5/word/styles.xml
new file mode 100644
index 0000000..bc08178
--- /dev/null
+++ b/src/test/test_tree5/word/styles.xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/word/theme/theme1.xml b/src/test/test_tree5/word/theme/theme1.xml
new file mode 100644
index 0000000..a4458fa
--- /dev/null
+++ b/src/test/test_tree5/word/theme/theme1.xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/test/test_tree5/word/webSettings.xml b/src/test/test_tree5/word/webSettings.xml
new file mode 100644
index 0000000..71f223e
--- /dev/null
+++ b/src/test/test_tree5/word/webSettings.xml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/src/xml_util.rs b/src/xml_util.rs
index ebafab4..05b5c5a 100644
--- a/src/xml_util.rs
+++ b/src/xml_util.rs
@@ -1,10 +1,10 @@
-use quick_xml::events::{Event, BytesStart};
+use quick_xml::events::{Event, BytesStart, BytesText};
use quick_xml::events::attributes::{Attr, Attribute};
use quick_xml::name::QName;
use quick_xml::reader::Reader;
use quick_xml::writer::Writer;
use regex::Regex;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
use std::fs::{File, self};
use std::io::{BufReader, BufWriter};
use std::path::Path;
@@ -15,6 +15,11 @@ use walkdir::WalkDir;
use crate::file_util::FileUtil;
use crate::zip_util::ZipUtil;
+#[cfg(windows)]
+const LINE_ENDING: &'static str = "\r\n";
+#[cfg(not(windows))]
+const LINE_ENDING: &'static str = "\n";
+
#[derive(Clone, Debug, PartialEq, Eq)]
enum Mode {
AttrCondition {
@@ -24,15 +29,21 @@ enum Mode {
condval: String,
},
Attribute,
- Value
+ Cat,
+ Grep,
+ Replace
}
pub struct XMLUtil {
}
+/// A collection of functions for working with .docx XML files. The functions generally expect that the .docx file
+/// is already unzipped and passed in as the root directory of the location where is was unzipped.
impl XMLUtil {
+ /// Send the text content of the docx structure to stdout. `dir` is the directory containing
+ /// the unzipped docx file and `src_file` is the original name of the docx file.
pub fn cat(dir: &str, src_file: &str) {
- Self::snr_xml(Mode::Value, dir, src_file, None, None, None, None);
+ Self::snr_xml(Mode::Cat, dir, src_file, None, None, None, None);
}
pub fn cat_rel_attr(el_name: &str, attr_name: &str, cond_key: &str, cond_val: &str,
@@ -47,12 +58,31 @@ impl XMLUtil {
None, None, None);
}
- pub fn grep_xml(_dir: &str, _src_file: &str, _pattern: &str) {
- panic!("The 'grep' functionality is currently disabled until issue #2 is fixed");
+ /// Search for regex `pattern` in the text of the docx structure and send matches to stdout.
+ /// `dir` is the directory containing
+ /// the unzipped docx file and `src_file` is the original name of the docx file.
+ pub fn grep_xml(dir: &str, src_file: &str, pattern: &str) {
+ // TODO put the pattern in the 'Mode' enum.
+ Self::snr_xml(Mode::Grep, dir, src_file, None, Some(pattern), None, None);
}
- pub fn replace_xml(_dir: &str, _src_file: &str, _pattern: &str, _replace: &str, _output_file: &Option<&str>) {
- panic!("The 'replace' functionality is currently disabled until issue #2 is fixed");
+ /// Search for regex `pattern` in the text of the docx structure and replace all occurrences with `replacement`.
+ /// `dir` is the directory containing
+ /// the unzipped docx file and `src_file` is the original name of the docx file.
+ ///
+ /// `output_file` can be a .docx filename. If specified the result will be zipped and written to produce this
+ /// new .docx file. Otherwise the result is zipped and written to `src_file`.
+ pub fn replace_xml(dir: &str, src_file: &str, pattern: &str, replacement: &str, output_file: &Option<&str>) {
+ let out_file = match output_file {
+ Some(of) => of,
+ None => src_file
+ };
+
+ let (_, files) = Self::get_files_with_content_type(dir,
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml");
+ let fref = files.iter().map(AsRef::as_ref).collect();
+
+ Self::snr_xml(Mode::Replace, dir, src_file, Some(fref), Some(pattern), Some(replacement), Some(out_file))
}
pub fn replace_rel_attr(dir: &str, src_file: &str, pattern: &str, replace: &str, output_file: &Option<&str>) {
@@ -88,7 +118,13 @@ impl XMLUtil {
rels_files
}
- fn snr_xml(mode: Mode, dir: &str, src_file: &str, files: Option>, pattern: Option<&str>, replace: Option<&str>, output_file: Option<&str>) {
+ /// Iterate recursively over all files in `dir` and perform the operation specified in `mode` on each file. The original name
+ /// of the .docx file is provided in `src_file`.
+ ///
+ /// Optionally specify `files` as the list of files to match. If not specified, all files ending with `.xml` are matched.
+ /// `pattern` and `replacement` are used to search/replace operations.
+ /// `output_file` optionally specifies a different output file for replacement operations.
+ fn snr_xml(mode: Mode, dir: &str, src_file: &str, files: Option>, pattern: Option<&str>, replacement: Option<&str>, output_file: Option<&str>) {
let mut base_dir = dir.to_owned();
if !dir.ends_with("/") {
base_dir.push('/');
@@ -116,7 +152,7 @@ impl XMLUtil {
}
}
- Self::snr_xml_file(&mode, entry.path(), ®ex, &replace, src_file);
+ Self::snr_xml_file(&mode, entry.path(), ®ex, &replacement, src_file);
}
if let Some(outfile) = output_file {
@@ -125,40 +161,412 @@ impl XMLUtil {
}
fn snr_xml_file(mode: &Mode, path: &Path, regex: &Option, replace: &Option<&str>, src_file: &str) {
- let reader = Reader::from_file(path).expect(&path.to_string_lossy());
-
match mode {
- Mode::Value => Self::snr_xml_node(reader, src_file),
- Mode::Attribute => Self::snr_change_attribute(reader, regex, replace, src_file, path),
- Mode::AttrCondition { .. } => Self::snr_xml_attribute(mode, reader, src_file),
+ Mode::Cat => Self::cat_text(path, src_file),
+ Mode::Attribute => Self::snr_change_attribute(path, regex, replace, src_file, path),
+ Mode::AttrCondition { .. } => Self::snr_xml_attribute(mode, path, src_file),
+ Mode::Grep => Self::grep_text(path, src_file, regex),
+ Mode::Replace => Self::replace_text(path, src_file, regex, replace.unwrap())
+ }
+ }
+
+ fn get_reader(path: &Path) -> Reader> {
+ Reader::from_file(path).expect(&path.to_string_lossy())
+ }
+
+ fn read_namespaces(e: &BytesStart, nslist: &mut Vec) {
+ for attr in e.attributes() {
+ if let Ok(a) = attr {
+ let k = str::from_utf8(a.key.as_ref());
+ if let Ok(key) = k {
+ if key.starts_with("xmlns:") {
+ if a.value.as_ref() == b"http://schemas.openxmlformats.org/wordprocessingml/2006/main" {
+ let alt_name = (&key[6..]).to_string();
+ nslist.push(alt_name);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /// For each namespace in `nsl` produce a qname result that has the namespace and `tag` as local name.
+ fn nsl_to_fqnames(nsl: &Vec, tag: &str) -> Vec {
+ let mut fqnames = vec![];
+
+ for ns in nsl {
+ let mut fq = ns.clone();
+ fq.push(':');
+ fq.push_str(tag);
+ fqnames.push(fq);
}
+
+ fqnames
+ }
+
+ /// Convert a list of qnames specified as strings in to a list of QNames.
+ fn qnames(fqnl: &Vec) -> Vec {
+ let mut qnames = vec![];
+
+ for fqn in fqnl {
+ qnames.push(QName(fqn.as_bytes()));
+ }
+
+ qnames
}
- fn snr_xml_node(mut reader: Reader>, src_file: &str) {
+ /// Check if any of the namespaces specified as `nslist` with `tag` as local name contains the QName
+ /// specified as `qn`.
+ fn match_tag(qn: &QName, nslist: &Vec, tag: &str) -> bool {
+ let para_fqnl = Self::nsl_to_fqnames(nslist, tag);
+ let para_qnames = Self::qnames(¶_fqnl);
+ let contains = para_qnames.contains(qn);
+ contains
+ }
+
+ /// Read the contents of `xml_file` which would typically be a `word/document.xml` file and collect
+ /// all paragraphs of text in the result as a `Vec`.
+ ///
+ /// In the input XML file a single paragraph and even a single word might be spread over different
+ /// tags. The String list returned merges these together so that the result looks like what
+ /// you would see in the word processor. However, in order to replace text, we need to know which text
+ /// originated in which tag. For this this method numbers the tags in the document and in its
+ /// second return value it returns a BTreeMap where the key is the number, or id, of each text element
+ /// and the value is a tuple where the first value is the paragraph that is relates to and the second
+ /// value is the character position in that paragraph that the tag with this id starts.
+ ///
+ /// `src_file` is the name of the original .docx file.
+ /// If the `replacements` HashMap contains data, then these will be applied and the result is used to
+ /// overwrite the `xml_file` input file.
+ /// The keys of the `replacements` map is the id of the tags that need to be replaced and the first
+ /// value of the value tuple of `replacements` is the new value for this tag. The second value of the
+ /// tuple is not used in this function.
+ fn get_replace_text(xml_file: &Path, src_file: &str, replacements: HashMap)>)
+ -> (Vec, BTreeMap) {
+ let mut reader = Self::get_reader(xml_file);
+
+ // TODO Move create temp file writer to share function
+ let mut temp_res = xml_file.parent().unwrap().to_owned();
+ temp_res.push(format!("{}.xml", Uuid::new_v4()));
+ let temp_file = &temp_res.to_string_lossy();
+ let tf = File::create(&temp_res).expect(temp_file);
+ let mut writer = Writer::new(BufWriter::new(tf));
+
+ let mut paras = Vec::new();
+ let mut cur_line = String::new();
+ let mut coords = BTreeMap::new();
+
let mut buf = Vec::new();
+ let mut nslist = vec!["http://schemas.openxmlformats.org/wordprocessingml/2006/main".to_string()];
+
+ let mut first_element = true;
+ let mut inside_paragraph = false;
+ let mut inside_text = false;
+ let mut text_els: usize = 0;
loop {
- match reader.read_event_into(&mut buf) {
+ let ev = reader.read_event_into(&mut buf);
+ // println!("Read event: {:?}", ev);
+ match ev {
Err(e) => panic!("Error reading {} at position {}: {:?}", src_file, reader.buffer_position(), e),
Ok(Event::Eof) => break,
+ Ok(Event::Empty(e)) => {
+ if Self::match_tag(&e.name(), &nslist, "br") {
+ cur_line.push_str(LINE_ENDING);
+ }
+ writer.write_event(Event::Empty(e)).expect(temp_file);
+ },
+ Ok(Event::Start(e)) => {
+ if first_element {
+ first_element = false;
+ Self::read_namespaces(&e, &mut nslist);
+ }
+
+ if Self::match_tag(&e.name(), &nslist, "p") { // TODO Maybe store para_qnames eagerly
+ inside_paragraph = true;
+ } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "t") {
+ inside_text = true;
+ } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "br") {
+ cur_line.push_str(LINE_ENDING);
+ }
+ writer.write_event(Event::Start(e)).expect(temp_file);
+ },
+ Ok(Event::End(e)) => {
+ if Self::match_tag(&e.name(), &nslist, "p") {
+ inside_paragraph = false;
+ if cur_line.len() > 0 {
+ paras.push(cur_line.clone());
+ }
+ cur_line.clear();
+ } else if inside_paragraph && Self::match_tag(&e.name(), &nslist, "t") {
+ inside_text = false;
+ }
+ writer.write_event(Event::End(e)).expect(&temp_file);
+ },
Ok(Event::Text(t)) => {
- let val = t.unescape().expect(src_file);
- let val_trimmed = val.trim();
- if val_trimmed.len() > 0 {
- println!("{}: {}", src_file, val_trimmed);
+ let mut ct = t;
+
+ if inside_text {
+ let val = ct.unescape().expect(src_file);
+ if val.len() > 0 {
+ coords.insert(text_els, (paras.len(), cur_line.len()));
+
+ let new_text = replacements.get(&text_els);
+ if let Some((nt, _)) = new_text {
+ ct = BytesText::new(nt);
+ println!("{}: {}\n-> {}", src_file, val, nt);
+ }
+
+ text_els += 1;
+
+ cur_line.push_str(val.as_ref());
+ }
+ }
+ writer.write_event(Event::Text(ct)).expect(&temp_file);
+ },
+ Ok(e) => writer.write_event(e).expect(temp_file)
+ }
+ }
+
+ drop(reader); // Close the file being read
+
+ // This writes out the file
+ writer.into_inner().into_inner().unwrap();
+
+ if !replacements.is_empty() {
+ // Original file should be replaced
+ fs::remove_file(xml_file).unwrap();
+ fs::rename(temp_res, xml_file).unwrap();
+ } else {
+ fs::remove_file(temp_res).unwrap();
+ }
+
+ (paras, coords)
+ }
+
+ fn cat_text(path: &Path, src_file: &str) {
+ let (paras, _) = Self::get_replace_text(path, src_file, HashMap::new());
+
+ for para in paras {
+ println!("{}: {}", src_file, para);
+ }
+ }
+
+ fn grep_text(path: &Path, src_file: &str, regex: &Option) {
+ if let Some(rex) = regex {
+ let (paras, _) = Self::get_replace_text(path, src_file, HashMap::new());
+
+ for para in paras {
+ if rex.is_match(¶) {
+ println!("{}: {}", src_file, para);
+ }
+ }
+ } else {
+ panic!("Bad regex for grep: {:?}", regex);
+ }
+ }
+
+ fn get_line_coords(cur_line: usize, coords: &BTreeMap) -> BTreeMap {
+ let mut res = BTreeMap::new();
+
+ for (id, (line, pos)) in coords {
+ if cur_line == *line {
+ res.insert(*pos, (usize::MAX, *id));
+
+ if *pos > 0 {
+ // The position is not at the start of the line, update the previous position with the endpos
+ let prev_id = id - 1;
+ let prev = coords.get(&prev_id);
+
+ if let Some((pl, ppos)) = prev {
+ if *pl != cur_line {
+ // Shouldn't happen
+ break;
+ }
+ res.insert(*ppos, (*pos, prev_id));
+ }
+ }
+ }
+ }
+
+ res
+ }
+
+ /// Apply the replacements needed for a single tag. The id (internal number) of the tag is
+ /// provided in `tag_id`. The original contents of the tags is provided in `tag`. The replacement text
+ /// is in `replace` and the location start and end of the text in the original tag to be replaced
+ /// is in `match_start` and `match_end`.
+ ///
+ /// `replacements` contains the currently known set of replacements, which may already contain other
+ /// replacements made for the current tag. The key of this HashMap is the tag id and the value contains
+ /// the current value of the tag, given any previous replacements and an offset mapping that contains
+ /// for each character position in the original tag text a positive or negative offset in case the
+ /// match locations must be adjusted given any previously applied replacements, as they may have
+ /// changed the length of text in the tag.
+ fn replace_within_tag(replacements: &mut HashMap)>, tag_id: usize, tag: &str,
+ match_start: usize, match_end: usize, replace: &str) {
+ let prev_repl = replacements.get(&tag_id);
+
+ let mut replaced;
+ let mut corr_idxs: Vec;
+ if let Some((r, c)) = prev_repl {
+ replaced = r.clone();
+ corr_idxs = c.clone();
+ } else {
+ replaced = tag.to_string();
+ corr_idxs = vec![0; replaced.len()];
+ }
+
+ let mut correction = 0;
+ for i in 0..match_start {
+ correction += corr_idxs[i as usize];
+ }
+
+ let repl_start = (match_start as i32 + correction) as usize;
+ let repl_end = (match_end as i32 + correction) as usize;
+ replaced.replace_range(repl_start..repl_end, replace);
+
+ let delta = replace.len() as i32 - match_end as i32 + match_start as i32;
+ if delta < 0 {
+ let from_pos = (match_end as i32) + correction;
+ for i in from_pos+delta .. from_pos {
+ corr_idxs[i as usize] -= 1;
+ }
+ } else if delta > 0 {
+ let corr_pos = (match_end as i32) - 1;
+ corr_idxs[corr_pos as usize] += delta;
+ }
+
+ replacements.insert(tag_id, (replaced, corr_idxs));
+ }
+
+ /// In the file pointed to by `path` replace all matching `regex`es with the `replace` value.
+ /// The input file will be overwritten with the result. `src_file` is the name of the original
+ /// .docx file
+ ///
+ /// This method works by reading the file contents first via `get_replace_text` and applying the
+ /// regex replacements to its result (a list of strings, representing each paragraph).
+ ///
+ /// Replacements are mapped to tags which are numbered internally.
+ /// Once all the replacements have been found, the `get_replace_text` method is called again
+ /// but now with the replacements to-be-applied.
+ fn replace_text(path: &Path, src_file: &str, regex: &Option, replace: &str) {
+ let mut replacements: HashMap)> = HashMap::new();
+
+ let rex = regex.as_ref().unwrap();
+ let (paras, coords) = Self::get_replace_text(path, src_file, HashMap::new());
+
+ let mut cur_line: usize = 0;
+ for para in paras {
+ let line_coords = Self::get_line_coords(cur_line, &coords);
+ for m in rex.find_iter(¶) {
+ let mstart = m.start();
+ let mend = m.end();
+
+ let mut start_id = 0;
+ let mut end_id = 0;
+ let mut start_idx = 0;
+
+ let mut tags = BTreeMap::new();
+ for (idx, (eidx, id)) in &line_coords {
+ let neidx;
+ if *eidx > para.len() {
+ neidx = para.len();
+ } else {
+ neidx = *eidx;
+ }
+
+ let t = ¶[*idx..neidx];
+ tags.insert(*id, t);
+
+ if *idx <= mstart {
+ start_id = *id;
+ end_id = *id;
+ start_idx = *idx;
+ }
+ if *idx < mend {
+ end_id = *id;
+ }
+ }
+
+ // The match region is between start_id and end_id now
+
+ if start_id == end_id {
+ // simplest case start and end are the same:
+ if let Some(tag) = tags.get(&start_id) {
+ Self::replace_within_tag(&mut replacements, start_id, tag, mstart - start_idx, mend - start_idx, replace);
+ }
+ } else {
+ /*
+ 1. get the length of the replacement
+ 2. get all tags
+ 3. Walk over tags, first one from match position, later ones from start
+ 4. divide up the caracters:
+ all up to but not including last:
+ replace the characters
+ last one:
+ replace the rest
+ */
+
+ let mut remaining_chars = mend as i32 - mstart as i32;
+ let mut cur_replacement = replace.to_string();
+ for i in start_id..end_id + 1 {
+ if remaining_chars < 0 { remaining_chars = 0; }
+
+ if let Some(tag) = tags.get(&i) {
+ if i == start_id {
+ let chars = tag.len() - mstart;
+
+ let repl;
+ if cur_replacement.len() >= chars {
+ repl = &cur_replacement[0..chars];
+ } else {
+ repl = &cur_replacement;
+ }
+ Self::replace_within_tag(&mut replacements, i, tag, mstart, mstart + chars, repl);
+
+ remaining_chars -= chars as i32;
+ if cur_replacement.len() >= chars {
+ cur_replacement = cur_replacement[chars..].to_string();
+ } else {
+ cur_replacement.clear();
+ }
+ } else if i == end_id {
+ Self::replace_within_tag(&mut replacements, i, tag, 0, remaining_chars as usize, &cur_replacement);
+ } else {
+ let repl;
+ if cur_replacement.len() >= tag.len() {
+ repl = &cur_replacement[0..tag.len()];
+ } else {
+ repl = &cur_replacement;
+ }
+
+ Self::replace_within_tag(&mut replacements, i, tag, 0, tag.len(), repl);
+
+ remaining_chars -= tag.len() as i32;
+ if cur_replacement.len() >= tag.len() {
+ cur_replacement = cur_replacement[tag.len()..].to_string();
+ } else {
+ cur_replacement.clear();
+ }
+ }
+ }
}
}
- _ => (),
}
+ cur_line += 1;
+ }
- // buf.clear(); why is this suggested in the docs?
+ if !replacements.is_empty() {
+ Self::get_replace_text(path, src_file, replacements);
}
}
- fn snr_change_attribute(mut reader: Reader>, regex: &Option, replace: &Option<&str>, src_file: &str, output_path: &Path) {
+ fn snr_change_attribute(path: &Path, regex: &Option, replace: &Option<&str>, src_file: &str, output_path: &Path) {
if regex.is_none() || replace.is_none() {
return;
}
+ let mut reader = Self::get_reader(path);
let rex = regex.as_ref().unwrap();
let repl = replace.unwrap();
@@ -203,7 +611,8 @@ impl XMLUtil {
}
fn update_attributes<'a>(bs: BytesStart<'a>, regex: &Regex, replace: &str, src_file: &str) -> (BytesStart<'a>, bool) {
- let mut es = BytesStart::clone(&bs);
+ let mut es = bs.clone();
+
es.clear_attributes();
let mut changed = false;
@@ -236,7 +645,8 @@ impl XMLUtil {
}
}
- fn snr_xml_attribute(mode: &Mode, mut reader: Reader>, src_file: &str) {
+ fn snr_xml_attribute(mode: &Mode, path: &Path, src_file: &str) {
+ let mut reader = Self::get_reader(path);
let mut buf = Vec::new();
loop {
@@ -391,18 +801,29 @@ mod tests {
assert!(out.contains("my-file.docx: Here’s a hyperlink:"));
}
- /*
+ #[test]
+ #[serial]
+ fn test_cat2() {
+ let out = capture_stdout!(XMLUtil::cat("./src/test/test_tree5", "wordbreak.docx"));
+ let expected =
+ "wordbreak.docx: Notwithstanding the eventual resulting quotations punters were agreeable to a technocratic compromise.".to_string()
+ + super::LINE_ENDING + "Here’s another line of text.";
+ let idx1 = out.find(&expected).unwrap();
+ let idx2 = out.find("wordbreak.docx: And this text is in the next paragraph.").unwrap();
+
+ assert!(idx1 < idx2);
+ }
+
#[test]
#[serial] // This test has to run serially to avoid multiple tests to capture stdout
fn test_grep() {
let out = capture_stdout!(XMLUtil::grep_xml("./src/test/test_tree2", "doc123.docx", "[oe]re"));
- assert!(out.contains("doc123.docx: And some some more text"));
+ assert!(out.contains("doc123.docx: And some some some more text"));
assert!(out.contains("doc123.docx: Something here"));
assert!(out.contains("doc123.docx: Here’s a hyperlink:"));
assert!(out.contains("doc123.docx: And here’s just some text:"));
assert!(!out.contains("Target"));
}
- */
#[test]
#[serial]
@@ -418,16 +839,15 @@ mod tests {
assert!(!out.contains("Target=webSettings.xml"))
}
- /*
#[test]
- fn test_replace() -> io::Result<()> {
+ fn test_replace_shorten() -> io::Result<()> {
let orgdir = "./src/test/test_tree2";
let testdir = testdir!();
copy_dir_all(orgdir, &testdir)?;
let before = fs::read_to_string("./src/test/test_tree2/word/document.xml")?;
- assert!(before.contains("And some some more text"), "Precondition");
+ assert!(before.contains("And some some some more text"), "Precondition");
assert!(before.contains("and then some"), "Precondition");
assert!(before.contains("Something here"), "Precondition");
assert!(before.contains(">some<"), "Precondition");
@@ -440,7 +860,7 @@ mod tests {
// Check that the replacement worked as expected
let after = fs::read_to_string(testdir.join("word/document.xml"))?;
- assert!(after.contains("And zzz zzz more text"));
+ assert!(after.contains("And zzz zzz zzz more text"));
assert!(after.contains("and then zzz"));
assert!(after.contains("zzzthing here"));
assert!(after.contains(">zzz"));
@@ -449,7 +869,147 @@ mod tests {
Ok(())
}
- */
+
+ #[test]
+ fn test_replace_make_longer() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree2";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ let before = fs::read_to_string("./src/test/test_tree2/word/document.xml")?;
+ assert!(before.contains("And some some some more text"), "Precondition");
+ assert!(before.contains("and then some"), "Precondition");
+ assert!(before.contains("Something here"), "Precondition");
+ assert!(before.contains(">some<"), "Precondition");
+ assert!(before.contains(">Some <"), "Precondition");
+ assert!(!before.contains("zzz"), "Precondition");
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "my-source.docx",
+ "[Ss]ome", "ABCDEF",
+ &Some(&testdir.join("output.docx").to_string_lossy()));
+
+ // Check that the replacement worked as expected
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("And ABCDEF ABCDEF ABCDEF more text"));
+ assert!(after.contains("and then ABCDEF"));
+ assert!(after.contains("ABCDEFthing here"));
+ assert!(after.contains(">ABCDEF"));
+ assert!(!after.contains("some"));
+ assert!(!after.contains("Some"));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_replace_across_tags() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree5";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?;
+ assert!(before.contains("re"), "Precondition");
+ assert!(before.contains("sult"), "Precondition");
+ assert!(before.contains("ing"), "Precondition");
+ assert!(!before.contains("resulting"), "Precondition");
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx",
+ "resulting", "1234567890",
+ &Some(&testdir.join("output.docx").to_string_lossy()));
+
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("eventual 12<"));
+ assert!(after.contains(">3456<"));
+ assert!(after.contains(">7890 quotations"));
+ assert!(!after.contains("1234567890"));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_replace_across_tags0() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree5";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?;
+ assert!(before.contains("re"), "Precondition");
+ assert!(before.contains("sult"), "Precondition");
+ assert!(before.contains("ing"), "Precondition");
+ assert!(!before.contains("resulting"), "Precondition");
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx",
+ "resulting", "1",
+ &Some(&testdir.join("output.docx").to_string_lossy()));
+
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("eventual 1<"));
+ assert!(after.contains("><"));
+ assert!(after.contains("> quotations"));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_replace_across_tags1() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree5";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ let before = fs::read_to_string("./src/test/test_tree5/word/document.xml")?;
+ assert!(before.contains("re"), "Precondition");
+ assert!(before.contains("sult"), "Precondition");
+ assert!(before.contains("ing"), "Precondition");
+ assert!(!before.contains("resulting"), "Precondition");
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "acrstags.docx",
+ "resulting", "123",
+ &Some(&testdir.join("output.docx").to_string_lossy()));
+
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("eventual 12<"));
+ assert!(after.contains(">3<"));
+ assert!(after.contains("> quotations"));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_replace_across_tags2() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree2";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "xyz.docx",
+ "(text and|then some)", "aaa", &None);
+
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("some more aaa aaa<"));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_replace_across_tags3() -> io::Result<()> {
+ let orgdir = "./src/test/test_tree2";
+ let testdir = testdir!();
+
+ copy_dir_all(orgdir, &testdir)?;
+
+ XMLUtil::replace_xml(&testdir.to_string_lossy(), "xyz.docx",
+ "(text and|then some)", "bbbbb", &None);
+
+ let after = fs::read_to_string(testdir.join("word/document.xml"))?;
+ assert!(after.contains("some more bbbbb bbbbb<"));
+
+ Ok(())
+ }
#[test]
fn test_replace_hyperlink() -> io::Result<()> {
@@ -477,7 +1037,6 @@ mod tests {
Ok(())
}
- /*
#[test]
fn test_replace_both() -> io::Result<()> {
let orgdir = "./src/test/test_tree3";
@@ -520,7 +1079,6 @@ mod tests {
Ok(())
}
- */
fn copy_dir_all(src: impl AsRef, dst: impl AsRef) -> io::Result<()> {
fs::create_dir_all(&dst)?;