diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index b29c593..aaf3fcc 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -3,5 +3,16 @@ FROM mcr.microsoft.com/devcontainers/rust:1-1-bullseye # [Optional] Uncomment this section to install additional packages. -# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ -# && apt-get -y install --no-install-recommends +RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ + && apt-get -y install --no-install-recommends python3-lxml + +USER vscode + +# Install nightly rust +RUN rustup toolchain install nightly + +# Install pip +ENV PATH="${PATH}:/home/vscode/.local/bin" +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && rm /tmp/get-pip.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 6c5728e..2d6e42d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,32 +5,35 @@ "build": { "dockerfile": "Dockerfile" }, - "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ], - - // Set *default* container specific settings.json values on container create. - "settings": { - "lldb.executable": "/usr/bin/lldb", - // VS Code don't watch files under ./target - "files.watcherExclude": { - "**/target/**": true + "runArgs": [ + "--cap-add=SYS_PTRACE", + "--security-opt", + "seccomp=unconfined" + ], + "customizations": { + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "lldb.executable": "/usr/bin/lldb", + // VS Code don't watch files under ./target + "files.watcherExclude": { + "**/target/**": true + } + }, + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ + "matklad.rust-analyzer", + "bungcip.better-toml", + "vadimcn.vscode-lldb", + "mutantdino.resourcemonitor", + "ms-azuretools.vscode-docker" + ] } }, - - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ - "matklad.rust-analyzer", - "bungcip.better-toml", - "vadimcn.vscode-lldb", - "mutantdino.resourcemonitor", - "ms-azuretools.vscode-docker" - ], - // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], - // Use 'postCreateCommand' to run commands after the container is created. - // "postCreateCommand": "rustc --version", - + "postCreateCommand": "pip install -r tests/lxml_tests/requirements.txt", // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. "remoteUser": "vscode" -} +} \ No newline at end of file diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1d47d4c..e532d04 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -11,14 +11,17 @@ env: CARGO_TERM_COLOR: always jobs: - build: + build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Build - run: cargo build --verbose - - name: Run tests - run: cargo test --verbose + - name: Checkout (GitHub) + uses: actions/checkout@v3 + - name: Run tests in devcontainer + uses: devcontainers/ci@v0.3 + with: + push: never + runCmd: | + cargo test stack_overflow_tests: runs-on: windows-latest diff --git a/Cargo.toml b/Cargo.toml index d57b2ca..71038c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "skyscraper" -version = "0.6.1" +version = "0.6.2" authors = ["James La Novara-Gsell "] edition = "2021" description = "XPath for HTML web scraping" @@ -29,6 +29,8 @@ mockall = "0.12.0" indoc = "2" proptest = "1.3.1" regex = "1.10.3" +serde_json = "1.0.113" +serde = "1.0.196" [[bench]] name = "benchmarks" diff --git a/src/html/mod.rs b/src/html/mod.rs index c859c0a..e7fa9d1 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -139,21 +139,70 @@ pub struct HtmlText { impl HtmlText { /// Creates a new [HtmlText] from the given string. pub fn from_str(value: &str) -> HtmlText { - // If the text has non-whitespace characters, trim it. - let trimmed_text = value.trim(); - let value = if trimmed_text.is_empty() { - value - } else { - trimmed_text - }; - + let text = unescape_characters(value); HtmlText { - value: value.to_string(), - only_whitespace: trimmed_text.is_empty(), + value: text.to_string(), + only_whitespace: text.trim().is_empty(), } } } +/// Unescapes commonly escaped characters in HTML text. +/// +/// - `&` becomes `&` +/// - `<` becomes `<` +/// - `>` becomes `>` +/// - `"` becomes `"` +/// - `'` becomes `'` +pub fn unescape_characters(text: &str) -> String { + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", r#"""#) + .replace("'", "'") +} + +/// Escapes commonly escaped characters in HTML text. +/// +/// - `&` becomes `&` +/// - `<` becomes `<` +/// - `>` becomes `>` +/// - `"` becomes `"` +/// - `'` becomes `'` +pub fn escape_characters(text: &str) -> String { + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(r#"""#, """) + .replace("'", "'") +} + +/// Trims internal whitespace from the given text such that only a single space separates words. +/// This is used to emulate the behaviour of Chromium browsers. +/// +/// # Example +/// ```rust +/// use skyscraper::html::trim_internal_whitespace; +/// let text = " hello \n world "; +/// let result = trim_internal_whitespace(text); +/// assert_eq!("hello world", result); +/// ``` +pub fn trim_internal_whitespace(text: &str) -> String { + let mut result = String::new(); + let mut last_char = ' '; + for c in text.chars() { + if c.is_whitespace() { + if !last_char.is_whitespace() { + result.push(' '); + } + } else { + result.push(c); + } + last_char = c; + } + result.trim_end().to_string() +} + /// An HTML node can be either a tag or raw text. #[derive(Clone, Debug, EnumExtract)] pub enum HtmlNode { @@ -249,6 +298,14 @@ impl HtmlDocument { display_node(0, self, &self.root_node, format_type).expect("failed to display node"); format!("{}", text) } + + /// Get an iterator over all nodes in this document. + pub fn iter(&self) -> impl Iterator + '_ { + self.arena.iter().map(|node| { + let id = self.arena.get_node_id(node).unwrap(); + DocumentNode::new(id) + }) + } } impl fmt::Display for HtmlDocument { @@ -325,14 +382,15 @@ fn display_node( } } HtmlNode::Text(text) => { + let output_text = escape_characters(text.value.as_str()); match format_type { DocumentFormatType::Standard => { - write!(&mut str, "{}", text.value)?; + write!(&mut str, "{}", output_text)?; } DocumentFormatType::IgnoreWhitespace => { // If ignoring whitespace texts, only display if this text is not solely whitespace. if !text.only_whitespace { - write!(&mut str, "{}", text.value)?; + write!(&mut str, "{}", output_text)?; } } DocumentFormatType::Indented => { @@ -341,7 +399,7 @@ fn display_node( display_indent(indent, &mut str)?; // Trim the text incase there's leading or trailing whitespace. - writeln!(&mut str, "{}", text.value.trim())?; + writeln!(&mut str, "{}", output_text.trim())?; } } } @@ -809,4 +867,30 @@ mod tests { // assert assert_eq!(html_output, text); } + + #[test] + fn html_document_display_should_escape_text() { + // arrange + let text = indoc!( + r#" + + < + + "#, + ); + + let document = parse(&text).unwrap(); + + // act + let html_output = document.to_formatted_string(DocumentFormatType::Indented); + + // assert + // assert that the text retrieved from the tag was unescaped + let root_text = document.root_node.get_text(&document).unwrap(); + let trimmed = root_text.trim(); + assert_eq!("<", trimmed); + + // asser that the display output was escaped + assert_eq!(html_output, text); + } } diff --git a/src/html/parse/mod.rs b/src/html/parse/mod.rs index 082bd2e..fb985b5 100644 --- a/src/html/parse/mod.rs +++ b/src/html/parse/mod.rs @@ -409,7 +409,7 @@ fn get_mut_tree_node(key: Option, arena: &mut Arena) -> &mut N pub mod test_helpers { use std::collections::HashMap; - use crate::html::{DocumentNode, HtmlDocument, HtmlNode, HtmlText}; + use crate::html::{DocumentNode, HtmlDocument, HtmlNode}; pub fn assert_tag( document: &HtmlDocument, @@ -445,7 +445,7 @@ pub mod test_helpers { let html_node = document.get_html_node(&key).unwrap(); let node_text = html_node.extract_as_text(); - assert_eq!(&HtmlText::from_str(text), node_text); + assert_eq!(text, node_text.value.trim()); } } @@ -910,23 +910,12 @@ mod tests { // -> ->
->
->
->
->
->

-> text() { - let key = children.next().unwrap(); - assert_text( - &result, - key, - "Rust is blazingly fast and memory-efficient: with no runtime or"); - - let key = children.next().unwrap(); - assert_text( - &result, - key, - "garbage collector, it can power performance-critical services, run on"); + let mut t = String::from("Rust is blazingly fast and memory-efficient: with no runtime or"); + t = format!("{}\n garbage collector, it can power performance-critical services, run on", t); + t = format!("{}\n embedded devices, and easily integrate with other languages.", t); let key = children.next().unwrap(); - assert_text( - &result, - key, - "embedded devices, and easily integrate with other languages."); + assert_text(&result, key, &t); } } } diff --git a/src/html/tokenizer/helpers.rs b/src/html/tokenizer/helpers.rs index f2a83e2..e39f881 100644 --- a/src/html/tokenizer/helpers.rs +++ b/src/html/tokenizer/helpers.rs @@ -275,10 +275,10 @@ pub fn is_text( pointer.index = pointer_index; buffer.push('<'); } - Some('\n') => { - // Text is allowed to start with a new line, but not allowed to contain one mid-sequence. - break; - } + // Some('\n') => { + // // Text is allowed to start with a new line, but not allowed to contain one mid-sequence. + // break; + // } Some(c) => { buffer.push(*c); } @@ -683,7 +683,7 @@ mod tests { } #[test] - fn is_text_should_terminate_on_newline() { + fn is_text_should_not_terminate_on_newline() { // arrange let chars: Vec = "foo\nbar".chars().collect(); let mut pointer = VecPointerRef::new(&chars); @@ -692,8 +692,8 @@ mod tests { let result = is_text(&mut pointer, false, true).unwrap(); // assert - assert_eq!(result, Token::Text(String::from("foo"))); - assert_eq!(pointer.index, 3); + assert_eq!(result, Token::Text(String::from("foo\nbar"))); + assert_eq!(pointer.index, 7); } #[test] @@ -709,4 +709,18 @@ mod tests { assert_eq!(result, Token::Text(String::from("\n\t\t"))); assert_eq!(pointer.index, 3); } + + #[test] + fn is_text_should_capture_multiple_lines_of_whitespace() { + // arrange + let chars: Vec = "\n\t\n\t".chars().collect(); + let mut pointer = VecPointerRef::new(&chars); + + // act + let result = is_text(&mut pointer, false, true).unwrap(); + + // assert + assert_eq!(result, Token::Text(String::from("\n\t\n\t"))); + assert_eq!(pointer.index, 4); + } } diff --git a/src/html/tokenizer/mod.rs b/src/html/tokenizer/mod.rs index 23197de..9e68e3b 100644 --- a/src/html/tokenizer/mod.rs +++ b/src/html/tokenizer/mod.rs @@ -60,6 +60,9 @@ pub fn lex(text: &str) -> Result, LexError> { symbols.push(s); } else { if let Some(c) = pointer.current() { + if *c != ' ' { + println!("Unknown symbol {}", c); + } if !c.is_whitespace() { // Unknown symbol, move on ¯\_(ツ)_/¯ error!("Unknown HTML symbol {}", c); @@ -183,207 +186,4 @@ mod tests { assert_eq!(expected, result); } - - #[test] - fn lex_should_work_with_html() { - // arrange - let html = r###" - - - - Rust Programming Language - - - - - - -

-
-
-
-

- Why Rust? -

-
-
-
-

Performance

-

- Rust is blazingly fast and memory-efficient: with no runtime or - garbage collector, it can power performance-critical services, run on - embedded devices, and easily integrate with other languages. -

-
-
-
-
-
-