Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(xpath): Adjust text methods to match lxml #30

Merged
merged 1 commit into from
Feb 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,16 @@
FROM mcr.microsoft.com/devcontainers/rust:1-1-bullseye

# [Optional] Uncomment this section to install additional packages.
# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
# && apt-get -y install --no-install-recommends <your-package-list-here>
RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends python3-lxml

USER vscode

# Install nightly rust
RUN rustup toolchain install nightly

# Install pip
ENV PATH="${PATH}:/home/vscode/.local/bin"
RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
&& python3 /tmp/get-pip.py \
&& rm /tmp/get-pip.py
47 changes: 25 additions & 22 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,35 @@
"build": {
"dockerfile": "Dockerfile"
},
"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],

// Set *default* container specific settings.json values on container create.
"settings": {
"lldb.executable": "/usr/bin/lldb",
// VS Code don't watch files under ./target
"files.watcherExclude": {
"**/target/**": true
"runArgs": [
"--cap-add=SYS_PTRACE",
"--security-opt",
"seccomp=unconfined"
],
"customizations": {
"vscode": {
// Set *default* container specific settings.json values on container create.
"settings": {
"lldb.executable": "/usr/bin/lldb",
// VS Code don't watch files under ./target
"files.watcherExclude": {
"**/target/**": true
}
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"matklad.rust-analyzer",
"bungcip.better-toml",
"vadimcn.vscode-lldb",
"mutantdino.resourcemonitor",
"ms-azuretools.vscode-docker"
]
}
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"matklad.rust-analyzer",
"bungcip.better-toml",
"vadimcn.vscode-lldb",
"mutantdino.resourcemonitor",
"ms-azuretools.vscode-docker"
],

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "rustc --version",

"postCreateCommand": "pip install -r tests/lxml_tests/requirements.txt",
// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
"remoteUser": "vscode"
}
}
15 changes: 9 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@ env:
CARGO_TERM_COLOR: always

jobs:
build:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
- name: Checkout (GitHub)
uses: actions/checkout@v3
- name: Run tests in devcontainer
uses: devcontainers/ci@v0.3
with:
push: never
runCmd: |
cargo test

stack_overflow_tests:
runs-on: windows-latest
Expand Down
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "skyscraper"
version = "0.6.1"
version = "0.6.2"
authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
edition = "2021"
description = "XPath for HTML web scraping"
Expand Down Expand Up @@ -29,6 +29,8 @@ mockall = "0.12.0"
indoc = "2"
proptest = "1.3.1"
regex = "1.10.3"
serde_json = "1.0.113"
serde = "1.0.196"

[[bench]]
name = "benchmarks"
Expand Down
110 changes: 97 additions & 13 deletions src/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,21 +139,70 @@ pub struct HtmlText {
impl HtmlText {
/// Creates a new [HtmlText] from the given string.
pub fn from_str(value: &str) -> HtmlText {
// If the text has non-whitespace characters, trim it.
let trimmed_text = value.trim();
let value = if trimmed_text.is_empty() {
value
} else {
trimmed_text
};

let text = unescape_characters(value);
HtmlText {
value: value.to_string(),
only_whitespace: trimmed_text.is_empty(),
value: text.to_string(),
only_whitespace: text.trim().is_empty(),
}
}
}

/// Unescapes commonly escaped characters in HTML text.
///
/// - `&amp;` becomes `&`
/// - `&lt;` becomes `<`
/// - `&gt;` becomes `>`
/// - `&quot;` becomes `"`
/// - `&#39;` becomes `'`
pub fn unescape_characters(text: &str) -> String {
text.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", r#"""#)
.replace("&#39;", "'")
}

/// Escapes commonly escaped characters in HTML text.
///
/// - `&` becomes `&amp;`
/// - `<` becomes `&lt;`
/// - `>` becomes `&gt;`
/// - `"` becomes `&quot;`
/// - `'` becomes `&#39;`
pub fn escape_characters(text: &str) -> String {
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace(r#"""#, "&quot;")
.replace("'", "&#39;")
}

/// Trims internal whitespace from the given text such that only a single space separates words.
/// This is used to emulate the behaviour of Chromium browsers.
///
/// # Example
/// ```rust
/// use skyscraper::html::trim_internal_whitespace;
/// let text = " hello \n world ";
/// let result = trim_internal_whitespace(text);
/// assert_eq!("hello world", result);
/// ```
pub fn trim_internal_whitespace(text: &str) -> String {
let mut result = String::new();
let mut last_char = ' ';
for c in text.chars() {
if c.is_whitespace() {
if !last_char.is_whitespace() {
result.push(' ');
}
} else {
result.push(c);
}
last_char = c;
}
result.trim_end().to_string()
}

/// An HTML node can be either a tag or raw text.
#[derive(Clone, Debug, EnumExtract)]
pub enum HtmlNode {
Expand Down Expand Up @@ -249,6 +298,14 @@ impl HtmlDocument {
display_node(0, self, &self.root_node, format_type).expect("failed to display node");
format!("{}", text)
}

/// Get an iterator over all nodes in this document.
pub fn iter(&self) -> impl Iterator<Item = DocumentNode> + '_ {
self.arena.iter().map(|node| {
let id = self.arena.get_node_id(node).unwrap();
DocumentNode::new(id)
})
}
}

impl fmt::Display for HtmlDocument {
Expand Down Expand Up @@ -325,14 +382,15 @@ fn display_node(
}
}
HtmlNode::Text(text) => {
let output_text = escape_characters(text.value.as_str());
match format_type {
DocumentFormatType::Standard => {
write!(&mut str, "{}", text.value)?;
write!(&mut str, "{}", output_text)?;
}
DocumentFormatType::IgnoreWhitespace => {
// If ignoring whitespace texts, only display if this text is not solely whitespace.
if !text.only_whitespace {
write!(&mut str, "{}", text.value)?;
write!(&mut str, "{}", output_text)?;
}
}
DocumentFormatType::Indented => {
Expand All @@ -341,7 +399,7 @@ fn display_node(
display_indent(indent, &mut str)?;

// Trim the text incase there's leading or trailing whitespace.
writeln!(&mut str, "{}", text.value.trim())?;
writeln!(&mut str, "{}", output_text.trim())?;
}
}
}
Expand Down Expand Up @@ -809,4 +867,30 @@ mod tests {
// assert
assert_eq!(html_output, text);
}

#[test]
fn html_document_display_should_escape_text() {
// arrange
let text = indoc!(
r#"
<html>
&lt;
</html>
"#,
);

let document = parse(&text).unwrap();

// act
let html_output = document.to_formatted_string(DocumentFormatType::Indented);

// assert
// assert that the text retrieved from the tag was unescaped
let root_text = document.root_node.get_text(&document).unwrap();
let trimmed = root_text.trim();
assert_eq!("<", trimmed);

// asser that the display output was escaped
assert_eq!(html_output, text);
}
}
23 changes: 6 additions & 17 deletions src/html/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ fn get_mut_tree_node(key: Option<NodeId>, arena: &mut Arena<HtmlNode>) -> &mut N
pub mod test_helpers {
use std::collections::HashMap;

use crate::html::{DocumentNode, HtmlDocument, HtmlNode, HtmlText};
use crate::html::{DocumentNode, HtmlDocument, HtmlNode};

pub fn assert_tag(
document: &HtmlDocument,
Expand Down Expand Up @@ -445,7 +445,7 @@ pub mod test_helpers {
let html_node = document.get_html_node(&key).unwrap();

let node_text = html_node.extract_as_text();
assert_eq!(&HtmlText::from_str(text), node_text);
assert_eq!(text, node_text.value.trim());
}
}

Expand Down Expand Up @@ -910,23 +910,12 @@ mod tests {

// <html> -> <body> -> <main> -> <section> -> <div> -> <div> -> <section> -> <p> -> text()
{
let key = children.next().unwrap();
assert_text(
&result,
key,
"Rust is blazingly fast and memory-efficient: with no runtime or");

let key = children.next().unwrap();
assert_text(
&result,
key,
"garbage collector, it can power performance-critical services, run on");
let mut t = String::from("Rust is blazingly fast and memory-efficient: with no runtime or");
t = format!("{}\n garbage collector, it can power performance-critical services, run on", t);
t = format!("{}\n embedded devices, and easily integrate with other languages.", t);

let key = children.next().unwrap();
assert_text(
&result,
key,
"embedded devices, and easily integrate with other languages.");
assert_text(&result, key, &t);
}
}
}
Expand Down
28 changes: 21 additions & 7 deletions src/html/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ pub fn is_text(
pointer.index = pointer_index;
buffer.push('<');
}
Some('\n') => {
// Text is allowed to start with a new line, but not allowed to contain one mid-sequence.
break;
}
// Some('\n') => {
// // Text is allowed to start with a new line, but not allowed to contain one mid-sequence.
// break;
// }
Some(c) => {
buffer.push(*c);
}
Expand Down Expand Up @@ -683,7 +683,7 @@ mod tests {
}

#[test]
fn is_text_should_terminate_on_newline() {
fn is_text_should_not_terminate_on_newline() {
// arrange
let chars: Vec<char> = "foo\nbar".chars().collect();
let mut pointer = VecPointerRef::new(&chars);
Expand All @@ -692,8 +692,8 @@ mod tests {
let result = is_text(&mut pointer, false, true).unwrap();

// assert
assert_eq!(result, Token::Text(String::from("foo")));
assert_eq!(pointer.index, 3);
assert_eq!(result, Token::Text(String::from("foo\nbar")));
assert_eq!(pointer.index, 7);
}

#[test]
Expand All @@ -709,4 +709,18 @@ mod tests {
assert_eq!(result, Token::Text(String::from("\n\t\t")));
assert_eq!(pointer.index, 3);
}

#[test]
fn is_text_should_capture_multiple_lines_of_whitespace() {
// arrange
let chars: Vec<char> = "\n\t\n\t".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, true).unwrap();

// assert
assert_eq!(result, Token::Text(String::from("\n\t\n\t")));
assert_eq!(pointer.index, 4);
}
}
Loading
Loading