Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(html): Handle mismatched tags #14

Merged
merged 1 commit into from
Jul 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
feat(html): Handle mismatched tags
- Adds an extension point for handling mismatched tags in malformed HTML
- Adds several mismatched tag handlers
- Implements Display on HtmlDocument
James-LG committed Jul 20, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 2fc5295be5858ea9409d0aa06014aee001a05819
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"rust-analyzer.linkedProjects": [
"./Cargo.toml"
]
}
55 changes: 29 additions & 26 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@
[package]
name = "skyscraper"
version = "0.4.0"
authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
edition = "2018"
description = "XPath for HTML web scraping"
license = "MIT"
readme = "README.md"
homepage = "https://github.com/James-LG/Skyscraper"
repository = "https://github.com/James-LG/Skyscraper"
categories = ["parsing"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
indextree = "4.3.1"
lazy_static = "1.4.0"
thiserror = "1.0.30"
indexmap = "1.8.2"

[dev-dependencies]
criterion = "0.3"

[[bench]]
name = "benchmarks"
harness = false
[package]
name = "skyscraper"
version = "0.4.0"
authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
edition = "2018"
description = "XPath for HTML web scraping"
license = "MIT"
readme = "README.md"
homepage = "https://github.com/James-LG/Skyscraper"
repository = "https://github.com/James-LG/Skyscraper"
categories = ["parsing"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
indextree = "4.3.1"
lazy_static = "1.4.0"
thiserror = "1.0.30"
indexmap = "2.0.0"
log = "0.4.19"

[dev-dependencies]
criterion = "0.5.1"
mockall = "0.11.4"
indoc = "2"

[[bench]]
name = "benchmarks"
harness = false
255 changes: 222 additions & 33 deletions src/html/mod.rs

Large diffs are not rendered by default.

476 changes: 476 additions & 0 deletions src/html/parse/malformed_html_handlers.rs

Large diffs are not rendered by default.

369 changes: 212 additions & 157 deletions src/html/parse.rs → src/html/parse/mod.rs

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions src/html/parse/parse_options.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//! Create options for [parse_opts](super::parse_opts).
use super::malformed_html_handlers::{ErrorMismatchedTagHandler, MismatchedTagHandler};

/// Options for [parse_opts](super::parse_opts).
pub struct ParseOptions {
/// Defines the method for handling an end tag that doesn't match the currently opened tag.
pub mismatched_tag_handler: Box<dyn MismatchedTagHandler>,
}

impl ParseOptions {
/// Create a new [ParseOptions] with default values.
pub fn new() -> Self {
Self {
mismatched_tag_handler: Box::new(ErrorMismatchedTagHandler::new()),
}
}
}

impl Default for ParseOptions {
fn default() -> Self {
ParseOptions::new()
}
}

/// Builds [ParseOptions] for the [Parser](crate::html::parse::Parser).
///
/// See [ParseOptions] for the default values used if not set by the builder.
///
/// Example usage:
/// ```rust
/// # use std::error::Error;
/// # fn main() -> Result<(), Box<dyn Error>> {
/// use skyscraper::html::parse::{Parser, ParseOptionsBuilder, malformed_html_handlers::VoidMismatchedTagHandler};
///
/// let options = ParseOptionsBuilder::new()
/// .with_mismatched_tag_handler(Box::new(VoidMismatchedTagHandler::new(None)))
/// .build();
///
/// let parser = Parser::new(options);
/// # Ok(())
/// # }
/// ```
pub struct ParseOptionsBuilder {
reducers: Vec<Box<dyn FnOnce(ParseOptions) -> ParseOptions>>,
}

impl ParseOptionsBuilder {
/// Creates a new [ParseOptionsBuilder].
pub fn new() -> Self {
Self {
reducers: Vec::new(),
}
}

/// Set the type of [MismatchedTagHandler] the parser should use.
pub fn with_mismatched_tag_handler(mut self, handler: Box<dyn MismatchedTagHandler>) -> Self {
let reducer = |options| ParseOptions {
mismatched_tag_handler: handler,
..options
};
self.reducers.push(Box::new(reducer));
self
}

/// Build the [ParseOptions].
pub fn build(self) -> ParseOptions {
self.reducers
.into_iter()
.fold(ParseOptions::new(), |options, f| f(options))
}
}
Comment on lines +44 to +72
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Order of builder method calls in maintained with self.reducers.


impl Default for ParseOptionsBuilder {
fn default() -> Self {
ParseOptionsBuilder::new()
}
}

#[cfg(test)]
mod tests {
use crate::html::parse::{
malformed_html_handlers::{MismatchedTagHandlerContext, MockMismatchedTagHandler},
ParserState,
};

use super::*;

#[test]
fn with_mismatched_tag_handler_should_set_handler() {
// arrange
let builder = ParseOptionsBuilder::new();
let mut handler = MockMismatchedTagHandler::new();
let mut context = ParserState {
..Default::default()
};
let context = MismatchedTagHandlerContext {
open_tag_name: "hi",
close_tag_name: "bye",
parser_state: &mut context,
};

handler.expect_invoke().times(1).returning(|_| Ok(()));

// act
let options = builder
.with_mismatched_tag_handler(Box::new(handler))
.build();

// assert
assert!(matches!(
options.mismatched_tag_handler.invoke(context),
Ok(())
));
}
}
23 changes: 14 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -48,42 +48,47 @@
//! # Ok(())
//! # }
//! ```
//!
//!
//! # Example: use lazy_static if Xpath expressions are static
//!
//!
//! If your Xpath expressions are static, and you have a function that
//! parses and applies the expression every time the function is called,
//! consider using [mod@lazy_static] to prevent the expression from being
//! repeatedly parsed.
//!
//!
//! ```rust
//! #[macro_use]
//! extern crate lazy_static;
//!
//!
//! use std::error::Error;
//! use skyscraper::{html::{self, HtmlDocument}, xpath::{self, Xpath}};
//!
//!
//! lazy_static! {
//! static ref SPAN_XPATH: Xpath = xpath::parse("/div/span").unwrap();
//! }
//!
//!
//! fn my_func(document: &HtmlDocument) -> Result<Option<String>, Box<dyn Error>> {
//! let xpath_results = SPAN_XPATH.apply(document)?;
//! Ok(xpath_results[0].get_text(document))
//! }
//!
//!
//! fn main() -> Result<(), Box<dyn Error>> {
//! let doc1 = html::parse("<div><span>foo</span></div>")?;
//! let text1 = my_func(&doc1)?.expect("text missing");
//! assert_eq!("foo", text1);
//!
//!
//! let doc2 = html::parse("<div><span>bar</span></div>")?;
//! let text2 = my_func(&doc2)?.expect("text missing");
//! assert_eq!("bar", text2);
//!
//!
//! Ok(())
//! }
//! ```
//!
//! For more information on HTML documents and nodes, including how to get text or attributes from nodes,
//! see the [html] module documentation.
//!
//! For more information on XPath expressions, see the [xpath] module documentation.
#![warn(missing_docs)]

166 changes: 68 additions & 98 deletions src/xpath/mod.rs

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/xpath/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
@@ -168,10 +168,10 @@ pub fn is_double_colon(pointer: &mut VecPointerRef<char>) -> Option<Token> {
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
pub fn is_number(pointer: &mut VecPointerRef<char>) -> Option<Token> {
if let Some(c) = pointer.current() {
if c.is_digit(10) {
if c.is_ascii_digit() {
let mut num = c.to_string();
while let Some(c) = pointer.next() {
if c.is_digit(10) {
if c.is_ascii_digit() {
num.push(*c);
} else {
break;
@@ -182,7 +182,7 @@ pub fn is_number(pointer: &mut VecPointerRef<char>) -> Option<Token> {
if let Some('.') = pointer.current() {
num.push('.');
while let Some(c) = pointer.next() {
if c.is_digit(10) {
if c.is_ascii_digit() {
num.push(*c);
} else {
break;