From 7ec2f1ab0074478f4a0c3a513eb0a3b1e497d9f8 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Sat, 18 May 2019 18:32:29 -0700 Subject: [PATCH 01/50] Setup W3C tests for RDF/XML parser implementation --- sophia/Cargo.toml | 1 + sophia/src/lib.rs | 1 + sophia/src/parser.rs | 1 + sophia/src/parser/xml.rs | 188 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 sophia/src/parser/xml.rs diff --git a/sophia/Cargo.toml b/sophia/Cargo.toml index 8a6739cb..fa653014 100644 --- a/sophia/Cargo.toml +++ b/sophia/Cargo.toml @@ -19,6 +19,7 @@ language-tag = "0.9.0" lazy_static = "1.2.0" pest = "2.1.0" pest_derive = "2.1.0" +quick-xml = "0.14.0" regex = "1.1.0" rental = "0.5.2" resiter = "0.3.0" diff --git a/sophia/src/lib.rs b/sophia/src/lib.rs index 5025c06b..303ed413 100644 --- a/sophia/src/lib.rs +++ b/sophia/src/lib.rs @@ -45,6 +45,7 @@ extern crate pest_derive; extern crate regex; #[macro_use] extern crate rental; +extern crate quick_xml; extern crate resiter; extern crate url; extern crate weak_table; diff --git a/sophia/src/parser.rs b/sophia/src/parser.rs index 824a2938..896535e1 100644 --- a/sophia/src/parser.rs +++ b/sophia/src/parser.rs @@ -24,3 +24,4 @@ pub mod common; pub mod nq; pub mod nt; +pub mod xml; diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs new file mode 100644 index 00000000..205ccdcd --- /dev/null +++ b/sophia/src/parser/xml.rs @@ -0,0 +1,188 @@ +//! Parser for RDF XML. + +use std::borrow::Cow; +use std::io::{BufRead, BufReader, Read}; +use std::result::Result as StdResult; + +use pest::error::Error as PestError; +use pest::{iterators::Pairs, Parser}; + +use super::common::*; +use super::nt::{pair_to_term, PestNtqParser, Rule}; +use crate::error::*; +use crate::quad::Quad; +use crate::term::{graph_key::GraphKey, Term}; +use crate::triple::Triple; + +#[derive(Clone, Debug, Default)] +pub struct Config; + +impl Config { + fn parse_xmlread<'a, B: BufRead + 'a>( + &self, + r: quick_xml::Reader, + ) -> impl Iterator> + 'a { + None.into_iter() + } + + #[inline] + pub fn parse_bufread<'a, B: BufRead + 'a>( + &self, + bufread: B, + ) -> impl Iterator> + 'a { + self.parse_xmlread(quick_xml::Reader::from_reader(bufread)) + } + + #[inline] + pub fn parse_read<'a, R: Read + 'a>( + &self, + read: R, + ) -> impl Iterator> + 'a { + self.parse_bufread(BufReader::new(read)) + } + + #[inline] + pub fn parse_str<'a>(&self, txt: &'a str) -> impl Iterator> + 'a { + self.parse_xmlread(quick_xml::Reader::from_str(txt)) + } +} + +pub struct XmlQuad { + s: Term, + p: Term, + o: Term, +} + +impl<'a> Triple<'a> for XmlQuad { + type TermData = String; + fn s(&self) -> &Term { + &self.s + } + fn p(&self) -> &Term { + &self.p + } + fn o(&self) -> &Term { + &self.o + } +} + +#[cfg(test)] +mod test { + + use crate::graph::inmem::HashGraph; + use crate::graph::inmem::TermIndexMapU; + use crate::graph::Graph; + use crate::term::factory::RcTermFactory; + use crate::triple::stream::TripleSource; + use crate::triple::Triple; + use std::ffi::OsStr; + use std::fmt::Debug; + use std::fmt::Formatter; + use std::fmt::Result as FmtResult; + use std::fs::{read_dir, File}; + use std::io; + use std::path::Path; + + type TestGraph = HashGraph>; + + impl PartialEq for TestGraph { + fn eq(&self, other: &Self) -> bool { + // check self is contained in other + for res in ::triples(&self) { + if let Ok(triple) = res { + if !other.contains(triple.s(), triple.p(), triple.o()).unwrap() { + return false; + } + } else { + return false; + } + } + + // check other is contained in self + for res in ::triples(&other) { + if let Ok(triple) = res { + if !self.contains(triple.s(), triple.p(), triple.o()).unwrap() { + return false; + } + } else { + return false; + } + } + + // both graphs are included in each other so they are equal + true + } + } + + impl Debug for TestGraph { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let v: Vec<_> = self.triples().collect(); + v.fmt(f) + } + } + + #[test] + fn w3c_test_suite() { + fn do_test_suite() -> io::Result<()> { + let rdf_ext = OsStr::new("rdf"); + let nt_ext = OsStr::new("nt"); + + let suite = Path::new("..").join("rdf-tests").join("rdf-xml"); + if !suite.exists() || !suite.is_dir() { + panic!("rdf-tests/rdf-xml not found, can not check W3C test-suite. cf README.md"); + } + + let mut tested = 0; + + for e in read_dir(&suite)? { + let entry = e?; + if entry.file_type()?.is_dir() { + for c in read_dir(entry.path())? { + let case = c?; + if case.path().extension() == Some(rdf_ext) { + if case.path().with_extension(nt_ext).is_file() { + // the reference N-Triples file + let ntparser = crate::parser::nt::Config::default(); + let ntfile = File::open(case.path().with_extension(nt_ext))?; + let mut expected = TestGraph::new(); + ntparser.parse_read(ntfile).in_graph(&mut expected).unwrap(); + // the test XML file + let xmlparser = super::Config::default(); + let xmlfile = File::open(case.path())?; + let mut actual = TestGraph::new(); + let res = xmlparser.parse_read(xmlfile).in_graph(&mut actual); + + // check the XML parses without error + assert!( + res.is_ok(), + format!("{} should parse without error", case.path().display()) + ); + // check the XML produces the same graph - TODO + assert_eq!(actual, expected); + + tested += 1; + } else if case.path().to_string_lossy().contains("error") { + let xmlparser = super::Config::default(); + let xmlfile = File::open(case.path())?; + let mut actual = TestGraph::new(); + assert!( + xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), + format!("{} should parse with error", case.path().display()) + ); + + tested += 1; + } + } + } + } + } + + assert_ne!( + tested, 0, + "No test found in W3C test-suite, something must be wrong" + ); + Ok(()) + } + do_test_suite().unwrap() + } +} From 3f8c2e2407920f4c43ed2bc3204c1fafa0bc75be Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Sun, 19 May 2019 22:33:09 -0700 Subject: [PATCH 02/50] Add early working version of RDF/XML parser --- sophia/Cargo.toml | 3 + sophia/src/ns.rs | 12 +- sophia/src/parser/xml.rs | 460 ++++++++++++++++++++++++++++++++++----- 3 files changed, 416 insertions(+), 59 deletions(-) diff --git a/sophia/Cargo.toml b/sophia/Cargo.toml index fa653014..b5b01438 100644 --- a/sophia/Cargo.toml +++ b/sophia/Cargo.toml @@ -14,6 +14,7 @@ edition = "2018" [dependencies] coercible_errors = "0.1.3" +curie = "0.0.8" error-chain = "0.12.0" language-tag = "0.9.0" lazy_static = "1.2.0" @@ -25,3 +26,5 @@ rental = "0.5.2" resiter = "0.3.0" url = "1.7.2" weak-table = "0.2.3" + +pretty_assertions = "*" diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index 610cd4db..fc2d44cb 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -23,6 +23,7 @@ use crate::error::*; use crate::term::{iri_rfc3987::is_valid_iri, Term, TermData}; /// A custom namespace. +#[derive(Clone, Debug)] pub struct Namespace(T); impl Namespace { @@ -85,6 +86,7 @@ pub mod rdf { // classes Alt, Bad, + Description, List, PlainLiteral, Property, @@ -102,7 +104,15 @@ pub mod rdf { subject, value, // individuals - nil + nil, + // core syntax terms + RDF, + ID, + about, + parseType, + resource, + nodeID, + datatypes ); ns_term!("http://www.w3.org/1999/02/22-rdf-syntax-ns#", type_, "type"); } diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 205ccdcd..41b8530b 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -1,68 +1,351 @@ //! Parser for RDF XML. -use std::borrow::Cow; +use std::collections::HashMap; +use std::collections::LinkedList; +use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; -use std::result::Result as StdResult; +use std::ops::RangeFrom; +use std::rc::Rc; -use pest::error::Error as PestError; -use pest::{iterators::Pairs, Parser}; +use quick_xml::events::BytesEnd; +use quick_xml::events::BytesStart; +use quick_xml::events::BytesText; +use quick_xml::events::Event; use super::common::*; -use super::nt::{pair_to_term, PestNtqParser, Rule}; use crate::error::*; -use crate::quad::Quad; -use crate::term::{graph_key::GraphKey, Term}; +use crate::ns::rdf; +use crate::ns::xsd; +use crate::ns::Namespace; +use crate::term::factory::RcTermFactory; +use crate::term::factory::TermFactory; +use crate::term::matcher::TermMatcher; +use crate::term::Term; +use crate::term::TermData; use crate::triple::Triple; +// --- + #[derive(Clone, Debug, Default)] pub struct Config; impl Config { - fn parse_xmlread<'a, B: BufRead + 'a>( - &self, - r: quick_xml::Reader, - ) -> impl Iterator> + 'a { - None.into_iter() - } - #[inline] pub fn parse_bufread<'a, B: BufRead + 'a>( &self, bufread: B, - ) -> impl Iterator> + 'a { - self.parse_xmlread(quick_xml::Reader::from_reader(bufread)) + ) -> impl Iterator>; 3]>> + 'a { + XmlParser::<_, RcTermFactory>::new(quick_xml::Reader::from_reader(bufread)) } #[inline] pub fn parse_read<'a, R: Read + 'a>( &self, read: R, - ) -> impl Iterator> + 'a { + ) -> impl Iterator>; 3]>> + 'a { self.parse_bufread(BufReader::new(read)) } #[inline] - pub fn parse_str<'a>(&self, txt: &'a str) -> impl Iterator> + 'a { - self.parse_xmlread(quick_xml::Reader::from_str(txt)) + pub fn parse_str<'a>( + &self, + txt: &'a str, + ) -> impl Iterator>; 3]>> + 'a { + XmlParser::<_, RcTermFactory>::new(quick_xml::Reader::from_str(txt)) + } +} + +// --- + +#[derive(Clone)] +pub struct PrefixMapping { + default: Option, + mapping: HashMap>, + factory: F, +} + +impl Default for PrefixMapping { + fn default() -> Self { + Self { + default: None, + mapping: HashMap::new(), + factory: Default::default(), + } + } +} + +impl PrefixMapping { + pub fn add_prefix(&mut self, prefix: &str, value: &str) { + if prefix == "_" { + panic!("reserved prefix") + } else { + self.mapping.insert( + String::from(prefix), + Namespace::new(self.factory.get_term_data(value)).expect("FIXME"), + ); + } + } + + pub fn expand_curie_string(&mut self, curie_str: &str) -> Term { + if let Some(separator_idx) = curie_str.chars().position(|c| c == ':') { + let prefix = &curie_str[..separator_idx]; + let reference = &curie_str[separator_idx + 1..]; + self.expand_curie(&prefix, &reference) + } else { + panic!("missing prefix") + } + } + + pub fn expand_curie(&mut self, prefix: &str, local: &str) -> Term { + if let Some(ns) = self.mapping.get(prefix) { + ns.get(self.factory.get_term_data(local)).expect("FIXME") + } else { + panic!("no such namespace") + } } } -pub struct XmlQuad { - s: Term, - p: Term, - o: Term, +// --- + +struct XmlParser { + reader: quick_xml::Reader, + // The stack of namespaces: should be optimized. + namespaces: Vec>, + // The stack of parents (for nested declarations) + parents: Vec>, + // The queue of produced triples + triples: LinkedList; 3]>>, + // `true` if we are currently in a node element. + in_node: bool, + // + factory: F, + // + bnodes: RangeFrom, } -impl<'a> Triple<'a> for XmlQuad { - type TermData = String; - fn s(&self) -> &Term { - &self.s +impl XmlParser +where + B: BufRead, + F: TermFactory + Clone + Default, + ::TermData: Debug, +{ + fn new(reader: quick_xml::Reader) -> Self { + Self { + reader, + parents: Vec::new(), + namespaces: vec![PrefixMapping::default()], + triples: LinkedList::new(), + in_node: false, + factory: Default::default(), + bnodes: 0.., + } + } + + fn element_start(&mut self, e: BytesStart) { + // Add a new namespace mapping (OPTIMISE ME) + let mut ns = self.namespaces.last().unwrap().clone(); + for attr in e.attributes() { + let a = attr.expect("FIXME"); + if a.key.starts_with(b"xmlns:") { + ns.add_prefix( + std::str::from_utf8(&a.key[6..]).expect("FIXME"), + std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), + ); + } + } + self.namespaces.push(ns); + + // Ignore top-level rdf:RDF element + if e.name() != b"rdf:RDF" { + // Change the current element type + self.in_node = !self.in_node; + // Parse as a node of as a property + if self.in_node { + self.node_start(e) + } else { + self.predicate_start(e) + } + } + } + + fn node_start(&mut self, e: BytesStart) { + let ns = self.namespaces.last_mut().unwrap(); + + // Separate node subject from other attributes + let mut properties = HashMap::new(); + let mut subject = None; + for attr in e.attributes().with_checks(true) { + let a = attr.expect("FIXME"); + + // ignore xmlns attributes (processed in element_start) + if a.key.starts_with(b"xmlns:") { + continue; + } + + // try to extract the subject annotation + let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + if k.matches(&rdf::about) { + if subject.is_none() { + subject = Some(self.factory.iri(v).expect("FIXME")); + } else { + panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") + } + } else if k.matches(&rdf::ID) { + + } else if k.matches(&rdf::nodeID) { + if subject.is_none() { + subject = Some(self.factory.bnode(format!("o{}", v)).expect("FIXME")); + } else { + panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") + } + } else { + properties.insert(k, self.factory.literal_dt(v, xsd::string).expect("FIXME")); + } + } + + // Get subject and add it to the current nested stack + let s: Term<_> = subject.unwrap_or( + self.factory + .bnode(format!("n{}", self.bnodes.next().unwrap())) + .expect("FIXME"), + ); + self.parents.push(s.clone()); + + // Add the type as a triple if it is not `rdf:Description` + let ty = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + if ty != rdf::Description { + self.triples + .push_back(Ok([s.clone(), self.factory.copy(&rdf::type_), ty])); + } + + // Add properties + for (p, lit) in properties { + self.triples.push_back(Ok([s.clone(), p, lit])) + } + + // Add the entity as an object if it is not top-level + if self.parents.len() > 1 { + let o = s; + let s = &self.parents[self.parents.len() - 3]; + let p = &self.parents[self.parents.len() - 2]; + self.triples.push_back(Ok([s.clone(), p.clone(), o])); + } + } + + fn predicate_start(&mut self, e: BytesStart) { + let ns = self.namespaces.last_mut().unwrap(); + let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + self.parents.push(p) + } + + fn element_end(&mut self, e: BytesEnd) { + // Change the current element type (if not in rdf:RDF) + if e.name() != b"rdf:RDF" { + self.in_node = !self.in_node; + self.parents.pop(); + } + } + + fn element_text(&mut self, e: BytesText) { + if !self.in_node { + self.predicate_text(e); + } + } + + // FIXME: datatype handler + fn predicate_text(&mut self, e: BytesText) { + if self.parents.len() > 1 { + let s = &self.parents[self.parents.len() - 2]; + let p = &self.parents[self.parents.len() - 1]; + let o = self + .factory + .literal_dt( + e.unescape_and_decode(&self.reader).expect("FIXME"), + xsd::string, + ) + .expect("FIXME"); + self.triples.push_back(Ok([s.clone(), p.clone(), o])); + } } - fn p(&self) -> &Term { - &self.p + + fn element_empty(&mut self, e: BytesStart) { + // Add a new namespace mapping (OPTIMISE ME) + let mut ns = self.namespaces.last().unwrap().clone(); + for attr in e.attributes() { + let a = attr.expect("FIXME"); + if a.key.starts_with(b"xmlns:") { + ns.add_prefix( + std::str::from_utf8(&a.key[6..]).expect("FIXME"), + std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), + ); + } + } + + self.namespaces.push(ns); + if self.in_node { + self.predicate_empty(e) + } else { + self.node_empty(e) + } + } + + fn node_empty(&mut self, e: BytesStart) {} + + fn predicate_empty(&mut self, e: BytesStart) { + let ns = self.namespaces.last_mut().unwrap(); + let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + + let mut object = None; + for attr in e.attributes().with_checks(true) { + let a = attr.expect("FIXME"); + + // ignore xmlns attributes + if a.key.starts_with(b"xmlns:") { + continue; + } + + // try to extract the annotation object + let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + if k.matches(&rdf::resource) { + if object.is_none() { + object = Some(self.factory.iri(v).expect("FIXME")); + } else { + panic!("cannot have rdf:resource rdf:nodeId at the same time") + } + } else if k.matches(&rdf::nodeID) { + + } + } + + let s = self.parents.last().unwrap(); + let o = object.unwrap(); // FIXME + self.triples.push_back(Ok([s.clone(), p, o])); } - fn o(&self) -> &Term { - &self.o +} + +impl Iterator for XmlParser +where + B: BufRead, + F: TermFactory + Clone + Default, + ::TermData: Debug, +{ + type Item = Result<[Term; 3]>; + fn next(&mut self) -> Option { + loop { + if let Some(triple) = self.triples.pop_front() { + return Some(triple); + } + match self.reader.read_event(&mut Vec::new()).unwrap() { + Event::Eof => return None, + Event::Start(s) => self.element_start(s), + Event::Empty(e) => self.element_empty(e), + Event::End(e) => self.element_end(e), + // Event::Text(t) => self.element_text(t), + _ => (), + } + } } } @@ -73,6 +356,7 @@ mod test { use crate::graph::inmem::TermIndexMapU; use crate::graph::Graph; use crate::term::factory::RcTermFactory; + use crate::term::matcher::TermMatcher; use crate::triple::stream::TripleSource; use crate::triple::Triple; use std::ffi::OsStr; @@ -87,24 +371,31 @@ mod test { impl PartialEq for TestGraph { fn eq(&self, other: &Self) -> bool { - // check self is contained in other + let mut triples_self = Vec::new(); + let mut triples_other = Vec::new(); + + // for res in ::triples(&self) { - if let Ok(triple) = res { - if !other.contains(triple.s(), triple.p(), triple.o()).unwrap() { - return false; - } - } else { + triples_self.push(res.unwrap()); + } + for res in ::triples(&other) { + triples_other.push(res.unwrap()); + } + + // + triples_self.sort_by_key(|t| (t.s().value(), t.p().value(), t.o().value())); + triples_other.sort_by_key(|t| (t.s().value(), t.p().value(), t.o().value())); + + for (ts, to) in triples_self.into_iter().zip(triples_other.into_iter()) { + if !ts.s().matches(to.s()) { return false; } - } - // check other is contained in self - for res in ::triples(&other) { - if let Ok(triple) = res { - if !self.contains(triple.s(), triple.p(), triple.o()).unwrap() { - return false; - } - } else { + if !ts.p().matches(to.p()) { + return false; + } + + if !ts.o().matches(to.o()) { return false; } } @@ -116,12 +407,16 @@ mod test { impl Debug for TestGraph { fn fmt(&self, f: &mut Formatter) -> FmtResult { - let v: Vec<_> = self.triples().collect(); + let mut v = Vec::new(); + for t in self.triples() { + v.push(t.unwrap()); + } + v.sort_by_key(|t| (t.s().value(), t.p().value(), t.o().value())); v.fmt(f) } } - #[test] + // #[test] fn w3c_test_suite() { fn do_test_suite() -> io::Result<()> { let rdf_ext = OsStr::new("rdf"); @@ -141,6 +436,8 @@ mod test { let case = c?; if case.path().extension() == Some(rdf_ext) { if case.path().with_extension(nt_ext).is_file() { + println!("{}", case.path().display()); + // the reference N-Triples file let ntparser = crate::parser::nt::Config::default(); let ntfile = File::open(case.path().with_extension(nt_ext))?; @@ -157,20 +454,25 @@ mod test { res.is_ok(), format!("{} should parse without error", case.path().display()) ); - // check the XML produces the same graph - TODO - assert_eq!(actual, expected); - - tested += 1; - } else if case.path().to_string_lossy().contains("error") { - let xmlparser = super::Config::default(); - let xmlfile = File::open(case.path())?; - let mut actual = TestGraph::new(); - assert!( - xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), - format!("{} should parse with error", case.path().display()) + // check the XML produces the same graph + pretty_assertions::assert_eq!( + actual, + expected, + "{} does not give expected results", + case.path().display() ); tested += 1; + } else if case.path().to_string_lossy().contains("error") { + // let xmlparser = super::Config::default(); + // let xmlfile = File::open(case.path())?; + // let mut actual = TestGraph::new(); + // assert!( + // xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), + // format!("{} should parse with error", case.path().display()) + // ); + // + // tested += 1; } } } @@ -185,4 +487,46 @@ mod test { } do_test_suite().unwrap() } + + #[test] + fn w3c_example_07() { + let mut actual = TestGraph::new(); + super::Config::default() + .parse_str( + r#" + + + + + + + + + + + + "#, + ) + .in_graph(&mut actual) + .expect("failed parsing XML file"); + + let mut expected = TestGraph::new(); + crate::parser::nt::Config::default() + .parse_str( + r#" + "RDF/XML Syntax Specification (Revised)" . + _:genid1 "Dave Beckett" . + _:genid1 . + _:genid1 . + "#, + ) + .in_graph(&mut expected) + .expect("could not parse N-Triples file"); + + // pretty_assertions::assert_eq!(actual, expected); + } + } From 135f2802ba039cdf23ea6fb723010a4e743f43ac Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 11:12:38 -0700 Subject: [PATCH 03/50] Add `xml` namespace to `sophia::ns` module --- sophia/src/ns.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index fc2d44cb..0aab0e53 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -112,7 +112,7 @@ pub mod rdf { parseType, resource, nodeID, - datatypes + datatype ); ns_term!("http://www.w3.org/1999/02/22-rdf-syntax-ns#", type_, "type"); } @@ -196,6 +196,19 @@ pub mod rdfs { ); } +/// The standard `xml:` namespace +pub mod xml { + namespace!( + "http://www.w3.org/XML/1998/namespace#", + lang, + space, + base, + id, + // John Bosak + Father + ); +} + #[cfg(test)] mod test { // Nothing really worth testing here From d53c4226cbb1a3c9e74c97adb6c5f2673c235202 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 15:38:37 -0700 Subject: [PATCH 04/50] Add support for `xml:lang` and literal properties in XML parser --- sophia/src/parser/xml.rs | 314 ++++++++++++++++++++++++++++----------- 1 file changed, 225 insertions(+), 89 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 41b8530b..a9854a8e 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -12,9 +12,9 @@ use quick_xml::events::BytesStart; use quick_xml::events::BytesText; use quick_xml::events::Event; -use super::common::*; use crate::error::*; use crate::ns::rdf; +use crate::ns::xml; use crate::ns::xsd; use crate::ns::Namespace; use crate::term::factory::RcTermFactory; @@ -57,7 +57,7 @@ impl Config { // --- -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct PrefixMapping { default: Option, mapping: HashMap>, @@ -66,11 +66,13 @@ pub struct PrefixMapping { impl Default for PrefixMapping { fn default() -> Self { - Self { + let mut m = Self { default: None, mapping: HashMap::new(), factory: Default::default(), - } + }; + m.add_prefix("xml", "http://www.w3.org/XML/1998/namespace#"); + m } } @@ -92,7 +94,7 @@ impl PrefixMapping { let reference = &curie_str[separator_idx + 1..]; self.expand_curie(&prefix, &reference) } else { - panic!("missing prefix") + panic!("missing prefix: {}", curie_str) } } @@ -107,10 +109,38 @@ impl PrefixMapping { // --- +struct Text { + owner: Term, + datatype: Option>, + text: String, +} + +impl Text { + fn new(owner: Term) -> Self { + Self { + owner, + datatype: None, + text: String::new(), + } + } + + fn set_datatype>>>(&mut self, datatype: O) { + self.datatype = datatype.into(); + } + + fn set_text(&mut self, text: String) { + self.text = text; + } +} + struct XmlParser { reader: quick_xml::Reader, + // The stack of namespaces: should be optimized. namespaces: Vec>, + // The stack of `xml:lang`: should be optimized + lang: Vec>, + // The stack of parents (for nested declarations) parents: Vec>, // The queue of produced triples @@ -121,14 +151,55 @@ struct XmlParser { factory: F, // bnodes: RangeFrom, + // + text: Option>, } impl XmlParser where B: BufRead, - F: TermFactory + Clone + Default, + F: TermFactory + Clone + Default + Debug, ::TermData: Debug, { + // --- + fn enter_scope(&mut self, e: &BytesStart) { + // Add a new namespace mapping or copy the last one (OPTIMISE ME) + let mut ns = self.namespaces.last().unwrap().clone(); + for attr in e.attributes().with_checks(true) { + let a = attr.expect("FIXME"); + if a.key.starts_with(b"xmlns:") { + ns.add_prefix( + std::str::from_utf8(&a.key[6..]).expect("FIXME"), + std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), + ); + } + } + self.namespaces.push(ns); + + // Add current lang to scope or copy last one (OPTIMISE ME) + let mut lang = self.lang.last().unwrap().clone(); + for attr in e.attributes().with_checks(true) { + let a = attr.expect("FIXME"); + if a.key == b"xml:lang" { + lang = Some( + self.factory + .get_term_data(&a.unescape_and_decode_value(&self.reader).unwrap()), + ); + } + } + self.lang.push(lang); + + // Reset text element + self.text = None; + } + + fn leave_scope(&mut self) { + self.parents.pop(); + self.namespaces.pop(); + self.lang.pop(); + self.text = None; + } + fn new(reader: quick_xml::Reader) -> Self { Self { reader, @@ -138,23 +209,13 @@ where in_node: false, factory: Default::default(), bnodes: 0.., + lang: vec![None], + text: None, } } - fn element_start(&mut self, e: BytesStart) { - // Add a new namespace mapping (OPTIMISE ME) - let mut ns = self.namespaces.last().unwrap().clone(); - for attr in e.attributes() { - let a = attr.expect("FIXME"); - if a.key.starts_with(b"xmlns:") { - ns.add_prefix( - std::str::from_utf8(&a.key[6..]).expect("FIXME"), - std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), - ); - } - } - self.namespaces.push(ns); - + fn element_start(&mut self, e: &BytesStart) { + self.enter_scope(e); // Ignore top-level rdf:RDF element if e.name() != b"rdf:RDF" { // Change the current element type @@ -168,7 +229,7 @@ where } } - fn node_start(&mut self, e: BytesStart) { + fn node_start(&mut self, e: &BytesStart) { let ns = self.namespaces.last_mut().unwrap(); // Separate node subject from other attributes @@ -177,7 +238,7 @@ where for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); - // ignore xmlns attributes (processed in element_start) + // ignore xmlns and xml:lang attributes (processed in element_start) if a.key.starts_with(b"xmlns:") { continue; } @@ -199,7 +260,8 @@ where } else { panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") } - } else { + } else if !k.matches(&xml::lang) { + println!("{:?}", k); properties.insert(k, self.factory.literal_dt(v, xsd::string).expect("FIXME")); } } @@ -219,12 +281,12 @@ where .push_back(Ok([s.clone(), self.factory.copy(&rdf::type_), ty])); } - // Add properties + // Add triples described by properties in XML attributes for (p, lit) in properties { self.triples.push_back(Ok([s.clone(), p, lit])) } - // Add the entity as an object if it is not top-level + // Add the entity as a triple object if it is not top-level if self.parents.len() > 1 { let o = s; let s = &self.parents[self.parents.len() - 3]; @@ -233,66 +295,90 @@ where } } - fn predicate_start(&mut self, e: BytesStart) { + fn predicate_start(&mut self, e: &BytesStart) { let ns = self.namespaces.last_mut().unwrap(); + + // Get the predicate and add it to the current nested stack let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); - self.parents.push(p) + self.parents.push(p.clone()); + + // Get the datatype of the possible literal value, if any + let mut txt = Text::new(p); + for attr in e.attributes().with_checks(true) { + let a = attr.expect("FIXME"); + if !a.key.starts_with(b"xmlns") { + let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); + if k.matches(&rdf::datatype) { + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + txt.set_datatype(ns.expand_curie_string(&v)); + } + } + } + self.text = Some(txt); } - fn element_end(&mut self, e: BytesEnd) { + fn element_end(&mut self, e: &BytesEnd) { // Change the current element type (if not in rdf:RDF) if e.name() != b"rdf:RDF" { + if !self.in_node { + self.predicate_end(e); + } self.in_node = !self.in_node; - self.parents.pop(); + } + self.leave_scope(); + } + + fn predicate_end(&mut self, e: &BytesEnd) { + let ns = self.namespaces.last_mut().unwrap(); + let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + + if let Some(text) = self.text.take() { + if p.matches(&text.owner) { + let s = &self.parents[self.parents.len() - 2]; + let o = match (text.datatype, self.lang.last()) { + (Some(dt), _) => self.factory.literal_dt(text.text, dt).expect("FIXME"), + (None, Some(Some(l))) => { + self.factory.literal_lang(text.text, l).expect("FIXME") + } + _ => self + .factory + .literal_dt(text.text, xsd::string) + .expect("FIXME"), + }; + self.triples.push_back(Ok([s.clone(), p, o])); + } } } - fn element_text(&mut self, e: BytesText) { + // --- Text elements ---------------------------------------------------- + + fn element_text(&mut self, e: &BytesText) { if !self.in_node { self.predicate_text(e); } } - // FIXME: datatype handler - fn predicate_text(&mut self, e: BytesText) { - if self.parents.len() > 1 { - let s = &self.parents[self.parents.len() - 2]; - let p = &self.parents[self.parents.len() - 1]; - let o = self - .factory - .literal_dt( - e.unescape_and_decode(&self.reader).expect("FIXME"), - xsd::string, - ) - .expect("FIXME"); - self.triples.push_back(Ok([s.clone(), p.clone(), o])); + fn predicate_text(&mut self, e: &BytesText) { + if let Some(text) = &mut self.text { + text.set_text(e.unescape_and_decode(&self.reader).expect("FIXME")); } } - fn element_empty(&mut self, e: BytesStart) { - // Add a new namespace mapping (OPTIMISE ME) - let mut ns = self.namespaces.last().unwrap().clone(); - for attr in e.attributes() { - let a = attr.expect("FIXME"); - if a.key.starts_with(b"xmlns:") { - ns.add_prefix( - std::str::from_utf8(&a.key[6..]).expect("FIXME"), - std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), - ); - } - } + // --- Empty elements ---------------------------------------------------- - self.namespaces.push(ns); + fn element_empty(&mut self, e: &BytesStart) { + self.enter_scope(e); if self.in_node { self.predicate_empty(e) } else { self.node_empty(e) } + self.leave_scope(); } - fn node_empty(&mut self, e: BytesStart) {} + fn node_empty(&mut self, e: &BytesStart) {} - fn predicate_empty(&mut self, e: BytesStart) { + fn predicate_empty(&mut self, e: &BytesStart) { let ns = self.namespaces.last_mut().unwrap(); let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); @@ -328,7 +414,7 @@ where impl Iterator for XmlParser where B: BufRead, - F: TermFactory + Clone + Default, + F: TermFactory + Clone + Default + Debug, ::TermData: Debug, { type Item = Result<[Term; 3]>; @@ -337,12 +423,12 @@ where if let Some(triple) = self.triples.pop_front() { return Some(triple); } - match self.reader.read_event(&mut Vec::new()).unwrap() { + match &self.reader.read_event(&mut Vec::new()).unwrap() { Event::Eof => return None, Event::Start(s) => self.element_start(s), Event::Empty(e) => self.element_empty(e), Event::End(e) => self.element_end(e), - // Event::Text(t) => self.element_text(t), + Event::Text(t) => self.element_text(t), _ => (), } } @@ -352,13 +438,6 @@ where #[cfg(test)] mod test { - use crate::graph::inmem::HashGraph; - use crate::graph::inmem::TermIndexMapU; - use crate::graph::Graph; - use crate::term::factory::RcTermFactory; - use crate::term::matcher::TermMatcher; - use crate::triple::stream::TripleSource; - use crate::triple::Triple; use std::ffi::OsStr; use std::fmt::Debug; use std::fmt::Formatter; @@ -367,6 +446,18 @@ mod test { use std::io; use std::path::Path; + use crate::graph::inmem::HashGraph; + use crate::graph::inmem::TermIndexMapU; + use crate::graph::Graph; + use crate::ns::xsd; + use crate::term::factory::RcTermFactory; + use crate::term::factory::TermFactory; + use crate::term::matcher::TermMatcher; + use crate::term::Normalization; + use crate::term::Term; + use crate::triple::stream::TripleSource; + use crate::triple::Triple; + type TestGraph = HashGraph>; impl PartialEq for TestGraph { @@ -490,43 +581,88 @@ mod test { #[test] fn w3c_example_07() { + let mut f = RcTermFactory::default(); let mut actual = TestGraph::new(); super::Config::default() .parse_str( r#" - - - - - + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:ex="http://example.org/stuff/1.0/"> + + + + + + - - - "#, ) .in_graph(&mut actual) .expect("failed parsing XML file"); - let mut expected = TestGraph::new(); - crate::parser::nt::Config::default() + assert_eq!( + actual.len(), + 4, + "unexpected number of triples: {:#?}", + actual + ); + assert!(actual + .contains( + &f.iri("http://www.w3.org/TR/rdf-syntax-grammar").unwrap(), + &f.iri2("http://purl.org/dc/elements/1.1/", "title").unwrap(), + &f.literal_dt("RDF/XML Syntax Specification (Revised)", xsd::string,) + .unwrap() + ) + .unwrap()); + } + + #[test] + fn w3c_example_08() { + let mut f = RcTermFactory::default(); + let mut actual = TestGraph::new(); + super::Config::default() .parse_str( - r#" - "RDF/XML Syntax Specification (Revised)" . - _:genid1 "Dave Beckett" . - _:genid1 . - _:genid1 . + r#" + + + + RDF 1.1 XML Syntax + RDF 1.1 XML Syntax + RDF 1.1 XML Syntax + + + + Der Baum + Das Buch ist außergewöhnlich + The Tree + + + "#, ) - .in_graph(&mut expected) - .expect("could not parse N-Triples file"); + .in_graph(&mut actual) + .expect("failed parsing XML file"); - // pretty_assertions::assert_eq!(actual, expected); + assert_eq!( + actual.len(), + 6, + "unexpected number of triples: {:#?}", + actual + ); + // assert!( + // actual.contains( + // &f.iri("http://www.w3.org/TR/rdf-syntax-grammar").unwrap(), + // &f.iri2("http://purl.org/dc/elements/1.1/", "title").unwrap(), + // &f.literal_dt( + // "RDF/XML Syntax Specification (Revised)", + // xsd::string, + // ).unwrap() + // ).unwrap() + // ); } } From 18237b60b4b61715ce116f2d0678fd1a01948ec0 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 16:22:04 -0700 Subject: [PATCH 05/50] Add Dublin Core elements namespace as `::ns::dc::elements` --- sophia/src/ns.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index 0aab0e53..faf6aea1 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -209,6 +209,31 @@ pub mod xml { ); } +/// The standard `dc:` namespace. +pub mod dc { + /// The Dublin Core elements (`http://purl.org/dc/elements/1.1/`). + pub mod elements { + namespace!( + "http://purl.org/dc/elements/1.1/", + contributor, + coverage, + creator, + date, + description, + format, + identifier, + language, + publisher, + relation, + rights, + source, + subject, + title + ); + ns_term!("http://purl.org/dc/elements/1.1/", type_, "type"); + } +} + #[cfg(test)] mod test { // Nothing really worth testing here From f38be3f005e3e67cce031a0f2b77643104d20489 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 19:05:56 -0700 Subject: [PATCH 06/50] Improve literal management and add more W3C tests --- sophia/src/parser/xml.rs | 441 +++++++++++++++++++++++---------------- 1 file changed, 263 insertions(+), 178 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index a9854a8e..0e372330 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -59,7 +59,7 @@ impl Config { #[derive(Debug, Clone)] pub struct PrefixMapping { - default: Option, + default: Option>, mapping: HashMap>, factory: F, } @@ -110,20 +110,20 @@ impl PrefixMapping { // --- struct Text { - owner: Term, datatype: Option>, text: String, } -impl Text { - fn new(owner: Term) -> Self { +impl Default for Text { + fn default() -> Self { Self { - owner, datatype: None, - text: String::new(), + text: Default::default(), } } +} +impl Text { fn set_datatype>>>(&mut self, datatype: O) { self.datatype = datatype.into(); } @@ -133,14 +133,15 @@ impl Text { } } +// --- + struct XmlParser { + // reader: quick_xml::Reader, - // The stack of namespaces: should be optimized. namespaces: Vec>, // The stack of `xml:lang`: should be optimized lang: Vec>, - // The stack of parents (for nested declarations) parents: Vec>, // The queue of produced triples @@ -162,6 +163,8 @@ where ::TermData: Debug, { // --- + + // Add a local scope (`lang`, `namespaces`, but not `parents`) fn enter_scope(&mut self, e: &BytesStart) { // Add a new namespace mapping or copy the last one (OPTIMISE ME) let mut ns = self.namespaces.last().unwrap().clone(); @@ -193,8 +196,8 @@ where self.text = None; } + // Exit the local scope fn leave_scope(&mut self) { - self.parents.pop(); self.namespaces.pop(); self.lang.pop(); self.text = None; @@ -214,6 +217,8 @@ where } } + // --- + fn element_start(&mut self, e: &BytesStart) { self.enter_scope(e); // Ignore top-level rdf:RDF element @@ -227,6 +232,12 @@ where self.predicate_start(e) } } + + println!( + "Entering {}: {:?}", + std::str::from_utf8(e.name()).unwrap(), + self.parents + ); } fn node_start(&mut self, e: &BytesStart) { @@ -238,8 +249,8 @@ where for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); - // ignore xmlns and xml:lang attributes (processed in element_start) - if a.key.starts_with(b"xmlns:") { + // ignore xmlns attributes (processed in element_start) + if a.key.starts_with(b"xmlns") { continue; } @@ -261,7 +272,7 @@ where panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") } } else if !k.matches(&xml::lang) { - println!("{:?}", k); + // Ignore xml:lang attributes properties.insert(k, self.factory.literal_dt(v, xsd::string).expect("FIXME")); } } @@ -300,24 +311,33 @@ where // Get the predicate and add it to the current nested stack let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); - self.parents.push(p.clone()); + self.parents.push(p); // Get the datatype of the possible literal value, if any - let mut txt = Text::new(p); + let mut txt = Text::default(); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); if !a.key.starts_with(b"xmlns") { let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); if k.matches(&rdf::datatype) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - txt.set_datatype(ns.expand_curie_string(&v)); + // txt.set_datatype(ns.expand_curie_string(&v)); + txt.set_datatype(self.factory.iri(v).expect("FIXME")); } } } self.text = Some(txt); } + // --- + fn element_end(&mut self, e: &BytesEnd) { + println!( + "Leaving {}: {:?}", + std::str::from_utf8(e.name()).unwrap(), + self.parents + ); + // Change the current element type (if not in rdf:RDF) if e.name() != b"rdf:RDF" { if !self.in_node { @@ -326,27 +346,28 @@ where self.in_node = !self.in_node; } self.leave_scope(); + + // Remove + self.parents.pop(); } fn predicate_end(&mut self, e: &BytesEnd) { + // Build the curie string corresponding let ns = self.namespaces.last_mut().unwrap(); let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + // Get the literal value if let Some(text) = self.text.take() { - if p.matches(&text.owner) { - let s = &self.parents[self.parents.len() - 2]; - let o = match (text.datatype, self.lang.last()) { - (Some(dt), _) => self.factory.literal_dt(text.text, dt).expect("FIXME"), - (None, Some(Some(l))) => { - self.factory.literal_lang(text.text, l).expect("FIXME") - } - _ => self - .factory - .literal_dt(text.text, xsd::string) - .expect("FIXME"), - }; - self.triples.push_back(Ok([s.clone(), p, o])); - } + let s = &self.parents[self.parents.len() - 2]; + let o = match (text.datatype, self.lang.last()) { + (Some(dt), _) => self.factory.literal_dt(text.text, dt).expect("FIXME"), + (None, Some(Some(l))) => self.factory.literal_lang(text.text, l).expect("FIXME"), + _ => self + .factory + .literal_dt(text.text, xsd::string) + .expect("FIXME"), + }; + self.triples.push_back(Ok([s.clone(), p, o])); } } @@ -420,9 +441,11 @@ where type Item = Result<[Term; 3]>; fn next(&mut self) -> Option { loop { + // First make sure to consume the queue. if let Some(triple) = self.triples.pop_front() { return Some(triple); } + // Then process the next event to maybe produce triples match &self.reader.read_event(&mut Vec::new()).unwrap() { Event::Eof => return None, Event::Start(s) => self.element_start(s), @@ -435,66 +458,36 @@ where } } +// --- + #[cfg(test)] mod test { - use std::ffi::OsStr; use std::fmt::Debug; use std::fmt::Formatter; use std::fmt::Result as FmtResult; - use std::fs::{read_dir, File}; - use std::io; - use std::path::Path; use crate::graph::inmem::HashGraph; use crate::graph::inmem::TermIndexMapU; use crate::graph::Graph; + use crate::ns::dc; use crate::ns::xsd; use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; - use crate::term::matcher::TermMatcher; - use crate::term::Normalization; + use crate::term::IriData; + use crate::term::StaticTerm; use crate::term::Term; use crate::triple::stream::TripleSource; use crate::triple::Triple; - type TestGraph = HashGraph>; + pub static GRAMMAR_DESC: &str = "RDF/XML Syntax Specification (Revised)"; + pub static GRAMMAR: StaticTerm = Term::Iri(IriData { + ns: "http://www.w3.org/TR/rdf-syntax-grammar", + suffix: None, + absolute: true, + }); - impl PartialEq for TestGraph { - fn eq(&self, other: &Self) -> bool { - let mut triples_self = Vec::new(); - let mut triples_other = Vec::new(); - - // - for res in ::triples(&self) { - triples_self.push(res.unwrap()); - } - for res in ::triples(&other) { - triples_other.push(res.unwrap()); - } - - // - triples_self.sort_by_key(|t| (t.s().value(), t.p().value(), t.o().value())); - triples_other.sort_by_key(|t| (t.s().value(), t.p().value(), t.o().value())); - - for (ts, to) in triples_self.into_iter().zip(triples_other.into_iter()) { - if !ts.s().matches(to.s()) { - return false; - } - - if !ts.p().matches(to.p()) { - return false; - } - - if !ts.o().matches(to.o()) { - return false; - } - } - - // both graphs are included in each other so they are equal - true - } - } + type TestGraph = HashGraph>; impl Debug for TestGraph { fn fmt(&self, f: &mut Formatter) -> FmtResult { @@ -508,81 +501,81 @@ mod test { } // #[test] - fn w3c_test_suite() { - fn do_test_suite() -> io::Result<()> { - let rdf_ext = OsStr::new("rdf"); - let nt_ext = OsStr::new("nt"); - - let suite = Path::new("..").join("rdf-tests").join("rdf-xml"); - if !suite.exists() || !suite.is_dir() { - panic!("rdf-tests/rdf-xml not found, can not check W3C test-suite. cf README.md"); - } - - let mut tested = 0; - - for e in read_dir(&suite)? { - let entry = e?; - if entry.file_type()?.is_dir() { - for c in read_dir(entry.path())? { - let case = c?; - if case.path().extension() == Some(rdf_ext) { - if case.path().with_extension(nt_ext).is_file() { - println!("{}", case.path().display()); - - // the reference N-Triples file - let ntparser = crate::parser::nt::Config::default(); - let ntfile = File::open(case.path().with_extension(nt_ext))?; - let mut expected = TestGraph::new(); - ntparser.parse_read(ntfile).in_graph(&mut expected).unwrap(); - // the test XML file - let xmlparser = super::Config::default(); - let xmlfile = File::open(case.path())?; - let mut actual = TestGraph::new(); - let res = xmlparser.parse_read(xmlfile).in_graph(&mut actual); - - // check the XML parses without error - assert!( - res.is_ok(), - format!("{} should parse without error", case.path().display()) - ); - // check the XML produces the same graph - pretty_assertions::assert_eq!( - actual, - expected, - "{} does not give expected results", - case.path().display() - ); - - tested += 1; - } else if case.path().to_string_lossy().contains("error") { - // let xmlparser = super::Config::default(); - // let xmlfile = File::open(case.path())?; - // let mut actual = TestGraph::new(); - // assert!( - // xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), - // format!("{} should parse with error", case.path().display()) - // ); - // - // tested += 1; - } - } - } - } - } - - assert_ne!( - tested, 0, - "No test found in W3C test-suite, something must be wrong" - ); - Ok(()) - } - do_test_suite().unwrap() - } + // fn w3c_test_suite() { + // fn do_test_suite() -> io::Result<()> { + // let rdf_ext = OsStr::new("rdf"); + // let nt_ext = OsStr::new("nt"); + // + // let suite = Path::new("..").join("rdf-tests").join("rdf-xml"); + // if !suite.exists() || !suite.is_dir() { + // panic!("rdf-tests/rdf-xml not found, can not check W3C test-suite. cf README.md"); + // } + // + // let mut tested = 0; + // + // for e in read_dir(&suite)? { + // let entry = e?; + // if entry.file_type()?.is_dir() { + // for c in read_dir(entry.path())? { + // let case = c?; + // if case.path().extension() == Some(rdf_ext) { + // if case.path().with_extension(nt_ext).is_file() { + // println!("{}", case.path().display()); + // + // // the reference N-Triples file + // let ntparser = crate::parser::nt::Config::default(); + // let ntfile = File::open(case.path().with_extension(nt_ext))?; + // let mut expected = TestGraph::new(); + // ntparser.parse_read(ntfile).in_graph(&mut expected).unwrap(); + // // the test XML file + // let xmlparser = super::Config::default(); + // let xmlfile = File::open(case.path())?; + // let mut actual = TestGraph::new(); + // let res = xmlparser.parse_read(xmlfile).in_graph(&mut actual); + // + // // check the XML parses without error + // assert!( + // res.is_ok(), + // format!("{} should parse without error", case.path().display()) + // ); + // // check the XML produces the same graph + // pretty_assertions::assert_eq!( + // actual, + // expected, + // "{} does not give expected results", + // case.path().display() + // ); + // + // tested += 1; + // } else if case.path().to_string_lossy().contains("error") { + // // let xmlparser = super::Config::default(); + // // let xmlfile = File::open(case.path())?; + // // let mut actual = TestGraph::new(); + // // assert!( + // // xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), + // // format!("{} should parse with error", case.path().display()) + // // ); + // // + // // tested += 1; + // } + // } + // } + // } + // } + // + // assert_ne!( + // tested, 0, + // "No test found in W3C test-suite, something must be wrong" + // ); + // Ok(()) + // } + // do_test_suite().unwrap() + // } #[test] fn w3c_example_07() { let mut f = RcTermFactory::default(); - let mut actual = TestGraph::new(); + let mut g = TestGraph::new(); super::Config::default() .parse_str( r#" @@ -600,39 +593,31 @@ mod test { "#, ) - .in_graph(&mut actual) + .in_graph(&mut g) .expect("failed parsing XML file"); - assert_eq!( - actual.len(), - 4, - "unexpected number of triples: {:#?}", - actual - ); - assert!(actual + assert_eq!(g.len(), 4, "unexpected number of triples: {:#?}", g); + assert!(g .contains( - &f.iri("http://www.w3.org/TR/rdf-syntax-grammar").unwrap(), - &f.iri2("http://purl.org/dc/elements/1.1/", "title").unwrap(), - &f.literal_dt("RDF/XML Syntax Specification (Revised)", xsd::string,) - .unwrap() + &GRAMMAR, + &dc::elements::title, + &f.literal_dt(GRAMMAR_DESC, xsd::string).unwrap() ) .unwrap()); } #[test] fn w3c_example_08() { - let mut f = RcTermFactory::default(); - let mut actual = TestGraph::new(); + let mut g = TestGraph::new(); super::Config::default() .parse_str( r#" - + xmlns:dc="http://purl.org/dc/elements/1.1/"> - RDF 1.1 XML Syntax - RDF 1.1 XML Syntax - RDF 1.1 XML Syntax + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) @@ -640,29 +625,129 @@ mod test { Das Buch ist außergewöhnlich The Tree + + "#, + ) + .in_graph(&mut g) + .expect("failed parsing XML file"); + + assert_eq!(g.len(), 6, "unexpected number of triples: {:#?}", g); + for triple in crate::parser::nt::Config::default() + .parse_str(r#" + "RDF/XML Syntax Specification (Revised)" . + "RDF/XML Syntax Specification (Revised)"@en . + "RDF/XML Syntax Specification (Revised)"@en-us . + "Der Baum"@de . + "Das Buch ist au\u00DFergew\u00F6hnlich"@de . + "The Tree"@en . + "#) + { + let t = triple.expect("N-Triples iterator failed"); + assert!( + g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() + ); + } + } + #[test] + fn w3c_example_09() { + let mut g = TestGraph::new(); + super::Config::default() + .parse_str( + r#" + + + 123 + "#, ) - .in_graph(&mut actual) + .in_graph(&mut g) .expect("failed parsing XML file"); - assert_eq!( - actual.len(), - 6, - "unexpected number of triples: {:#?}", - actual - ); - // assert!( - // actual.contains( - // &f.iri("http://www.w3.org/TR/rdf-syntax-grammar").unwrap(), - // &f.iri2("http://purl.org/dc/elements/1.1/", "title").unwrap(), - // &f.literal_dt( - // "RDF/XML Syntax Specification (Revised)", - // xsd::string, - // ).unwrap() - // ).unwrap() - // ); + assert_eq!(g.len(), 1, "unexpected number of triples: {:#?}", g); + for triple in crate::parser::nt::Config::default() + .parse_str(r#" + "123"^^ . + "#) + { + let t = triple.expect("N-Triples iterator failed"); + assert!( + g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() + ); + } + } + + #[test] + fn w3c_example_14() { + let mut g = TestGraph::new(); + super::Config::default() + .parse_str( + r#" + + + + A marvelous thing + + + "#, + ) + .in_graph(&mut g) + .expect("failed parsing XML file"); + + assert_eq!(g.len(), 2, "unexpected number of triples: {:#?}", g); + for triple in crate::parser::nt::Config::default() + .parse_str(r#" + . + "A marvelous thing" . + "#) + { + let t = triple.expect("N-Triples iterator failed"); + assert!( + g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() + ); + } } + #[test] + fn w3c_example_15() { + let mut g = TestGraph::new(); + super::Config::default() + .parse_str( + r#" + + + A marvelous thing + + + "#, + ) + .in_graph(&mut g) + .expect("failed parsing XML file"); + + assert_eq!(g.len(), 2, "unexpected number of triples: {:#?}", g); + for triple in crate::parser::nt::Config::default() + .parse_str(r#" + . + "A marvelous thing" . + "#) + { + let t = triple.expect("N-Triples iterator failed"); + assert!( + g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() + ); + } + } + + #[test] + fn w3c_example_16() {} } From 546d05e8c8d5435e32e229ba3731fb589232e6f3 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 20:01:49 -0700 Subject: [PATCH 07/50] Add support for `nodeID` in XML predicates elements --- sophia/src/parser/xml.rs | 296 +++++++++++++++++++++------------------ 1 file changed, 159 insertions(+), 137 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 0e372330..e3282775 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -57,6 +57,14 @@ impl Config { // --- +// enum ParsingMode { +// Node, +// Predicate, +// Resource, +// } + +// --- + #[derive(Debug, Clone)] pub struct PrefixMapping { default: Option>, @@ -144,6 +152,7 @@ struct XmlParser { lang: Vec>, // The stack of parents (for nested declarations) parents: Vec>, + // The queue of produced triples triples: LinkedList; 3]>>, // `true` if we are currently in a node element. @@ -203,6 +212,8 @@ where self.text = None; } + // --- + fn new(reader: quick_xml::Reader) -> Self { Self { reader, @@ -419,10 +430,14 @@ where if object.is_none() { object = Some(self.factory.iri(v).expect("FIXME")); } else { - panic!("cannot have rdf:resource rdf:nodeId at the same time") + panic!("cannot have rdf:resource and rdf:nodeId at the same time") } } else if k.matches(&rdf::nodeID) { - + if object.is_none() { + object = Some(self.factory.bnode(format!("o{}", v)).expect("FIXME")); + } else { + panic!("cannot have rdf:resource and rdf:nodeId at the same time") + } } } @@ -572,6 +587,35 @@ mod test { // do_test_suite().unwrap() // } + macro_rules! w3c_example { + ($name:ident, $xml:literal, $nt:literal) => { + #[test] + fn $name() { + let mut g = TestGraph::new(); + super::Config::default() + .parse_str($xml) + .in_graph(&mut g) + .expect("failed parsing XML file"); + + let mut nt = Vec::new(); + for triple in crate::parser::nt::Config::default().parse_str($nt) { + nt.push(triple.expect("N-Triples iterator failed")); + } + + assert_eq!(g.len(), nt.len(), "unexpected number of triples: {:#?}", g); + for t in nt.into_iter() { + assert!( + g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?})", + t.s(), + t.p(), + t.o() + ); + } + } + }; + } + #[test] fn w3c_example_07() { let mut f = RcTermFactory::default(); @@ -606,148 +650,126 @@ mod test { .unwrap()); } - #[test] - fn w3c_example_08() { - let mut g = TestGraph::new(); - super::Config::default() - .parse_str( - r#" - - - RDF/XML Syntax Specification (Revised) - RDF/XML Syntax Specification (Revised) - RDF/XML Syntax Specification (Revised) - - - - Der Baum - Das Buch ist außergewöhnlich - The Tree - - - "#, - ) - .in_graph(&mut g) - .expect("failed parsing XML file"); - - assert_eq!(g.len(), 6, "unexpected number of triples: {:#?}", g); - for triple in crate::parser::nt::Config::default() - .parse_str(r#" - "RDF/XML Syntax Specification (Revised)" . - "RDF/XML Syntax Specification (Revised)"@en . - "RDF/XML Syntax Specification (Revised)"@en-us . - "Der Baum"@de . - "Das Buch ist au\u00DFergew\u00F6hnlich"@de . - "The Tree"@en . - "#) - { - let t = triple.expect("N-Triples iterator failed"); - assert!( - g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() - ); - } + // Example 08: 'Complete example of xml:lang' + w3c_example! { + w3c_example_08, + r#" + + + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) + + + + Der Baum + Das Buch ist außergewöhnlich + The Tree + + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + "RDF/XML Syntax Specification (Revised)"@en . + "RDF/XML Syntax Specification (Revised)"@en-us . + "Der Baum"@de . + "Das Buch ist au\u00DFergew\u00F6hnlich"@de . + "The Tree"@en . + "# } - #[test] - fn w3c_example_09() { - let mut g = TestGraph::new(); - super::Config::default() - .parse_str( - r#" - - - 123 - - - "#, - ) - .in_graph(&mut g) - .expect("failed parsing XML file"); - - assert_eq!(g.len(), 1, "unexpected number of triples: {:#?}", g); - for triple in crate::parser::nt::Config::default() - .parse_str(r#" - "123"^^ . - "#) - { - let t = triple.expect("N-Triples iterator failed"); - assert!( - g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() - ); - } + // Example 09: 'Complete example of rdf:parseType="Literal"' + w3c_example! { + w3c_example_09, + r#" + + + 123 + + + "#, + r#" "123"^^ . + "# } - #[test] - fn w3c_example_14() { - let mut g = TestGraph::new(); - super::Config::default() - .parse_str( - r#" - - - - A marvelous thing - - - "#, - ) - .in_graph(&mut g) - .expect("failed parsing XML file"); - - assert_eq!(g.len(), 2, "unexpected number of triples: {:#?}", g); - for triple in crate::parser::nt::Config::default() - .parse_str(r#" - . - "A marvelous thing" . - "#) - { - let t = triple.expect("N-Triples iterator failed"); - assert!( - g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() - ); - } + // Example 11: 'Complete RDF/XML description of graph using rdf:nodeID identifying the blank node' + w3c_example! { + w3c_example_11, + r#" + + + + + + + + + + "#, + // This is with renamed node IDs + r#" "RDF/XML Syntax Specification (Revised)" . + _:oabc . + _:oabc "Dave Beckett" . + _:oabc . + "# } - #[test] - fn w3c_example_15() { - let mut g = TestGraph::new(); - super::Config::default() - .parse_str( - r#" - - - A marvelous thing - - - "#, - ) - .in_graph(&mut g) - .expect("failed parsing XML file"); + // Example 14: 'Complete example with rdf:type' + w3c_example! { + w3c_example_14, + r#" + + + + A marvelous thing + + + "#, + r#" . + "A marvelous thing" . + "# + } - assert_eq!(g.len(), 2, "unexpected number of triples: {:#?}", g); - for triple in crate::parser::nt::Config::default() - .parse_str(r#" - . - "A marvelous thing" . - "#) - { - let t = triple.expect("N-Triples iterator failed"); - assert!( - g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?})", t.s(), t.p(), t.o() - ); - } + // Example 15: 'Complete example using a typed node element to replace an rdf:type' + w3c_example! { + w3c_example_15, + r#" + + + A marvelous thing + + + "#, + r#" . + "A marvelous thing" . + "# } - #[test] - fn w3c_example_16() {} + // Example 17: 'Complex example using RDF list properties' + w3c_example! { + w3c_example_17, + r#" + + + + + + + + "#, + r#" . + . + . + . + "# + } } From 8026c137d5c56e867ce29a8272cb1eccdbbe89ce Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 21:40:51 -0700 Subject: [PATCH 08/50] Add support for `parseType=Resource' in XML parser --- sophia/src/parser/xml.rs | 263 ++++++++++++++++++++++++++------------- 1 file changed, 177 insertions(+), 86 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index e3282775..4d324ff6 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -1,11 +1,13 @@ //! Parser for RDF XML. +use std::cell::RefCell; use std::collections::HashMap; use std::collections::LinkedList; use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; -use std::ops::RangeFrom; use std::rc::Rc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; use quick_xml::events::BytesEnd; use quick_xml::events::BytesStart; @@ -57,11 +59,13 @@ impl Config { // --- -// enum ParsingMode { -// Node, -// Predicate, -// Resource, -// } +#[derive(Debug, Clone, Copy)] +enum ParsingState { + Node, + Predicate, + Resource, + Literal, // NB: not supported by quick-xml right now +} // --- @@ -69,34 +73,41 @@ impl Config { pub struct PrefixMapping { default: Option>, mapping: HashMap>, - factory: F, + factory: Rc>, } -impl Default for PrefixMapping { - fn default() -> Self { +impl PrefixMapping { + fn with_factory(f: Rc>) -> Self { let mut m = Self { default: None, mapping: HashMap::new(), - factory: Default::default(), + factory: f, }; m.add_prefix("xml", "http://www.w3.org/XML/1998/namespace#"); m } } +impl Default for PrefixMapping { + fn default() -> Self { + Self::with_factory(Default::default()) + } +} + impl PrefixMapping { pub fn add_prefix(&mut self, prefix: &str, value: &str) { if prefix == "_" { panic!("reserved prefix") } else { + let mut f = self.factory.borrow_mut(); self.mapping.insert( String::from(prefix), - Namespace::new(self.factory.get_term_data(value)).expect("FIXME"), + Namespace::new(f.get_term_data(value)).expect("FIXME"), ); } } - pub fn expand_curie_string(&mut self, curie_str: &str) -> Term { + pub fn expand_curie_string(&self, curie_str: &str) -> Term { if let Some(separator_idx) = curie_str.chars().position(|c| c == ':') { let prefix = &curie_str[..separator_idx]; let reference = &curie_str[separator_idx + 1..]; @@ -106,9 +117,10 @@ impl PrefixMapping { } } - pub fn expand_curie(&mut self, prefix: &str, local: &str) -> Term { + pub fn expand_curie(&self, prefix: &str, local: &str) -> Term { if let Some(ns) = self.mapping.get(prefix) { - ns.get(self.factory.get_term_data(local)).expect("FIXME") + let mut f = self.factory.borrow_mut(); + ns.get(f.get_term_data(local)).expect("FIXME") } else { panic!("no such namespace") } @@ -146,21 +158,24 @@ impl Text { struct XmlParser { // reader: quick_xml::Reader, + // The stack of namespaces: should be optimized. namespaces: Vec>, // The stack of `xml:lang`: should be optimized lang: Vec>, // The stack of parents (for nested declarations) parents: Vec>, + // The tr + state: Vec, // The queue of produced triples triples: LinkedList; 3]>>, // `true` if we are currently in a node element. in_node: bool, // - factory: F, + factory: Rc>, // - bnodes: RangeFrom, + bnodes: AtomicU64, // text: Option>, } @@ -195,6 +210,7 @@ where if a.key == b"xml:lang" { lang = Some( self.factory + .borrow_mut() .get_term_data(&a.unescape_and_decode_value(&self.reader).unwrap()), ); } @@ -212,47 +228,55 @@ where self.text = None; } + // Create a new bnode term (using `n` prefix). + fn new_bnode(&self) -> Term { + self.factory + .borrow_mut() + .bnode(&format!("n{}", self.bnodes.fetch_add(1, Ordering::Relaxed))) + .unwrap() + } + // --- fn new(reader: quick_xml::Reader) -> Self { + let factory: Rc> = Default::default(); Self { reader, parents: Vec::new(), - namespaces: vec![PrefixMapping::default()], + namespaces: vec![PrefixMapping::with_factory(factory.clone())], triples: LinkedList::new(), in_node: false, - factory: Default::default(), - bnodes: 0.., + factory: factory, + bnodes: AtomicU64::new(0), lang: vec![None], text: None, + state: vec![ParsingState::Node], } } // --- fn element_start(&mut self, e: &BytesStart) { + println!("{:?}", self.state); + self.enter_scope(e); - // Ignore top-level rdf:RDF element - if e.name() != b"rdf:RDF" { - // Change the current element type - self.in_node = !self.in_node; - // Parse as a node of as a property - if self.in_node { - self.node_start(e) - } else { - self.predicate_start(e) - } + match self.state.last().unwrap() { + ParsingState::Node => self.node_start(e), + ParsingState::Predicate => self.predicate_start(e), + ParsingState::Resource => self.predicate_start(e), + _ => unimplemented!(), } - - println!( - "Entering {}: {:?}", - std::str::from_utf8(e.name()).unwrap(), - self.parents - ); } fn node_start(&mut self, e: &BytesStart) { - let ns = self.namespaces.last_mut().unwrap(); + // Get the namespace mapping in the current scope + let ns = self.namespaces.last().unwrap(); + + // Bail out if this the top level rdf:RDF + if e.name() == b"rdf:RDF" { + self.state.push(ParsingState::Node); + return; + } // Separate node subject from other attributes let mut properties = HashMap::new(); @@ -270,37 +294,47 @@ where let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); if k.matches(&rdf::about) { if subject.is_none() { - subject = Some(self.factory.iri(v).expect("FIXME")); + subject = Some(self.factory.borrow_mut().iri(v).expect("FIXME")); } else { panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") } } else if k.matches(&rdf::ID) { - + unimplemented!() } else if k.matches(&rdf::nodeID) { if subject.is_none() { - subject = Some(self.factory.bnode(format!("o{}", v)).expect("FIXME")); + subject = Some( + self.factory + .borrow_mut() + .bnode(format!("o{}", v)) + .expect("FIXME"), + ); } else { panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") } } else if !k.matches(&xml::lang) { // Ignore xml:lang attributes - properties.insert(k, self.factory.literal_dt(v, xsd::string).expect("FIXME")); + properties.insert( + k, + self.factory + .borrow_mut() + .literal_dt(v, xsd::string) + .expect("FIXME"), + ); } } // Get subject and add it to the current nested stack - let s: Term<_> = subject.unwrap_or( - self.factory - .bnode(format!("n{}", self.bnodes.next().unwrap())) - .expect("FIXME"), - ); + let s: Term<_> = subject.unwrap_or_else(|| self.new_bnode()); self.parents.push(s.clone()); // Add the type as a triple if it is not `rdf:Description` let ty = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); if ty != rdf::Description { - self.triples - .push_back(Ok([s.clone(), self.factory.copy(&rdf::type_), ty])); + self.triples.push_back(Ok([ + s.clone(), + self.factory.borrow_mut().copy(&rdf::type_), + ty, + ])); } // Add triples described by properties in XML attributes @@ -308,24 +342,20 @@ where self.triples.push_back(Ok([s.clone(), p, lit])) } - // Add the entity as a triple object if it is not top-level - if self.parents.len() > 1 { - let o = s; - let s = &self.parents[self.parents.len() - 3]; - let p = &self.parents[self.parents.len() - 2]; - self.triples.push_back(Ok([s.clone(), p.clone(), o])); - } + // Next start event is expected to be a predicate + self.state.push(ParsingState::Predicate); } fn predicate_start(&mut self, e: &BytesStart) { - let ns = self.namespaces.last_mut().unwrap(); + let ns = self.namespaces.last().unwrap(); // Get the predicate and add it to the current nested stack let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); self.parents.push(p); - // Get the datatype of the possible literal value, if any + // Extract attributes relevant to the RDF syntax let mut txt = Text::default(); + let mut next_state = ParsingState::Node; for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); if !a.key.starts_with(b"xmlns") { @@ -333,33 +363,46 @@ where if k.matches(&rdf::datatype) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); // txt.set_datatype(ns.expand_curie_string(&v)); - txt.set_datatype(self.factory.iri(v).expect("FIXME")); + txt.set_datatype(self.factory.borrow_mut().iri(v).expect("FIXME")); + } else if k.matches(&rdf::parseType) { + match a.value.as_ref() { + b"Resource" => { + self.parents.push(self.new_bnode()); + next_state = ParsingState::Resource; + } + b"Literal" => next_state = ParsingState::Literal, + other => panic!("invalid parseType: {:?}", other), + } } } } self.text = Some(txt); + self.state.push(next_state); } // --- fn element_end(&mut self, e: &BytesEnd) { - println!( - "Leaving {}: {:?}", - std::str::from_utf8(e.name()).unwrap(), - self.parents - ); - - // Change the current element type (if not in rdf:RDF) - if e.name() != b"rdf:RDF" { - if !self.in_node { - self.predicate_end(e); - } - self.in_node = !self.in_node; + println!("{:?}", self.state); + + match self.state.pop().unwrap() { + ParsingState::Node => self.predicate_end(e), + ParsingState::Predicate => self.node_end(e), + ParsingState::Resource => self.resource_end(e), + _ => unimplemented!(), } + self.leave_scope(); + } - // Remove - self.parents.pop(); + fn node_end(&mut self, e: &BytesEnd) { + // Add the entity as a triple object if it is not top-level + let o = self.parents.pop().unwrap(); + if self.parents.len() > 1 { + let s = &self.parents[self.parents.len() - 2]; + let p = &self.parents[self.parents.len() - 1]; + self.triples.push_back(Ok([s.clone(), p.clone(), o])); + } } fn predicate_end(&mut self, e: &BytesEnd) { @@ -371,26 +414,36 @@ where if let Some(text) = self.text.take() { let s = &self.parents[self.parents.len() - 2]; let o = match (text.datatype, self.lang.last()) { - (Some(dt), _) => self.factory.literal_dt(text.text, dt).expect("FIXME"), - (None, Some(Some(l))) => self.factory.literal_lang(text.text, l).expect("FIXME"), + (Some(dt), _) => self + .factory + .borrow_mut() + .literal_dt(text.text, dt) + .expect("FIXME"), + (None, Some(Some(l))) => self + .factory + .borrow_mut() + .literal_lang(text.text, l) + .expect("FIXME"), _ => self .factory + .borrow_mut() .literal_dt(text.text, xsd::string) .expect("FIXME"), }; self.triples.push_back(Ok([s.clone(), p, o])); } + + self.parents.pop(); + } + + fn resource_end(&mut self, e: &BytesEnd) { + self.node_end(e); + self.predicate_end(e) } // --- Text elements ---------------------------------------------------- fn element_text(&mut self, e: &BytesText) { - if !self.in_node { - self.predicate_text(e); - } - } - - fn predicate_text(&mut self, e: &BytesText) { if let Some(text) = &mut self.text { text.set_text(e.unescape_and_decode(&self.reader).expect("FIXME")); } @@ -400,15 +453,20 @@ where fn element_empty(&mut self, e: &BytesStart) { self.enter_scope(e); - if self.in_node { - self.predicate_empty(e) - } else { - self.node_empty(e) + + match self.state.last().unwrap() { + ParsingState::Node => self.node_empty(e), + ParsingState::Predicate => self.predicate_empty(e), + ParsingState::Resource => self.resource_empty(e), + _ => (), } + self.leave_scope(); } - fn node_empty(&mut self, e: &BytesStart) {} + fn node_empty(&mut self, e: &BytesStart) { + // FIXME + } fn predicate_empty(&mut self, e: &BytesStart) { let ns = self.namespaces.last_mut().unwrap(); @@ -428,13 +486,18 @@ where let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); if k.matches(&rdf::resource) { if object.is_none() { - object = Some(self.factory.iri(v).expect("FIXME")); + object = Some(self.factory.borrow_mut().iri(v).expect("FIXME")); } else { panic!("cannot have rdf:resource and rdf:nodeId at the same time") } } else if k.matches(&rdf::nodeID) { if object.is_none() { - object = Some(self.factory.bnode(format!("o{}", v)).expect("FIXME")); + object = Some( + self.factory + .borrow_mut() + .bnode(format!("o{}", v)) + .expect("FIXME"), + ); } else { panic!("cannot have rdf:resource and rdf:nodeId at the same time") } @@ -445,6 +508,10 @@ where let o = object.unwrap(); // FIXME self.triples.push_back(Ok([s.clone(), p, o])); } + + fn resource_empty(&mut self, e: &BytesStart) { + self.predicate_empty(e) + } } impl Iterator for XmlParser @@ -719,6 +786,30 @@ mod test { "# } + // Example 12: 'Complete example using rdf:parseType="Resource"' + w3c_example! { + w3c_example_12, + r#" + + + + Dave Beckett + + + + + "#, + // This is with renamed node IDs + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + _:n0 . + "# + } + // Example 14: 'Complete example with rdf:type' w3c_example! { w3c_example_14, From 4f4ec3b32458a592652200942de1cb0424d0dd08 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Mon, 20 May 2019 21:44:32 -0700 Subject: [PATCH 09/50] Remove unused `in_node` field from `XmlParser` --- sophia/src/parser/xml.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 4d324ff6..5f5d7468 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -24,7 +24,6 @@ use crate::term::factory::TermFactory; use crate::term::matcher::TermMatcher; use crate::term::Term; use crate::term::TermData; -use crate::triple::Triple; // --- @@ -170,8 +169,6 @@ struct XmlParser { // The queue of produced triples triples: LinkedList; 3]>>, - // `true` if we are currently in a node element. - in_node: bool, // factory: Rc>, // @@ -245,7 +242,6 @@ where parents: Vec::new(), namespaces: vec![PrefixMapping::with_factory(factory.clone())], triples: LinkedList::new(), - in_node: false, factory: factory, bnodes: AtomicU64::new(0), lang: vec![None], From dac36bc52787818f5782e6fcd2ab83cf0dd2ae95 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 21 May 2019 17:11:26 -0700 Subject: [PATCH 10/50] Refactor XML parser with better scope management --- sophia/src/parser/xml.rs | 578 ++++++++++++++++++++++++--------------- 1 file changed, 352 insertions(+), 226 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 5f5d7468..f73615e2 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -21,9 +21,10 @@ use crate::ns::xsd; use crate::ns::Namespace; use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; +use crate::term::iri_rfc3987::is_absolute_iri; +use crate::term::iri_rfc3987::is_relative_iri; use crate::term::matcher::TermMatcher; use crate::term::Term; -use crate::term::TermData; // --- @@ -69,112 +70,162 @@ enum ParsingState { // --- #[derive(Debug, Clone)] -pub struct PrefixMapping { +pub struct Scope { + /// The XML namespaces declared in this scope. + ns: HashMap>, + + /// The default XML namespace to expand tags without namespaces with. default: Option>, - mapping: HashMap>, + + /// The base IRI namespace to expand `rdf:ID`, `rdf:resource` and `rdf:about`. + base: Option>, + + /// The term factory used to create new terms. factory: Rc>, -} -impl PrefixMapping { - fn with_factory(f: Rc>) -> Self { - let mut m = Self { - default: None, - mapping: HashMap::new(), - factory: f, - }; - m.add_prefix("xml", "http://www.w3.org/XML/1998/namespace#"); - m - } + /// The datatype of the containing element. + datatype: Option>, + + /// The language tag of the containing element. + lang: Option, + + /// The text gathered in the current scope. + text: Option, } -impl Default for PrefixMapping { - fn default() -> Self { - Self::with_factory(Default::default()) +impl Scope { + /// Create a new `Scope` with the given term factory. + fn with_factory(f: F) -> Self { + Self::with_factory_rc(Rc::new(RefCell::new(f))) } -} -impl PrefixMapping { - pub fn add_prefix(&mut self, prefix: &str, value: &str) { + /// Create a new `Scope` from a shared smartpointer to a term factory. + fn with_factory_rc(f: Rc>) -> Self { + let mut scope = Self { + ns: HashMap::new(), + default: None, + base: None, + factory: f, + datatype: None, + lang: None, + text: None, + }; + // These namespaces are always in scope + scope + .add_prefix("xml", "http://www.w3.org/XML/1998/namespace#") + .unwrap(); + scope + .add_prefix("xmlms", "https://www.w3.org/2000/xmlns/") + .unwrap(); + scope + } + + /// Add a new XML prefix to the namespace mapping. + fn add_prefix(&mut self, prefix: &str, value: &str) -> Result<()> { if prefix == "_" { panic!("reserved prefix") } else { let mut f = self.factory.borrow_mut(); - self.mapping.insert( + self.ns.insert( String::from(prefix), - Namespace::new(f.get_term_data(value)).expect("FIXME"), + Namespace::new(f.get_term_data(value))?, ); } + + Ok(()) } - pub fn expand_curie_string(&self, curie_str: &str) -> Term { - if let Some(separator_idx) = curie_str.chars().position(|c| c == ':') { - let prefix = &curie_str[..separator_idx]; - let reference = &curie_str[separator_idx + 1..]; - self.expand_curie(&prefix, &reference) + /// Set the default XML prefix. + fn set_default(&mut self, default: &str) -> Result<()> { + let mut f = self.factory.borrow_mut(); + self.default = Some(Namespace::new(f.get_term_data(default))?); + Ok(()) + } + + /// Set the base IRI prefix. + fn set_base(&mut self, base: &str) -> Result<()> { + let mut f = self.factory.borrow_mut(); + self.base = Some(Namespace::new(f.get_term_data(base))?); + Ok(()) + } + + fn set_datatype(&mut self, datatype: &str) -> Result<()> { + self.datatype = Some(self.expand_iri(datatype)?); + Ok(()) + } + + fn set_text>>(&mut self, text: T) { + self.text = text.into(); + } + + fn expand_attribute(&self, attr: &str) -> Result> { + if let Some(separator_idx) = attr.chars().position(|c| c == ':') { + let prefix = &attr[..separator_idx]; + let reference = &attr[separator_idx + 1..]; + if let Some(ns) = self.ns.get(prefix) { + ns.get(self.factory.borrow_mut().get_term_data(reference)) + } else { + panic!("unknown namespace: {}", prefix) + } + } else if let Some(ns) = &self.default { + ns.get(self.factory.borrow_mut().get_term_data(attr)) } else { - panic!("missing prefix: {}", curie_str) + panic!("missing prefix: {}", attr) } } - pub fn expand_curie(&self, prefix: &str, local: &str) -> Term { - if let Some(ns) = self.mapping.get(prefix) { - let mut f = self.factory.borrow_mut(); - ns.get(f.get_term_data(local)).expect("FIXME") + fn expand_iri(&self, iri: &str) -> Result> { + if is_absolute_iri(iri) { + self.factory.borrow_mut().iri(iri) + } else if is_relative_iri(iri) { + if let Some(ns) = &self.base { + ns.get(self.factory.borrow_mut().get_term_data(iri)) + } else { + panic!("NO BASE IRI") + } } else { - panic!("no such namespace") + Err(Error::from_kind(ErrorKind::InvalidIri(iri.to_owned()))) } } -} - -// --- - -struct Text { - datatype: Option>, - text: String, -} -impl Default for Text { - fn default() -> Self { - Self { - datatype: None, - text: Default::default(), + fn expand_id(&self, id: &str) -> Result> { + if id.starts_with("#") { + self.expand_iri(id) + } else { + self.expand_iri(&format!("#{}", id)) } } } -impl Text { - fn set_datatype>>>(&mut self, datatype: O) { - self.datatype = datatype.into(); - } - - fn set_text(&mut self, text: String) { - self.text = text; +impl Default for Scope { + fn default() -> Self { + Self::with_factory(Default::default()) } } // --- struct XmlParser { - // + /// The underlying XML reader. reader: quick_xml::Reader, - // The stack of namespaces: should be optimized. - namespaces: Vec>, - // The stack of `xml:lang`: should be optimized - lang: Vec>, - // The stack of parents (for nested declarations) + /// The stack of scoped data (for nested declaration). + scopes: Vec>, + + /// The stack of parent elements (for nested declarations). parents: Vec>, - // The tr - state: Vec, - // The queue of produced triples + // The queue of produced triples. triples: LinkedList; 3]>>, + // factory: Rc>, + // bnodes: AtomicU64, - // - text: Option>, + + /// The current state of the parser. + state: Vec, } impl XmlParser @@ -184,45 +235,64 @@ where ::TermData: Debug, { // --- + fn scope(&self) -> &Scope { + self.scopes.last().unwrap() + } + + fn scope_mut(&mut self) -> &mut Scope { + self.scopes.last_mut().unwrap() + } // Add a local scope (`lang`, `namespaces`, but not `parents`) - fn enter_scope(&mut self, e: &BytesStart) { - // Add a new namespace mapping or copy the last one (OPTIMISE ME) - let mut ns = self.namespaces.last().unwrap().clone(); + fn enter_scope(&mut self, e: &BytesStart) -> Result<()> { + // We are entering a new elements: text is not relevant anymore. + let mut prev = self.scope_mut(); + prev.text = None; + + // Create a local scope using values from the outer one. + let mut scope = prev.clone(); + scope.text = Some(String::new()); + + // Update XML namespaces with those defined in the document. for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); if a.key.starts_with(b"xmlns:") { - ns.add_prefix( - std::str::from_utf8(&a.key[6..]).expect("FIXME"), - std::str::from_utf8(&a.value.as_ref()).expect("FIXME"), - ); + scope + .add_prefix( + std::str::from_utf8(&a.key[6..]).expect("FIXME"), + &a.unescape_and_decode_value(&self.reader).expect("FIXME"), + ) + .expect("FIXME"); + } else if a.key == b"xmlns" { + scope.set_default(&a.unescape_and_decode_value(&self.reader).expect("FIXME"))?; + } else if a.key == b"xml:base" { + scope.set_base(&a.unescape_and_decode_value(&self.reader).expect("FIXME"))?; } } - self.namespaces.push(ns); // Add current lang to scope or copy last one (OPTIMISE ME) - let mut lang = self.lang.last().unwrap().clone(); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); if a.key == b"xml:lang" { - lang = Some( + scope.lang = if a.value.is_empty() { + None + } else { self.factory .borrow_mut() - .get_term_data(&a.unescape_and_decode_value(&self.reader).unwrap()), - ); + .get_term_data(&a.unescape_and_decode_value(&self.reader).unwrap()) + .into() + }; } } - self.lang.push(lang); - // Reset text element - self.text = None; + // Make the newly created scope the local one. + self.scopes.push(scope); + Ok(()) } // Exit the local scope fn leave_scope(&mut self) { - self.namespaces.pop(); - self.lang.pop(); - self.text = None; + self.scopes.pop().expect("FIXME"); } // Create a new bnode term (using `n` prefix). @@ -240,12 +310,10 @@ where Self { reader, parents: Vec::new(), - namespaces: vec![PrefixMapping::with_factory(factory.clone())], + scopes: vec![Scope::with_factory_rc(factory.clone())], triples: LinkedList::new(), factory: factory, bnodes: AtomicU64::new(0), - lang: vec![None], - text: None, state: vec![ParsingState::Node], } } @@ -256,57 +324,63 @@ where println!("{:?}", self.state); self.enter_scope(e); - match self.state.last().unwrap() { + let res = match self.state.last().unwrap() { ParsingState::Node => self.node_start(e), ParsingState::Predicate => self.predicate_start(e), ParsingState::Resource => self.predicate_start(e), _ => unimplemented!(), - } + }; } fn node_start(&mut self, e: &BytesStart) { - // Get the namespace mapping in the current scope - let ns = self.namespaces.last().unwrap(); - // Bail out if this the top level rdf:RDF if e.name() == b"rdf:RDF" { self.state.push(ParsingState::Node); + self.parents.push(self.factory.borrow_mut().copy(&rdf::RDF)); return; } // Separate node subject from other attributes let mut properties = HashMap::new(); - let mut subject = None; + let mut subject = Vec::new(); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); // ignore xmlns attributes (processed in element_start) - if a.key.starts_with(b"xmlns") { + if a.key.starts_with(b"xmlns:") || a.key == b"xmlns" { continue; } // try to extract the subject annotation - let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); + let k = self + .scope() + .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expect("FIXME"); let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + if k.matches(&rdf::about) { - if subject.is_none() { - subject = Some(self.factory.borrow_mut().iri(v).expect("FIXME")); - } else { - panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") - } + subject.push(self.scope().expand_iri(&v).expect("INVALID IRI")); + + // + // if is_absolute_iri(&v) { + // subject.push(self.factory.borrow_mut().iri(v).expect("FIXME")); + // } else if is_relative_iri(&v) { + // subject.push(ns.expand_resource(&v)); + // } } else if k.matches(&rdf::ID) { - unimplemented!() + subject.push(self.scope().expand_id(&v).expect("INVALID NAME")); + // if v.starts_with("#") { + // subject.push(ns.expand_id(&v)) + // } else { + // subject.push(ns.expand_id(&format!("#{}", v))); + // } } else if k.matches(&rdf::nodeID) { - if subject.is_none() { - subject = Some( - self.factory - .borrow_mut() - .bnode(format!("o{}", v)) - .expect("FIXME"), - ); - } else { - panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") - } + subject.push( + self.factory + .borrow_mut() + .bnode(&format!("o{}", v)) + .expect("INVALID BNODE"), + ); } else if !k.matches(&xml::lang) { // Ignore xml:lang attributes properties.insert( @@ -320,12 +394,20 @@ where } // Get subject and add it to the current nested stack - let s: Term<_> = subject.unwrap_or_else(|| self.new_bnode()); + if subject.len() > 1 { + panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") + } + let s: Term<_> = subject.pop().unwrap_or_else(|| self.new_bnode()); self.parents.push(s.clone()); // Add the type as a triple if it is not `rdf:Description` - let ty = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); - if ty != rdf::Description { + let ty = self + .scope() + .expand_attribute( + std::str::from_utf8(e.name()).expect("INVALID DATATYPE IRI REFERENCE"), + ) + .expect("INVALID DATATYPE IRI REFERENCE"); + if !ty.matches(&rdf::Description) { self.triples.push_back(Ok([ s.clone(), self.factory.borrow_mut().copy(&rdf::type_), @@ -343,36 +425,35 @@ where } fn predicate_start(&mut self, e: &BytesStart) { - let ns = self.namespaces.last().unwrap(); - // Get the predicate and add it to the current nested stack - let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + let p = self + .scope() + .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("FIXME"); self.parents.push(p); // Extract attributes relevant to the RDF syntax - let mut txt = Text::default(); let mut next_state = ParsingState::Node; for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); - if !a.key.starts_with(b"xmlns") { - let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); - if k.matches(&rdf::datatype) { - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - // txt.set_datatype(ns.expand_curie_string(&v)); - txt.set_datatype(self.factory.borrow_mut().iri(v).expect("FIXME")); - } else if k.matches(&rdf::parseType) { - match a.value.as_ref() { - b"Resource" => { - self.parents.push(self.new_bnode()); - next_state = ParsingState::Resource; - } - b"Literal" => next_state = ParsingState::Literal, - other => panic!("invalid parseType: {:?}", other), + let k = self + .scope() + .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expect("INVALID ATTRIBUTE"); + if k.matches(&rdf::datatype) { + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + self.scope_mut().set_datatype(&v); + } else if k.matches(&rdf::parseType) { + match a.value.as_ref() { + b"Resource" => { + self.parents.push(self.new_bnode()); + next_state = ParsingState::Resource; } + b"Literal" => next_state = ParsingState::Literal, + other => panic!("invalid parseType: {:?}", other), } } } - self.text = Some(txt); self.state.push(next_state); } @@ -380,6 +461,11 @@ where fn element_end(&mut self, e: &BytesEnd) { println!("{:?}", self.state); + println!( + "exiting {:?}: {:?}", + std::str::from_utf8(e.name()), + self.parents + ); match self.state.pop().unwrap() { ParsingState::Node => self.predicate_end(e), @@ -394,39 +480,50 @@ where fn node_end(&mut self, e: &BytesEnd) { // Add the entity as a triple object if it is not top-level let o = self.parents.pop().unwrap(); - if self.parents.len() > 1 { + if self.parents.len() > 2 { let s = &self.parents[self.parents.len() - 2]; let p = &self.parents[self.parents.len() - 1]; - self.triples.push_back(Ok([s.clone(), p.clone(), o])); + if !s.matches(&rdf::RDF) { + self.triples.push_back(Ok([s.clone(), p.clone(), o])); + } } } fn predicate_end(&mut self, e: &BytesEnd) { // Build the curie string corresponding - let ns = self.namespaces.last_mut().unwrap(); - let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + let p = self + .scope() + .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("INVALID ATTRIBUTE IRI"); + + println!("exiting {}: {:?}", p.value(), self.parents); // Get the literal value - if let Some(text) = self.text.take() { - let s = &self.parents[self.parents.len() - 2]; - let o = match (text.datatype, self.lang.last()) { - (Some(dt), _) => self - .factory - .borrow_mut() - .literal_dt(text.text, dt) - .expect("FIXME"), - (None, Some(Some(l))) => self - .factory - .borrow_mut() - .literal_lang(text.text, l) - .expect("FIXME"), - _ => self - .factory - .borrow_mut() - .literal_dt(text.text, xsd::string) - .expect("FIXME"), - }; - self.triples.push_back(Ok([s.clone(), p, o])); + if self.parents.len() > 2 { + if let Some(text) = self.scope_mut().text.take() { + let s = self.parents[self.parents.len() - 2].clone(); + let o = match ( + self.scope_mut().datatype.take(), + self.scope_mut().lang.take(), + ) { + (Some(dt), _) => self + .factory + .borrow_mut() + .literal_dt(text, dt) + .expect("FIXME"), + (None, Some(l)) => self + .factory + .borrow_mut() + .literal_lang(text, l) + .expect("FIXME"), + _ => self + .factory + .borrow_mut() + .literal_dt(text, xsd::string) + .unwrap(), + }; + self.triples.push_back(Ok([s, p, o])); + } } self.parents.pop(); @@ -440,8 +537,9 @@ where // --- Text elements ---------------------------------------------------- fn element_text(&mut self, e: &BytesText) { - if let Some(text) = &mut self.text { - text.set_text(e.unescape_and_decode(&self.reader).expect("FIXME")); + if self.scope().text.is_some() { + let text = e.unescape_and_decode(&self.reader).expect("FIXME"); + self.scope_mut().set_text(text); } } @@ -465,43 +563,39 @@ where } fn predicate_empty(&mut self, e: &BytesStart) { - let ns = self.namespaces.last_mut().unwrap(); - let p = ns.expand_curie_string(std::str::from_utf8(e.name()).expect("FIXME")); + let p = self + .scope() + .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("INVALID ATTRIBUTE IRI"); - let mut object = None; + let mut object = Vec::with_capacity(1); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); - // ignore xmlns attributes - if a.key.starts_with(b"xmlns:") { - continue; - } - // try to extract the annotation object - let k = ns.expand_curie_string(std::str::from_utf8(a.key).expect("FIXME")); + let k = self + .scope() + .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expect("FIXME"); let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); if k.matches(&rdf::resource) { - if object.is_none() { - object = Some(self.factory.borrow_mut().iri(v).expect("FIXME")); - } else { - panic!("cannot have rdf:resource and rdf:nodeId at the same time") - } + object.push(self.scope().expand_iri(&v).expect("INVALID IRI")); } else if k.matches(&rdf::nodeID) { - if object.is_none() { - object = Some( - self.factory - .borrow_mut() - .bnode(format!("o{}", v)) - .expect("FIXME"), - ); - } else { - panic!("cannot have rdf:resource and rdf:nodeId at the same time") - } + object.push( + self.factory + .borrow_mut() + .bnode(format!("o{}", v)) + .expect("FIXME"), + ); } } let s = self.parents.last().unwrap(); - let o = object.unwrap(); // FIXME + let o = match object.len() { + 0 => panic!("missing resource in empty predicate !"), + 1 => object.pop().unwrap(), + _ => panic!("cannot have rdf:resource and rdf:nodeId at the same time"), + }; self.triples.push_back(Ok([s.clone(), p, o])); } @@ -548,23 +642,13 @@ mod test { use crate::graph::inmem::HashGraph; use crate::graph::inmem::TermIndexMapU; use crate::graph::Graph; - use crate::ns::dc; - use crate::ns::xsd; use crate::term::factory::RcTermFactory; - use crate::term::factory::TermFactory; use crate::term::IriData; use crate::term::StaticTerm; use crate::term::Term; use crate::triple::stream::TripleSource; use crate::triple::Triple; - pub static GRAMMAR_DESC: &str = "RDF/XML Syntax Specification (Revised)"; - pub static GRAMMAR: StaticTerm = Term::Iri(IriData { - ns: "http://www.w3.org/TR/rdf-syntax-grammar", - suffix: None, - absolute: true, - }); - type TestGraph = HashGraph>; impl Debug for TestGraph { @@ -669,48 +753,39 @@ mod test { for t in nt.into_iter() { assert!( g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?})", + "missing triple: ({:?} {:?} {:?}) in {:#?}", t.s(), t.p(), - t.o() + t.o(), + g ); } } }; } - #[test] - fn w3c_example_07() { - let mut f = RcTermFactory::default(); - let mut g = TestGraph::new(); - super::Config::default() - .parse_str( - r#" - - - - - - - - - - "#, - ) - .in_graph(&mut g) - .expect("failed parsing XML file"); - - assert_eq!(g.len(), 4, "unexpected number of triples: {:#?}", g); - assert!(g - .contains( - &GRAMMAR, - &dc::elements::title, - &f.literal_dt(GRAMMAR_DESC, xsd::string).unwrap() - ) - .unwrap()); + // Example 07: 'Complete RDF/XML description of Figure 1 graph' + w3c_example! { + w3c_example_07, + r#" + + + + + + + + + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + _:n0 . + "# } // Example 08: 'Complete example of xml:lang' @@ -756,6 +831,21 @@ mod test { "# } + // Example 10: 'Complete example of rdf:datatype' + w3c_example! { + w3c_example_10, + r#" + + + 123 + + + "#, + r#" "123"^^ . + "# + } + // Example 11: 'Complete RDF/XML description of graph using rdf:nodeID identifying the blank node' w3c_example! { w3c_example_11, @@ -806,6 +896,26 @@ mod test { "# } + // Example 13: 'Complete example of property attributes on an empty property element' + w3c_example! { + w3c_example_13, + r#" + + + + + + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + "# + } + // Example 14: 'Complete example with rdf:type' w3c_example! { w3c_example_14, @@ -841,6 +951,22 @@ mod test { "# } + // Example 16: 'Complete example using rdf:ID and xml:base for shortening URIs' + w3c_example! { + w3c_example_16, + r#" + + + + + + "#, + r#" . + "# + } + // Example 17: 'Complex example using RDF list properties' w3c_example! { w3c_example_17, From c8af31c06258bd17ae531f5f1979fa2bd54af8c0 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 21 May 2019 18:01:38 -0700 Subject: [PATCH 11/50] Remove unneeded `Debug` trait bound on `Term::new_literal_dt` --- sophia/src/term.rs | 2 +- sophia/src/term/factory.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sophia/src/term.rs b/sophia/src/term.rs index a60c462c..d42345c7 100644 --- a/sophia/src/term.rs +++ b/sophia/src/term.rs @@ -198,7 +198,7 @@ where /// May fail if `dt` is not a valid datatype. pub fn new_literal_dt(txt: U, dt: Term) -> Result> where - T: From + Debug, + T: From, { match dt { Iri(iri) => Ok(Literal(T::from(txt), Datatype(iri))), diff --git a/sophia/src/term/factory.rs b/sophia/src/term/factory.rs index 578bbe00..c9216211 100644 --- a/sophia/src/term/factory.rs +++ b/sophia/src/term/factory.rs @@ -58,7 +58,7 @@ pub trait TermFactory { where T: TermData, U: TermData, - Self::TermData: Debug, + // Self::TermData: Debug, { Term::new_literal_dt(self.get_term_data(txt.as_ref()), self.copy(&dt)) } From f102b8fc069e4b0b369189f3fb3775e77e7b7404 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 21 May 2019 18:07:44 -0700 Subject: [PATCH 12/50] Add support for property attributes on property elements --- sophia/src/parser/xml.rs | 67 ++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index f73615e2..fb3f14e3 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -195,6 +195,15 @@ impl Scope { self.expand_iri(&format!("#{}", id)) } } + + /// Create a new literal with the `rdf:type` and `xml:lang` in scope. + fn new_literal(&self, text: String) -> Result> { + match (&self.datatype, &self.lang) { + (Some(dt), _) => self.factory.borrow_mut().literal_dt(text, dt.clone()), + (None, Some(l)) => self.factory.borrow_mut().literal_lang(text, l.clone()), + _ => self.factory.borrow_mut().literal_dt(text, xsd::string), + } + } } impl Default for Scope { @@ -502,26 +511,7 @@ where if self.parents.len() > 2 { if let Some(text) = self.scope_mut().text.take() { let s = self.parents[self.parents.len() - 2].clone(); - let o = match ( - self.scope_mut().datatype.take(), - self.scope_mut().lang.take(), - ) { - (Some(dt), _) => self - .factory - .borrow_mut() - .literal_dt(text, dt) - .expect("FIXME"), - (None, Some(l)) => self - .factory - .borrow_mut() - .literal_lang(text, l) - .expect("FIXME"), - _ => self - .factory - .borrow_mut() - .literal_dt(text, xsd::string) - .unwrap(), - }; + let o = self.scope_mut().new_literal(text).expect("FIXME"); self.triples.push_back(Ok([s, p, o])); } } @@ -569,6 +559,7 @@ where .expect("INVALID ATTRIBUTE IRI"); let mut object = Vec::with_capacity(1); + let mut attributes = HashMap::new(); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); @@ -587,16 +578,38 @@ where .bnode(format!("o{}", v)) .expect("FIXME"), ); + } else if !k.matches(&xml::lang) && !a.key.starts_with(b"xmlns") { + attributes.insert(k, v); } } - let s = self.parents.last().unwrap(); - let o = match object.len() { - 0 => panic!("missing resource in empty predicate !"), - 1 => object.pop().unwrap(), - _ => panic!("cannot have rdf:resource and rdf:nodeId at the same time"), - }; - self.triples.push_back(Ok([s.clone(), p, o])); + match object.len() { + 0 => { + let s = self.parents.last().unwrap().clone(); + let o = self.new_bnode(); + self.triples.push_back(Ok([s, p, o.clone()])); + for (prop, value) in attributes.into_iter() { + let literal = self.scope().new_literal(value).expect("FIXME"); + self.triples.push_back(Ok([o.clone(), prop, literal])); + } + } + 1 => { + // Ignoring property attributes + let s = self.parents.last().unwrap().clone(); + let o = object.pop().unwrap(); + self.triples.push_back(Ok([s, p, o])); + } + _ => { + panic!("cannot have rdf:resource and rdf:nodeID at the same time"); + } + } + + // let o = match object.len() { + // 0 => panic!("missing resource in empty predicate !"), + // 1 => object.pop().unwrap(), + // _ => panic!(""), + // }; + // self.triples.push_back(Ok([s.clone(), p, o])); } fn resource_empty(&mut self, e: &BytesStart) { From 8f9e5607d662e1b04290f4f9ad4084dc5fa48a18 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 21 May 2019 19:05:33 -0700 Subject: [PATCH 13/50] Add support for `rdf:li` metaproperty syntax --- sophia/src/ns.rs | 1 + sophia/src/parser/xml.rs | 182 +++++++++++++++++++++++++++++++++++---- 2 files changed, 168 insertions(+), 15 deletions(-) diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index faf6aea1..859e419a 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -111,6 +111,7 @@ pub mod rdf { about, parseType, resource, + li, nodeID, datatype ); diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index fb3f14e3..de0ed5de 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -69,7 +69,7 @@ enum ParsingState { // --- -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct Scope { /// The XML namespaces declared in this scope. ns: HashMap>, @@ -91,6 +91,27 @@ pub struct Scope { /// The text gathered in the current scope. text: Option, + + /// The current count of list elements + li: AtomicU64, +} + +// We implement it ourselves instead of deriving so that: +// * F does not need to be `Clone` (deriving requires it). +// * we can clone `li` although `AtomicU64` is not `Clone`. +impl Clone for Scope { + fn clone(&self) -> Self { + Self { + ns: self.ns.clone(), + default: self.default.clone(), + base: self.base.clone(), + factory: self.factory.clone(), + datatype: self.datatype.clone(), + lang: self.lang.clone(), + text: self.text.clone(), + li: AtomicU64::new(self.li.load(Ordering::Relaxed)), + } + } } impl Scope { @@ -109,6 +130,7 @@ impl Scope { datatype: None, lang: None, text: None, + li: AtomicU64::new(1), }; // These namespaces are always in scope scope @@ -204,6 +226,26 @@ impl Scope { _ => self.factory.borrow_mut().literal_dt(text, xsd::string), } } + + /// Create a new `rdf:li` property. + fn new_li(&self) -> Result> { + if let Some(ns) = self.ns.get("rdf") { + let mut f = self.factory.borrow_mut(); + ns.get(f.get_term_data(&format!("_{}", self.li.fetch_add(1, Ordering::Relaxed)))) + } else { + panic!("undeclared `rdf` prefix !") + } + } + + /// Get the current `rdf:li` property. + fn current_li(&self) -> Result> { + if let Some(ns) = self.ns.get("rdf") { + let mut f = self.factory.borrow_mut(); + ns.get(f.get_term_data(&format!("_{}", self.li.load(Ordering::Relaxed)))) + } else { + panic!("undeclared `rdf` prefix !") + } + } } impl Default for Scope { @@ -261,6 +303,7 @@ where // Create a local scope using values from the outer one. let mut scope = prev.clone(); scope.text = Some(String::new()); + scope.li.store(1, Ordering::Relaxed); // Update XML namespaces with those defined in the document. for attr in e.attributes().with_checks(true) { @@ -304,6 +347,8 @@ where self.scopes.pop().expect("FIXME"); } + // --- + // Create a new bnode term (using `n` prefix). fn new_bnode(&self) -> Term { self.factory @@ -312,6 +357,28 @@ where .unwrap() } + // Create a new predicate IRI from an XML name (or a RDF metasyntactic element) + fn predicate_iri_start(&self, name: &str) -> Result> { + let mut p = self.scope().expand_attribute(name)?; + if p.matches(&rdf::li) { + let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); + parent_scope.new_li() + } else { + Ok(p) + } + } + + // Retrieve a predicate IRI from an XML name + fn predicate_iri_end(&self, name: &str) -> Result> { + let mut p = self.scope().expand_attribute(name)?; + if p.matches(&rdf::li) { + let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); + parent_scope.current_li() + } else { + Ok(p) + } + } + // --- fn new(reader: quick_xml::Reader) -> Self { @@ -330,8 +397,6 @@ where // --- fn element_start(&mut self, e: &BytesStart) { - println!("{:?}", self.state); - self.enter_scope(e); let res = match self.state.last().unwrap() { ParsingState::Node => self.node_start(e), @@ -435,10 +500,10 @@ where fn predicate_start(&mut self, e: &BytesStart) { // Get the predicate and add it to the current nested stack + // or build a new `rdf:_n` IRI if the predicate is `rdf:li`. let p = self - .scope() - .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) - .expect("FIXME"); + .predicate_iri_start(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("INVALID PREDICATE IRI"); self.parents.push(p); // Extract attributes relevant to the RDF syntax @@ -499,13 +564,10 @@ where } fn predicate_end(&mut self, e: &BytesEnd) { - // Build the curie string corresponding + // Build the predicate IRI let p = self - .scope() - .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) - .expect("INVALID ATTRIBUTE IRI"); - - println!("exiting {}: {:?}", p.value(), self.parents); + .predicate_iri_end(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("INVALID PREDICATE IRI"); // Get the literal value if self.parents.len() > 2 { @@ -554,9 +616,8 @@ where fn predicate_empty(&mut self, e: &BytesStart) { let p = self - .scope() - .expand_attribute(std::str::from_utf8(e.name()).expect("FIXME")) - .expect("INVALID ATTRIBUTE IRI"); + .predicate_iri_start(std::str::from_utf8(e.name()).expect("FIXME")) + .expect("INVALID PREDICATE IRI"); let mut object = Vec::with_capacity(1); let mut attributes = HashMap::new(); @@ -998,4 +1059,95 @@ mod test { . "# } + + // Example 18: 'Complete example using rdf:li property element for list properties' + w3c_example! { + w3c_example_18, + r#" + + + + + + + + "#, + r#" . + . + . + . + "# + } + + // Example 19: 'Complete example of a RDF collection of nodes using rdf:parseType="Collection"' + w3c_example! { + w3c_example_19, + r#" + + + + + + + + + + "#, + r#"_:genid1 . + _:genid2 . + _:genid1 _:genid2 . + _:genid3 . + _:genid2 _:genid3 . + _:genid3 . + _:genid1 . + "# + } + + // Example 20: 'Complete example of rdf:ID reifying a property element' + w3c_example! { + w3c_example_20, + r#" + + + blah + + + "#, + r#" "blah" . + . + . + . + "blah" . + "# + } + + w3c_example! { + nested_li, + r#" + + + + + + + + + + + + + "#, + r#" . + . + . + . + . + . + . + "# + } + } From 7300bd1c816fdbf653ab926464938f98764d7fa7 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 21 May 2019 19:19:30 -0700 Subject: [PATCH 14/50] Remove `curie` and `pretty-assertions` from cargo dependencies --- sophia/Cargo.toml | 3 -- sophia/src/parser/xml.rs | 61 ++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/sophia/Cargo.toml b/sophia/Cargo.toml index b5b01438..fa653014 100644 --- a/sophia/Cargo.toml +++ b/sophia/Cargo.toml @@ -14,7 +14,6 @@ edition = "2018" [dependencies] coercible_errors = "0.1.3" -curie = "0.0.8" error-chain = "0.12.0" language-tag = "0.9.0" lazy_static = "1.2.0" @@ -26,5 +25,3 @@ rental = "0.5.2" resiter = "0.3.0" url = "1.7.2" weak-table = "0.2.3" - -pretty_assertions = "*" diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index de0ed5de..49a7d741 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -808,7 +808,7 @@ mod test { // do_test_suite().unwrap() // } - macro_rules! w3c_example { + macro_rules! nt_example { ($name:ident, $xml:literal, $nt:literal) => { #[test] fn $name() { @@ -838,8 +838,8 @@ mod test { }; } - // Example 07: 'Complete RDF/XML description of Figure 1 graph' - w3c_example! { + // W3C Example 07: 'Complete RDF/XML description of Figure 1 graph' + nt_example! { w3c_example_07, r#" @@ -1060,8 +1060,8 @@ mod test { "# } - // Example 18: 'Complete example using rdf:li property element for list properties' - w3c_example! { + // W3C Example 18: 'Complete example using rdf:li property element for list properties' + nt_example! { w3c_example_18, r#" @@ -1079,8 +1079,8 @@ mod test { "# } - // Example 19: 'Complete example of a RDF collection of nodes using rdf:parseType="Collection"' - w3c_example! { + // W3C Example 19: 'Complete example of a RDF collection of nodes using rdf:parseType="Collection"' + nt_example! { w3c_example_19, r#" From 056f93ed917fdc9c0a16cea2bb571cb23ef516ec Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Wed, 22 May 2019 11:42:16 -0700 Subject: [PATCH 15/50] Add support for reified triples in RDF/XML documents --- sophia/src/parser/xml.rs | 50 +++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 49a7d741..f2c4e162 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -286,6 +286,7 @@ where ::TermData: Debug, { // --- + fn scope(&self) -> &Scope { self.scopes.last().unwrap() } @@ -359,7 +360,7 @@ where // Create a new predicate IRI from an XML name (or a RDF metasyntactic element) fn predicate_iri_start(&self, name: &str) -> Result> { - let mut p = self.scope().expand_attribute(name)?; + let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); parent_scope.new_li() @@ -370,7 +371,7 @@ where // Retrieve a predicate IRI from an XML name fn predicate_iri_end(&self, name: &str) -> Result> { - let mut p = self.scope().expand_attribute(name)?; + let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); parent_scope.current_li() @@ -434,20 +435,8 @@ where if k.matches(&rdf::about) { subject.push(self.scope().expand_iri(&v).expect("INVALID IRI")); - - // - // if is_absolute_iri(&v) { - // subject.push(self.factory.borrow_mut().iri(v).expect("FIXME")); - // } else if is_relative_iri(&v) { - // subject.push(ns.expand_resource(&v)); - // } } else if k.matches(&rdf::ID) { subject.push(self.scope().expand_id(&v).expect("INVALID NAME")); - // if v.starts_with("#") { - // subject.push(ns.expand_id(&v)) - // } else { - // subject.push(ns.expand_id(&format!("#{}", v))); - // } } else if k.matches(&rdf::nodeID) { subject.push( self.factory @@ -517,6 +506,9 @@ where if k.matches(&rdf::datatype) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); self.scope_mut().set_datatype(&v); + } else if k.matches(&rdf::ID) { + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + return self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => { @@ -531,6 +523,34 @@ where self.state.push(next_state); } + fn reification_start(&mut self, e: &BytesStart, id: Term) { + // Get the subject and predicate of the triple + let p = self.parents.pop().unwrap(); + let s = self.parents.last().unwrap().clone(); + + // Get the object of the triple + let txt = self.reader.read_text(e.name(), &mut Vec::new()).unwrap(); + let o = self.scope().new_literal(txt).unwrap(); + + // Add the actual triple + self.triples + .push_back(Ok([s.clone(), p.clone(), o.clone()])); + + // Add the reified triples + let ty = self.factory.borrow_mut().copy(&rdf::type_); + let subject = self.factory.borrow_mut().copy(&rdf::subject); + let predicate = self.factory.borrow_mut().copy(&rdf::predicate); + let object = self.factory.borrow_mut().copy(&rdf::object); + self.triples.push_back(Ok([ + id.clone(), + ty, + self.factory.borrow_mut().copy(&rdf::Statement), + ])); + self.triples.push_back(Ok([id.clone(), subject, s])); + self.triples.push_back(Ok([id.clone(), predicate, p])); + self.triples.push_back(Ok([id.clone(), object, o])); + } + // --- fn element_end(&mut self, e: &BytesEnd) { @@ -582,7 +602,9 @@ where } fn resource_end(&mut self, e: &BytesEnd) { + // End of the implicit node element self.node_end(e); + // End of the resource predicate self.predicate_end(e) } From d4da809891b00cc90bc648ce23f376d9646addb3 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Wed, 22 May 2019 17:16:18 -0700 Subject: [PATCH 16/50] Add support for `parseType=Collection' in XML parser --- sophia/src/parser/xml.rs | 148 ++++++++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 17 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index f2c4e162..0fcce390 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -65,6 +65,9 @@ enum ParsingState { Predicate, Resource, Literal, // NB: not supported by quick-xml right now + + Collection, + CollectionItem, } // --- @@ -94,6 +97,8 @@ pub struct Scope { /// The current count of list elements li: AtomicU64, + + collection: Vec>, } // We implement it ourselves instead of deriving so that: @@ -110,6 +115,7 @@ impl Clone for Scope { lang: self.lang.clone(), text: self.text.clone(), li: AtomicU64::new(self.li.load(Ordering::Relaxed)), + collection: self.collection.clone(), } } } @@ -131,6 +137,7 @@ impl Scope { lang: None, text: None, li: AtomicU64::new(1), + collection: Vec::new(), }; // These namespaces are always in scope scope @@ -304,6 +311,7 @@ where // Create a local scope using values from the outer one. let mut scope = prev.clone(); scope.text = Some(String::new()); + scope.collection = Vec::new(); scope.li.store(1, Ordering::Relaxed); // Update XML namespaces with those defined in the document. @@ -398,11 +406,20 @@ where // --- fn element_start(&mut self, e: &BytesStart) { + // println!( + // "entering {:?}: \ntrace: {:?}\nparents: {:?}", + // std::str::from_utf8(e.name()), + // self.state, + // self.parents + // ); + self.enter_scope(e); let res = match self.state.last().unwrap() { ParsingState::Node => self.node_start(e), ParsingState::Predicate => self.predicate_start(e), ParsingState::Resource => self.predicate_start(e), + ParsingState::Collection => self.collection_start(e), + ParsingState::CollectionItem => unreachable!(), _ => unimplemented!(), }; } @@ -515,6 +532,9 @@ where self.parents.push(self.new_bnode()); next_state = ParsingState::Resource; } + b"Collection" => { + next_state = ParsingState::Collection; + } b"Literal" => next_state = ParsingState::Literal, other => panic!("invalid parseType: {:?}", other), } @@ -551,27 +571,38 @@ where self.triples.push_back(Ok([id.clone(), object, o])); } + fn collection_start(&mut self, e: &BytesStart) { + self.state.push(ParsingState::CollectionItem); + self.node_start(e); + let new_iri = self.parents.last().unwrap().clone(); + + let l = self.scopes.len(); + self.scopes.get_mut(l - 2).unwrap().collection.push(new_iri); + } + // --- fn element_end(&mut self, e: &BytesEnd) { - println!("{:?}", self.state); - println!( - "exiting {:?}: {:?}", - std::str::from_utf8(e.name()), - self.parents - ); + // println!( + // "exiting {:?}: \ntrace: {:?}\nparents: {:?}", + // std::str::from_utf8(e.name()), + // self.state, + // self.parents + // ); match self.state.pop().unwrap() { ParsingState::Node => self.predicate_end(e), - ParsingState::Predicate => self.node_end(e), + ParsingState::Predicate => self.node_end(), ParsingState::Resource => self.resource_end(e), + ParsingState::CollectionItem => self.collection_item_end(), + ParsingState::Collection => self.collection_end(e), _ => unimplemented!(), } self.leave_scope(); } - fn node_end(&mut self, e: &BytesEnd) { + fn node_end(&mut self) { // Add the entity as a triple object if it is not top-level let o = self.parents.pop().unwrap(); if self.parents.len() > 2 { @@ -603,11 +634,60 @@ where fn resource_end(&mut self, e: &BytesEnd) { // End of the implicit node element - self.node_end(e); + self.node_end(); // End of the resource predicate self.predicate_end(e) } + fn collection_item_end(&mut self) { + // End of the node parent. + self.parents.pop(); + // Remove `CollectionItem` + self.state.pop(); + } + + fn collection_end(&mut self, e: &BytesEnd) { + self.state.pop(); // Remove the `Predicate` parsing state as well. + + let collection = self.scope().collection.clone(); + if !collection.is_empty() { + let mut node = self.new_bnode(); + let mut elements = collection.into_iter().peekable(); + + self.triples.push_back(Ok([ + self.parents[self.parents.len() - 2].clone(), + self.parents[self.parents.len() - 1].clone(), + node.clone(), + ])); + + while let Some(e) = elements.next() { + self.triples.push_back(Ok([ + node.clone(), + self.factory.borrow_mut().copy(&rdf::first), + e.clone(), + ])); + if elements.peek().is_some() { + let next_node = self.new_bnode(); + self.triples.push_back(Ok([ + node, + self.factory.borrow_mut().copy(&rdf::rest), + next_node.clone(), + ])); + node = next_node; + } else { + let mut f = self.factory.borrow_mut(); + self.triples.push_back(Ok([ + node.clone(), + f.copy(&rdf::rest), + f.copy(&rdf::nil), + ])); + } + } + } + + self.predicate_end(e); + } + // --- Text elements ---------------------------------------------------- fn element_text(&mut self, e: &BytesText) { @@ -620,12 +700,21 @@ where // --- Empty elements ---------------------------------------------------- fn element_empty(&mut self, e: &BytesStart) { + // println!( + // "empty {:?}: \ntrace: {:?}\nparents: {:?}", + // std::str::from_utf8(e.name()), + // self.state, + // self.parents + // ); + self.enter_scope(e); match self.state.last().unwrap() { ParsingState::Node => self.node_empty(e), ParsingState::Predicate => self.predicate_empty(e), ParsingState::Resource => self.resource_empty(e), + ParsingState::Collection => self.collection_item_empty(e), + ParsingState::CollectionItem => unreachable!(), _ => (), } @@ -633,7 +722,9 @@ where } fn node_empty(&mut self, e: &BytesStart) { - // FIXME + self.node_start(e); + self.state.pop(); + self.node_end(); } fn predicate_empty(&mut self, e: &BytesStart) { @@ -698,6 +789,12 @@ where fn resource_empty(&mut self, e: &BytesStart) { self.predicate_empty(e) } + + fn collection_item_empty(&mut self, e: &BytesStart) { + self.collection_start(e); + self.state.pop(); + self.collection_item_end(); + } } impl Iterator for XmlParser @@ -1116,13 +1213,13 @@ mod test { "#, - r#"_:genid1 . - _:genid2 . - _:genid1 _:genid2 . - _:genid3 . - _:genid2 _:genid3 . - _:genid3 . - _:genid1 . + r#"_:n0 . + _:n0 _:n1 . + _:n1 . + _:n1 _:n2 . + _:n2 . + _:n2 . + _:n0 . "# } @@ -1173,4 +1270,21 @@ mod test { "# } + // Check that an empty node is used as a leaf. + nt_example! { + empty_node, + r#" + + + + + + + + "#, + r#" . + "# + } + } From 23f899a71bb7a4a205c26b3370ba6eb7114b1656 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Wed, 22 May 2019 18:34:09 -0700 Subject: [PATCH 17/50] Fix inconsistent state when exiting a collection in XML Parser --- sophia/src/parser/xml.rs | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 0fcce390..48c9e4cf 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -406,20 +406,13 @@ where // --- fn element_start(&mut self, e: &BytesStart) { - // println!( - // "entering {:?}: \ntrace: {:?}\nparents: {:?}", - // std::str::from_utf8(e.name()), - // self.state, - // self.parents - // ); - self.enter_scope(e); let res = match self.state.last().unwrap() { ParsingState::Node => self.node_start(e), ParsingState::Predicate => self.predicate_start(e), ParsingState::Resource => self.predicate_start(e), ParsingState::Collection => self.collection_start(e), - ParsingState::CollectionItem => unreachable!(), + ParsingState::CollectionItem => self.collection_item_start(e), _ => unimplemented!(), }; } @@ -573,9 +566,14 @@ where fn collection_start(&mut self, e: &BytesStart) { self.state.push(ParsingState::CollectionItem); + self.collection_item_start(e); + } + + fn collection_item_start(&mut self, e: &BytesStart) { + // Start the inner node element and get its IRI. self.node_start(e); let new_iri = self.parents.last().unwrap().clone(); - + // Add the iri of the node to the parent scope (not current!) let l = self.scopes.len(); self.scopes.get_mut(l - 2).unwrap().collection.push(new_iri); } @@ -583,13 +581,6 @@ where // --- fn element_end(&mut self, e: &BytesEnd) { - // println!( - // "exiting {:?}: \ntrace: {:?}\nparents: {:?}", - // std::str::from_utf8(e.name()), - // self.state, - // self.parents - // ); - match self.state.pop().unwrap() { ParsingState::Node => self.predicate_end(e), ParsingState::Predicate => self.node_end(), @@ -598,7 +589,6 @@ where ParsingState::Collection => self.collection_end(e), _ => unimplemented!(), } - self.leave_scope(); } @@ -647,8 +637,6 @@ where } fn collection_end(&mut self, e: &BytesEnd) { - self.state.pop(); // Remove the `Predicate` parsing state as well. - let collection = self.scope().collection.clone(); if !collection.is_empty() { let mut node = self.new_bnode(); @@ -700,15 +688,7 @@ where // --- Empty elements ---------------------------------------------------- fn element_empty(&mut self, e: &BytesStart) { - // println!( - // "empty {:?}: \ntrace: {:?}\nparents: {:?}", - // std::str::from_utf8(e.name()), - // self.state, - // self.parents - // ); - self.enter_scope(e); - match self.state.last().unwrap() { ParsingState::Node => self.node_empty(e), ParsingState::Predicate => self.predicate_empty(e), @@ -717,7 +697,6 @@ where ParsingState::CollectionItem => unreachable!(), _ => (), } - self.leave_scope(); } @@ -1286,5 +1265,4 @@ mod test { r#" . "# } - } From f2c2f3244b5407675a265e3a373f42d02f4a3df9 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Wed, 22 May 2019 22:33:50 -0700 Subject: [PATCH 18/50] Add `rdf-containers-syntax-vs-schema` W3C tests to XML parser --- sophia/src/parser/xml.rs | 779 +++++++++++++++++++++------------------ 1 file changed, 430 insertions(+), 349 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 48c9e4cf..86dd90e8 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -756,13 +756,6 @@ where panic!("cannot have rdf:resource and rdf:nodeID at the same time"); } } - - // let o = match object.len() { - // 0 => panic!("missing resource in empty predicate !"), - // 1 => object.pop().unwrap(), - // _ => panic!(""), - // }; - // self.triples.push_back(Ok([s.clone(), p, o])); } fn resource_empty(&mut self, e: &BytesStart) { @@ -834,90 +827,18 @@ mod test { } } - // #[test] - // fn w3c_test_suite() { - // fn do_test_suite() -> io::Result<()> { - // let rdf_ext = OsStr::new("rdf"); - // let nt_ext = OsStr::new("nt"); - // - // let suite = Path::new("..").join("rdf-tests").join("rdf-xml"); - // if !suite.exists() || !suite.is_dir() { - // panic!("rdf-tests/rdf-xml not found, can not check W3C test-suite. cf README.md"); - // } - // - // let mut tested = 0; - // - // for e in read_dir(&suite)? { - // let entry = e?; - // if entry.file_type()?.is_dir() { - // for c in read_dir(entry.path())? { - // let case = c?; - // if case.path().extension() == Some(rdf_ext) { - // if case.path().with_extension(nt_ext).is_file() { - // println!("{}", case.path().display()); - // - // // the reference N-Triples file - // let ntparser = crate::parser::nt::Config::default(); - // let ntfile = File::open(case.path().with_extension(nt_ext))?; - // let mut expected = TestGraph::new(); - // ntparser.parse_read(ntfile).in_graph(&mut expected).unwrap(); - // // the test XML file - // let xmlparser = super::Config::default(); - // let xmlfile = File::open(case.path())?; - // let mut actual = TestGraph::new(); - // let res = xmlparser.parse_read(xmlfile).in_graph(&mut actual); - // - // // check the XML parses without error - // assert!( - // res.is_ok(), - // format!("{} should parse without error", case.path().display()) - // ); - // // check the XML produces the same graph - // pretty_assertions::assert_eq!( - // actual, - // expected, - // "{} does not give expected results", - // case.path().display() - // ); - // - // tested += 1; - // } else if case.path().to_string_lossy().contains("error") { - // // let xmlparser = super::Config::default(); - // // let xmlfile = File::open(case.path())?; - // // let mut actual = TestGraph::new(); - // // assert!( - // // xmlparser.parse_read(xmlfile).in_graph(&mut actual).is_err(), - // // format!("{} should parse with error", case.path().display()) - // // ); - // // - // // tested += 1; - // } - // } - // } - // } - // } - // - // assert_ne!( - // tested, 0, - // "No test found in W3C test-suite, something must be wrong" - // ); - // Ok(()) - // } - // do_test_suite().unwrap() - // } - - macro_rules! nt_example { + macro_rules! nt_test { ($name:ident, $xml:literal, $nt:literal) => { #[test] fn $name() { let mut g = TestGraph::new(); - super::Config::default() + $crate::parser::xml::Config::default() .parse_str($xml) .in_graph(&mut g) .expect("failed parsing XML file"); let mut nt = Vec::new(); - for triple in crate::parser::nt::Config::default().parse_str($nt) { + for triple in $crate::parser::nt::Config::default().parse_str($nt) { nt.push(triple.expect("N-Triples iterator failed")); } @@ -936,294 +857,454 @@ mod test { }; } - // W3C Example 07: 'Complete RDF/XML description of Figure 1 graph' - nt_example! { - w3c_example_07, - r#" - - - - - + mod w3c_example { + use super::*; + + // W3C Example 07: 'Complete RDF/XML description of Figure 1 graph' + nt_test! { + ex07, + r#" + + + + + + + - - - - "#, - r#" "RDF/XML Syntax Specification (Revised)" . - _:n0 "Dave Beckett" . - _:n0 . - _:n0 . - "# - } + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + _:n0 . + "# + } - // W3C Example 08: 'Complete example of xml:lang' - nt_example! { - w3c_example_08, - r#" - - - RDF/XML Syntax Specification (Revised) - RDF/XML Syntax Specification (Revised) - RDF/XML Syntax Specification (Revised) - + // W3C Example 08: 'Complete example of xml:lang' + nt_test! { + ex08, + r#" + + + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) + RDF/XML Syntax Specification (Revised) + - - Der Baum - Das Buch ist außergewöhnlich - The Tree - - - "#, - r#" "RDF/XML Syntax Specification (Revised)" . - "RDF/XML Syntax Specification (Revised)"@en . - "RDF/XML Syntax Specification (Revised)"@en-us . - "Der Baum"@de . - "Das Buch ist au\u00DFergew\u00F6hnlich"@de . - "The Tree"@en . - "# - } + + Der Baum + Das Buch ist außergewöhnlich + The Tree + + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + "RDF/XML Syntax Specification (Revised)"@en . + "RDF/XML Syntax Specification (Revised)"@en-us . + "Der Baum"@de . + "Das Buch ist au\u00DFergew\u00F6hnlich"@de . + "The Tree"@en . + "# + } - // W3C Example 09: 'Complete example of rdf:parseType="Literal"' - nt_example! { - w3c_example_09, - r#" - - - 123 - - - "#, - r#" "123"^^ . - "# - } + // W3C Example 09: 'Complete example of rdf:parseType="Literal"' + nt_test! { + ex09, + r#" + + + 123 + + + "#, + r#" "123"^^ . + "# + } - // W3C Example 10: 'Complete example of rdf:datatype' - nt_example! { - w3c_example_10, - r#" - - - 123 - - - "#, - r#" "123"^^ . - "# - } + // W3C Example 10: 'Complete example of rdf:datatype' + nt_test! { + ex10, + r#" + + + 123 + + + "#, + r#" "123"^^ . + "# + } - // W3C Example 11: 'Complete RDF/XML description of graph using rdf:nodeID identifying the blank node' - nt_example! { - w3c_example_11, - r#" - - - - + // W3C Example 11: 'Complete RDF/XML description of graph using rdf:nodeID identifying the blank node' + nt_test! { + ex11, + r#" + + + + - - - - - "#, - // This is with renamed node IDs - r#" "RDF/XML Syntax Specification (Revised)" . - _:oabc . - _:oabc "Dave Beckett" . - _:oabc . - "# - } + + + + + "#, + // This is with renamed node IDs + r#" "RDF/XML Syntax Specification (Revised)" . + _:oabc . + _:oabc "Dave Beckett" . + _:oabc . + "# + } - // W3C Example 12: 'Complete example using rdf:parseType="Resource"' - nt_example! { - w3c_example_12, - r#" - - - - Dave Beckett - - - - - "#, - // This is with renamed node IDs - r#" "RDF/XML Syntax Specification (Revised)" . - _:n0 "Dave Beckett" . - _:n0 . - _:n0 . - "# - } + // W3C Example 12: 'Complete example using rdf:parseType="Resource"' + nt_test! { + ex12, + r#" + + + + Dave Beckett + + + + + "#, + // This is with renamed node IDs + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + _:n0 . + "# + } - // W3C Example 13: 'Complete example of property attributes on an empty property element' - nt_example! { - w3c_example_13, - r#" - - - - - - - "#, - r#" "RDF/XML Syntax Specification (Revised)" . - _:n0 "Dave Beckett" . - _:n0 . - "# - } + // W3C Example 13: 'Complete example of property attributes on an empty property element' + nt_test! { + ex13, + r#" + + + + + + + "#, + r#" "RDF/XML Syntax Specification (Revised)" . + _:n0 "Dave Beckett" . + _:n0 . + "# + } - // W3C Example 14: 'Complete example with rdf:type' - nt_example! { - w3c_example_14, - r#" - - - - A marvelous thing - - - "#, - r#" . - "A marvelous thing" . - "# - } + // W3C Example 14: 'Complete example with rdf:type' + nt_test! { + ex14, + r#" + + + + A marvelous thing + + + "#, + r#" . + "A marvelous thing" . + "# + } - // W3C Example 15: 'Complete example using a typed node element to replace an rdf:type' - nt_example! { - w3c_example_15, - r#" - + + + A marvelous thing + + + "#, + r#" . + "A marvelous thing" . + "# + } + + // W3C Example 16: 'Complete example using rdf:ID and xml:base for shortening URIs' + nt_test! { + ex16, + r#" + + + + + + "#, + r#" . + "# + } + + // W3C Example 17: 'Complex example using RDF list properties' + nt_test! { + ex17, + r#" + + + + + + + + "#, + r#" . + . + . + . + "# + } + + // W3C Example 18: 'Complete example using rdf:li property element for list properties' + nt_test! { + ex18, + r#" + + + + + + + + "#, + r#" . + . + . + . + "# + } + + // W3C Example 19: 'Complete example of a RDF collection of nodes using rdf:parseType="Collection"' + nt_test! { + ex19, + r#" + - - A marvelous thing - - - "#, - r#" . - "A marvelous thing" . - "# - } + + + + + + + + + "#, + r#"_:n0 . + _:n0 _:n1 . + _:n1 . + _:n1 _:n2 . + _:n2 . + _:n2 . + _:n0 . + "# + } - // W3C Example 16: 'Complete example using rdf:ID and xml:base for shortening URIs' - nt_example! { - w3c_example_16, - r#" - - - - - - "#, - r#" . - "# + // W3C Example 20: 'Complete example of rdf:ID reifying a property element' + nt_test! { + ex20, + r#" + + + blah + + + "#, + r#" "blah" . + . + . + . + "blah" . + "# + } } - // W3C Example 17: 'Complex example using RDF list properties' - nt_example! { - w3c_example_17, - r#" - - - - - - + mod rdf_containers_syntax_vs_schema { + use super::*; + + // Simple container + nt_test! { + test001, + r#" + + 1 + 2 + - "#, - r#" . - . - . - . - "# - } + "#, + r#"_:n0 . + _:n0 "1" . + _:n0 "2" . + "# + } - // W3C Example 18: 'Complete example using rdf:li property element for list properties' - nt_example! { - w3c_example_18, - r#" - - - - - - - - "#, - r#" . - . - . - . - "# - } + // rdf:li is unaffected by other rdf:_nnn properties. + nt_test! { + test002, + r#" + + _1 + 1 + _3 + 2 + + + "#, + r#"_:n0 . + _:n0 "_1" . + _:n0 "1" . + _:n0 "_3" . + _:n0 "2" . + "# + } - // W3C Example 19: 'Complete example of a RDF collection of nodes using rdf:parseType="Collection"' - nt_example! { - w3c_example_19, - r#" - - - - - - - - - - "#, - r#"_:n0 . - _:n0 _:n1 . - _:n1 . - _:n1 _:n2 . - _:n2 . - _:n2 . - _:n0 . - "# - } + // rdf:li elements can exist in any description element + nt_test! { + test003, + r#" + + 1 + 2 + + + "#, + r#"_:n0 . + _:n0 "1" . + _:n0 "2" . + "# + } - // W3C Example 20: 'Complete example of rdf:ID reifying a property element' - nt_example! { - w3c_example_20, - r#" - - - blah - - - "#, - r#" "blah" . - . - . - . - "blah" . - "# + // rdf:li elements match any of the property element productions + nt_test! { + test004, + r#" + + 1 + 2 + + + + + + + "#, + r#"_:n0 . + _:n0 "1" . + . + _:bar . + . + "1" . + _:n0 "2"^^ . + _:n0 _:res . + _:n0 . + _:n0 _:res2 . + . + _:bar . + . + _:res2 . + _:res2 "foobar" . + "# + } + + // containers match the typed node production + nt_test! { + test006, + r##" + + + barfoo + + + + "##, + r#" . + "3" . + "foobar" . + . + "2" . + "foobar" . + "barfoo" . + _:bag . + "# + } + + // rdf:li processing within each element is independent + nt_test! { + test007, + r##" + + + + 1 + 2 + + + 2 + + + "##, + r#"_:n0 _:n1 . + _:n1 "1" . + _:n1 "2" . + _:n0 "2" . + "# + } + + // rdf:li processing is per element, not per resource. + nt_test! { + test008, + r##" + + 1 + + + 1-again + + + "##, + r#" "1" . + "1-again" . + "# + } } // Check that nested `rdf:li` keeps independent counters for nested elements. - nt_example! { + nt_test! { nested_li, r#" @@ -1250,7 +1331,7 @@ mod test { } // Check that an empty node is used as a leaf. - nt_example! { + nt_test! { empty_node, r#" Date: Wed, 22 May 2019 23:47:26 -0700 Subject: [PATCH 19/50] Fix bug with `rdf:li` in XML parser --- sophia/src/parser/xml.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 86dd90e8..8bc405b1 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -248,7 +248,7 @@ impl Scope { fn current_li(&self) -> Result> { if let Some(ns) = self.ns.get("rdf") { let mut f = self.factory.borrow_mut(); - ns.get(f.get_term_data(&format!("_{}", self.li.load(Ordering::Relaxed)))) + ns.get(f.get_term_data(&format!("_{}", self.li.load(Ordering::Relaxed) - 1))) } else { panic!("undeclared `rdf` prefix !") } @@ -1258,7 +1258,7 @@ mod test { "2" . "foobar" . "barfoo" . - _:bag . + _:n0 . "# } From 524dfc7b3e716009875ffa7ff751b96eb61801c8 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 00:22:41 -0700 Subject: [PATCH 20/50] Improve RDF tests handling in `::parser::xml` module --- sophia/src/parser/xml.rs | 279 ++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 163 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 8bc405b1..5d2eaeae 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -827,32 +827,103 @@ mod test { } } + macro_rules! assert_graph_eq { + ($l:ident, $r:ident) => { + assert_eq!( + $l.len(), + $r.len(), + "unexpected number of triples: {:#?}", + $l + ); + for t in $r.triples().map(Result::unwrap) { + assert!( + $l.contains(t.s(), t.p(), t.o()).expect(".contains failed"), + "missing triple: ({:?} {:?} {:?}) in {:#?}", + t.s(), + t.p(), + t.o(), + $l + ); + } + }; + } + + macro_rules! rdf_test { + ($suite:ident / $case:ident where $($l:pat => $r:literal),*) => { + #[test] + fn $case() { + let path = std::path::PathBuf::from("..") + .join("rdf-tests") + .join("rdf-xml") + .join(stringify!($suite).replace('_', "-")) + .join(stringify!($case).replace('_', "-")); + + let ntfile = std::fs::File::open(path.with_extension("nt")).unwrap(); + let xmlfile = std::fs::File::open(path.with_extension("rdf")).unwrap(); + + let mut xml = TestGraph::new(); + $crate::parser::xml::Config::default() + .parse_read(xmlfile) + .in_graph(&mut xml) + .expect("failed parsing XML file"); + + let mut nt = TestGraph::new(); + $crate::parser::nt::Config::default() + .parse_read(ntfile) + .in_graph(&mut nt) + .expect("failed parsing N-Triples file"); + + use std::rc::Rc; + use crate::term::factory::TermFactory; + use crate::graph::MutableGraph; + + fn relabel(factory: &mut RcTermFactory, t: Term>) -> Term> { + if let Term::BNode(bnode) = t { + match bnode.as_ref() { + $($l => factory.bnode($r).unwrap(),)* + other => factory.bnode(other).unwrap(), + } + } else { + t + } + } + + let mut iso = TestGraph::new(); + let mut factory = RcTermFactory::default(); + for t in nt.triples().map(Result::unwrap) { + iso.insert( + &relabel(&mut factory, t.s().clone()), + &relabel(&mut factory, t.p().clone()), + &relabel(&mut factory, t.o().clone()), + ).unwrap(); + } + + assert_graph_eq!(xml, iso); + } + }; + + ($suite:ident / $case:ident) => { + rdf_test!($suite / $case where); + }; + } + macro_rules! nt_test { ($name:ident, $xml:literal, $nt:literal) => { #[test] fn $name() { - let mut g = TestGraph::new(); + let mut xml = TestGraph::new(); $crate::parser::xml::Config::default() .parse_str($xml) - .in_graph(&mut g) + .in_graph(&mut xml) .expect("failed parsing XML file"); - let mut nt = Vec::new(); - for triple in $crate::parser::nt::Config::default().parse_str($nt) { - nt.push(triple.expect("N-Triples iterator failed")); - } + let mut nt = TestGraph::new(); + $crate::parser::nt::Config::default() + .parse_str($nt) + .in_graph(&mut nt) + .expect("failed parsing N-Triples file"); - assert_eq!(g.len(), nt.len(), "unexpected number of triples: {:#?}", g); - for t in nt.into_iter() { - assert!( - g.contains(t.s(), t.p(), t.o()).expect(".contains failed"), - "missing triple: ({:?} {:?} {:?}) in {:#?}", - t.s(), - t.p(), - t.o(), - g - ); - } + assert_graph_eq!(xml, nt); } }; } @@ -1147,160 +1218,42 @@ mod test { } } - mod rdf_containers_syntax_vs_schema { + mod amp_in_url { use super::*; - // Simple container - nt_test! { - test001, - r#" - - 1 - 2 - - - "#, - r#"_:n0 . - _:n0 "1" . - _:n0 "2" . - "# - } + rdf_test!(amp_in_url / test001); + } - // rdf:li is unaffected by other rdf:_nnn properties. - nt_test! { - test002, - r#" - - _1 - 1 - _3 - 2 - - - "#, - r#"_:n0 . - _:n0 "_1" . - _:n0 "1" . - _:n0 "_3" . - _:n0 "2" . - "# - } + mod datatypes { + use super::*; - // rdf:li elements can exist in any description element - nt_test! { - test003, - r#" - - 1 - 2 - - - "#, - r#"_:n0 . - _:n0 "1" . - _:n0 "2" . - "# - } + rdf_test!(datatypes / test001); + rdf_test!(datatypes / test002); + } - // rdf:li elements match any of the property element productions - nt_test! { - test004, - r#" - - 1 - 2 - - - - - - - "#, - r#"_:n0 . - _:n0 "1" . - . - _:bar . - . - "1" . - _:n0 "2"^^ . - _:n0 _:res . - _:n0 . - _:n0 _:res2 . - . - _:bar . - . - _:res2 . - _:res2 "foobar" . - "# - } + mod rdf_charmod_literals { + use super::*; - // containers match the typed node production - nt_test! { - test006, - r##" - - - barfoo - - - - "##, - r#" . - "3" . - "foobar" . - . - "2" . - "foobar" . - "barfoo" . - _:n0 . - "# - } + rdf_test!(rdf_charmod_literals / test001 where "a" => "n0"); + } - // rdf:li processing within each element is independent - nt_test! { - test007, - r##" - - - - 1 - 2 - - - 2 - - - "##, - r#"_:n0 _:n1 . - _:n1 "1" . - _:n1 "2" . - _:n0 "2" . - "# - } + mod rdf_charmod_uris { + use super::*; - // rdf:li processing is per element, not per resource. - nt_test! { - test008, - r##" - - 1 - - - 1-again - - - "##, - r#" "1" . - "1-again" . - "# - } + rdf_test!(rdf_charmod_uris / test001); + rdf_test!(rdf_charmod_uris / test002); + } + + mod rdf_containers_syntax_vs_schema { + use super::*; + + rdf_test!(rdf_containers_syntax_vs_schema / test001 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test002 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test003 where "bar" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test004); + rdf_test!(rdf_containers_syntax_vs_schema / test006 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test007 where "d1" => "n0", "d2" => "n1"); + rdf_test!(rdf_containers_syntax_vs_schema / test008); } // Check that nested `rdf:li` keeps independent counters for nested elements. From dbb6cec7c73bceaadddcb8c506c3f2979830ccdc Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 12:24:07 -0700 Subject: [PATCH 21/50] Fix `predicate_end` not producing triple on missing top-level `RDF` element --- sophia/src/parser/xml.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 5d2eaeae..6d2ecfad 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -611,7 +611,7 @@ where .expect("INVALID PREDICATE IRI"); // Get the literal value - if self.parents.len() > 2 { + if self.parents.len() > 1 { if let Some(text) = self.scope_mut().text.take() { let s = self.parents[self.parents.len() - 2].clone(); let o = self.scope_mut().new_literal(text).expect("FIXME"); @@ -1256,6 +1256,12 @@ mod test { rdf_test!(rdf_containers_syntax_vs_schema / test008); } + mod rdf_element_not_mandatory { + use super::*; + + rdf_test!(rdf_element_not_mandatory / test001 where "a" => "n0"); + } + // Check that nested `rdf:li` keeps independent counters for nested elements. nt_test! { nested_li, From b8de13c44df838b46b12f30a49bcd72407c9e1c0 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 12:48:20 -0700 Subject: [PATCH 22/50] Expand `rdf:type` as URIs instead of literal in XML parser --- sophia/src/parser/xml.rs | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 6d2ecfad..72a514e4 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -418,8 +418,14 @@ where } fn node_start(&mut self, e: &BytesStart) { - // Bail out if this the top level rdf:RDF - if e.name() == b"rdf:RDF" { + // Get node type from the XML attribute. + let ty = self + .scope() + .expand_attribute(std::str::from_utf8(e.name()).expect("INVALID UTF8")) + .expect("INVALID DATATYPE IRI REFERENCE"); + + // Bail out if an rdf:RDF element + if ty.matches(&rdf::RDF) { self.state.push(ParsingState::Node); self.parents.push(self.factory.borrow_mut().copy(&rdf::RDF)); return; @@ -454,6 +460,8 @@ where .bnode(&format!("o{}", v)) .expect("INVALID BNODE"), ); + } else if k.matches(&rdf::type_) { + properties.insert(k, self.scope().expand_iri(&v).expect("INVALID IRI")); } else if !k.matches(&xml::lang) { // Ignore xml:lang attributes properties.insert( @@ -474,12 +482,6 @@ where self.parents.push(s.clone()); // Add the type as a triple if it is not `rdf:Description` - let ty = self - .scope() - .expand_attribute( - std::str::from_utf8(e.name()).expect("INVALID DATATYPE IRI REFERENCE"), - ) - .expect("INVALID DATATYPE IRI REFERENCE"); if !ty.matches(&rdf::Description) { self.triples.push_back(Ok([ s.clone(), @@ -1262,6 +1264,22 @@ mod test { rdf_test!(rdf_element_not_mandatory / test001 where "a" => "n0"); } + mod rdf_ns_prefix_confusion { + use super::*; + + rdf_test!(rdf_ns_prefix_confusion / test0001); + rdf_test!(rdf_ns_prefix_confusion / test0003); + rdf_test!(rdf_ns_prefix_confusion / test0004); + rdf_test!(rdf_ns_prefix_confusion / test0005 where "genid" => "n0"); + rdf_test!(rdf_ns_prefix_confusion / test0006); + rdf_test!(rdf_ns_prefix_confusion / test0009); + rdf_test!(rdf_ns_prefix_confusion / test0010); + rdf_test!(rdf_ns_prefix_confusion / test0011); + rdf_test!(rdf_ns_prefix_confusion / test0012); + rdf_test!(rdf_ns_prefix_confusion / test0013); + rdf_test!(rdf_ns_prefix_confusion / test0014); + } + // Check that nested `rdf:li` keeps independent counters for nested elements. nt_test! { nested_li, From 642c527b97b2903c7c875b666b1050d52a78502c Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 13:11:47 -0700 Subject: [PATCH 23/50] Add method to `parser::xml::Config` to parse using a base URI --- sophia/src/parser/xml.rs | 44 +++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 72a514e4..02624b48 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -13,6 +13,7 @@ use quick_xml::events::BytesEnd; use quick_xml::events::BytesStart; use quick_xml::events::BytesText; use quick_xml::events::Event; +use quick_xml::Reader; use crate::error::*; use crate::ns::rdf; @@ -29,7 +30,17 @@ use crate::term::Term; // --- #[derive(Clone, Debug, Default)] -pub struct Config; +pub struct Config { + base: Option>>, +} + +impl Config { + fn with_base(base: &str) -> Result { + Ok(Self { + base: Some(Namespace::new(Rc::from(base))?), + }) + } +} impl Config { #[inline] @@ -37,7 +48,11 @@ impl Config { &self, bufread: B, ) -> impl Iterator>; 3]>> + 'a { - XmlParser::<_, RcTermFactory>::new(quick_xml::Reader::from_reader(bufread)) + type Parser = XmlParser; + match &self.base { + Some(base) => Parser::with_base(Reader::from_reader(bufread), base.clone()), + None => Parser::new(Reader::from_reader(bufread)), + } } #[inline] @@ -53,7 +68,11 @@ impl Config { &self, txt: &'a str, ) -> impl Iterator>; 3]>> + 'a { - XmlParser::<_, RcTermFactory>::new(quick_xml::Reader::from_str(txt)) + type Parser = XmlParser; + match &self.base { + Some(base) => Parser::with_base(Reader::from_str(txt), base.clone()), + None => Parser::new(Reader::from_str(txt)), + } } } @@ -265,7 +284,7 @@ impl Default for Scope { struct XmlParser { /// The underlying XML reader. - reader: quick_xml::Reader, + reader: Reader, /// The stack of scoped data (for nested declaration). scopes: Vec>, @@ -390,7 +409,7 @@ where // --- - fn new(reader: quick_xml::Reader) -> Self { + fn new(reader: Reader) -> Self { let factory: Rc> = Default::default(); Self { reader, @@ -403,6 +422,13 @@ where } } + fn with_base(reader: Reader, base: Namespace) -> Self { + let mut parser = Self::new(reader); + let mut scope = parser.scope_mut(); + scope.base = Some(base); + parser + } + // --- fn element_start(&mut self, e: &BytesStart) { @@ -864,7 +890,12 @@ mod test { let xmlfile = std::fs::File::open(path.with_extension("rdf")).unwrap(); let mut xml = TestGraph::new(); - $crate::parser::xml::Config::default() + $crate::parser::xml::Config::with_base(&format!( + "http://www.w3.org/2013/RDFXMLTests/{}/{}.rdf", + stringify!($suite).replace('_', "-"), + stringify!($case).replace('_', "-"), + )) + .unwrap() .parse_read(xmlfile) .in_graph(&mut xml) .expect("failed parsing XML file"); @@ -903,7 +934,6 @@ mod test { assert_graph_eq!(xml, iso); } }; - ($suite:ident / $case:ident) => { rdf_test!($suite / $case where); }; From 22ebf809b808d5522d458cff7261a37a9ba8739f Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 15:35:20 -0700 Subject: [PATCH 24/50] Add more W3C RDF/XML tests for XML parser and fix reification behaviour --- sophia/src/parser/xml.rs | 341 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 320 insertions(+), 21 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 02624b48..9b4f8473 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -246,6 +246,11 @@ impl Scope { /// Create a new literal with the `rdf:type` and `xml:lang` in scope. fn new_literal(&self, text: String) -> Result> { + println!( + "creating new text: {} (scope lang is {:?})", + &text, + &self.lang.as_ref().map(|t| t.as_ref().to_owned()) + ); match (&self.datatype, &self.lang) { (Some(dt), _) => self.factory.borrow_mut().literal_dt(text, dt.clone()), (None, Some(l)) => self.factory.borrow_mut().literal_lang(text, l.clone()), @@ -488,15 +493,8 @@ where ); } else if k.matches(&rdf::type_) { properties.insert(k, self.scope().expand_iri(&v).expect("INVALID IRI")); - } else if !k.matches(&xml::lang) { - // Ignore xml:lang attributes - properties.insert( - k, - self.factory - .borrow_mut() - .literal_dt(v, xsd::string) - .expect("FIXME"), - ); + } else if !k.matches(&xml::lang) && a.key != b"xmlns" && !a.key.starts_with(b"xmlns:") { + properties.insert(k, self.scope().new_literal(v).expect("FIXME")); } } @@ -547,16 +545,27 @@ where } else if k.matches(&rdf::ID) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); return self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); + } else if k.matches(&rdf::resource) { + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + self.parents + .push(self.scope().expand_iri(&v).expect("FIXME")); + self.state.push(ParsingState::Node); + next_state = ParsingState::Predicate; + // self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => { self.parents.push(self.new_bnode()); + self.scope_mut().set_text(None); next_state = ParsingState::Resource; } b"Collection" => { next_state = ParsingState::Collection; } - b"Literal" => next_state = ParsingState::Literal, + b"Literal" => { + self.scope_mut().set_datatype(&rdf::XMLLiteral.value()); + next_state = ParsingState::Literal; + } other => panic!("invalid parseType: {:?}", other), } } @@ -612,6 +621,7 @@ where match self.state.pop().unwrap() { ParsingState::Node => self.predicate_end(e), ParsingState::Predicate => self.node_end(), + ParsingState::Literal => self.predicate_end(e), ParsingState::Resource => self.resource_end(e), ParsingState::CollectionItem => self.collection_item_end(), ParsingState::Collection => self.collection_end(e), @@ -741,6 +751,10 @@ where let mut object = Vec::with_capacity(1); let mut attributes = HashMap::new(); + let mut parse_type = None; + let mut reification = None; + + // Extract attributes for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); @@ -759,31 +773,73 @@ where .bnode(format!("o{}", v)) .expect("FIXME"), ); - } else if !k.matches(&xml::lang) && !a.key.starts_with(b"xmlns") { + } else if k.matches(&rdf::ID) { + reification = Some(self.scope().expand_id(&v).expect("FIXME")); + } else if k.matches(&rdf::parseType) { + match a.value.as_ref() { + b"Resource" => parse_type = Some(&b"Resource"[..]), + b"Literal" => parse_type = Some(&b"Literal"[..]), + other => panic!("invalid parseType: {:?}", other), + }; + } else if !k.matches(&xml::lang) && !a.key.starts_with(b"xmlns:") && a.key != b"xmlns" { attributes.insert(k, v); } } + // Make sure to create + if parse_type == Some(b"Resource") && object.is_empty() { + object.push(self.new_bnode()); + } else if parse_type == Some(b"Literal") { + let xmlliteral = self.factory.borrow_mut().copy(&rdf::XMLLiteral); + let mut scope = self.scope_mut(); + scope.datatype = Some(xmlliteral); + } + match object.len() { - 0 => { + 0 if !attributes.is_empty() => { let s = self.parents.last().unwrap().clone(); let o = self.new_bnode(); - self.triples.push_back(Ok([s, p, o.clone()])); + object.push(o.clone()); + self.triples.push_back(Ok([s, p.clone(), o.clone()])); for (prop, value) in attributes.into_iter() { let literal = self.scope().new_literal(value).expect("FIXME"); self.triples.push_back(Ok([o.clone(), prop, literal])); } } + 0 if attributes.is_empty() => { + let s = self.parents.last().unwrap().clone(); + let o = self.scope().new_literal(String::new()).expect("FIXME"); + object.push(o.clone()); + self.triples.push_back(Ok([s, p.clone(), o])); + } 1 => { - // Ignoring property attributes let s = self.parents.last().unwrap().clone(); - let o = object.pop().unwrap(); - self.triples.push_back(Ok([s, p, o])); + let o = object.last().unwrap().clone(); + self.triples.push_back(Ok([s, p.clone(), o])); } _ => { panic!("cannot have rdf:resource and rdf:nodeID at the same time"); } } + + if let Some(id) = reification { + // Types for the reification + let ty = self.factory.borrow_mut().copy(&rdf::type_); + let subject = self.factory.borrow_mut().copy(&rdf::subject); + let predicate = self.factory.borrow_mut().copy(&rdf::predicate); + let obj = self.factory.borrow_mut().copy(&rdf::object); + let stmt = self.factory.borrow_mut().copy(&rdf::Statement); + + // + let s = self.parents.last().unwrap().clone(); + let o = object.pop().unwrap(); + + // + self.triples.push_back(Ok([id.clone(), ty, stmt])); + self.triples.push_back(Ok([id.clone(), subject, s])); + self.triples.push_back(Ok([id.clone(), predicate, p])); + self.triples.push_back(Ok([id.clone(), obj, o])); + } } fn resource_empty(&mut self, e: &BytesStart) { @@ -877,7 +933,8 @@ mod test { } macro_rules! rdf_test { - ($suite:ident / $case:ident where $($l:pat => $r:literal),*) => { + ($(#[$attr:meta])* $suite:ident / $case:ident where $($l:pat => $r:literal),*) => { + $(#[$attr])* #[test] fn $case() { let path = std::path::PathBuf::from("..") @@ -934,8 +991,8 @@ mod test { assert_graph_eq!(xml, iso); } }; - ($suite:ident / $case:ident) => { - rdf_test!($suite / $case where); + ($(#[$attr:meta])* $suite:ident / $case:ident) => { + rdf_test!($(#[$attr])* $suite / $case where); }; } @@ -1273,7 +1330,11 @@ mod test { use super::*; rdf_test!(rdf_charmod_uris / test001); - rdf_test!(rdf_charmod_uris / test002); + rdf_test!( + #[ignore] + rdf_charmod_uris + / test002 + ); } mod rdf_containers_syntax_vs_schema { @@ -1282,7 +1343,11 @@ mod test { rdf_test!(rdf_containers_syntax_vs_schema / test001 where "bag" => "n0"); rdf_test!(rdf_containers_syntax_vs_schema / test002 where "bag" => "n0"); rdf_test!(rdf_containers_syntax_vs_schema / test003 where "bar" => "n0"); - rdf_test!(rdf_containers_syntax_vs_schema / test004); + rdf_test!( + #[ignore] + rdf_containers_syntax_vs_schema + / test004 + ); rdf_test!(rdf_containers_syntax_vs_schema / test006 where "bag" => "n0"); rdf_test!(rdf_containers_syntax_vs_schema / test007 where "d1" => "n0", "d2" => "n1"); rdf_test!(rdf_containers_syntax_vs_schema / test008); @@ -1310,6 +1375,240 @@ mod test { rdf_test!(rdf_ns_prefix_confusion / test0014); } + mod rdfms_duplicate_member_props { + use super::*; + + rdf_test!(rdfms_duplicate_member_props / test001); + } + + mod rdfms_empty_property_elements { + use super::*; + + rdf_test!(rdfms_empty_property_elements / test001); + rdf_test!(rdfms_empty_property_elements / test002); + rdf_test!(rdfms_empty_property_elements / test003); + rdf_test!(rdfms_empty_property_elements / test004 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test005); + rdf_test!(rdfms_empty_property_elements / test006 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test007); + rdf_test!(rdfms_empty_property_elements / test008); + rdf_test!(rdfms_empty_property_elements / test009); + rdf_test!(rdfms_empty_property_elements / test010 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test011); + rdf_test!(#[ignore] rdfms_empty_property_elements / test012 where "a1" => "n0"); + rdf_test!( + #[ignore] + rdfms_empty_property_elements + / test013 + ); + rdf_test!(rdfms_empty_property_elements / test014 where "a1" => "n0"); + rdf_test!(#[ignore] rdfms_empty_property_elements / test015 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test016); + rdf_test!(rdfms_empty_property_elements / test017); + } + + mod rdfms_identity_anon_resources { + use super::*; + + rdf_test!(rdfms_identity_anon_resources / test001 where "j0" => "n0"); + rdf_test!(rdfms_identity_anon_resources / test002 where "j0" => "n0"); + rdf_test!(rdfms_identity_anon_resources / test003 where "j0" => "n0"); + rdf_test!(rdfms_identity_anon_resources / test004 where "j0" => "n0"); + rdf_test!(rdfms_identity_anon_resources / test005 where "j0" => "n0"); + } + + mod rdfms_not_id_and_resource_attr { + use super::*; + + rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test001 where "j88090" => "n0", "j88091" => "n1"); + rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test002 where "j88093" => "n0"); + rdf_test!(rdfms_not_id_and_resource_attr / test004 where "j88101" => "n0"); + rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test005 where "j88106" => "n0"); + } + + mod rdfms_para196 { + use super::*; + + rdf_test!( + #[ignore] + rdfms_not_id_and_resource_attr + / test001 + ); + } + + mod rdfms_rdf_names_use { + use super::*; + + rdf_test!(rdfms_rdf_names_use / test_001); + rdf_test!(rdfms_rdf_names_use / test_002); + rdf_test!(rdfms_rdf_names_use / test_003); + rdf_test!(rdfms_rdf_names_use / test_004); + rdf_test!(rdfms_rdf_names_use / test_005); + rdf_test!(rdfms_rdf_names_use / test_006); + rdf_test!(rdfms_rdf_names_use / test_007); + rdf_test!(rdfms_rdf_names_use / test_008); + rdf_test!(rdfms_rdf_names_use / test_009); + rdf_test!(rdfms_rdf_names_use / test_010); + rdf_test!(rdfms_rdf_names_use / test_011); + rdf_test!(rdfms_rdf_names_use / test_012); + rdf_test!(rdfms_rdf_names_use / test_013); + rdf_test!(rdfms_rdf_names_use / test_014); + rdf_test!(rdfms_rdf_names_use / test_015); + rdf_test!(rdfms_rdf_names_use / test_016); + rdf_test!(rdfms_rdf_names_use / test_017); + rdf_test!(rdfms_rdf_names_use / test_018); + rdf_test!(rdfms_rdf_names_use / test_019); + rdf_test!(rdfms_rdf_names_use / test_020); + rdf_test!(rdfms_rdf_names_use / test_021); + rdf_test!(rdfms_rdf_names_use / test_022); + rdf_test!(rdfms_rdf_names_use / test_023); + rdf_test!(rdfms_rdf_names_use / test_024); + rdf_test!(rdfms_rdf_names_use / test_025); + rdf_test!(rdfms_rdf_names_use / test_026); + rdf_test!(rdfms_rdf_names_use / test_027); + rdf_test!(rdfms_rdf_names_use / test_028); + rdf_test!(rdfms_rdf_names_use / test_029); + rdf_test!(rdfms_rdf_names_use / test_030); + rdf_test!(rdfms_rdf_names_use / test_031); + rdf_test!(rdfms_rdf_names_use / test_032); + rdf_test!(rdfms_rdf_names_use / test_033); + rdf_test!(rdfms_rdf_names_use / test_034); + rdf_test!(rdfms_rdf_names_use / test_035); + rdf_test!(rdfms_rdf_names_use / test_036); + rdf_test!(rdfms_rdf_names_use / test_037); + } + + mod rdfms_reification_required { + use super::*; + + rdf_test!(rdfms_reification_required / test001); + } + + mod rdfms_seq_representation { + use super::*; + + rdf_test!(rdfms_seq_representation / test001 + where "a0" => "n0", "a1" => "n1", "a2" => "n2"); + } + + mod rdfms_syntax_incomplete { + use super::*; + + rdf_test!(rdfms_syntax_incomplete / test001 where "j0" => "oa"); + rdf_test!(rdfms_syntax_incomplete / test002 where "j0A" => "oa", "j2" => "n0", "j1B" => "ob"); + rdf_test!(rdfms_syntax_incomplete / test003 where "j0A" => "oa"); + rdf_test!(rdfms_syntax_incomplete / test004 where "j1A" => "oa", "j2" => "n1", "j0" => "n0"); + } + + mod rdfms_uri_substructure { + use super::*; + + rdf_test!(rdfms_uri_substructure / test001 where "a" => "n0"); + } + + mod rdfms_xml_literal_namespaces { + use super::*; + + rdf_test!( + #[ignore] + rdfms_xml_literal_namespaces + / test001 + ); + rdf_test!( + #[ignore] + rdfms_xml_literal_namespaces + / test002 + ); + } + + mod rdfms_xmllang { + use super::*; + + rdf_test!(rdfms_xmllang / test001); + rdf_test!(rdfms_xmllang / test002); + rdf_test!(rdfms_xmllang / test003); + rdf_test!(rdfms_xmllang / test004); + rdf_test!(rdfms_xmllang / test005); + rdf_test!(rdfms_xmllang / test006); + } + + mod rdfs_domain_and_range { + use super::*; + + rdf_test!(rdfs_domain_and_range / test001); + rdf_test!(rdfs_domain_and_range / test002); + } + + mod unrecognised_xml_attributes { + use super::*; + + rdf_test!(unrecognised_xml_attributes / test001); + rdf_test!( + #[ignore] + unrecognised_xml_attributes + / test002 + ); + } + + mod xml_canon { + use super::*; + + rdf_test!( + #[ignore] + xml_canon + / test001 + ); + } + + mod xmlbase { + use super::*; + + rdf_test!(xmlbase / test001); + rdf_test!(#[ignore] xmlbase / test002 where "j0" => "n0"); + rdf_test!( + #[ignore] + xmlbase + / test003 + ); + rdf_test!(xmlbase / test004 where "j0" => "n0"); + rdf_test!( + #[ignore] + xmlbase + / test006 + ); + rdf_test!( + #[ignore] + xmlbase + / test007 + ); + rdf_test!(xmlbase / test008); + rdf_test!( + #[ignore] + xmlbase + / test009 + ); + rdf_test!( + #[ignore] + xmlbase + / test010 + ); + rdf_test!( + #[ignore] + xmlbase + / test011 + ); + rdf_test!( + #[ignore] + xmlbase + / test013 + ); + rdf_test!( + #[ignore] + xmlbase + / test014 + ); + } + // Check that nested `rdf:li` keeps independent counters for nested elements. nt_test! { nested_li, From 3b84ceb6b8c2476520895df89a4eaddaa92e76ce Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 17:49:30 -0700 Subject: [PATCH 25/50] Fix reification of triples in XML parser --- sophia/src/parser/xml.rs | 137 ++++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 9b4f8473..89dc2322 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -85,6 +85,8 @@ enum ParsingState { Resource, Literal, // NB: not supported by quick-xml right now + Res, + Collection, CollectionItem, } @@ -246,11 +248,6 @@ impl Scope { /// Create a new literal with the `rdf:type` and `xml:lang` in scope. fn new_literal(&self, text: String) -> Result> { - println!( - "creating new text: {} (scope lang is {:?})", - &text, - &self.lang.as_ref().map(|t| t.as_ref().to_owned()) - ); match (&self.datatype, &self.lang) { (Some(dt), _) => self.factory.borrow_mut().literal_dt(text, dt.clone()), (None, Some(l)) => self.factory.borrow_mut().literal_lang(text, l.clone()), @@ -326,6 +323,15 @@ where self.scopes.last_mut().unwrap() } + fn parent_scope(&self) -> &Scope { + &self.scopes[self.scopes.len() - 2] + } + + fn parent_scope_mut(&mut self) -> &mut Scope { + let l = self.scopes.len(); + &mut self.scopes[l - 2] + } + // Add a local scope (`lang`, `namespaces`, but not `parents`) fn enter_scope(&mut self, e: &BytesStart) -> Result<()> { // We are entering a new elements: text is not relevant anymore. @@ -394,8 +400,7 @@ where fn predicate_iri_start(&self, name: &str) -> Result> { let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { - let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); - parent_scope.new_li() + self.parent_scope().new_li() } else { Ok(p) } @@ -405,8 +410,7 @@ where fn predicate_iri_end(&self, name: &str) -> Result> { let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { - let parent_scope = self.scopes.get(self.scopes.len() - 2).unwrap(); - parent_scope.current_li() + self.parent_scope().current_li() } else { Ok(p) } @@ -444,6 +448,7 @@ where ParsingState::Resource => self.predicate_start(e), ParsingState::Collection => self.collection_start(e), ParsingState::CollectionItem => self.collection_item_start(e), + ParsingState::Res => panic!("expecting text, not new element"), _ => unimplemented!(), }; } @@ -544,12 +549,14 @@ where self.scope_mut().set_datatype(&v); } else if k.matches(&rdf::ID) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - return self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); + self.parents + .push(self.scope().expand_id(&v).expect("FIXME")); + next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); self.parents .push(self.scope().expand_iri(&v).expect("FIXME")); - self.state.push(ParsingState::Node); + self.state.pop(); next_state = ParsingState::Predicate; // self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); } else if k.matches(&rdf::parseType) { @@ -573,34 +580,6 @@ where self.state.push(next_state); } - fn reification_start(&mut self, e: &BytesStart, id: Term) { - // Get the subject and predicate of the triple - let p = self.parents.pop().unwrap(); - let s = self.parents.last().unwrap().clone(); - - // Get the object of the triple - let txt = self.reader.read_text(e.name(), &mut Vec::new()).unwrap(); - let o = self.scope().new_literal(txt).unwrap(); - - // Add the actual triple - self.triples - .push_back(Ok([s.clone(), p.clone(), o.clone()])); - - // Add the reified triples - let ty = self.factory.borrow_mut().copy(&rdf::type_); - let subject = self.factory.borrow_mut().copy(&rdf::subject); - let predicate = self.factory.borrow_mut().copy(&rdf::predicate); - let object = self.factory.borrow_mut().copy(&rdf::object); - self.triples.push_back(Ok([ - id.clone(), - ty, - self.factory.borrow_mut().copy(&rdf::Statement), - ])); - self.triples.push_back(Ok([id.clone(), subject, s])); - self.triples.push_back(Ok([id.clone(), predicate, p])); - self.triples.push_back(Ok([id.clone(), object, o])); - } - fn collection_start(&mut self, e: &BytesStart) { self.state.push(ParsingState::CollectionItem); self.collection_item_start(e); @@ -611,8 +590,7 @@ where self.node_start(e); let new_iri = self.parents.last().unwrap().clone(); // Add the iri of the node to the parent scope (not current!) - let l = self.scopes.len(); - self.scopes.get_mut(l - 2).unwrap().collection.push(new_iri); + self.parent_scope_mut().collection.push(new_iri); } // --- @@ -625,6 +603,7 @@ where ParsingState::Resource => self.resource_end(e), ParsingState::CollectionItem => self.collection_item_end(), ParsingState::Collection => self.collection_end(e), + ParsingState::Res => self.res_end(), _ => unimplemented!(), } self.leave_scope(); @@ -714,6 +693,32 @@ where self.predicate_end(e); } + fn res_end(&mut self) { + // Types for the reification + let mut factory = self.factory.borrow_mut(); + let ty = factory.copy(&rdf::type_); + let subject = factory.copy(&rdf::subject); + let predicate = factory.copy(&rdf::predicate); + let object = factory.copy(&rdf::object); + let stmt = factory.copy(&rdf::Statement); + drop(factory); + + // Subject, predicate, object and ID of the reified triple + let id = self.parents.pop().unwrap(); + let p = self.parents.pop().unwrap(); + let s = self.parents.last().unwrap().clone(); + let txt = self.scope_mut().text.take().unwrap_or_default(); + let o = self.scope().new_literal(txt).expect("FIXME"); + + // Add all triples + self.triples + .push_back(Ok([s.clone(), p.clone(), o.clone()])); + self.triples.push_back(Ok([id.clone(), ty, stmt])); + self.triples.push_back(Ok([id.clone(), subject, s])); + self.triples.push_back(Ok([id.clone(), predicate, p])); + self.triples.push_back(Ok([id.clone(), object, o])); + } + // --- Text elements ---------------------------------------------------- fn element_text(&mut self, e: &BytesText) { @@ -733,6 +738,7 @@ where ParsingState::Resource => self.resource_empty(e), ParsingState::Collection => self.collection_item_empty(e), ParsingState::CollectionItem => unreachable!(), + ParsingState::Res => panic!("expected end element, not empty"), _ => (), } self.leave_scope(); @@ -786,7 +792,8 @@ where } } - // Make sure to create + // Make sure to create the right kind of object if `parseType` was + // explicitly given in the source document. if parse_type == Some(b"Resource") && object.is_empty() { object.push(self.new_bnode()); } else if parse_type == Some(b"Literal") { @@ -795,6 +802,7 @@ where scope.datatype = Some(xmlliteral); } + // Process the object of the triple. match object.len() { 0 if !attributes.is_empty() => { let s = self.parents.last().unwrap().clone(); @@ -822,19 +830,22 @@ where } } + // Reify the triple if needed. if let Some(id) = reification { // Types for the reification - let ty = self.factory.borrow_mut().copy(&rdf::type_); - let subject = self.factory.borrow_mut().copy(&rdf::subject); - let predicate = self.factory.borrow_mut().copy(&rdf::predicate); - let obj = self.factory.borrow_mut().copy(&rdf::object); - let stmt = self.factory.borrow_mut().copy(&rdf::Statement); - - // + let mut factory = self.factory.borrow_mut(); + let ty = factory.copy(&rdf::type_); + let subject = factory.copy(&rdf::subject); + let predicate = factory.copy(&rdf::predicate); + let obj = factory.copy(&rdf::object); + let stmt = factory.copy(&rdf::Statement); + drop(factory); + + // Subject and object of the reification let s = self.parents.last().unwrap().clone(); let o = object.pop().unwrap(); - // + // Add all triples self.triples.push_back(Ok([id.clone(), ty, stmt])); self.triples.push_back(Ok([id.clone(), subject, s])); self.triples.push_back(Ok([id.clone(), predicate, p])); @@ -1340,16 +1351,24 @@ mod test { mod rdf_containers_syntax_vs_schema { use super::*; - rdf_test!(rdf_containers_syntax_vs_schema / test001 where "bag" => "n0"); - rdf_test!(rdf_containers_syntax_vs_schema / test002 where "bag" => "n0"); - rdf_test!(rdf_containers_syntax_vs_schema / test003 where "bar" => "n0"); - rdf_test!( - #[ignore] - rdf_containers_syntax_vs_schema - / test004 + rdf_test!(rdf_containers_syntax_vs_schema / test001 + where "bag" => "n0" + ); + rdf_test!(rdf_containers_syntax_vs_schema / test002 + where "bag" => "n0" + ); + rdf_test!(rdf_containers_syntax_vs_schema / test003 + where "bar" => "n0" + ); + rdf_test!(rdf_containers_syntax_vs_schema / test004 + where "res2" => "n2", "bar" => "n0", "res" => "n1" + ); + rdf_test!(rdf_containers_syntax_vs_schema / test006 + where "bag" => "n0" + ); + rdf_test!(rdf_containers_syntax_vs_schema / test007 + where "d1" => "n0", "d2" => "n1" ); - rdf_test!(rdf_containers_syntax_vs_schema / test006 where "bag" => "n0"); - rdf_test!(rdf_containers_syntax_vs_schema / test007 where "d1" => "n0", "d2" => "n1"); rdf_test!(rdf_containers_syntax_vs_schema / test008); } From e7fb7dc2e9d04ee97327c644186ad62d303bca43 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 18:15:04 -0700 Subject: [PATCH 26/50] Use `url` crate to handle `xml:base` IRI expansion in `::parser::xml` --- sophia/src/parser/xml.rs | 114 ++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 61 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 89dc2322..b981e069 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -14,6 +14,7 @@ use quick_xml::events::BytesStart; use quick_xml::events::BytesText; use quick_xml::events::Event; use quick_xml::Reader; +use url::Url; use crate::error::*; use crate::ns::rdf; @@ -24,6 +25,7 @@ use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; use crate::term::iri_rfc3987::is_absolute_iri; use crate::term::iri_rfc3987::is_relative_iri; +use crate::term::iri_rfc3987::is_valid_iri; use crate::term::matcher::TermMatcher; use crate::term::Term; @@ -31,14 +33,15 @@ use crate::term::Term; #[derive(Clone, Debug, Default)] pub struct Config { - base: Option>>, + base: Option, } impl Config { fn with_base(base: &str) -> Result { - Ok(Self { - base: Some(Namespace::new(Rc::from(base))?), - }) + match Url::parse(base) { + Ok(url) => Ok(Self { base: Some(url) }), + Err(_) => Err(Error::from_kind(ErrorKind::InvalidIri(base.to_owned()))), + } } } @@ -102,7 +105,7 @@ pub struct Scope { default: Option>, /// The base IRI namespace to expand `rdf:ID`, `rdf:resource` and `rdf:about`. - base: Option>, + base: Option, /// The term factory used to create new terms. factory: Rc>, @@ -194,9 +197,12 @@ impl Scope { /// Set the base IRI prefix. fn set_base(&mut self, base: &str) -> Result<()> { - let mut f = self.factory.borrow_mut(); - self.base = Some(Namespace::new(f.get_term_data(base))?); - Ok(()) + if let Ok(url) = Url::parse(base) { + self.base = Some(url); + Ok(()) + } else { + Err(Error::from_kind(ErrorKind::InvalidIri(String::from(base)))) + } } fn set_datatype(&mut self, datatype: &str) -> Result<()> { @@ -228,8 +234,19 @@ impl Scope { if is_absolute_iri(iri) { self.factory.borrow_mut().iri(iri) } else if is_relative_iri(iri) { - if let Some(ns) = &self.base { - ns.get(self.factory.borrow_mut().get_term_data(iri)) + if let Some(url) = &self.base { + + match url.join(iri) { + Ok(iri) => self.factory.borrow_mut().iri(iri), + Err(e) => Err(Error::from_kind(ErrorKind::InvalidIri(String::from(iri)))), + } + // self.factory.borrow_mut().iri( + // url.join(iri) + // ); + // + // + // + // ns.get(self.factory.borrow_mut().get_term_data(iri)) } else { panic!("NO BASE IRI") } @@ -431,7 +448,7 @@ where } } - fn with_base(reader: Reader, base: Namespace) -> Self { + fn with_base(reader: Reader, base: Url) -> Self { let mut parser = Self::new(reader); let mut scope = parser.scope_mut(); scope.base = Some(base); @@ -473,8 +490,8 @@ where for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); - // ignore xmlns attributes (processed in element_start) - if a.key.starts_with(b"xmlns:") || a.key == b"xmlns" { + // ignore xml attributes (processed in element_start) + if a.key.starts_with(b"xml") { continue; } @@ -498,7 +515,7 @@ where ); } else if k.matches(&rdf::type_) { properties.insert(k, self.scope().expand_iri(&v).expect("INVALID IRI")); - } else if !k.matches(&xml::lang) && a.key != b"xmlns" && !a.key.starts_with(b"xmlns:") { + } else { properties.insert(k, self.scope().new_literal(v).expect("FIXME")); } } @@ -540,6 +557,12 @@ where let mut next_state = ParsingState::Node; for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); + + // Ignore `xml` attributes + if a.key.starts_with(b"xml") { + continue + } + let k = self .scope() .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) @@ -764,6 +787,11 @@ where for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); + // ignore XML attributes (processed when entering scope) + if a.key.starts_with(b"xml") { + continue + } + // try to extract the annotation object let k = self .scope() @@ -787,7 +815,7 @@ where b"Literal" => parse_type = Some(&b"Literal"[..]), other => panic!("invalid parseType: {:?}", other), }; - } else if !k.matches(&xml::lang) && !a.key.starts_with(b"xmlns:") && a.key != b"xmlns" { + } else { attributes.insert(k, v); } } @@ -1562,11 +1590,7 @@ mod test { use super::*; rdf_test!(unrecognised_xml_attributes / test001); - rdf_test!( - #[ignore] - unrecognised_xml_attributes - / test002 - ); + rdf_test!(unrecognised_xml_attributes / test002); } mod xml_canon { @@ -1583,49 +1607,17 @@ mod test { use super::*; rdf_test!(xmlbase / test001); - rdf_test!(#[ignore] xmlbase / test002 where "j0" => "n0"); - rdf_test!( - #[ignore] - xmlbase - / test003 - ); + rdf_test!(xmlbase / test002 where "j0" => "n0"); + rdf_test!(xmlbase / test003); rdf_test!(xmlbase / test004 where "j0" => "n0"); - rdf_test!( - #[ignore] - xmlbase - / test006 - ); - rdf_test!( - #[ignore] - xmlbase - / test007 - ); + rdf_test!(xmlbase / test006); + rdf_test!(xmlbase / test007); rdf_test!(xmlbase / test008); - rdf_test!( - #[ignore] - xmlbase - / test009 - ); - rdf_test!( - #[ignore] - xmlbase - / test010 - ); - rdf_test!( - #[ignore] - xmlbase - / test011 - ); - rdf_test!( - #[ignore] - xmlbase - / test013 - ); - rdf_test!( - #[ignore] - xmlbase - / test014 - ); + rdf_test!(xmlbase / test009); + rdf_test!(xmlbase / test010); + rdf_test!(xmlbase / test011); + rdf_test!(xmlbase / test013); + rdf_test!(xmlbase / test014); } // Check that nested `rdf:li` keeps independent counters for nested elements. From be0b530ff885602208fb98db7474729ab5caa870 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 18:41:34 -0700 Subject: [PATCH 27/50] Fix handling of attributes in reified XML predicates --- sophia/src/parser/xml.rs | 68 ++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index b981e069..12c80fd1 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -717,6 +717,13 @@ where } fn res_end(&mut self) { + // Subject, predicate, object and ID of the reified triple + let id = self.parents.pop().unwrap(); + let p = self.parents.pop().unwrap(); + let s = self.parents.last().unwrap().clone(); + let txt = self.scope_mut().text.take().unwrap_or_default(); + let o = self.scope().new_literal(txt).expect("FIXME"); + // Types for the reification let mut factory = self.factory.borrow_mut(); let ty = factory.copy(&rdf::type_); @@ -724,14 +731,6 @@ where let predicate = factory.copy(&rdf::predicate); let object = factory.copy(&rdf::object); let stmt = factory.copy(&rdf::Statement); - drop(factory); - - // Subject, predicate, object and ID of the reified triple - let id = self.parents.pop().unwrap(); - let p = self.parents.pop().unwrap(); - let s = self.parents.last().unwrap().clone(); - let txt = self.scope_mut().text.take().unwrap_or_default(); - let o = self.scope().new_literal(txt).expect("FIXME"); // Add all triples self.triples @@ -830,32 +829,20 @@ where scope.datatype = Some(xmlliteral); } - // Process the object of the triple. - match object.len() { - 0 if !attributes.is_empty() => { - let s = self.parents.last().unwrap().clone(); - let o = self.new_bnode(); - object.push(o.clone()); - self.triples.push_back(Ok([s, p.clone(), o.clone()])); - for (prop, value) in attributes.into_iter() { - let literal = self.scope().new_literal(value).expect("FIXME"); - self.triples.push_back(Ok([o.clone(), prop, literal])); - } - } - 0 if attributes.is_empty() => { - let s = self.parents.last().unwrap().clone(); - let o = self.scope().new_literal(String::new()).expect("FIXME"); - object.push(o.clone()); - self.triples.push_back(Ok([s, p.clone(), o])); - } - 1 => { - let s = self.parents.last().unwrap().clone(); - let o = object.last().unwrap().clone(); - self.triples.push_back(Ok([s, p.clone(), o])); - } - _ => { - panic!("cannot have rdf:resource and rdf:nodeID at the same time"); - } + // Extract subjet and object of the triple + let s = self.parents.last().unwrap().clone(); + let o = match object.len() { + 0 if !attributes.is_empty() => self.new_bnode(), + 1 => object.last().unwrap().clone(), + 0 if attributes.is_empty() => self.scope().new_literal(String::new()).expect("FIXME"), + _ => panic!("cannot have rdf:resource and rdf:nodeID at the same time"), + }; + + // Add the triple and all subsequent triples as attributes + self.triples.push_back(Ok([s.clone(), p.clone(), o.clone()])); + for (prop, value) in attributes.into_iter() { + let literal = self.scope().new_literal(value).expect("FIXME"); + self.triples.push_back(Ok([o.clone(), prop, literal])); } // Reify the triple if needed. @@ -867,11 +854,6 @@ where let predicate = factory.copy(&rdf::predicate); let obj = factory.copy(&rdf::object); let stmt = factory.copy(&rdf::Statement); - drop(factory); - - // Subject and object of the reification - let s = self.parents.last().unwrap().clone(); - let o = object.pop().unwrap(); // Add all triples self.triples.push_back(Ok([id.clone(), ty, stmt])); @@ -1443,11 +1425,7 @@ mod test { rdf_test!(rdfms_empty_property_elements / test010 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test011); rdf_test!(#[ignore] rdfms_empty_property_elements / test012 where "a1" => "n0"); - rdf_test!( - #[ignore] - rdfms_empty_property_elements - / test013 - ); + rdf_test!(rdfms_empty_property_elements / test013); rdf_test!(rdfms_empty_property_elements / test014 where "a1" => "n0"); rdf_test!(#[ignore] rdfms_empty_property_elements / test015 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test016); @@ -1470,7 +1448,7 @@ mod test { rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test001 where "j88090" => "n0", "j88091" => "n1"); rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test002 where "j88093" => "n0"); rdf_test!(rdfms_not_id_and_resource_attr / test004 where "j88101" => "n0"); - rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test005 where "j88106" => "n0"); + rdf_test!(rdfms_not_id_and_resource_attr / test005 where "j88106" => "n0"); } mod rdfms_para196 { From 03da20535435ecf0d672db7b8660730e7add27a4 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 19:03:42 -0700 Subject: [PATCH 28/50] Fix more RDF/XML behaviour tests in `::parser::xml` --- sophia/src/parser/xml.rs | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 12c80fd1..c8589282 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -18,7 +18,6 @@ use url::Url; use crate::error::*; use crate::ns::rdf; -use crate::ns::xml; use crate::ns::xsd; use crate::ns::Namespace; use crate::term::factory::RcTermFactory; @@ -466,7 +465,7 @@ where ParsingState::Collection => self.collection_start(e), ParsingState::CollectionItem => self.collection_item_start(e), ParsingState::Res => panic!("expecting text, not new element"), - _ => unimplemented!(), + ParsingState::Literal => unimplemented!("entering element as literal"), }; } @@ -554,6 +553,7 @@ where self.parents.push(p); // Extract attributes relevant to the RDF syntax + let mut attributes = HashMap::new(); let mut next_state = ParsingState::Node; for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); @@ -598,9 +598,22 @@ where } other => panic!("invalid parseType: {:?}", other), } + } else { + let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + attributes.insert(k, self.scope().new_literal(v).expect("FIXME")); } } - self.state.push(next_state); + + if !attributes.is_empty() { + let o = self.new_bnode(); + self.parents.push(o.clone()); + std::mem::replace(self.state.last_mut().unwrap(), ParsingState::Resource); + for (k, v) in attributes.into_iter() { + self.triples.push_back(Ok([o.clone(), k, v])); + } + } else { + self.state.push(next_state); + } } fn collection_start(&mut self, e: &BytesStart) { @@ -627,7 +640,6 @@ where ParsingState::CollectionItem => self.collection_item_end(), ParsingState::Collection => self.collection_end(e), ParsingState::Res => self.res_end(), - _ => unimplemented!(), } self.leave_scope(); } @@ -665,6 +677,8 @@ where fn resource_end(&mut self, e: &BytesEnd) { // End of the implicit node element self.node_end(); + // Drop text, since it is not relevant in a Resource predicate. + self.scope_mut().text.take(); // End of the resource predicate self.predicate_end(e) } @@ -761,7 +775,7 @@ where ParsingState::Collection => self.collection_item_empty(e), ParsingState::CollectionItem => unreachable!(), ParsingState::Res => panic!("expected end element, not empty"), - _ => (), + ParsingState::Literal => unimplemented!("empty element as literal"), } self.leave_scope(); } @@ -1427,7 +1441,7 @@ mod test { rdf_test!(#[ignore] rdfms_empty_property_elements / test012 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test013); rdf_test!(rdfms_empty_property_elements / test014 where "a1" => "n0"); - rdf_test!(#[ignore] rdfms_empty_property_elements / test015 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test015 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test016); rdf_test!(rdfms_empty_property_elements / test017); } @@ -1571,6 +1585,7 @@ mod test { rdf_test!(unrecognised_xml_attributes / test002); } + // FIXME(@althonos): requires `parseType=Literal` to work. mod xml_canon { use super::*; From f3846cda5d63f364422605d27362eb5470f0d345 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 19:16:50 -0700 Subject: [PATCH 29/50] Fix test predicate XML element declaring annotation of implicit bnode --- sophia/src/parser/xml.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index c8589282..ba26e54d 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -555,6 +555,7 @@ where // Extract attributes relevant to the RDF syntax let mut attributes = HashMap::new(); let mut next_state = ParsingState::Node; + let mut object = Vec::with_capacity(1); for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); @@ -572,20 +573,18 @@ where self.scope_mut().set_datatype(&v); } else if k.matches(&rdf::ID) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - self.parents - .push(self.scope().expand_id(&v).expect("FIXME")); + object.push(self.scope().expand_id(&v).expect("FIXME")); next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - self.parents - .push(self.scope().expand_iri(&v).expect("FIXME")); - self.state.pop(); + object.push(self.scope().expand_iri(&v).expect("FIXME")); + // self.state.pop(); next_state = ParsingState::Predicate; // self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => { - self.parents.push(self.new_bnode()); + object.push(self.new_bnode()); self.scope_mut().set_text(None); next_state = ParsingState::Resource; } @@ -601,19 +600,29 @@ where } else { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); attributes.insert(k, self.scope().new_literal(v).expect("FIXME")); + next_state = ParsingState::Resource; } } - if !attributes.is_empty() { - let o = self.new_bnode(); + // Extract subjet and object of the triple + let s = self.parents.last().unwrap().clone(); + let o = match object.len() { + 0 if !attributes.is_empty() => Some(self.new_bnode()), + 0 if attributes.is_empty() => None, + 1 => Some(object.last().unwrap().clone()), + _ => panic!("cannot have rdf:resource, rdf::ID or rdf:nodeID at the same time"), + }; + + // Make the predicate a resource element if an objec tis present. + if let Some(o) = o { self.parents.push(o.clone()); std::mem::replace(self.state.last_mut().unwrap(), ParsingState::Resource); for (k, v) in attributes.into_iter() { self.triples.push_back(Ok([o.clone(), k, v])); } - } else { - self.state.push(next_state); } + + self.state.push(next_state); } fn collection_start(&mut self, e: &BytesStart) { @@ -1460,7 +1469,7 @@ mod test { use super::*; rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test001 where "j88090" => "n0", "j88091" => "n1"); - rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test002 where "j88093" => "n0"); + rdf_test!(rdfms_not_id_and_resource_attr / test002 where "j88093" => "n0"); rdf_test!(rdfms_not_id_and_resource_attr / test004 where "j88101" => "n0"); rdf_test!(rdfms_not_id_and_resource_attr / test005 where "j88106" => "n0"); } From 6aced015ac531b51b443c377971fd1f8a38788a0 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 23 May 2019 20:09:06 -0700 Subject: [PATCH 30/50] Fix `::parser::xml::rdfms_para196::test001` using wrong resource file --- sophia/src/parser/xml.rs | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index ba26e54d..7794d1a0 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -234,18 +234,17 @@ impl Scope { self.factory.borrow_mut().iri(iri) } else if is_relative_iri(iri) { if let Some(url) = &self.base { - match url.join(iri) { Ok(iri) => self.factory.borrow_mut().iri(iri), Err(e) => Err(Error::from_kind(ErrorKind::InvalidIri(String::from(iri)))), } - // self.factory.borrow_mut().iri( - // url.join(iri) - // ); - // - // - // - // ns.get(self.factory.borrow_mut().get_term_data(iri)) + // self.factory.borrow_mut().iri( + // url.join(iri) + // ); + // + // + // + // ns.get(self.factory.borrow_mut().get_term_data(iri)) } else { panic!("NO BASE IRI") } @@ -561,7 +560,7 @@ where // Ignore `xml` attributes if a.key.starts_with(b"xml") { - continue + continue; } let k = self @@ -584,7 +583,9 @@ where } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => { - object.push(self.new_bnode()); + if object.is_empty() { + object.push(self.new_bnode()); + } self.scope_mut().set_text(None); next_state = ParsingState::Resource; } @@ -811,7 +812,7 @@ where // ignore XML attributes (processed when entering scope) if a.key.starts_with(b"xml") { - continue + continue; } // try to extract the annotation object @@ -862,7 +863,8 @@ where }; // Add the triple and all subsequent triples as attributes - self.triples.push_back(Ok([s.clone(), p.clone(), o.clone()])); + self.triples + .push_back(Ok([s.clone(), p.clone(), o.clone()])); for (prop, value) in attributes.into_iter() { let literal = self.scope().new_literal(value).expect("FIXME"); self.triples.push_back(Ok([o.clone(), prop, literal])); @@ -1477,11 +1479,7 @@ mod test { mod rdfms_para196 { use super::*; - rdf_test!( - #[ignore] - rdfms_not_id_and_resource_attr - / test001 - ); + rdf_test!(rdfms_para196 / test001); } mod rdfms_rdf_names_use { From e649ff322e35842d2906401ae15a6e73a56bf568 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Fri, 24 May 2019 15:02:29 -0700 Subject: [PATCH 31/50] Transform empty Start/End events into a single Empty event in XML parser --- sophia/src/parser/xml.rs | 173 ++++++++++++++++++++++++++++++--------- 1 file changed, 135 insertions(+), 38 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 7794d1a0..7f87e4c2 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use std::collections::LinkedList; use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; +use std::ops::Deref; use std::rc::Rc; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -14,6 +15,7 @@ use quick_xml::events::BytesStart; use quick_xml::events::BytesText; use quick_xml::events::Event; use quick_xml::Reader; +use quick_xml::Result as XmlResult; use url::Url; use crate::error::*; @@ -30,13 +32,20 @@ use crate::term::Term; // --- +/// RDF/XML parser configuration. +/// +/// For more information, +/// see the [uniform interface] of parsers. +/// +/// [uniform interface]: ../index.html#uniform-interface +/// #[derive(Clone, Debug, Default)] pub struct Config { base: Option, } impl Config { - fn with_base(base: &str) -> Result { + pub fn with_base(base: &str) -> Result { match Url::parse(base) { Ok(url) => Ok(Self { base: Some(url) }), Err(_) => Err(Error::from_kind(ErrorKind::InvalidIri(base.to_owned()))), @@ -80,47 +89,119 @@ impl Config { // --- +/// The state of the parser. #[derive(Debug, Clone, Copy)] enum ParsingState { + /// The parser is in a predicate, and the next expected element is a node. Node, + /// The parser is in a node, and the next expected element is a predicate. Predicate, + /// The parser is in a resource predicate, and the next expected element + /// is a predicate. Resource, + /// The parser is in a predicate and will process its content as a literal. Literal, // NB: not supported by quick-xml right now - + /// The parser is in a reified element, and the next expected element is + /// a text to be used as a reified triple object. Res, - + /// The parser is in a collection, and the next expected element is a node. Collection, + /// The parser is in a collection, and the next expected element is a node, + /// but exiting this does not exit the collection. CollectionItem, } // --- +/// A wrapper for `quick_xml::Reader` ignoring or merging some events. +pub struct XmlReader { + inner: Reader, + event: Option>, // actually 'buffer + buffer: Vec, +} + +impl XmlReader { + /// Read an XML event. + pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> XmlResult> { + + // Clear the event peeking cache if it is not empty. + if let Some(e) = self.event.take() { + return Ok(e); + } + + // Get a `Start` event, or return if it is something else. + let start = match self.inner.read_event(buf)? { + Event::Start(ref s) => s.clone(), + other => return Ok(other), + }; + + // Get a `Text` event, or return if it is something else. + // The `transmute` make the compiler think the event now has a + // static lifetime, where it only has the lifetime of the struct. + // This is OK because we never return an event exceeding the lifetime + // of the `XmlReader` itself. + self.buffer.clear(); + match self.inner.read_event(&mut self.buffer)? { + Event::Text(ref e) if e.is_empty() => (), + other => unsafe { + self.event = Some(std::mem::transmute(other)); + return Ok(Event::Start(start)); + } + } + + // Get an `End` event, org return if it is something else. + self.buffer.clear(); + match self.inner.read_event(&mut self.buffer)? { + Event::End(_) => { + Ok(Event::Empty(start)) + }, + other => unsafe { + self.event = Some(std::mem::transmute(other)); + Ok(Event::Start(start)) + } + } + } +} + +impl From> for XmlReader { + fn from(r: Reader) -> Self { + Self { + inner: r, + event: None, + buffer: Vec::new(), + } + } +} + +impl Deref for XmlReader { + type Target = Reader; + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +// --- + +/// Data relevant to an XML scope. #[derive(Debug)] pub struct Scope { /// The XML namespaces declared in this scope. ns: HashMap>, - /// The default XML namespace to expand tags without namespaces with. default: Option>, - /// The base IRI namespace to expand `rdf:ID`, `rdf:resource` and `rdf:about`. base: Option, - /// The term factory used to create new terms. factory: Rc>, - /// The datatype of the containing element. datatype: Option>, - /// The language tag of the containing element. lang: Option, - /// The text gathered in the current scope. text: Option, - /// The current count of list elements li: AtomicU64, - + /// The collection: Vec>, } @@ -196,23 +277,32 @@ impl Scope { /// Set the base IRI prefix. fn set_base(&mut self, base: &str) -> Result<()> { + // Accept the URL only if it is a valid URL and a valid base. if let Ok(url) = Url::parse(base) { - self.base = Some(url); - Ok(()) - } else { - Err(Error::from_kind(ErrorKind::InvalidIri(String::from(base)))) + if !url.cannot_be_a_base() { + self.base = Some(url); + return Ok(()); + } } + + Err(Error::from_kind(ErrorKind::InvalidIri(String::from(base)))) } + /// Set the scope datatype. fn set_datatype(&mut self, datatype: &str) -> Result<()> { self.datatype = Some(self.expand_iri(datatype)?); Ok(()) } + /// Set the scope text. fn set_text>>(&mut self, text: T) { self.text = text.into(); } + /// Expand an XML attribute in the form `namespace:id` into an IRI. + /// + /// This uses the `xmlns` default namespace to expand local attributes, + /// or any declared namespace in the current scope. fn expand_attribute(&self, attr: &str) -> Result> { if let Some(separator_idx) = attr.chars().position(|c| c == ':') { let prefix = &attr[..separator_idx]; @@ -229,30 +319,29 @@ impl Scope { } } + /// Expand an IRI reference (in a `rdf:resource` or `rdf:about`) into an IRI. + /// + /// This uses `xml:base` to expand local resources, and does nothing in + /// case the IRI is already in expanded form. fn expand_iri(&self, iri: &str) -> Result> { - if is_absolute_iri(iri) { - self.factory.borrow_mut().iri(iri) - } else if is_relative_iri(iri) { + if is_relative_iri(&iri) { if let Some(url) = &self.base { match url.join(iri) { Ok(iri) => self.factory.borrow_mut().iri(iri), - Err(e) => Err(Error::from_kind(ErrorKind::InvalidIri(String::from(iri)))), + Err(e) => bail!(ErrorKind::InvalidIri(String::from(iri))), } - // self.factory.borrow_mut().iri( - // url.join(iri) - // ); - // - // - // - // ns.get(self.factory.borrow_mut().get_term_data(iri)) } else { panic!("NO BASE IRI") } } else { - Err(Error::from_kind(ErrorKind::InvalidIri(iri.to_owned()))) + self.factory.borrow_mut().iri(&iri) } } + /// Expand an ID (in a `rdf:ID`) into an IRI. + /// + /// This also uses `xml:base` to expand local resources, and prefixes + /// identifiers in the document with a `#` if needed. fn expand_id(&self, id: &str) -> Result> { if id.starts_with("#") { self.expand_iri(id) @@ -270,7 +359,7 @@ impl Scope { } } - /// Create a new `rdf:li` property. + /// Create a new `rdf:li` property by incrementing the scope `li` counter. fn new_li(&self) -> Result> { if let Some(ns) = self.ns.get("rdf") { let mut f = self.factory.borrow_mut(); @@ -299,9 +388,10 @@ impl Default for Scope { // --- +/// An XML parser supporting any term factory as a backend. struct XmlParser { /// The underlying XML reader. - reader: Reader, + reader: XmlReader, /// The stack of scoped data (for nested declaration). scopes: Vec>, @@ -330,24 +420,28 @@ where { // --- + /// Get a reference to the current scope. fn scope(&self) -> &Scope { self.scopes.last().unwrap() } + /// Get a mutable reference to the current scope. fn scope_mut(&mut self) -> &mut Scope { self.scopes.last_mut().unwrap() } + /// Get a reference to the parent scope. fn parent_scope(&self) -> &Scope { &self.scopes[self.scopes.len() - 2] } + /// Get a mutable reference to the current scope. fn parent_scope_mut(&mut self) -> &mut Scope { let l = self.scopes.len(); &mut self.scopes[l - 2] } - // Add a local scope (`lang`, `namespaces`, but not `parents`) + // Enter a new scope, parsing `xml:lang`, `xmlns` and `rdf:datatype`. fn enter_scope(&mut self, e: &BytesStart) -> Result<()> { // We are entering a new elements: text is not relevant anymore. let mut prev = self.scope_mut(); @@ -376,7 +470,7 @@ where } } - // Add current lang to scope or copy last one (OPTIMISE ME) + // Add current lang to scope or copy last one for attr in e.attributes().with_checks(true) { let a = attr.expect("FIXME"); if a.key == b"xml:lang" { @@ -396,7 +490,7 @@ where Ok(()) } - // Exit the local scope + // Exit the local scope. fn leave_scope(&mut self) { self.scopes.pop().expect("FIXME"); } @@ -433,10 +527,11 @@ where // --- + /// Create a new `XmlParser` from the given `quick_xml::Reader`. fn new(reader: Reader) -> Self { let factory: Rc> = Default::default(); Self { - reader, + reader: XmlReader::from(reader), parents: Vec::new(), scopes: vec![Scope::with_factory_rc(factory.clone())], triples: LinkedList::new(), @@ -446,6 +541,7 @@ where } } + /// Create a new `XmlParser` using the given URL as the top-level `xml:base`. fn with_base(reader: Reader, base: Url) -> Self { let mut parser = Self::new(reader); let mut scope = parser.scope_mut(); @@ -577,9 +673,7 @@ where } else if k.matches(&rdf::resource) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); object.push(self.scope().expand_iri(&v).expect("FIXME")); - // self.state.pop(); next_state = ParsingState::Predicate; - // self.reification_start(e, self.scope().expand_id(&v).expect("FIXME")); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => { @@ -907,13 +1001,14 @@ where { type Item = Result<[Term; 3]>; fn next(&mut self) -> Option { + let mut buffer = Vec::new(); loop { // First make sure to consume the queue. if let Some(triple) = self.triples.pop_front() { return Some(triple); } // Then process the next event to maybe produce triples - match &self.reader.read_event(&mut Vec::new()).unwrap() { + match &self.reader.read_event(&mut buffer).unwrap() { Event::Eof => return None, Event::Start(s) => self.element_start(s), Event::Empty(e) => self.element_empty(e), @@ -921,6 +1016,8 @@ where Event::Text(t) => self.element_text(t), _ => (), } + // Finally clear the buffer if we are going to use it again. + buffer.clear(); } } } @@ -1449,7 +1546,7 @@ mod test { rdf_test!(rdfms_empty_property_elements / test009); rdf_test!(rdfms_empty_property_elements / test010 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test011); - rdf_test!(#[ignore] rdfms_empty_property_elements / test012 where "a1" => "n0"); + rdf_test!(rdfms_empty_property_elements / test012 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test013); rdf_test!(rdfms_empty_property_elements / test014 where "a1" => "n0"); rdf_test!(rdfms_empty_property_elements / test015 where "a1" => "n0"); @@ -1470,7 +1567,7 @@ mod test { mod rdfms_not_id_and_resource_attr { use super::*; - rdf_test!(#[ignore] rdfms_not_id_and_resource_attr / test001 where "j88090" => "n0", "j88091" => "n1"); + rdf_test!(rdfms_not_id_and_resource_attr / test001 where "j88090" => "n0", "j88091" => "n1"); rdf_test!(rdfms_not_id_and_resource_attr / test002 where "j88093" => "n0"); rdf_test!(rdfms_not_id_and_resource_attr / test004 where "j88101" => "n0"); rdf_test!(rdfms_not_id_and_resource_attr / test005 where "j88106" => "n0"); From 2c1c8966b03009ee4125adaca9b85ab1132a8489 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Fri, 24 May 2019 16:08:30 -0700 Subject: [PATCH 32/50] Use `AtomicUsize` instead of `AtomicU64` in `::parser::xml` --- sophia/src/parser/xml.rs | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 7f87e4c2..d2a496ed 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; use std::ops::Deref; use std::rc::Rc; -use std::sync::atomic::AtomicU64; +use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering; use quick_xml::events::BytesEnd; @@ -123,7 +123,6 @@ pub struct XmlReader { impl XmlReader { /// Read an XML event. pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> XmlResult> { - // Clear the event peeking cache if it is not empty. if let Some(e) = self.event.take() { return Ok(e); @@ -146,19 +145,17 @@ impl XmlReader { other => unsafe { self.event = Some(std::mem::transmute(other)); return Ok(Event::Start(start)); - } + }, } // Get an `End` event, org return if it is something else. self.buffer.clear(); match self.inner.read_event(&mut self.buffer)? { - Event::End(_) => { - Ok(Event::Empty(start)) - }, + Event::End(_) => Ok(Event::Empty(start)), other => unsafe { self.event = Some(std::mem::transmute(other)); Ok(Event::Start(start)) - } + }, } } } @@ -200,14 +197,14 @@ pub struct Scope { /// The text gathered in the current scope. text: Option, /// The current count of list elements - li: AtomicU64, + li: AtomicUsize, /// The collection: Vec>, } // We implement it ourselves instead of deriving so that: // * F does not need to be `Clone` (deriving requires it). -// * we can clone `li` although `AtomicU64` is not `Clone`. +// * we can clone `li` although `AtomicUsize` is not `Clone`. impl Clone for Scope { fn clone(&self) -> Self { Self { @@ -218,7 +215,7 @@ impl Clone for Scope { datatype: self.datatype.clone(), lang: self.lang.clone(), text: self.text.clone(), - li: AtomicU64::new(self.li.load(Ordering::Relaxed)), + li: AtomicUsize::new(self.li.load(Ordering::Relaxed)), collection: self.collection.clone(), } } @@ -240,7 +237,7 @@ impl Scope { datatype: None, lang: None, text: None, - li: AtomicU64::new(1), + li: AtomicUsize::new(1), collection: Vec::new(), }; // These namespaces are always in scope @@ -406,7 +403,7 @@ struct XmlParser { factory: Rc>, // - bnodes: AtomicU64, + bnodes: AtomicUsize, /// The current state of the parser. state: Vec, @@ -536,7 +533,7 @@ where scopes: vec![Scope::with_factory_rc(factory.clone())], triples: LinkedList::new(), factory: factory, - bnodes: AtomicU64::new(0), + bnodes: AtomicUsize::new(0), state: vec![ParsingState::Node], } } @@ -1076,7 +1073,7 @@ mod test { } macro_rules! rdf_test { - ($(#[$attr:meta])* $suite:ident / $case:ident where $($l:pat => $r:literal),*) => { + ($(#[$attr:meta])* $suite:ident / $case:ident where $($l:pat => $r:expr),*) => { $(#[$attr])* #[test] fn $case() { @@ -1140,7 +1137,7 @@ mod test { } macro_rules! nt_test { - ($name:ident, $xml:literal, $nt:literal) => { + ($name:ident, $xml:expr, $nt:expr) => { #[test] fn $name() { let mut xml = TestGraph::new(); From b7139792bcd9d133ce1cf88d4c954e063cc0d07e Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 4 Jun 2019 13:25:33 -0700 Subject: [PATCH 33/50] Improve `XmlReader.read_event` to merge start/end events even when they contain comments --- sophia/src/parser/xml.rs | 46 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index d2a496ed..13bd1f0f 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -123,6 +123,9 @@ pub struct XmlReader { impl XmlReader { /// Read an XML event. pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> XmlResult> { + + use quick_xml::events::Event::*; + // Clear the event peeking cache if it is not empty. if let Some(e) = self.event.take() { return Ok(e); @@ -130,32 +133,41 @@ impl XmlReader { // Get a `Start` event, or return if it is something else. let start = match self.inner.read_event(buf)? { - Event::Start(ref s) => s.clone(), + Start(ref s) => s.clone(), other => return Ok(other), }; - // Get a `Text` event, or return if it is something else. + // Get a `Text` event, return `Start`, `End`, `Empty` or `Eof`, + // or ignore other event (such as `Comment`). // The `transmute` make the compiler think the event now has a // static lifetime, where it only has the lifetime of the struct. // This is OK because we never return an event exceeding the lifetime // of the `XmlReader` itself. - self.buffer.clear(); - match self.inner.read_event(&mut self.buffer)? { - Event::Text(ref e) if e.is_empty() => (), - other => unsafe { - self.event = Some(std::mem::transmute(other)); - return Ok(Event::Start(start)); - }, + loop { + self.buffer.clear(); + match self.inner.read_event(&mut self.buffer)? { + Text(ref e) if e.is_empty() => break, + Comment(_) | CData(_) | Decl(_) | PI(_) | DocType(_) => (), + other => unsafe { + self.event = Some(std::mem::transmute(other)); + return Ok(Start(start)); + }, + } } - // Get an `End` event, org return if it is something else. - self.buffer.clear(); - match self.inner.read_event(&mut self.buffer)? { - Event::End(_) => Ok(Event::Empty(start)), - other => unsafe { - self.event = Some(std::mem::transmute(other)); - Ok(Event::Start(start)) - }, + // Get an `End` event, or return if it is something else with + // semantic value. + loop { + self.buffer.clear(); + match self.inner.read_event(&mut self.buffer)? { + End(_) => return Ok(Empty(start)), + Text(ref e) if e.is_empty() => (), + Comment(_) | CData(_) | Decl(_) | PI(_) | DocType(_) => (), + other => unsafe { + self.event = Some(std::mem::transmute(other)); + return Ok(Start(start)); + }, + } } } } From 26371d0bbffbce5d20de67c73852cba9bce90020 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 4 Jun 2019 13:40:28 -0700 Subject: [PATCH 34/50] Use `XmlReader.decode` to decode element names instead of assuming UTF-8 --- sophia/src/parser/xml.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 13bd1f0f..0c8a83a6 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -468,7 +468,7 @@ where if a.key.starts_with(b"xmlns:") { scope .add_prefix( - std::str::from_utf8(&a.key[6..]).expect("FIXME"), + &self.reader.decode(&a.key[6..]), &a.unescape_and_decode_value(&self.reader).expect("FIXME"), ) .expect("FIXME"); @@ -577,7 +577,7 @@ where // Get node type from the XML attribute. let ty = self .scope() - .expand_attribute(std::str::from_utf8(e.name()).expect("INVALID UTF8")) + .expand_attribute(&self.reader.decode(e.name())) .expect("INVALID DATATYPE IRI REFERENCE"); // Bail out if an rdf:RDF element @@ -601,7 +601,7 @@ where // try to extract the subject annotation let k = self .scope() - .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expand_attribute(&self.reader.decode(a.key)) .expect("FIXME"); let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); @@ -652,7 +652,7 @@ where // Get the predicate and add it to the current nested stack // or build a new `rdf:_n` IRI if the predicate is `rdf:li`. let p = self - .predicate_iri_start(std::str::from_utf8(e.name()).expect("FIXME")) + .predicate_iri_start(&self.reader.decode(e.name())) .expect("INVALID PREDICATE IRI"); self.parents.push(p); @@ -670,7 +670,7 @@ where let k = self .scope() - .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expand_attribute(&self.reader.decode(a.key)) .expect("INVALID ATTRIBUTE"); if k.matches(&rdf::datatype) { let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); @@ -772,7 +772,7 @@ where fn predicate_end(&mut self, e: &BytesEnd) { // Build the predicate IRI let p = self - .predicate_iri_end(std::str::from_utf8(e.name()).expect("FIXME")) + .predicate_iri_end(&self.reader.decode(e.name())) .expect("INVALID PREDICATE IRI"); // Get the literal value @@ -901,7 +901,7 @@ where fn predicate_empty(&mut self, e: &BytesStart) { let p = self - .predicate_iri_start(std::str::from_utf8(e.name()).expect("FIXME")) + .predicate_iri_start(&self.reader.decode(e.name())) .expect("INVALID PREDICATE IRI"); let mut object = Vec::with_capacity(1); @@ -921,7 +921,7 @@ where // try to extract the annotation object let k = self .scope() - .expand_attribute(std::str::from_utf8(a.key).expect("FIXME")) + .expand_attribute(&self.reader.decode(a.key)) .expect("FIXME"); let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); if k.matches(&rdf::resource) { From 6786a5c6485b2817a103c04b0bbf550a6461486d Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 4 Jun 2019 14:30:29 -0700 Subject: [PATCH 35/50] Fix reader creating percent-encoding URIs in `::parser::xml` --- sophia/src/parser/xml.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 0c8a83a6..4c42d113 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -123,7 +123,6 @@ pub struct XmlReader { impl XmlReader { /// Read an XML event. pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> XmlResult> { - use quick_xml::events::Event::*; // Clear the event peeking cache if it is not empty. @@ -333,17 +332,23 @@ impl Scope { /// This uses `xml:base` to expand local resources, and does nothing in /// case the IRI is already in expanded form. fn expand_iri(&self, iri: &str) -> Result> { - if is_relative_iri(&iri) { + fn dec(x: &str) -> std::borrow::Cow { + url::percent_encoding::percent_decode(x.as_bytes()) + .decode_utf8() + .unwrap() + } + + if is_relative_iri(iri) { if let Some(url) = &self.base { match url.join(iri) { - Ok(iri) => self.factory.borrow_mut().iri(iri), + Ok(u) => self.factory.borrow_mut().iri(&dec(u.as_str())), Err(e) => bail!(ErrorKind::InvalidIri(String::from(iri))), } } else { panic!("NO BASE IRI") } } else { - self.factory.borrow_mut().iri(&iri) + self.factory.borrow_mut().iri(&dec(iri)) } } From d68c28f8b0b5d441d2befc1b9fed72db2c243af7 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 4 Jun 2019 14:32:46 -0700 Subject: [PATCH 36/50] Add test harness for RDF/XML expected failures in `rdf-tests` --- sophia/src/parser/xml.rs | 91 ++++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 4c42d113..e6e83055 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -1153,6 +1153,10 @@ mod test { }; } + macro_rules! rdf_failure { + ($(#[$attr:meta])* $suite:ident / $case:ident) => {}; + } + macro_rules! nt_test { ($name:ident, $xml:expr, $nt:expr) => { #[test] @@ -1497,24 +1501,15 @@ mod test { mod rdf_containers_syntax_vs_schema { use super::*; - rdf_test!(rdf_containers_syntax_vs_schema / test001 - where "bag" => "n0" - ); - rdf_test!(rdf_containers_syntax_vs_schema / test002 - where "bag" => "n0" - ); - rdf_test!(rdf_containers_syntax_vs_schema / test003 - where "bar" => "n0" - ); - rdf_test!(rdf_containers_syntax_vs_schema / test004 - where "res2" => "n2", "bar" => "n0", "res" => "n1" - ); - rdf_test!(rdf_containers_syntax_vs_schema / test006 - where "bag" => "n0" - ); - rdf_test!(rdf_containers_syntax_vs_schema / test007 - where "d1" => "n0", "d2" => "n1" - ); + rdf_failure!(rdf_containers_syntax_vs_schema / error001); + rdf_failure!(rdf_containers_syntax_vs_schema / error002); + + rdf_test!(rdf_containers_syntax_vs_schema / test001 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test002 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test003 where "bar" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test004 where "res2" => "n2", "bar" => "n0", "res" => "n1"); + rdf_test!(rdf_containers_syntax_vs_schema / test006 where "bag" => "n0"); + rdf_test!(rdf_containers_syntax_vs_schema / test007 where "d1" => "n0", "d2" => "n1"); rdf_test!(rdf_containers_syntax_vs_schema / test008); } @@ -1540,6 +1535,22 @@ mod test { rdf_test!(rdf_ns_prefix_confusion / test0014); } + mod rdfms_abouteach { + use super::*; + + rdf_failure!(rdfms_abouteach / error001); + rdf_failure!(rdfms_abouteach / error002); + } + + mod rdfms_difference_between_ID_and_about { + use super::*; + + rdf_failure!(rdfms_difference_between_ID_and_about / error1); + rdf_test!(rdfms_difference_between_ID_and_about / test1); + rdf_test!(rdfms_difference_between_ID_and_about / test2); + rdf_test!(rdfms_difference_between_ID_and_about / test3); + } + mod rdfms_duplicate_member_props { use super::*; @@ -1549,6 +1560,10 @@ mod test { mod rdfms_empty_property_elements { use super::*; + rdf_failure!(rdfms_empty_property_elements / error001); + rdf_failure!(rdfms_empty_property_elements / error002); + rdf_failure!(rdfms_empty_property_elements / error003); + rdf_test!(rdfms_empty_property_elements / test001); rdf_test!(rdfms_empty_property_elements / test002); rdf_test!(rdfms_empty_property_elements / test003); @@ -1593,9 +1608,42 @@ mod test { rdf_test!(rdfms_para196 / test001); } + mod rdfms_rdf_id { + use super::*; + + rdf_failure!(rdfms_rdf_id / error_001); + rdf_failure!(rdfms_rdf_id / error_002); + rdf_failure!(rdfms_rdf_id / error_003); + rdf_failure!(rdfms_rdf_id / error_004); + rdf_failure!(rdfms_rdf_id / error_005); + rdf_failure!(rdfms_rdf_id / error_006); + rdf_failure!(rdfms_rdf_id / error_007); + } + mod rdfms_rdf_names_use { use super::*; + rdf_failure!(rdfms_rdf_names_use / error_001); + rdf_failure!(rdfms_rdf_names_use / error_002); + rdf_failure!(rdfms_rdf_names_use / error_003); + rdf_failure!(rdfms_rdf_names_use / error_004); + rdf_failure!(rdfms_rdf_names_use / error_005); + rdf_failure!(rdfms_rdf_names_use / error_006); + rdf_failure!(rdfms_rdf_names_use / error_007); + rdf_failure!(rdfms_rdf_names_use / error_008); + rdf_failure!(rdfms_rdf_names_use / error_009); + rdf_failure!(rdfms_rdf_names_use / error_010); + rdf_failure!(rdfms_rdf_names_use / error_011); + rdf_failure!(rdfms_rdf_names_use / error_012); + rdf_failure!(rdfms_rdf_names_use / error_013); + rdf_failure!(rdfms_rdf_names_use / error_014); + rdf_failure!(rdfms_rdf_names_use / error_015); + rdf_failure!(rdfms_rdf_names_use / error_016); + rdf_failure!(rdfms_rdf_names_use / error_017); + rdf_failure!(rdfms_rdf_names_use / error_018); + rdf_failure!(rdfms_rdf_names_use / error_019); + rdf_failure!(rdfms_rdf_names_use / error_020); + rdf_test!(rdfms_rdf_names_use / test_001); rdf_test!(rdfms_rdf_names_use / test_002); rdf_test!(rdfms_rdf_names_use / test_003); @@ -1651,6 +1699,13 @@ mod test { mod rdfms_syntax_incomplete { use super::*; + rdf_failure!(rdfms_syntax_incomplete / error001); + rdf_failure!(rdfms_syntax_incomplete / error002); + rdf_failure!(rdfms_syntax_incomplete / error003); + rdf_failure!(rdfms_syntax_incomplete / error004); + rdf_failure!(rdfms_syntax_incomplete / error005); + rdf_failure!(rdfms_syntax_incomplete / error006); + rdf_test!(rdfms_syntax_incomplete / test001 where "j0" => "oa"); rdf_test!(rdfms_syntax_incomplete / test002 where "j0A" => "oa", "j2" => "n0", "j1B" => "ob"); rdf_test!(rdfms_syntax_incomplete / test003 where "j0A" => "oa"); From 0cf3c65fe7d6cbab62ec94b924ee18308bbf8bbd Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 17:13:53 -0700 Subject: [PATCH 37/50] Add missing RDF syntax elements to `rdf` namespace in `::ns` --- sophia/src/ns.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index 859e419a..3a156d6f 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -113,7 +113,10 @@ pub mod rdf { resource, li, nodeID, - datatype + datatype, + bagID, + aboutEach, + aboutEachPrefix ); ns_term!("http://www.w3.org/1999/02/22-rdf-syntax-ns#", type_, "type"); } From 669495b11f2aff73d7a6162835c2e219579de718 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 17:15:45 -0700 Subject: [PATCH 38/50] Add error type and add RDF property name verification in XML parser --- sophia/src/error.rs | 3 + sophia/src/parser/xml.rs | 368 +++++++++++++++++++++++++++------------ 2 files changed, 260 insertions(+), 111 deletions(-) diff --git a/sophia/src/error.rs b/sophia/src/error.rs index f85ece3b..f71d1850 100644 --- a/sophia/src/error.rs +++ b/sophia/src/error.rs @@ -3,6 +3,9 @@ use pest::error::{InputLocation, LineColLocation}; error_chain! { + links { + XmlError(crate::parser::xml::error::Error, crate::parser::xml::error::ErrorKind); + } errors { /// Raised by the methods of the [`Graph`](../graph/trait.Graph.html) trait. GraphError(message: String) { diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index e6e83055..76f5944e 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -22,6 +22,7 @@ use crate::error::*; use crate::ns::rdf; use crate::ns::xsd; use crate::ns::Namespace; +use crate::term::StaticTerm; use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; use crate::term::iri_rfc3987::is_absolute_iri; @@ -30,7 +31,68 @@ use crate::term::iri_rfc3987::is_valid_iri; use crate::term::matcher::TermMatcher; use crate::term::Term; +static RESERVED_NODE_NAMES: &'static [StaticTerm] = &[ + rdf::RDF, + rdf::ID, + rdf::about, + rdf::bagID, + rdf::parseType, + rdf::resource, + rdf::nodeID, + rdf::li, + rdf::aboutEach, + rdf::aboutEachPrefix, +]; + +static RESERVED_PROPERTY_NAMES: &'static [StaticTerm] = &[ + rdf::Description, + rdf::RDF, + rdf::ID, + rdf::about, + rdf::bagID, + rdf::parseType, + rdf::resource, + rdf::nodeID, + rdf::aboutEach, + rdf::aboutEachPrefix, +]; + // --- +pub mod error { + error_chain! { + types { + Error, ErrorKind, ResultExt; + } + errors { + XmlError(e: ::quick_xml::Error) { + description("xml parser failed") + display("xml parser failed: {:?}", e) + } + InvalidNodeName(n: String) { + description("invalid property name") + display("invalid property name: {:?}", n) + } + InvalidPropertyName(n: String) { + description("invalid property name") + display("invalid property name: {:?}", n) + } + AmbiguousSubject { + description("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") + display("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") + } + InvalidPrefix(p: String) { + description("invalid prefix") + display("{:?} cannot be used as a prefix", p) + } + } + } + + impl From for Error { + fn from(e: quick_xml::Error) -> Self { + Self::from_kind(ErrorKind::XmlError(e)) + } + } +} /// RDF/XML parser configuration. /// @@ -264,16 +326,17 @@ impl Scope { /// Add a new XML prefix to the namespace mapping. fn add_prefix(&mut self, prefix: &str, value: &str) -> Result<()> { if prefix == "_" { - panic!("reserved prefix") + Err(Error::from( + self::error::Error::from(self::error::ErrorKind::InvalidPrefix(prefix.into())) + )) } else { let mut f = self.factory.borrow_mut(); self.ns.insert( String::from(prefix), Namespace::new(f.get_term_data(value))?, ); + Ok(()) } - - Ok(()) } /// Set the default XML prefix. @@ -469,31 +532,35 @@ where // Update XML namespaces with those defined in the document. for attr in e.attributes().with_checks(true) { - let a = attr.expect("FIXME"); + let a = attr.map_err(self::error::Error::from)?; if a.key.starts_with(b"xmlns:") { scope .add_prefix( &self.reader.decode(&a.key[6..]), - &a.unescape_and_decode_value(&self.reader).expect("FIXME"), - ) - .expect("FIXME"); + &a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)? + )?; } else if a.key == b"xmlns" { - scope.set_default(&a.unescape_and_decode_value(&self.reader).expect("FIXME"))?; + scope.set_default(&a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?)?; } else if a.key == b"xml:base" { - scope.set_base(&a.unescape_and_decode_value(&self.reader).expect("FIXME"))?; + scope.set_base(&a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?)?; } } // Add current lang to scope or copy last one for attr in e.attributes().with_checks(true) { - let a = attr.expect("FIXME"); + let a = attr.map_err(self::error::Error::from)?; if a.key == b"xml:lang" { scope.lang = if a.value.is_empty() { None } else { + let v = &a.unescape_and_decode_value(&self.reader) + .map_err(|e| self::error::Error::from(e))?; self.factory .borrow_mut() - .get_term_data(&a.unescape_and_decode_value(&self.reader).unwrap()) + .get_term_data(v) .into() }; } @@ -506,7 +573,7 @@ where // Exit the local scope. fn leave_scope(&mut self) { - self.scopes.pop().expect("FIXME"); + self.scopes.pop().expect("XML is not balanced"); } // --- @@ -566,8 +633,11 @@ where // --- fn element_start(&mut self, e: &BytesStart) { - self.enter_scope(e); - let res = match self.state.last().unwrap() { + if let Err(e) = self.enter_scope(e) { + self.triples.push_back(Err(e)); + } + + if let Err(e) = match self.state.last().unwrap() { ParsingState::Node => self.node_start(e), ParsingState::Predicate => self.predicate_start(e), ParsingState::Resource => self.predicate_start(e), @@ -575,28 +645,35 @@ where ParsingState::CollectionItem => self.collection_item_start(e), ParsingState::Res => panic!("expecting text, not new element"), ParsingState::Literal => unimplemented!("entering element as literal"), - }; + } { + self.triples.push_back(Err(e)); + } } - fn node_start(&mut self, e: &BytesStart) { + fn node_start(&mut self, e: &BytesStart) -> Result<()> { // Get node type from the XML attribute. let ty = self .scope() - .expand_attribute(&self.reader.decode(e.name())) - .expect("INVALID DATATYPE IRI REFERENCE"); + .expand_attribute(&self.reader.decode(e.name()))?; - // Bail out if an rdf:RDF element - if ty.matches(&rdf::RDF) { + // Bail out if in a top-level rdf:RDF element + if rdf::RDF.matches(&ty) && self.parents.is_empty() { self.state.push(ParsingState::Node); self.parents.push(self.factory.borrow_mut().copy(&rdf::RDF)); - return; + return Ok(()); + } + + // + if RESERVED_NODE_NAMES.matches(&ty) { + let kind = self::error::ErrorKind::InvalidNodeName(ty.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); } // Separate node subject from other attributes let mut properties = HashMap::new(); let mut subject = Vec::new(); for attr in e.attributes().with_checks(true) { - let a = attr.expect("FIXME"); + let a = attr.map_err(self::error::Error::from)?; // ignore xml attributes (processed in element_start) if a.key.starts_with(b"xml") { @@ -606,31 +683,33 @@ where // try to extract the subject annotation let k = self .scope() - .expand_attribute(&self.reader.decode(a.key)) - .expect("FIXME"); - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + .expand_attribute(&self.reader.decode(a.key))?; + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; if k.matches(&rdf::about) { - subject.push(self.scope().expand_iri(&v).expect("INVALID IRI")); + subject.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::ID) { - subject.push(self.scope().expand_id(&v).expect("INVALID NAME")); + subject.push(self.scope().expand_id(&v)?); } else if k.matches(&rdf::nodeID) { subject.push( self.factory .borrow_mut() - .bnode(&format!("o{}", v)) - .expect("INVALID BNODE"), + .bnode(&format!("o{}", v))?, ); } else if k.matches(&rdf::type_) { - properties.insert(k, self.scope().expand_iri(&v).expect("INVALID IRI")); + properties.insert(k, self.scope().expand_iri(&v)?); } else { - properties.insert(k, self.scope().new_literal(v).expect("FIXME")); + properties.insert(k, self.scope().new_literal(v)?); } } // Get subject and add it to the current nested stack if subject.len() > 1 { - panic!("cannot have rdf:ID, rdf:about and rdf:nodeId at the same time") + return Err( + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) + .into() + ); } let s: Term<_> = subject.pop().unwrap_or_else(|| self.new_bnode()); self.parents.push(s.clone()); @@ -651,22 +730,28 @@ where // Next start event is expected to be a predicate self.state.push(ParsingState::Predicate); + Ok(()) } - fn predicate_start(&mut self, e: &BytesStart) { + fn predicate_start(&mut self, e: &BytesStart) -> Result<()> { // Get the predicate and add it to the current nested stack // or build a new `rdf:_n` IRI if the predicate is `rdf:li`. - let p = self - .predicate_iri_start(&self.reader.decode(e.name())) - .expect("INVALID PREDICATE IRI"); - self.parents.push(p); + let p = self.predicate_iri_start(&self.reader.decode(e.name()))?; + + // Fail if the property is among forbidden names. + if RESERVED_PROPERTY_NAMES.matches(&p) { + let kind = self::error::ErrorKind::InvalidNodeName(p.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); + } else { + self.parents.push(p); + } // Extract attributes relevant to the RDF syntax let mut attributes = HashMap::new(); let mut next_state = ParsingState::Node; let mut object = Vec::with_capacity(1); for attr in e.attributes().with_checks(true) { - let a = attr.expect("FIXME"); + let a = attr.map_err(self::error::Error::from)?; // Ignore `xml` attributes if a.key.starts_with(b"xml") { @@ -675,18 +760,20 @@ where let k = self .scope() - .expand_attribute(&self.reader.decode(a.key)) - .expect("INVALID ATTRIBUTE"); + .expand_attribute(&self.reader.decode(a.key))?; if k.matches(&rdf::datatype) { - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - self.scope_mut().set_datatype(&v); + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; + self.scope_mut().set_datatype(&v)?; } else if k.matches(&rdf::ID) { - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - object.push(self.scope().expand_id(&v).expect("FIXME")); + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; + object.push(self.scope().expand_id(&v)?); next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - object.push(self.scope().expand_iri(&v).expect("FIXME")); + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; + object.push(self.scope().expand_iri(&v)?); next_state = ParsingState::Predicate; } else if k.matches(&rdf::parseType) { match a.value.as_ref() { @@ -701,14 +788,15 @@ where next_state = ParsingState::Collection; } b"Literal" => { - self.scope_mut().set_datatype(&rdf::XMLLiteral.value()); + self.scope_mut().set_datatype(&rdf::XMLLiteral.value())?; next_state = ParsingState::Literal; } other => panic!("invalid parseType: {:?}", other), } } else { - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); - attributes.insert(k, self.scope().new_literal(v).expect("FIXME")); + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; + attributes.insert(k, self.scope().new_literal(v)?); next_state = ParsingState::Resource; } } @@ -719,7 +807,10 @@ where 0 if !attributes.is_empty() => Some(self.new_bnode()), 0 if attributes.is_empty() => None, 1 => Some(object.last().unwrap().clone()), - _ => panic!("cannot have rdf:resource, rdf::ID or rdf:nodeID at the same time"), + _ => return Err( + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) + .into() + ) }; // Make the predicate a resource element if an objec tis present. @@ -732,25 +823,27 @@ where } self.state.push(next_state); + Ok(()) } - fn collection_start(&mut self, e: &BytesStart) { + fn collection_start(&mut self, e: &BytesStart) -> Result<()> { self.state.push(ParsingState::CollectionItem); - self.collection_item_start(e); + self.collection_item_start(e) } - fn collection_item_start(&mut self, e: &BytesStart) { + fn collection_item_start(&mut self, e: &BytesStart) -> Result<()> { // Start the inner node element and get its IRI. - self.node_start(e); + self.node_start(e)?; let new_iri = self.parents.last().unwrap().clone(); // Add the iri of the node to the parent scope (not current!) self.parent_scope_mut().collection.push(new_iri); + Ok(()) } // --- fn element_end(&mut self, e: &BytesEnd) { - match self.state.pop().unwrap() { + if let Err(e) = match self.state.pop().unwrap() { ParsingState::Node => self.predicate_end(e), ParsingState::Predicate => self.node_end(), ParsingState::Literal => self.predicate_end(e), @@ -758,11 +851,13 @@ where ParsingState::CollectionItem => self.collection_item_end(), ParsingState::Collection => self.collection_end(e), ParsingState::Res => self.res_end(), + } { + self.triples.push_back(Err(e)); } self.leave_scope(); } - fn node_end(&mut self) { + fn node_end(&mut self) -> Result<()> { // Add the entity as a triple object if it is not top-level let o = self.parents.pop().unwrap(); if self.parents.len() > 2 { @@ -772,43 +867,45 @@ where self.triples.push_back(Ok([s.clone(), p.clone(), o])); } } + + Ok(()) } - fn predicate_end(&mut self, e: &BytesEnd) { + fn predicate_end(&mut self, e: &BytesEnd) -> Result<()> { // Build the predicate IRI - let p = self - .predicate_iri_end(&self.reader.decode(e.name())) - .expect("INVALID PREDICATE IRI"); + let p = self.predicate_iri_end(&self.reader.decode(e.name()))?; // Get the literal value if self.parents.len() > 1 { if let Some(text) = self.scope_mut().text.take() { let s = self.parents[self.parents.len() - 2].clone(); - let o = self.scope_mut().new_literal(text).expect("FIXME"); + let o = self.scope_mut().new_literal(text)?; self.triples.push_back(Ok([s, p, o])); } } self.parents.pop(); + Ok(()) } - fn resource_end(&mut self, e: &BytesEnd) { + fn resource_end(&mut self, e: &BytesEnd) -> Result<()> { // End of the implicit node element - self.node_end(); + self.node_end()?; // Drop text, since it is not relevant in a Resource predicate. self.scope_mut().text.take(); // End of the resource predicate self.predicate_end(e) } - fn collection_item_end(&mut self) { + fn collection_item_end(&mut self) -> Result<()> { // End of the node parent. self.parents.pop(); // Remove `CollectionItem` self.state.pop(); + Ok(()) } - fn collection_end(&mut self, e: &BytesEnd) { + fn collection_end(&mut self, e: &BytesEnd) -> Result<()> { let collection = self.scope().collection.clone(); if !collection.is_empty() { let mut node = self.new_bnode(); @@ -845,16 +942,16 @@ where } } - self.predicate_end(e); + self.predicate_end(e) } - fn res_end(&mut self) { + fn res_end(&mut self) -> Result<()> { // Subject, predicate, object and ID of the reified triple let id = self.parents.pop().unwrap(); let p = self.parents.pop().unwrap(); let s = self.parents.last().unwrap().clone(); let txt = self.scope_mut().text.take().unwrap_or_default(); - let o = self.scope().new_literal(txt).expect("FIXME"); + let o = self.scope().new_literal(txt)?; // Types for the reification let mut factory = self.factory.borrow_mut(); @@ -871,22 +968,31 @@ where self.triples.push_back(Ok([id.clone(), subject, s])); self.triples.push_back(Ok([id.clone(), predicate, p])); self.triples.push_back(Ok([id.clone(), object, o])); + + Ok(()) } // --- Text elements ---------------------------------------------------- fn element_text(&mut self, e: &BytesText) { if self.scope().text.is_some() { - let text = e.unescape_and_decode(&self.reader).expect("FIXME"); - self.scope_mut().set_text(text); + match e.unescape_and_decode(&self.reader) { + Ok(text) => self.scope_mut().set_text(text), + Err(e) => self.triples.push_back(Err(self::error::Error::from(e).into())), + } } } // --- Empty elements ---------------------------------------------------- fn element_empty(&mut self, e: &BytesStart) { - self.enter_scope(e); - match self.state.last().unwrap() { + + + if let Err(e) = self.enter_scope(e) { + self.triples.push_back(Err(e)); + } + + if let Err(e) = match self.state.last().unwrap() { ParsingState::Node => self.node_empty(e), ParsingState::Predicate => self.predicate_empty(e), ParsingState::Resource => self.resource_empty(e), @@ -894,21 +1000,28 @@ where ParsingState::CollectionItem => unreachable!(), ParsingState::Res => panic!("expected end element, not empty"), ParsingState::Literal => unimplemented!("empty element as literal"), + } { + self.triples.push_back(Err(e)); } + self.leave_scope(); } - fn node_empty(&mut self, e: &BytesStart) { - self.node_start(e); + fn node_empty(&mut self, e: &BytesStart) -> Result<()> { + self.node_start(e)?; self.state.pop(); - self.node_end(); + self.node_end() } - fn predicate_empty(&mut self, e: &BytesStart) { - let p = self - .predicate_iri_start(&self.reader.decode(e.name())) - .expect("INVALID PREDICATE IRI"); + fn predicate_empty(&mut self, e: &BytesStart) -> Result<()> { + let p = self.predicate_iri_start(&self.reader.decode(e.name()))?; + // Fail if the property is among forbidden names. + if RESERVED_PROPERTY_NAMES.matches(&p) { + let kind = self::error::ErrorKind::InvalidNodeName(p.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); + } + let mut object = Vec::with_capacity(1); let mut attributes = HashMap::new(); let mut parse_type = None; @@ -916,7 +1029,7 @@ where // Extract attributes for attr in e.attributes().with_checks(true) { - let a = attr.expect("FIXME"); + let a = attr.map_err(self::error::Error::from)?; // ignore XML attributes (processed when entering scope) if a.key.starts_with(b"xml") { @@ -926,20 +1039,19 @@ where // try to extract the annotation object let k = self .scope() - .expand_attribute(&self.reader.decode(a.key)) - .expect("FIXME"); - let v = a.unescape_and_decode_value(&self.reader).expect("FIXME"); + .expand_attribute(&self.reader.decode(a.key))?; + let v = a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?; if k.matches(&rdf::resource) { - object.push(self.scope().expand_iri(&v).expect("INVALID IRI")); + object.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::nodeID) { object.push( self.factory .borrow_mut() - .bnode(format!("o{}", v)) - .expect("FIXME"), + .bnode(format!("o{}", v))? ); } else if k.matches(&rdf::ID) { - reification = Some(self.scope().expand_id(&v).expect("FIXME")); + reification = Some(self.scope().expand_id(&v)?); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => parse_type = Some(&b"Resource"[..]), @@ -966,15 +1078,18 @@ where let o = match object.len() { 0 if !attributes.is_empty() => self.new_bnode(), 1 => object.last().unwrap().clone(), - 0 if attributes.is_empty() => self.scope().new_literal(String::new()).expect("FIXME"), - _ => panic!("cannot have rdf:resource and rdf:nodeID at the same time"), + 0 if attributes.is_empty() => self.scope().new_literal(String::new())?, + _ => return Err( + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) + .into() + ), }; // Add the triple and all subsequent triples as attributes self.triples .push_back(Ok([s.clone(), p.clone(), o.clone()])); for (prop, value) in attributes.into_iter() { - let literal = self.scope().new_literal(value).expect("FIXME"); + let literal = self.scope().new_literal(value)?; self.triples.push_back(Ok([o.clone(), prop, literal])); } @@ -994,16 +1109,18 @@ where self.triples.push_back(Ok([id.clone(), predicate, p])); self.triples.push_back(Ok([id.clone(), obj, o])); } + + Ok(()) } - fn resource_empty(&mut self, e: &BytesStart) { + fn resource_empty(&mut self, e: &BytesStart) -> Result<()> { self.predicate_empty(e) } - fn collection_item_empty(&mut self, e: &BytesStart) { - self.collection_start(e); + fn collection_item_empty(&mut self, e: &BytesStart) -> Result<()> { + self.collection_start(e)?; self.state.pop(); - self.collection_item_end(); + self.collection_item_end() } } @@ -1018,17 +1135,22 @@ where let mut buffer = Vec::new(); loop { // First make sure to consume the queue. - if let Some(triple) = self.triples.pop_front() { - return Some(triple); + if let Some(res) = self.triples.pop_front() { + return Some(res); } // Then process the next event to maybe produce triples - match &self.reader.read_event(&mut buffer).unwrap() { - Event::Eof => return None, - Event::Start(s) => self.element_start(s), - Event::Empty(e) => self.element_empty(e), - Event::End(e) => self.element_end(e), - Event::Text(t) => self.element_text(t), - _ => (), + match self.reader.read_event(&mut buffer) { + Ok(Event::Eof) => return None, + Ok(Event::Start(s)) => self.element_start(&s), + Ok(Event::Empty(e)) => self.element_empty(&e), + Ok(Event::End(e)) => self.element_end(&e), + Ok(Event::Text(t)) => self.element_text(&t), + Ok(_) => (), + Err(e) => { + let kind = self::error::ErrorKind::XmlError(e); + let err = self::error::Error::from_kind(kind); + self.triples.push_back(Err(Error::from(err))); + } } // Finally clear the buffer if we are going to use it again. buffer.clear(); @@ -1154,7 +1276,31 @@ mod test { } macro_rules! rdf_failure { - ($(#[$attr:meta])* $suite:ident / $case:ident) => {}; + ($(#[$attr:meta])* $suite:ident / $case:ident) => { + $(#[$attr])* + #[test] + fn $case() { + let path = std::path::PathBuf::from("..") + .join("rdf-tests") + .join("rdf-xml") + .join(stringify!($suite).replace('_', "-")) + .join(stringify!($case).replace('_', "-")); + + let xmlfile = std::fs::File::open(path.with_extension("rdf")).unwrap(); + let mut xml = TestGraph::new(); + assert!( + $crate::parser::xml::Config::with_base(&format!( + "http://www.w3.org/2013/RDFXMLTests/{}/{}.rdf", + stringify!($suite).replace('_', "-"), + stringify!($case).replace('_', "-"), + )) + .unwrap() + .parse_read(xmlfile) + .in_graph(&mut xml) + .is_err() + ); + } + }; } macro_rules! nt_test { @@ -1611,13 +1757,13 @@ mod test { mod rdfms_rdf_id { use super::*; - rdf_failure!(rdfms_rdf_id / error_001); - rdf_failure!(rdfms_rdf_id / error_002); - rdf_failure!(rdfms_rdf_id / error_003); - rdf_failure!(rdfms_rdf_id / error_004); - rdf_failure!(rdfms_rdf_id / error_005); - rdf_failure!(rdfms_rdf_id / error_006); - rdf_failure!(rdfms_rdf_id / error_007); + rdf_failure!(rdfms_rdf_id / error001); + rdf_failure!(rdfms_rdf_id / error002); + rdf_failure!(rdfms_rdf_id / error003); + rdf_failure!(rdfms_rdf_id / error004); + rdf_failure!(rdfms_rdf_id / error005); + rdf_failure!(rdfms_rdf_id / error006); + rdf_failure!(rdfms_rdf_id / error007); } mod rdfms_rdf_names_use { From cee569249556c3e4d41ed76697d8377307e6d7b4 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 17:52:13 -0700 Subject: [PATCH 39/50] Add an XML name validator using a `pest` parser --- sophia/src/ns.rs | 2 +- sophia/src/parser/xml.rs | 166 +++++++++++++++++++-------------- sophia/src/parser/xmlname.pest | 30 ++++++ 3 files changed, 129 insertions(+), 69 deletions(-) create mode 100644 sophia/src/parser/xmlname.pest diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index 3a156d6f..85579e41 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -116,7 +116,7 @@ pub mod rdf { datatype, bagID, aboutEach, - aboutEachPrefix + aboutEachPrefix ); ns_term!("http://www.w3.org/1999/02/22-rdf-syntax-ns#", type_, "type"); } diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 76f5944e..6a273539 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -22,13 +22,13 @@ use crate::error::*; use crate::ns::rdf; use crate::ns::xsd; use crate::ns::Namespace; -use crate::term::StaticTerm; use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; use crate::term::iri_rfc3987::is_absolute_iri; use crate::term::iri_rfc3987::is_relative_iri; use crate::term::iri_rfc3987::is_valid_iri; use crate::term::matcher::TermMatcher; +use crate::term::StaticTerm; use crate::term::Term; static RESERVED_NODE_NAMES: &'static [StaticTerm] = &[ @@ -57,6 +57,30 @@ static RESERVED_PROPERTY_NAMES: &'static [StaticTerm] = &[ rdf::aboutEachPrefix, ]; +static RESERVED_ATTRIBUTES_NAMES: &'static [StaticTerm] = &[ + rdf::li, + rdf::aboutEach, + rdf::aboutEachPrefix, + rdf::bagID, +]; + +mod xmlname { + + use pest::Parser; + + #[cfg(debug_assertions)] + const _GRAMMAR: &str = include_str!("xmlname.pest"); + + #[derive(Parser)] + #[grammar = "parser/xmlname.pest"] + struct PestXmlNameParser; + + pub fn is_valid_xmlname(n: &str) -> bool { + PestXmlNameParser::parse(Rule::Name, n).is_ok() + } +} + + // --- pub mod error { error_chain! { @@ -76,6 +100,10 @@ pub mod error { description("invalid property name") display("invalid property name: {:?}", n) } + InvalidAttribute(n: String) { + description("invalid attribute") + display("invalid attribute: {:?}", n) + } AmbiguousSubject { description("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") display("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") @@ -326,9 +354,9 @@ impl Scope { /// Add a new XML prefix to the namespace mapping. fn add_prefix(&mut self, prefix: &str, value: &str) -> Result<()> { if prefix == "_" { - Err(Error::from( - self::error::Error::from(self::error::ErrorKind::InvalidPrefix(prefix.into())) - )) + Err(Error::from(self::error::Error::from( + self::error::ErrorKind::InvalidPrefix(prefix.into()), + ))) } else { let mut f = self.factory.borrow_mut(); self.ns.insert( @@ -530,38 +558,35 @@ where scope.collection = Vec::new(); scope.li.store(1, Ordering::Relaxed); - // Update XML namespaces with those defined in the document. + // * Update XML namespaces with those defined in the document. + // * Change scope language if there is any `xml:lang` attribute + // * Fail if there is an invalid `rdf:li` attribute for attr in e.attributes().with_checks(true) { let a = attr.map_err(self::error::Error::from)?; if a.key.starts_with(b"xmlns:") { - scope - .add_prefix( - &self.reader.decode(&a.key[6..]), - &a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)? - )?; + scope.add_prefix( + &self.reader.decode(&a.key[6..]), + &a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?, + )?; } else if a.key == b"xmlns" { - scope.set_default(&a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?)?; + scope.set_default( + &a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?, + )?; } else if a.key == b"xml:base" { - scope.set_base(&a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?)?; - } - } - - // Add current lang to scope or copy last one - for attr in e.attributes().with_checks(true) { - let a = attr.map_err(self::error::Error::from)?; - if a.key == b"xml:lang" { + scope.set_base( + &a.unescape_and_decode_value(&self.reader) + .map_err(self::error::Error::from)?, + )?; + } else if a.key == b"xml:lang" { scope.lang = if a.value.is_empty() { None } else { - let v = &a.unescape_and_decode_value(&self.reader) + let v = &a + .unescape_and_decode_value(&self.reader) .map_err(|e| self::error::Error::from(e))?; - self.factory - .borrow_mut() - .get_term_data(v) - .into() + self.factory.borrow_mut().get_term_data(v).into() }; } } @@ -681,10 +706,9 @@ where } // try to extract the subject annotation - let k = self - .scope() - .expand_attribute(&self.reader.decode(a.key))?; - let v = a.unescape_and_decode_value(&self.reader) + let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; if k.matches(&rdf::about) { @@ -692,13 +716,12 @@ where } else if k.matches(&rdf::ID) { subject.push(self.scope().expand_id(&v)?); } else if k.matches(&rdf::nodeID) { - subject.push( - self.factory - .borrow_mut() - .bnode(&format!("o{}", v))?, - ); + subject.push(self.factory.borrow_mut().bnode(&format!("o{}", v))?); } else if k.matches(&rdf::type_) { properties.insert(k, self.scope().expand_iri(&v)?); + } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { + let kind = self::error::ErrorKind::InvalidAttribute(k.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); } else { properties.insert(k, self.scope().new_literal(v)?); } @@ -707,8 +730,7 @@ where // Get subject and add it to the current nested stack if subject.len() > 1 { return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) - .into() + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), ); } let s: Term<_> = subject.pop().unwrap_or_else(|| self.new_bnode()); @@ -758,20 +780,21 @@ where continue; } - let k = self - .scope() - .expand_attribute(&self.reader.decode(a.key))?; + let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; if k.matches(&rdf::datatype) { - let v = a.unescape_and_decode_value(&self.reader) + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; self.scope_mut().set_datatype(&v)?; } else if k.matches(&rdf::ID) { - let v = a.unescape_and_decode_value(&self.reader) + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; object.push(self.scope().expand_id(&v)?); next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { - let v = a.unescape_and_decode_value(&self.reader) + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; object.push(self.scope().expand_iri(&v)?); next_state = ParsingState::Predicate; @@ -793,8 +816,12 @@ where } other => panic!("invalid parseType: {:?}", other), } + } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { + let kind = self::error::ErrorKind::InvalidAttribute(k.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); } else { - let v = a.unescape_and_decode_value(&self.reader) + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; attributes.insert(k, self.scope().new_literal(v)?); next_state = ParsingState::Resource; @@ -807,10 +834,11 @@ where 0 if !attributes.is_empty() => Some(self.new_bnode()), 0 if attributes.is_empty() => None, 1 => Some(object.last().unwrap().clone()), - _ => return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) - .into() - ) + _ => { + return Err( + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), + ) + } }; // Make the predicate a resource element if an objec tis present. @@ -978,7 +1006,9 @@ where if self.scope().text.is_some() { match e.unescape_and_decode(&self.reader) { Ok(text) => self.scope_mut().set_text(text), - Err(e) => self.triples.push_back(Err(self::error::Error::from(e).into())), + Err(e) => self + .triples + .push_back(Err(self::error::Error::from(e).into())), } } } @@ -986,8 +1016,6 @@ where // --- Empty elements ---------------------------------------------------- fn element_empty(&mut self, e: &BytesStart) { - - if let Err(e) = self.enter_scope(e) { self.triples.push_back(Err(e)); } @@ -1021,7 +1049,7 @@ where let kind = self::error::ErrorKind::InvalidNodeName(p.value()); return Err(Error::from(self::error::Error::from_kind(kind))); } - + let mut object = Vec::with_capacity(1); let mut attributes = HashMap::new(); let mut parse_type = None; @@ -1037,19 +1065,14 @@ where } // try to extract the annotation object - let k = self - .scope() - .expand_attribute(&self.reader.decode(a.key))?; - let v = a.unescape_and_decode_value(&self.reader) + let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; + let v = a + .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; if k.matches(&rdf::resource) { object.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::nodeID) { - object.push( - self.factory - .borrow_mut() - .bnode(format!("o{}", v))? - ); + object.push(self.factory.borrow_mut().bnode(format!("o{}", v))?); } else if k.matches(&rdf::ID) { reification = Some(self.scope().expand_id(&v)?); } else if k.matches(&rdf::parseType) { @@ -1079,10 +1102,11 @@ where 0 if !attributes.is_empty() => self.new_bnode(), 1 => object.last().unwrap().clone(), 0 if attributes.is_empty() => self.scope().new_literal(String::new())?, - _ => return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject) - .into() - ), + _ => { + return Err( + self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), + ) + } }; // Add the triple and all subsequent triples as attributes @@ -1762,8 +1786,14 @@ mod test { rdf_failure!(rdfms_rdf_id / error003); rdf_failure!(rdfms_rdf_id / error004); rdf_failure!(rdfms_rdf_id / error005); - rdf_failure!(rdfms_rdf_id / error006); - rdf_failure!(rdfms_rdf_id / error007); + rdf_failure!( + rdfms_rdf_id + / error006 + ); + rdf_failure!( + rdfms_rdf_id + / error007 + ); } mod rdfms_rdf_names_use { diff --git a/sophia/src/parser/xmlname.pest b/sophia/src/parser/xmlname.pest new file mode 100644 index 00000000..43a10275 --- /dev/null +++ b/sophia/src/parser/xmlname.pest @@ -0,0 +1,30 @@ +NameStartChar = { + ":" + | 'A'..'Z' + | "_" + | 'a'..'z' + | '\u{C0}'..'\u{D6}' + | '\u{D8}'..'\u{F6}' + | '\u{F8}'..'\u{2FF}' + | '\u{370}'..'\u{37D}' + | '\u{37F}'..'\u{1FFF}' + | '\u{200C}'..'\u{200D}' + | '\u{2070}'..'\u{218F}' + | '\u{2C00}'..'\u{2FEF}' + | '\u{3001}'..'\u{D7FF}' + | '\u{F900}'..'\u{FDCF}' + | '\u{FDF0}'..'\u{FFFD}' + | '\u{10000}'..'\u{EFFFF}' +} + +NameChar = { + NameStartChar + | "-" + | "." + | '0'..'9' + | "\u{B7}" + | '\u{0300}'..'\u{036F}' + | '\u{203F}'..'\u{2040}' +} + +Name = { NameStartChar ~ NameChar* } From 59d1dba28515a83a424fba8e3f3407ad9ca89ee1 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 18:25:48 -0700 Subject: [PATCH 40/50] Add XML name validation for `ID` and `nodeID` attributes of node elements --- sophia/src/parser/xml.rs | 60 ++++++++++++++++++++++++---------- sophia/src/parser/xmlname.pest | 5 ++- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 6a273539..a09e38a9 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -78,6 +78,16 @@ mod xmlname { pub fn is_valid_xmlname(n: &str) -> bool { PestXmlNameParser::parse(Rule::Name, n).is_ok() } + + pub fn validate(n: &str) -> Result<&str, super::error::Error> { + if is_valid_xmlname(n) { + Ok(n) + } else { + Err(super::error::Error::from_kind( + super::error::ErrorKind::InvalidXmlName(n.to_string()) + )) + } + } } @@ -104,6 +114,10 @@ pub mod error { description("invalid attribute") display("invalid attribute: {:?}", n) } + InvalidXmlName(n: String) { + description("invalid XML name") + display("invalid XML name: {:?}", n) + } AmbiguousSubject { description("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") display("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") @@ -448,7 +462,11 @@ impl Scope { /// This also uses `xml:base` to expand local resources, and prefixes /// identifiers in the document with a `#` if needed. fn expand_id(&self, id: &str) -> Result> { - if id.starts_with("#") { + if !xmlname::is_valid_xmlname(id) { + return Err(Error::from(self::error::Error::from_kind( + self::error::ErrorKind::InvalidXmlName(id.to_string()) + ))) + } else if id.starts_with("#") { self.expand_iri(id) } else { self.expand_iri(&format!("#{}", id)) @@ -603,7 +621,7 @@ where // --- - // Create a new bnode term (using `n` prefix). + /// Create a new bnode term (using `n` prefix). fn new_bnode(&self) -> Term { self.factory .borrow_mut() @@ -611,7 +629,18 @@ where .unwrap() } - // Create a new predicate IRI from an XML name (or a RDF metasyntactic element) + /// Rename a bnode using the `nodeID` in the document (using `o` prefix) + fn rename_bnode(&self, id: &str) -> Result> { + if xmlname::is_valid_xmlname(id) { + self.factory.borrow_mut().bnode(&format!("o{}", id)) + } else { + Err(Error::from(self::error::Error::from_kind( + self::error::ErrorKind::InvalidXmlName(id.to_string()) + ))) + } + } + + /// Create a new predicate IRI from an XML name (or a RDF metasyntactic element) fn predicate_iri_start(&self, name: &str) -> Result> { let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { @@ -621,7 +650,7 @@ where } } - // Retrieve a predicate IRI from an XML name + /// Retrieve a predicate IRI from an XML name fn predicate_iri_end(&self, name: &str) -> Result> { let p = self.scope().expand_attribute(name)?; if p.matches(&rdf::li) { @@ -716,7 +745,7 @@ where } else if k.matches(&rdf::ID) { subject.push(self.scope().expand_id(&v)?); } else if k.matches(&rdf::nodeID) { - subject.push(self.factory.borrow_mut().bnode(&format!("o{}", v))?); + subject.push(self.rename_bnode(&v)?); } else if k.matches(&rdf::type_) { properties.insert(k, self.scope().expand_iri(&v)?); } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { @@ -1072,7 +1101,7 @@ where if k.matches(&rdf::resource) { object.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::nodeID) { - object.push(self.factory.borrow_mut().bnode(format!("o{}", v))?); + object.push(self.rename_bnode(&v)?); } else if k.matches(&rdf::ID) { reification = Some(self.scope().expand_id(&v)?); } else if k.matches(&rdf::parseType) { @@ -1081,6 +1110,9 @@ where b"Literal" => parse_type = Some(&b"Literal"[..]), other => panic!("invalid parseType: {:?}", other), }; + } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { + let kind = self::error::ErrorKind::InvalidAttribute(k.value()); + return Err(Error::from(self::error::Error::from_kind(kind))); } else { attributes.insert(k, v); } @@ -1730,9 +1762,9 @@ mod test { mod rdfms_empty_property_elements { use super::*; - rdf_failure!(rdfms_empty_property_elements / error001); - rdf_failure!(rdfms_empty_property_elements / error002); - rdf_failure!(rdfms_empty_property_elements / error003); + // rdf_failure!(rdfms_empty_property_elements / error001); + // rdf_failure!(rdfms_empty_property_elements / error002); + // rdf_failure!(rdfms_empty_property_elements / error003); rdf_test!(rdfms_empty_property_elements / test001); rdf_test!(rdfms_empty_property_elements / test002); @@ -1786,14 +1818,8 @@ mod test { rdf_failure!(rdfms_rdf_id / error003); rdf_failure!(rdfms_rdf_id / error004); rdf_failure!(rdfms_rdf_id / error005); - rdf_failure!( - rdfms_rdf_id - / error006 - ); - rdf_failure!( - rdfms_rdf_id - / error007 - ); + rdf_failure!(rdfms_rdf_id / error006); + rdf_failure!(rdfms_rdf_id / error007); } mod rdfms_rdf_names_use { diff --git a/sophia/src/parser/xmlname.pest b/sophia/src/parser/xmlname.pest index 43a10275..313e0720 100644 --- a/sophia/src/parser/xmlname.pest +++ b/sophia/src/parser/xmlname.pest @@ -1,6 +1,5 @@ NameStartChar = { - ":" - | 'A'..'Z' + 'A'..'Z' | "_" | 'a'..'z' | '\u{C0}'..'\u{D6}' @@ -27,4 +26,4 @@ NameChar = { | '\u{203F}'..'\u{2040}' } -Name = { NameStartChar ~ NameChar* } +Name = { NameStartChar ~ NameChar* ~ EOI } From 8360b277beeb17b57a9c03bdb798a07b9c702718 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 18:40:11 -0700 Subject: [PATCH 41/50] Make sure `ID` values are unique (after expansion) in a RDF/XML document --- sophia/src/parser/xml.rs | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index a09e38a9..9b93244e 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -2,6 +2,7 @@ use std::cell::RefCell; use std::collections::HashMap; +use std::collections::HashSet; use std::collections::LinkedList; use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; @@ -102,6 +103,10 @@ pub mod error { description("xml parser failed") display("xml parser failed: {:?}", e) } + DuplicateId(id: String) { + description("duplicate ID") + display("duplicate ID: {:?}", id) + } InvalidNodeName(n: String) { description("invalid property name") display("invalid property name: {:?}", n) @@ -531,6 +536,9 @@ struct XmlParser { // bnodes: AtomicUsize, + // + ids: HashSet>, + /// The current state of the parser. state: Vec, } @@ -640,6 +648,17 @@ where } } + /// Check the given `ID` is unique. + fn check_unique_id(&mut self, id: Term) -> Result> { + if self.ids.contains(&id) { + let kind = self::error::ErrorKind::DuplicateId(id.value()); + Err(Error::from(self::error::Error::from_kind(kind))) + } else { + self.ids.insert(id.clone()); + Ok(id) + } + } + /// Create a new predicate IRI from an XML name (or a RDF metasyntactic element) fn predicate_iri_start(&self, name: &str) -> Result> { let p = self.scope().expand_attribute(name)?; @@ -673,6 +692,7 @@ where factory: factory, bnodes: AtomicUsize::new(0), state: vec![ParsingState::Node], + ids: HashSet::new(), } } @@ -743,7 +763,8 @@ where if k.matches(&rdf::about) { subject.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::ID) { - subject.push(self.scope().expand_id(&v)?); + let id = self.scope().expand_id(&v)?; + subject.push(self.check_unique_id(id)?); } else if k.matches(&rdf::nodeID) { subject.push(self.rename_bnode(&v)?); } else if k.matches(&rdf::type_) { @@ -819,7 +840,8 @@ where let v = a .unescape_and_decode_value(&self.reader) .map_err(self::error::Error::from)?; - object.push(self.scope().expand_id(&v)?); + let id = self.scope().expand_id(&v)?; + object.push(self.check_unique_id(id)?); next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { let v = a @@ -1103,7 +1125,8 @@ where } else if k.matches(&rdf::nodeID) { object.push(self.rename_bnode(&v)?); } else if k.matches(&rdf::ID) { - reification = Some(self.scope().expand_id(&v)?); + let id = self.scope().expand_id(&v)?; + reification = Some(self.check_unique_id(id)?); } else if k.matches(&rdf::parseType) { match a.value.as_ref() { b"Resource" => parse_type = Some(&b"Resource"[..]), @@ -1762,9 +1785,9 @@ mod test { mod rdfms_empty_property_elements { use super::*; - // rdf_failure!(rdfms_empty_property_elements / error001); - // rdf_failure!(rdfms_empty_property_elements / error002); - // rdf_failure!(rdfms_empty_property_elements / error003); + rdf_failure!(#[ignore] rdfms_empty_property_elements / error001); + rdf_failure!(#[ignore] rdfms_empty_property_elements / error002); + rdf_failure!(#[ignore] rdfms_empty_property_elements / error003); rdf_test!(rdfms_empty_property_elements / test001); rdf_test!(rdfms_empty_property_elements / test002); From 0b0b0d2d9c52c68cfb4c897c4debb23c615a5e14 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 18:48:13 -0700 Subject: [PATCH 42/50] Detect invalid use of `parseType=Literal` in XML document --- sophia/src/parser/xml.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 9b93244e..2f64ab93 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -123,6 +123,10 @@ pub mod error { description("invalid XML name") display("invalid XML name: {:?}", n) } + InvalidParseType(ty: String) { + description("invalid `parseType` value in this context") + display("cannot use `parseType={}` in this context", ty) + } AmbiguousSubject { description("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") display("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") @@ -1146,9 +1150,14 @@ where if parse_type == Some(b"Resource") && object.is_empty() { object.push(self.new_bnode()); } else if parse_type == Some(b"Literal") { - let xmlliteral = self.factory.borrow_mut().copy(&rdf::XMLLiteral); - let mut scope = self.scope_mut(); - scope.datatype = Some(xmlliteral); + if object.is_empty() && attributes.is_empty() { + let xmlliteral = self.factory.borrow_mut().copy(&rdf::XMLLiteral); + let mut scope = self.scope_mut(); + scope.datatype = Some(xmlliteral); + } else { + let kind = self::error::ErrorKind::InvalidParseType("Literal".to_string()); + return Err(Error::from(self::error::Error::from_kind(kind))); + } } // Extract subjet and object of the triple @@ -1785,9 +1794,9 @@ mod test { mod rdfms_empty_property_elements { use super::*; - rdf_failure!(#[ignore] rdfms_empty_property_elements / error001); - rdf_failure!(#[ignore] rdfms_empty_property_elements / error002); - rdf_failure!(#[ignore] rdfms_empty_property_elements / error003); + rdf_failure!(rdfms_empty_property_elements / error001); + rdf_failure!(rdfms_empty_property_elements / error002); + rdf_failure!(rdfms_empty_property_elements / error003); rdf_test!(rdfms_empty_property_elements / test001); rdf_test!(rdfms_empty_property_elements / test002); From 2ccd8b3e3d22b344eca53650c2da0272fb32b7e6 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 20:33:09 -0700 Subject: [PATCH 43/50] Fudge-fix inconsistent percent-encoding of IRI when parsing RDF/XML docs --- sophia/src/parser/xml.rs | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 2f64ab93..66be4680 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -446,23 +446,36 @@ impl Scope { /// This uses `xml:base` to expand local resources, and does nothing in /// case the IRI is already in expanded form. fn expand_iri(&self, iri: &str) -> Result> { - fn dec(x: &str) -> std::borrow::Cow { - url::percent_encoding::percent_decode(x.as_bytes()) - .decode_utf8() - .unwrap() - } + + let mut factory = self.factory.borrow_mut(); if is_relative_iri(iri) { + + // NB: We should not be percent-encoding, but `url::Url::parse` + // does it anyway: as a fudge, we percent-decode any input that + // contained non-ASCII characters back. This may cause strange + // behaviour with URLs that contain a mix of percent-encoded and + // raw Unicode characters, but this is the best we can do without + // reimplementing the `url` crate from scratch. + let ascii = iri.chars().all(|c| c.is_ascii()); + + fn decode(s: &str) -> std::borrow::Cow { + url::percent_encoding::percent_decode(s.as_bytes()) + .decode_utf8() + .unwrap() + } + if let Some(url) = &self.base { match url.join(iri) { - Ok(u) => self.factory.borrow_mut().iri(&dec(u.as_str())), - Err(e) => bail!(ErrorKind::InvalidIri(String::from(iri))), + Ok(ref u) if ascii => factory.iri(u), + Ok(ref u) => factory.iri(decode(u.as_ref())), + Err(ref e) => bail!(ErrorKind::InvalidIri(String::from(iri))), } } else { panic!("NO BASE IRI") } } else { - self.factory.borrow_mut().iri(&dec(iri)) + factory.iri(iri) } } @@ -1725,11 +1738,7 @@ mod test { use super::*; rdf_test!(rdf_charmod_uris / test001); - rdf_test!( - #[ignore] - rdf_charmod_uris - / test002 - ); + rdf_test!(rdf_charmod_uris / test002); } mod rdf_containers_syntax_vs_schema { From c7ec2851e003a0c430634b90d91cd21b69fbcf94 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 21:42:08 -0700 Subject: [PATCH 44/50] Refactor `::parser::xml` to allow reusing the same XML buffer --- sophia/src/parser/xml.rs | 86 +++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 66be4680..bfc5d136 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -27,11 +27,12 @@ use crate::term::factory::RcTermFactory; use crate::term::factory::TermFactory; use crate::term::iri_rfc3987::is_absolute_iri; use crate::term::iri_rfc3987::is_relative_iri; -use crate::term::iri_rfc3987::is_valid_iri; use crate::term::matcher::TermMatcher; use crate::term::StaticTerm; use crate::term::Term; +const DEFAULT_BUFFER_SIZE: usize = 8 * 1024; + static RESERVED_NODE_NAMES: &'static [StaticTerm] = &[ rdf::RDF, rdf::ID, @@ -79,16 +80,6 @@ mod xmlname { pub fn is_valid_xmlname(n: &str) -> bool { PestXmlNameParser::parse(Rule::Name, n).is_ok() } - - pub fn validate(n: &str) -> Result<&str, super::error::Error> { - if is_valid_xmlname(n) { - Ok(n) - } else { - Err(super::error::Error::from_kind( - super::error::ErrorKind::InvalidXmlName(n.to_string()) - )) - } - } } @@ -289,7 +280,7 @@ impl From> for XmlReader { Self { inner: r, event: None, - buffer: Vec::new(), + buffer: Vec::with_capacity(DEFAULT_BUFFER_SIZE), } } } @@ -469,13 +460,15 @@ impl Scope { match url.join(iri) { Ok(ref u) if ascii => factory.iri(u), Ok(ref u) => factory.iri(decode(u.as_ref())), - Err(ref e) => bail!(ErrorKind::InvalidIri(String::from(iri))), + Err(_) => bail!(ErrorKind::InvalidIri(String::from(iri))), } } else { panic!("NO BASE IRI") } - } else { + } else if is_absolute_iri(iri) { factory.iri(iri) + } else { + bail!(ErrorKind::InvalidIri(String::from(iri))) } } @@ -533,8 +526,36 @@ impl Default for Scope { // --- -/// An XML parser supporting any term factory as a backend. struct XmlParser { + handler: XmlHandler, + buffer: Vec, +} + +impl XmlParser +where + B: BufRead, + F: TermFactory + Clone + Default + Debug, + ::TermData: Debug, +{ + /// Create a new `XmlParser` from the given `quick_xml::Reader`. + fn new(reader: Reader) -> Self { + Self { + handler: XmlHandler::new(reader), + buffer: Vec::with_capacity(DEFAULT_BUFFER_SIZE), + } + } + + /// Create a new `XmlParser` using the given URL as the top-level `xml:base`. + fn with_base(reader: Reader, base: Url) -> Self { + Self { + handler: XmlHandler::with_base(reader, base), + buffer: Vec::with_capacity(DEFAULT_BUFFER_SIZE), + } + } +} + +/// An XML parser supporting any term factory as a backend. +struct XmlHandler { /// The underlying XML reader. reader: XmlReader, @@ -560,7 +581,7 @@ struct XmlParser { state: Vec, } -impl XmlParser +impl XmlHandler where B: BufRead, F: TermFactory + Clone + Default + Debug, @@ -698,7 +719,7 @@ where // --- - /// Create a new `XmlParser` from the given `quick_xml::Reader`. + /// Create a new `XmlHandler` from the given `quick_xml::Reader`. fn new(reader: Reader) -> Self { let factory: Rc> = Default::default(); Self { @@ -713,7 +734,7 @@ where } } - /// Create a new `XmlParser` using the given URL as the top-level `xml:base`. + /// Create a new `XmlHandler` using the given URL as the top-level `xml:base`. fn with_base(reader: Reader, base: Url) -> Self { let mut parser = Self::new(reader); let mut scope = parser.scope_mut(); @@ -896,8 +917,7 @@ where } } - // Extract subjet and object of the triple - let s = self.parents.last().unwrap().clone(); + // Extract object of the triple let o = match object.len() { 0 if !attributes.is_empty() => Some(self.new_bnode()), 0 if attributes.is_empty() => None, @@ -1233,28 +1253,31 @@ where { type Item = Result<[Term; 3]>; fn next(&mut self) -> Option { - let mut buffer = Vec::new(); + loop { + // First make sure to consume the queue. - if let Some(res) = self.triples.pop_front() { + if let Some(res) = self.handler.triples.pop_front() { return Some(res); } + + // + self.buffer.clear(); + // Then process the next event to maybe produce triples - match self.reader.read_event(&mut buffer) { + match self.handler.reader.read_event(&mut self.buffer) { Ok(Event::Eof) => return None, - Ok(Event::Start(s)) => self.element_start(&s), - Ok(Event::Empty(e)) => self.element_empty(&e), - Ok(Event::End(e)) => self.element_end(&e), - Ok(Event::Text(t)) => self.element_text(&t), + Ok(Event::Start(s)) => self.handler.element_start(&s), + Ok(Event::Empty(e)) => self.handler.element_empty(&e), + Ok(Event::End(e)) => self.handler.element_end(&e), + Ok(Event::Text(t)) => self.handler.element_text(&t), Ok(_) => (), Err(e) => { let kind = self::error::ErrorKind::XmlError(e); let err = self::error::Error::from_kind(kind); - self.triples.push_back(Err(Error::from(err))); + self.handler.triples.push_back(Err(Error::from(err))); } } - // Finally clear the buffer if we are going to use it again. - buffer.clear(); } } } @@ -1262,6 +1285,7 @@ where // --- #[cfg(test)] +#[allow(non_snake_case)] mod test { use std::fmt::Debug; @@ -1272,8 +1296,6 @@ mod test { use crate::graph::inmem::TermIndexMapU; use crate::graph::Graph; use crate::term::factory::RcTermFactory; - use crate::term::IriData; - use crate::term::StaticTerm; use crate::term::Term; use crate::triple::stream::TripleSource; use crate::triple::Triple; From 6882ef99e2b37f245228e71bb2a0ccd931e9d805 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 22:01:06 -0700 Subject: [PATCH 45/50] Remove all panics from RDF/XML parser and use proper error chaining --- sophia/src/parser/xml.rs | 55 +++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index bfc5d136..2c936855 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -98,6 +98,10 @@ pub mod error { description("duplicate ID") display("duplicate ID: {:?}", id) } + UnknownNamespace(ns: String) { + description("unknown namespace") + display("unknown or undeclared namespace: {:?}", ns) + } InvalidNodeName(n: String) { description("invalid property name") display("invalid property name: {:?}", n) @@ -122,6 +126,14 @@ pub mod error { description("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") display("cannot have `rdf:ID`, `rdf:nodeID` and `rdf:about` at the same time") } + UnexpectedEvent(exp: String, found: String) { + description("unexpected XML event") + display("unexpected XML event: expected {}, found {}", exp, found) + } + NoBaseIri(iri: String) { + description("document does not define a base IRI") + display("base IRI needed to expand: {:?}", iri) + } InvalidPrefix(p: String) { description("invalid prefix") display("{:?} cannot be used as a prefix", p) @@ -423,12 +435,14 @@ impl Scope { if let Some(ns) = self.ns.get(prefix) { ns.get(self.factory.borrow_mut().get_term_data(reference)) } else { - panic!("unknown namespace: {}", prefix) + let kind = self::error::ErrorKind::UnknownNamespace(prefix.to_string()); + Err(Error::from(self::error::Error::from_kind(kind))) } } else if let Some(ns) = &self.default { ns.get(self.factory.borrow_mut().get_term_data(attr)) } else { - panic!("missing prefix: {}", attr) + let kind = self::error::ErrorKind::UnknownNamespace("_".to_string()); + Err(Error::from(self::error::Error::from_kind(kind))) } } @@ -463,7 +477,8 @@ impl Scope { Err(_) => bail!(ErrorKind::InvalidIri(String::from(iri))), } } else { - panic!("NO BASE IRI") + let kind = self::error::ErrorKind::NoBaseIri(iri.to_string()); + Err(Error::from(self::error::Error::from_kind(kind))) } } else if is_absolute_iri(iri) { factory.iri(iri) @@ -503,7 +518,8 @@ impl Scope { let mut f = self.factory.borrow_mut(); ns.get(f.get_term_data(&format!("_{}", self.li.fetch_add(1, Ordering::Relaxed)))) } else { - panic!("undeclared `rdf` prefix !") + let kind = self::error::ErrorKind::UnknownNamespace("rdf".to_string()); + Err(Error::from(self::error::Error::from_kind(kind))) } } @@ -513,7 +529,8 @@ impl Scope { let mut f = self.factory.borrow_mut(); ns.get(f.get_term_data(&format!("_{}", self.li.load(Ordering::Relaxed) - 1))) } else { - panic!("undeclared `rdf` prefix !") + let kind = self::error::ErrorKind::UnknownNamespace("rdf".to_string()); + Err(Error::from(self::error::Error::from_kind(kind))) } } } @@ -755,8 +772,14 @@ where ParsingState::Resource => self.predicate_start(e), ParsingState::Collection => self.collection_start(e), ParsingState::CollectionItem => self.collection_item_start(e), - ParsingState::Res => panic!("expecting text, not new element"), ParsingState::Literal => unimplemented!("entering element as literal"), + ParsingState::Res => { + let kind = self::error::ErrorKind::UnexpectedEvent( + format!("<{}>", self.reader.decode(e.name())), + "text".to_string() + ); + Err(Error::from(self::error::Error::from_kind(kind))) + }, } { self.triples.push_back(Err(e)); } @@ -903,7 +926,11 @@ where self.scope_mut().set_datatype(&rdf::XMLLiteral.value())?; next_state = ParsingState::Literal; } - other => panic!("invalid parseType: {:?}", other), + other => { + let ty = String::from_utf8_lossy(other).to_string(); + let kind = self::error::ErrorKind::InvalidParseType(ty); + return Err(Error::from(self::error::Error::from_kind(kind))); + } } } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { let kind = self::error::ErrorKind::InvalidAttribute(k.value()); @@ -1114,8 +1141,14 @@ where ParsingState::Resource => self.resource_empty(e), ParsingState::Collection => self.collection_item_empty(e), ParsingState::CollectionItem => unreachable!(), - ParsingState::Res => panic!("expected end element, not empty"), ParsingState::Literal => unimplemented!("empty element as literal"), + ParsingState::Res => { + let kind = self::error::ErrorKind::UnexpectedEvent( + format!("<{}/>", self.reader.decode(e.name())), + "end".to_string() + ); + Err(Error::from(self::error::Error::from_kind(kind))) + } } { self.triples.push_back(Err(e)); } @@ -1168,7 +1201,11 @@ where match a.value.as_ref() { b"Resource" => parse_type = Some(&b"Resource"[..]), b"Literal" => parse_type = Some(&b"Literal"[..]), - other => panic!("invalid parseType: {:?}", other), + other => { + let ty = String::from_utf8_lossy(other).to_string(); + let kind = self::error::ErrorKind::InvalidParseType(ty); + return Err(Error::from(self::error::Error::from_kind(kind))); + } }; } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { let kind = self::error::ErrorKind::InvalidAttribute(k.value()); From 768093392e9f385fa884e1696d3ebd891ea1c6c0 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Thu, 6 Jun 2019 22:06:04 -0700 Subject: [PATCH 46/50] Remove `Debug` constraint on term data in `::parser::xml` --- sophia/src/parser/xml.rs | 45 ++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 2c936855..6e0ea41a 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -4,7 +4,6 @@ use std::cell::RefCell; use std::collections::HashMap; use std::collections::HashSet; use std::collections::LinkedList; -use std::fmt::Debug; use std::io::{BufRead, BufReader, Read}; use std::ops::Deref; use std::rc::Rc; @@ -59,12 +58,8 @@ static RESERVED_PROPERTY_NAMES: &'static [StaticTerm] = &[ rdf::aboutEachPrefix, ]; -static RESERVED_ATTRIBUTES_NAMES: &'static [StaticTerm] = &[ - rdf::li, - rdf::aboutEach, - rdf::aboutEachPrefix, - rdf::bagID, -]; +static RESERVED_ATTRIBUTES_NAMES: &'static [StaticTerm] = + &[rdf::li, rdf::aboutEach, rdf::aboutEachPrefix, rdf::bagID]; mod xmlname { @@ -82,7 +77,6 @@ mod xmlname { } } - // --- pub mod error { error_chain! { @@ -451,11 +445,8 @@ impl Scope { /// This uses `xml:base` to expand local resources, and does nothing in /// case the IRI is already in expanded form. fn expand_iri(&self, iri: &str) -> Result> { - - let mut factory = self.factory.borrow_mut(); if is_relative_iri(iri) { - // NB: We should not be percent-encoding, but `url::Url::parse` // does it anyway: as a fudge, we percent-decode any input that // contained non-ASCII characters back. This may cause strange @@ -467,7 +458,7 @@ impl Scope { fn decode(s: &str) -> std::borrow::Cow { url::percent_encoding::percent_decode(s.as_bytes()) .decode_utf8() - .unwrap() + .expect("always OK since validated with `is_relative_iri`") } if let Some(url) = &self.base { @@ -494,8 +485,8 @@ impl Scope { fn expand_id(&self, id: &str) -> Result> { if !xmlname::is_valid_xmlname(id) { return Err(Error::from(self::error::Error::from_kind( - self::error::ErrorKind::InvalidXmlName(id.to_string()) - ))) + self::error::ErrorKind::InvalidXmlName(id.to_string()), + ))); } else if id.starts_with("#") { self.expand_iri(id) } else { @@ -551,8 +542,7 @@ struct XmlParser { impl XmlParser where B: BufRead, - F: TermFactory + Clone + Default + Debug, - ::TermData: Debug, + F: TermFactory + Clone + Default, { /// Create a new `XmlParser` from the given `quick_xml::Reader`. fn new(reader: Reader) -> Self { @@ -601,19 +591,19 @@ struct XmlHandler { impl XmlHandler where B: BufRead, - F: TermFactory + Clone + Default + Debug, - ::TermData: Debug, + F: TermFactory + Clone + Default, { // --- /// Get a reference to the current scope. fn scope(&self) -> &Scope { - self.scopes.last().unwrap() + &self.scopes[self.scopes.len() - 1] } /// Get a mutable reference to the current scope. fn scope_mut(&mut self) -> &mut Scope { - self.scopes.last_mut().unwrap() + let l = self.scopes.len(); + &mut self.scopes[l - 1] } /// Get a reference to the parent scope. @@ -689,7 +679,7 @@ where self.factory .borrow_mut() .bnode(&format!("n{}", self.bnodes.fetch_add(1, Ordering::Relaxed))) - .unwrap() + .expect("always produces a correct BNode") } /// Rename a bnode using the `nodeID` in the document (using `o` prefix) @@ -698,7 +688,7 @@ where self.factory.borrow_mut().bnode(&format!("o{}", id)) } else { Err(Error::from(self::error::Error::from_kind( - self::error::ErrorKind::InvalidXmlName(id.to_string()) + self::error::ErrorKind::InvalidXmlName(id.to_string()), ))) } } @@ -776,10 +766,10 @@ where ParsingState::Res => { let kind = self::error::ErrorKind::UnexpectedEvent( format!("<{}>", self.reader.decode(e.name())), - "text".to_string() + "text".to_string(), ); Err(Error::from(self::error::Error::from_kind(kind))) - }, + } } { self.triples.push_back(Err(e)); } @@ -1145,7 +1135,7 @@ where ParsingState::Res => { let kind = self::error::ErrorKind::UnexpectedEvent( format!("<{}/>", self.reader.decode(e.name())), - "end".to_string() + "end".to_string(), ); Err(Error::from(self::error::Error::from_kind(kind))) } @@ -1285,14 +1275,11 @@ where impl Iterator for XmlParser where B: BufRead, - F: TermFactory + Clone + Default + Debug, - ::TermData: Debug, + F: TermFactory + Clone + Default, { type Item = Result<[Term; 3]>; fn next(&mut self) -> Option { - loop { - // First make sure to consume the queue. if let Some(res) = self.handler.triples.pop_front() { return Some(res); From f6eb5fe4c8d20b70e9379e14e14a6394effba263 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Fri, 7 Jun 2019 12:12:32 -0700 Subject: [PATCH 47/50] Use conversion traits to reduce boilerplate in `::parser::xml` --- sophia/src/parser/xml.rs | 256 ++++++++++++++++----------------------- 1 file changed, 103 insertions(+), 153 deletions(-) diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 6e0ea41a..331e9111 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -15,7 +15,7 @@ use quick_xml::events::BytesStart; use quick_xml::events::BytesText; use quick_xml::events::Event; use quick_xml::Reader; -use quick_xml::Result as XmlResult; +use quick_xml::Result as QuickXmlResult; use url::Url; use crate::error::*; @@ -140,8 +140,22 @@ pub mod error { Self::from_kind(ErrorKind::XmlError(e)) } } + + impl From for crate::error::Error { + fn from(e: quick_xml::Error) -> Self { + Error::from(e).into() + } + } + + impl From for crate::error::Error { + fn from(kind: ErrorKind) -> Self { + Self::from(Error::from_kind(kind)) + } + } } +use self::error::ErrorKind as XmlErrorKind; + /// RDF/XML parser configuration. /// /// For more information, @@ -232,7 +246,7 @@ pub struct XmlReader { impl XmlReader { /// Read an XML event. - pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> XmlResult> { + pub fn read_event<'a>(&mut self, buf: &'a mut Vec) -> QuickXmlResult> { use quick_xml::events::Event::*; // Clear the event peeking cache if it is not empty. @@ -374,9 +388,7 @@ impl Scope { /// Add a new XML prefix to the namespace mapping. fn add_prefix(&mut self, prefix: &str, value: &str) -> Result<()> { if prefix == "_" { - Err(Error::from(self::error::Error::from( - self::error::ErrorKind::InvalidPrefix(prefix.into()), - ))) + bail!(XmlErrorKind::InvalidPrefix(prefix.into())) } else { let mut f = self.factory.borrow_mut(); self.ns.insert( @@ -404,7 +416,7 @@ impl Scope { } } - Err(Error::from_kind(ErrorKind::InvalidIri(String::from(base)))) + bail!(ErrorKind::InvalidIri(String::from(base))) } /// Set the scope datatype. @@ -429,14 +441,12 @@ impl Scope { if let Some(ns) = self.ns.get(prefix) { ns.get(self.factory.borrow_mut().get_term_data(reference)) } else { - let kind = self::error::ErrorKind::UnknownNamespace(prefix.to_string()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::UnknownNamespace(prefix.to_string())) } } else if let Some(ns) = &self.default { ns.get(self.factory.borrow_mut().get_term_data(attr)) } else { - let kind = self::error::ErrorKind::UnknownNamespace("_".to_string()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::UnknownNamespace("_".to_string())) } } @@ -468,8 +478,7 @@ impl Scope { Err(_) => bail!(ErrorKind::InvalidIri(String::from(iri))), } } else { - let kind = self::error::ErrorKind::NoBaseIri(iri.to_string()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::NoBaseIri(iri.to_string())) } } else if is_absolute_iri(iri) { factory.iri(iri) @@ -484,10 +493,8 @@ impl Scope { /// identifiers in the document with a `#` if needed. fn expand_id(&self, id: &str) -> Result> { if !xmlname::is_valid_xmlname(id) { - return Err(Error::from(self::error::Error::from_kind( - self::error::ErrorKind::InvalidXmlName(id.to_string()), - ))); - } else if id.starts_with("#") { + bail!(XmlErrorKind::InvalidXmlName(id.to_string())) + } else if id.starts_with('#') { self.expand_iri(id) } else { self.expand_iri(&format!("#{}", id)) @@ -509,8 +516,7 @@ impl Scope { let mut f = self.factory.borrow_mut(); ns.get(f.get_term_data(&format!("_{}", self.li.fetch_add(1, Ordering::Relaxed)))) } else { - let kind = self::error::ErrorKind::UnknownNamespace("rdf".to_string()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::UnknownNamespace("rdf".to_string())) } } @@ -520,8 +526,7 @@ impl Scope { let mut f = self.factory.borrow_mut(); ns.get(f.get_term_data(&format!("_{}", self.li.load(Ordering::Relaxed) - 1))) } else { - let kind = self::error::ErrorKind::UnknownNamespace("rdf".to_string()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::UnknownNamespace("rdf".to_string())) } } } @@ -633,30 +638,21 @@ where // * Change scope language if there is any `xml:lang` attribute // * Fail if there is an invalid `rdf:li` attribute for attr in e.attributes().with_checks(true) { - let a = attr.map_err(self::error::Error::from)?; + let a = attr?; if a.key.starts_with(b"xmlns:") { scope.add_prefix( &self.reader.decode(&a.key[6..]), - &a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?, + &a.unescape_and_decode_value(&self.reader)?, )?; } else if a.key == b"xmlns" { - scope.set_default( - &a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?, - )?; + scope.set_default(&a.unescape_and_decode_value(&self.reader)?)?; } else if a.key == b"xml:base" { - scope.set_base( - &a.unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?, - )?; + scope.set_base(&a.unescape_and_decode_value(&self.reader)?)?; } else if a.key == b"xml:lang" { scope.lang = if a.value.is_empty() { None } else { - let v = &a - .unescape_and_decode_value(&self.reader) - .map_err(|e| self::error::Error::from(e))?; + let v = &a.unescape_and_decode_value(&self.reader)?; self.factory.borrow_mut().get_term_data(v).into() }; } @@ -687,17 +683,14 @@ where if xmlname::is_valid_xmlname(id) { self.factory.borrow_mut().bnode(&format!("o{}", id)) } else { - Err(Error::from(self::error::Error::from_kind( - self::error::ErrorKind::InvalidXmlName(id.to_string()), - ))) + bail!(XmlErrorKind::InvalidXmlName(id.to_string())) } } /// Check the given `ID` is unique. fn check_unique_id(&mut self, id: Term) -> Result> { if self.ids.contains(&id) { - let kind = self::error::ErrorKind::DuplicateId(id.value()); - Err(Error::from(self::error::Error::from_kind(kind))) + bail!(XmlErrorKind::DuplicateId(id.value())) } else { self.ids.insert(id.clone()); Ok(id) @@ -734,7 +727,7 @@ where parents: Vec::new(), scopes: vec![Scope::with_factory_rc(factory.clone())], triples: LinkedList::new(), - factory: factory, + factory, bnodes: AtomicUsize::new(0), state: vec![ParsingState::Node], ids: HashSet::new(), @@ -763,13 +756,10 @@ where ParsingState::Collection => self.collection_start(e), ParsingState::CollectionItem => self.collection_item_start(e), ParsingState::Literal => unimplemented!("entering element as literal"), - ParsingState::Res => { - let kind = self::error::ErrorKind::UnexpectedEvent( - format!("<{}>", self.reader.decode(e.name())), - "text".to_string(), - ); - Err(Error::from(self::error::Error::from_kind(kind))) - } + ParsingState::Res => Err(Error::from(XmlErrorKind::UnexpectedEvent( + format!("<{}>", self.reader.decode(e.name())), + "text".to_string(), + ))), } { self.triples.push_back(Err(e)); } @@ -781,24 +771,23 @@ where .scope() .expand_attribute(&self.reader.decode(e.name()))?; - // Bail out if in a top-level rdf:RDF element + // Return early if in a top-level rdf:RDF element if rdf::RDF.matches(&ty) && self.parents.is_empty() { self.state.push(ParsingState::Node); self.parents.push(self.factory.borrow_mut().copy(&rdf::RDF)); return Ok(()); } - // + // Bail out if the node has an invalid name. if RESERVED_NODE_NAMES.matches(&ty) { - let kind = self::error::ErrorKind::InvalidNodeName(ty.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidNodeName(ty.value())) } // Separate node subject from other attributes let mut properties = HashMap::new(); let mut subject = Vec::new(); for attr in e.attributes().with_checks(true) { - let a = attr.map_err(self::error::Error::from)?; + let a = attr?; // ignore xml attributes (processed in element_start) if a.key.starts_with(b"xml") { @@ -807,9 +796,7 @@ where // try to extract the subject annotation let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; if k.matches(&rdf::about) { subject.push(self.scope().expand_iri(&v)?); @@ -821,29 +808,21 @@ where } else if k.matches(&rdf::type_) { properties.insert(k, self.scope().expand_iri(&v)?); } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { - let kind = self::error::ErrorKind::InvalidAttribute(k.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(self::error::ErrorKind::InvalidAttribute(k.value())); } else { properties.insert(k, self.scope().new_literal(v)?); } } // Get subject and add it to the current nested stack - if subject.len() > 1 { - return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), - ); - } + ensure!(subject.len() < 2, XmlErrorKind::AmbiguousSubject); let s: Term<_> = subject.pop().unwrap_or_else(|| self.new_bnode()); self.parents.push(s.clone()); // Add the type as a triple if it is not `rdf:Description` if !ty.matches(&rdf::Description) { - self.triples.push_back(Ok([ - s.clone(), - self.factory.borrow_mut().copy(&rdf::type_), - ty, - ])); + let type_ = self.factory.borrow_mut().copy(&rdf::type_); + self.triples.push_back(Ok([s.clone(), type_, ty])); } // Add triples described by properties in XML attributes @@ -859,14 +838,13 @@ where fn predicate_start(&mut self, e: &BytesStart) -> Result<()> { // Get the predicate and add it to the current nested stack // or build a new `rdf:_n` IRI if the predicate is `rdf:li`. - let p = self.predicate_iri_start(&self.reader.decode(e.name()))?; + let pred = self.predicate_iri_start(&self.reader.decode(e.name()))?; // Fail if the property is among forbidden names. - if RESERVED_PROPERTY_NAMES.matches(&p) { - let kind = self::error::ErrorKind::InvalidNodeName(p.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + if RESERVED_PROPERTY_NAMES.matches(&pred) { + bail!(XmlErrorKind::InvalidNodeName(pred.value())) } else { - self.parents.push(p); + self.parents.push(pred); } // Extract attributes relevant to the RDF syntax @@ -874,7 +852,7 @@ where let mut next_state = ParsingState::Node; let mut object = Vec::with_capacity(1); for attr in e.attributes().with_checks(true) { - let a = attr.map_err(self::error::Error::from)?; + let a = attr?; // Ignore `xml` attributes if a.key.starts_with(b"xml") { @@ -883,21 +861,15 @@ where let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; if k.matches(&rdf::datatype) { - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; self.scope_mut().set_datatype(&v)?; } else if k.matches(&rdf::ID) { - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; let id = self.scope().expand_id(&v)?; object.push(self.check_unique_id(id)?); next_state = ParsingState::Res; } else if k.matches(&rdf::resource) { - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; object.push(self.scope().expand_iri(&v)?); next_state = ParsingState::Predicate; } else if k.matches(&rdf::parseType) { @@ -918,17 +890,13 @@ where } other => { let ty = String::from_utf8_lossy(other).to_string(); - let kind = self::error::ErrorKind::InvalidParseType(ty); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidParseType(ty)); } } } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { - let kind = self::error::ErrorKind::InvalidAttribute(k.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidAttribute(k.value())); } else { - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; attributes.insert(k, self.scope().new_literal(v)?); next_state = ParsingState::Resource; } @@ -939,11 +907,7 @@ where 0 if !attributes.is_empty() => Some(self.new_bnode()), 0 if attributes.is_empty() => None, 1 => Some(object.last().unwrap().clone()), - _ => { - return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), - ) - } + _ => bail!(XmlErrorKind::AmbiguousSubject), }; // Make the predicate a resource element if an objec tis present. @@ -1081,26 +1045,15 @@ where fn res_end(&mut self) -> Result<()> { // Subject, predicate, object and ID of the reified triple let id = self.parents.pop().unwrap(); - let p = self.parents.pop().unwrap(); - let s = self.parents.last().unwrap().clone(); + let pred = self.parents.pop().unwrap(); + let sbj = self.parents.last().unwrap().clone(); let txt = self.scope_mut().text.take().unwrap_or_default(); - let o = self.scope().new_literal(txt)?; - - // Types for the reification - let mut factory = self.factory.borrow_mut(); - let ty = factory.copy(&rdf::type_); - let subject = factory.copy(&rdf::subject); - let predicate = factory.copy(&rdf::predicate); - let object = factory.copy(&rdf::object); - let stmt = factory.copy(&rdf::Statement); + let obj = self.scope().new_literal(txt)?; // Add all triples self.triples - .push_back(Ok([s.clone(), p.clone(), o.clone()])); - self.triples.push_back(Ok([id.clone(), ty, stmt])); - self.triples.push_back(Ok([id.clone(), subject, s])); - self.triples.push_back(Ok([id.clone(), predicate, p])); - self.triples.push_back(Ok([id.clone(), object, o])); + .push_back(Ok([sbj.clone(), pred.clone(), obj.clone()])); + self.reify(id, sbj, pred, obj); Ok(()) } @@ -1111,9 +1064,7 @@ where if self.scope().text.is_some() { match e.unescape_and_decode(&self.reader) { Ok(text) => self.scope_mut().set_text(text), - Err(e) => self - .triples - .push_back(Err(self::error::Error::from(e).into())), + Err(e) => self.triples.push_back(Err(Error::from(e))), } } } @@ -1137,7 +1088,7 @@ where format!("<{}/>", self.reader.decode(e.name())), "end".to_string(), ); - Err(Error::from(self::error::Error::from_kind(kind))) + Err(Error::from(kind)) } } { self.triples.push_back(Err(e)); @@ -1153,12 +1104,11 @@ where } fn predicate_empty(&mut self, e: &BytesStart) -> Result<()> { - let p = self.predicate_iri_start(&self.reader.decode(e.name()))?; + let pred = self.predicate_iri_start(&self.reader.decode(e.name()))?; // Fail if the property is among forbidden names. - if RESERVED_PROPERTY_NAMES.matches(&p) { - let kind = self::error::ErrorKind::InvalidNodeName(p.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + if RESERVED_PROPERTY_NAMES.matches(&pred) { + bail!(XmlErrorKind::InvalidNodeName(pred.value())); } let mut object = Vec::with_capacity(1); @@ -1168,7 +1118,7 @@ where // Extract attributes for attr in e.attributes().with_checks(true) { - let a = attr.map_err(self::error::Error::from)?; + let a = attr?; // ignore XML attributes (processed when entering scope) if a.key.starts_with(b"xml") { @@ -1177,9 +1127,7 @@ where // try to extract the annotation object let k = self.scope().expand_attribute(&self.reader.decode(a.key))?; - let v = a - .unescape_and_decode_value(&self.reader) - .map_err(self::error::Error::from)?; + let v = a.unescape_and_decode_value(&self.reader)?; if k.matches(&rdf::resource) { object.push(self.scope().expand_iri(&v)?); } else if k.matches(&rdf::nodeID) { @@ -1193,13 +1141,11 @@ where b"Literal" => parse_type = Some(&b"Literal"[..]), other => { let ty = String::from_utf8_lossy(other).to_string(); - let kind = self::error::ErrorKind::InvalidParseType(ty); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidParseType(ty)); } }; } else if RESERVED_ATTRIBUTES_NAMES.matches(&k) { - let kind = self::error::ErrorKind::InvalidAttribute(k.value()); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidAttribute(k.value())); } else { attributes.insert(k, v); } @@ -1215,47 +1161,30 @@ where let mut scope = self.scope_mut(); scope.datatype = Some(xmlliteral); } else { - let kind = self::error::ErrorKind::InvalidParseType("Literal".to_string()); - return Err(Error::from(self::error::Error::from_kind(kind))); + bail!(XmlErrorKind::InvalidParseType("Literal".to_string())); } } - // Extract subjet and object of the triple - let s = self.parents.last().unwrap().clone(); - let o = match object.len() { + // Extract subject and object of the triple + let sbj = self.parents.last().unwrap().clone(); + let obj = match object.len() { 0 if !attributes.is_empty() => self.new_bnode(), 1 => object.last().unwrap().clone(), 0 if attributes.is_empty() => self.scope().new_literal(String::new())?, - _ => { - return Err( - self::error::Error::from_kind(self::error::ErrorKind::AmbiguousSubject).into(), - ) - } + _ => bail!(XmlErrorKind::AmbiguousSubject), }; // Add the triple and all subsequent triples as attributes self.triples - .push_back(Ok([s.clone(), p.clone(), o.clone()])); + .push_back(Ok([sbj.clone(), pred.clone(), obj.clone()])); for (prop, value) in attributes.into_iter() { let literal = self.scope().new_literal(value)?; - self.triples.push_back(Ok([o.clone(), prop, literal])); + self.triples.push_back(Ok([obj.clone(), prop, literal])); } // Reify the triple if needed. if let Some(id) = reification { - // Types for the reification - let mut factory = self.factory.borrow_mut(); - let ty = factory.copy(&rdf::type_); - let subject = factory.copy(&rdf::subject); - let predicate = factory.copy(&rdf::predicate); - let obj = factory.copy(&rdf::object); - let stmt = factory.copy(&rdf::Statement); - - // Add all triples - self.triples.push_back(Ok([id.clone(), ty, stmt])); - self.triples.push_back(Ok([id.clone(), subject, s])); - self.triples.push_back(Ok([id.clone(), predicate, p])); - self.triples.push_back(Ok([id.clone(), obj, o])); + self.reify(id, sbj, pred, obj); } Ok(()) @@ -1270,6 +1199,28 @@ where self.state.pop(); self.collection_item_end() } + + // --- + + fn reify( + &mut self, + id: Term, + sbj: Term, + pred: Term, + obj: Term, + ) { + let mut factory = self.factory.borrow_mut(); + let ty = factory.copy(&rdf::type_); + let subject = factory.copy(&rdf::subject); + let predicate = factory.copy(&rdf::predicate); + let object = factory.copy(&rdf::object); + let stmt = factory.copy(&rdf::Statement); + + self.triples.push_back(Ok([id.clone(), ty, stmt])); + self.triples.push_back(Ok([id.clone(), subject, sbj])); + self.triples.push_back(Ok([id.clone(), predicate, pred])); + self.triples.push_back(Ok([id.clone(), object, obj])); + } } impl Iterator for XmlParser @@ -1298,8 +1249,7 @@ where Ok(_) => (), Err(e) => { let kind = self::error::ErrorKind::XmlError(e); - let err = self::error::Error::from_kind(kind); - self.handler.triples.push_back(Err(Error::from(err))); + self.handler.triples.push_back(Err(Error::from(kind))); } } } From 30c60d903d1ff4f0a811d69d7026a1e94ebcfe76 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 11 Jun 2019 12:17:21 -0700 Subject: [PATCH 48/50] Remove `dc` from `::ns` module and fix `xml` and `rdf` namespaces --- sophia/src/ns.rs | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/sophia/src/ns.rs b/sophia/src/ns.rs index 85579e41..45441d2c 100644 --- a/sophia/src/ns.rs +++ b/sophia/src/ns.rs @@ -86,7 +86,6 @@ pub mod rdf { // classes Alt, Bad, - Description, List, PlainLiteral, Property, @@ -108,6 +107,7 @@ pub mod rdf { // core syntax terms RDF, ID, + Description, about, parseType, resource, @@ -208,36 +208,11 @@ pub mod xml { space, base, id, - // John Bosak + // Jon Bosak Father ); } -/// The standard `dc:` namespace. -pub mod dc { - /// The Dublin Core elements (`http://purl.org/dc/elements/1.1/`). - pub mod elements { - namespace!( - "http://purl.org/dc/elements/1.1/", - contributor, - coverage, - creator, - date, - description, - format, - identifier, - language, - publisher, - relation, - rights, - source, - subject, - title - ); - ns_term!("http://purl.org/dc/elements/1.1/", type_, "type"); - } -} - #[cfg(test)] mod test { // Nothing really worth testing here From debd9e8345ed0ff8a5fdfc5f59f1c7d9f5bf3bfa Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Tue, 11 Jun 2019 12:31:23 -0700 Subject: [PATCH 49/50] Remove `xmlname.pest` and use regex instead --- sophia/src/parser/xml.rs | 18 ++++++++++-------- sophia/src/parser/xmlname.pest | 29 ----------------------------- 2 files changed, 10 insertions(+), 37 deletions(-) delete mode 100644 sophia/src/parser/xmlname.pest diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 331e9111..8c682820 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -63,17 +63,19 @@ static RESERVED_ATTRIBUTES_NAMES: &'static [StaticTerm] = mod xmlname { - use pest::Parser; + use regex::Regex; - #[cfg(debug_assertions)] - const _GRAMMAR: &str = include_str!("xmlname.pest"); - - #[derive(Parser)] - #[grammar = "parser/xmlname.pest"] - struct PestXmlNameParser; + lazy_static! { + static ref XMLNAME_REGEX: Regex = Regex::new(r"(?x)^ + # NameStartChar + [_A-Za-z\u{C0}-\u{D6}\u{D8}-\u{F6}\u{F8}-\u{2FF}\u{370}-\u{37D}\u{37F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}\u{10000}-\u{EFFFF}] + # NameChar + [-.0-9\u{B7}_A-Za-z\u{C0}-\u{D6}\u{D8}-\u{F6}\u{F8}-\u{37D}\u{37F}-\u{1FFF}\u{200C}-\u{200D}\u{203F}-\u{2040}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}\u{10000}-\u{EFFFF}]* + $").unwrap(); + } pub fn is_valid_xmlname(n: &str) -> bool { - PestXmlNameParser::parse(Rule::Name, n).is_ok() + XMLNAME_REGEX.is_match(n) } } diff --git a/sophia/src/parser/xmlname.pest b/sophia/src/parser/xmlname.pest deleted file mode 100644 index 313e0720..00000000 --- a/sophia/src/parser/xmlname.pest +++ /dev/null @@ -1,29 +0,0 @@ -NameStartChar = { - 'A'..'Z' - | "_" - | 'a'..'z' - | '\u{C0}'..'\u{D6}' - | '\u{D8}'..'\u{F6}' - | '\u{F8}'..'\u{2FF}' - | '\u{370}'..'\u{37D}' - | '\u{37F}'..'\u{1FFF}' - | '\u{200C}'..'\u{200D}' - | '\u{2070}'..'\u{218F}' - | '\u{2C00}'..'\u{2FEF}' - | '\u{3001}'..'\u{D7FF}' - | '\u{F900}'..'\u{FDCF}' - | '\u{FDF0}'..'\u{FFFD}' - | '\u{10000}'..'\u{EFFFF}' -} - -NameChar = { - NameStartChar - | "-" - | "." - | '0'..'9' - | "\u{B7}" - | '\u{0300}'..'\u{036F}' - | '\u{203F}'..'\u{2040}' -} - -Name = { NameStartChar ~ NameChar* ~ EOI } From abc5888b99a812a9c4a9e75d1c2802382710ea53 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Wed, 12 Jun 2019 19:10:01 -0700 Subject: [PATCH 50/50] Rework error management to chain XML parser errors into `ParserError` --- sophia/src/error.rs | 3 --- sophia/src/parser/xml.rs | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/sophia/src/error.rs b/sophia/src/error.rs index f71d1850..f85ece3b 100644 --- a/sophia/src/error.rs +++ b/sophia/src/error.rs @@ -3,9 +3,6 @@ use pest::error::{InputLocation, LineColLocation}; error_chain! { - links { - XmlError(crate::parser::xml::error::Error, crate::parser::xml::error::ErrorKind); - } errors { /// Raised by the methods of the [`Graph`](../graph/trait.Graph.html) trait. GraphError(message: String) { diff --git a/sophia/src/parser/xml.rs b/sophia/src/parser/xml.rs index 8c682820..34e0638b 100644 --- a/sophia/src/parser/xml.rs +++ b/sophia/src/parser/xml.rs @@ -149,10 +149,33 @@ pub mod error { } } + impl From for crate::error::Error { + fn from(error: Error) -> Self { + let message = error.0.to_string(); + crate::error::Error::with_chain( + error, + crate::error::ErrorKind::ParserError( + message, + pest::error::InputLocation::Pos(0), + pest::error::LineColLocation::Pos((0, 0)), + ) + ) + } + } + impl From for crate::error::Error { fn from(kind: ErrorKind) -> Self { - Self::from(Error::from_kind(kind)) + let error = Error::from_kind(kind); + Self::from(error) + } + } + + /// Patch the `location` field of the error kind if the error is a `ParserError`. + pub fn with_position(mut e: crate::error::Error, n: usize) -> crate::error::Error { + if let crate::error::ErrorKind::ParserError(_, ref mut location, _) = e.0 { + *location = pest::error::InputLocation::Pos(n); } + e } } @@ -748,6 +771,7 @@ where fn element_start(&mut self, e: &BytesStart) { if let Err(e) = self.enter_scope(e) { + let e = error::with_position(e, self.reader.buffer_position()); self.triples.push_back(Err(e)); } @@ -763,6 +787,7 @@ where "text".to_string(), ))), } { + let e = error::with_position(e, self.reader.buffer_position()); self.triples.push_back(Err(e)); } } @@ -951,6 +976,7 @@ where ParsingState::Collection => self.collection_end(e), ParsingState::Res => self.res_end(), } { + let e = error::with_position(e, self.reader.buffer_position()); self.triples.push_back(Err(e)); } self.leave_scope(); @@ -1066,7 +1092,10 @@ where if self.scope().text.is_some() { match e.unescape_and_decode(&self.reader) { Ok(text) => self.scope_mut().set_text(text), - Err(e) => self.triples.push_back(Err(Error::from(e))), + Err(e) => { + let e = error::with_position(Error::from(e), self.reader.buffer_position()); + self.triples.push_back(Err(e)) + }, } } } @@ -1075,6 +1104,7 @@ where fn element_empty(&mut self, e: &BytesStart) { if let Err(e) = self.enter_scope(e) { + let e = error::with_position(e, self.reader.buffer_position()); self.triples.push_back(Err(e)); } @@ -1093,6 +1123,7 @@ where Err(Error::from(kind)) } } { + let e = error::with_position(e, self.reader.buffer_position()); self.triples.push_back(Err(e)); }