From a7343f58edea694710b8c30836117b0a4679fbc3 Mon Sep 17 00:00:00 2001 From: arnaud <arnaud.cassaigne@mondeca.com> Date: Wed, 20 Oct 2021 16:34:52 +0200 Subject: [PATCH] N3 parsing of local names with special characters --- src/n3parser.js | 84 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/src/n3parser.js b/src/n3parser.js index 779d7c1c9..97bddff3f 100644 --- a/src/n3parser.js +++ b/src/n3parser.js @@ -168,7 +168,7 @@ $Id: n3parser.js 14561 2008-02-23 06:37:26Z kennyluck $ HAND EDITED FOR CONVERSION TO JAVASCRIPT -This module implements a Nptation3 parser, and the final +This module implements a Notation3 parser, and the final part of a notation3 serializer. See also: @@ -201,9 +201,12 @@ var DATE_DATATYPE = "http://www.w3.org/2001/XMLSchema#date"; var DATETIME_DATATYPE = "http://www.w3.org/2001/XMLSchema#dateTime"; var BOOLEAN_DATATYPE = "http://www.w3.org/2001/XMLSchema#boolean"; var option_noregen = 0; -var _notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~"; -var _notNameChars = ( _notQNameChars + ":" ) ; +var _notQNameChars = "\t\r\n !\"#$&'()*,+/;<=>?@[\\]^`{|}~"; // else valid qname :-/ +var _notKeywordsChars = ( _notQNameChars + "." ) ; +var _notNameChars = ( _notQNameChars + ":" ) ; // Assume anything else valid name :-/ var _rdfns = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; +var hexChars = "ABCDEFabcdef0123456789"; +var escapeChars = "(_~.-!$&'()*+,;=/?#@%)"; // valid for \ escapes in localnames var N3CommentCharacter = "#"; var eol = new RegExp("^[ \\t]*(#[^\\n]*)?\\r?\\n", 'g'); var eof = new RegExp("^[ \\t]*(#[^\\n]*)?$", 'g'); @@ -360,7 +363,7 @@ __SinkParser.prototype.tok = function(tok, str, i) { } } var k = ( i + pyjslib_len(tok) ) ; - if ((str.slice( i, k) == tok) && (_notQNameChars.indexOf(str.charAt(k)) >= 0)) { + if ((str.slice( i, k) == tok) && (_notKeywordsChars.indexOf(str.charAt(k)) >= 0)) { return k; } else { @@ -626,7 +629,7 @@ __SinkParser.prototype.path = function(str, i, res) { var ch = str.slice( j, ( j + 1 ) ); if ((ch == ".")) { var ahead = str.slice( ( j + 1 ) , ( j + 2 ) ); - if (!(ahead) || (_notNameChars.indexOf(ahead) >= 0) && (":?<[{(".indexOf(ahead) < 0)) { + if (!(ahead) || (_notKeywordsChars.indexOf(ahead) >= 0) && (":?<[{(".indexOf(ahead) < 0) || (ahead == "%") ) { break; } } @@ -1168,7 +1171,7 @@ __SinkParser.prototype.variable = function(str, i, res) { throw BadSyntax(this._thisDoc, this.lines, str, j, ( ( "Varible name can't start with '" + str.charAt(j) ) + "s'" ) ); return -1; } - while ((i < pyjslib_len(str)) && (_notNameChars.indexOf(str.charAt(i)) < 0)) { + while ((i < pyjslib_len(str)) && (_notKeywordsChars.indexOf(str.charAt(i)) < 0)) { var i = ( i + 1 ) ; } if ((this._parentContext == null)) { @@ -1190,11 +1193,11 @@ __SinkParser.prototype.bareWord = function(str, i, res) { if (("0123456789-".indexOf(ch) >= 0)) { return -1; } - if ((_notNameChars.indexOf(ch) >= 0)) { + if ((_notKeywordsChars.indexOf(ch) >= 0)) { return -1; } var i = j; - while ((i < pyjslib_len(str)) && (_notNameChars.indexOf(str.charAt(i)) < 0)) { + while ((i < pyjslib_len(str)) && (_notKeywordsChars.indexOf(str.charAt(i)) < 0)) { var i = ( i + 1 ) ; } res.push(str.slice( j, i)); @@ -1202,7 +1205,6 @@ __SinkParser.prototype.bareWord = function(str, i, res) { }; __SinkParser.prototype.qname = function(str, i, res) { /* - xyz:def -> ('xyz', 'def') If not in keywords and keywordsSet: def -> ('', 'def') :def -> ('', 'def') @@ -1218,10 +1220,10 @@ __SinkParser.prototype.qname = function(str, i, res) { } if ((_notNameChars.indexOf(c) < 0)) { var ln = c; - var i = ( i + 1 ) ; - while ((i < pyjslib_len(str))) { + var i = ( i + 1 ) ; + while (i < pyjslib_len(str)) { var c = str.charAt(i); - if ((_notNameChars.indexOf(c) < 0)) { + if ((_notNameChars.indexOf(c) < 0)) { var ln = ( ln + c ) ; var i = ( i + 1 ) ; } @@ -1229,24 +1231,72 @@ __SinkParser.prototype.qname = function(str, i, res) { break; } } + + if(str.charAt(i - 1) == ".") { // qname cannot end with "." + var i = ( i - 1 ) ; + if (ln.length == 0) { + return -1; + } + ln = ln.slice(0, -1); + } } else { var ln = ""; } if ((i < pyjslib_len(str)) && (str.charAt(i) == ":")) { var pfx = ln; - var i = ( i + 1 ) ; + // bnodes names have different rules + if (pfx == "_") { + var allowedChars = _notNameChars + } + else { + var allowedChars = _notQNameChars + } + var i = ( i + 1 ) ; + var lastslash = false; var ln = ""; - while ((i < pyjslib_len(str))) { + while (i < pyjslib_len(str)) { var c = str.charAt(i); - if ((_notNameChars.indexOf(c) < 0)) { - var ln = ( ln + c ) ; - var i = ( i + 1 ) ; + if (c == "\\" && !(lastslash)) { + var lastslash = true; + } + else if ((allowedChars.indexOf(c) < 0) || lastslash) { + if (lastslash) { + if (escapeChars.indexOf(c) < 0) { + throw BadSyntax(this._thisDoc, this.lines, str, i, "illegal escape " + c); + } + } + else if (c == "%") { + if (i == pyjslib_len(str) - 2) { + throw BadSyntax(this._thisDoc, this.lines, str, i, "illegal hex escape % (EOF)"); + } + var ec1 = str.charAt(i + 1); + var ec2 = str.charAt(i + 2); + if (hexChars.indexOf(ec1) < 0 || hexChars.indexOf(ec2) < 0 ) { + throw BadSyntax(this._thisDoc, this.lines, str, i, "illegal hex escape %" + ec1 + ec2); + } + } + var lastslash = false; + var ln = ( ln + c ) ; } else { break; } + var i = ( i + 1 ) ; + } + + if (lastslash) { + throw BadSyntax(this._thisDoc, this.lines, str, i, "qname cannot end with \\"); + } + + if(str.charAt(i - 1) == ".") { // localname cannot end in . + if (ln.length == 0) { + return -1; + } + var i = ( i - 1 ) ; + ln = ln.slice(0, -1); } + res.push(new pyjslib_Tuple([pfx, ln])); return i; }