Skip to content

Commit

Permalink
[htm8] Quote attributes in XML conversion
Browse files Browse the repository at this point in the history
We still have to do

    &  ->  &
    <  ->  &lt;

in attributes.
  • Loading branch information
Andy C committed Jan 18, 2025
1 parent 3e5f085 commit 194a6ea
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 40 deletions.
38 changes: 23 additions & 15 deletions data_lang/htm8.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@
TODO
Migrate:
- maybe: migrate everything off of TagLexer()
- and AttrValueLexer() - this should requires Validate()
- would be nice: migrate everything off of TagLexer()
- oils_doc.py and help_gen.py
- this old API is stateful and uses Python iterators, which is problematic
- maybe we can use a better CSS selector abstraction
API:
- Get rid of Reset()?
- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
_LiteralTagName()
- Validate() can be improved
Features:
- work on ToXml() test cases? This is another text of AttrLexer
C++:
- UTF-8 check, like JSON8
- re2c
- port lexer, which will fix static typing issues
Expand Down Expand Up @@ -475,6 +480,7 @@ def __init__(self, s):

self.name_start = -1
self.name_end = -1
self.equal_end = -1
self.next_value_is_missing = False

self.init_t = -1
Expand Down Expand Up @@ -523,7 +529,7 @@ def Reset(self):
self.pos = self.init_t

def ReadName(self):
# type: () -> Tuple[attr_name_t, int, int]
# type: () -> Tuple[attr_name_t, int, int, int]
"""Reads the attribute name
EOF case:
Expand All @@ -541,29 +547,30 @@ def ReadName(self):
#log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
if a == attr_name.Invalid:
#log('m.groups %s', m.groups())
return attr_name.Invalid, -1, -1
return attr_name.Invalid, -1, -1, -1

self.pos = m.end(0) # Advance if it's not invalid

if a == attr_name.Ok:
#log('%r', m.groups())
self.name_start = m.start(1)
self.name_end = m.end(1)
self.equal_end = m.end(0) # XML conversion needs this
# Is the equals sign missing? Set state.
if m.group(2) is None:
self.next_value_is_missing = True
# HACK: REWIND, since we don't want to consume whitespace
self.pos = self.name_end
else:
self.next_value_is_missing = False
return attr_name.Ok, self.name_start, self.name_end
return attr_name.Ok, self.name_start, self.name_end, self.equal_end
else:
# Reset state - e.g. you must call AttrNameEquals
self.name_start = -1
self.name_end = -1

if a == attr_name.Done:
return attr_name.Done, -1, -1
return attr_name.Done, -1, -1, -1
else:
context = self.s[self.pos:]
#log('s %r %d', self.s, self.pos)
Expand Down Expand Up @@ -692,7 +699,7 @@ def ReadValue(self, tokens_out=None):
def GetAttrRaw(attr_lx, name):
# type: (AttrLexer, str) -> Optional[str]
while True:
n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
#log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
if n == attr_name.Ok:
if attr_lx.AttrNameEquals(name):
Expand All @@ -714,10 +721,10 @@ def GetAttrRaw(attr_lx, name):


def AllAttrsRawSlice(attr_lx):
# type: (AttrLexer) -> List[Tuple[int, int, attr_value_t, int, int]]
# type: (AttrLexer) -> List[Tuple[int, int, int, attr_value_t, int, int]]
result = []
while True:
n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, equal_end = attr_lx.ReadName()
if 0:
log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
Expand All @@ -728,7 +735,8 @@ def AllAttrsRawSlice(attr_lx):
v, val_start, val_end = attr_lx.ReadValue()
#val = attr_lx.s[val_start:val_end]
#log(' ReadValue %r', val)
result.append((name_start, name_end, v, val_start, val_end))
result.append(
(name_start, name_end, equal_end, v, val_start, val_end))
elif n == attr_name.Done:
break
elif n == attr_name.Invalid:
Expand All @@ -751,7 +759,7 @@ def AllAttrsRaw(attr_lx):
slices = AllAttrsRawSlice(attr_lx)
pairs = []
s = attr_lx.s
for name_start, name_end, val_id, val_start, val_end in slices:
for name_start, name_end, equal_end, val_id, val_start, val_end in slices:
n = s[name_start:name_end]
v = s[val_start:val_end]
pairs.append((n, v))
Expand Down
28 changes: 14 additions & 14 deletions data_lang/htm8_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def testNoAttrs(self):
attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)

# There is no tag
n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Done)
self.assertEqual(-1, name_start)
self.assertEqual(-1, name_end)
Expand All @@ -85,7 +85,7 @@ def testInvalid(self):
h = '<a !>'
attr_lx = _MakeAttrLexer(self, h)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Invalid)
self.assertEqual(-1, name_start)
self.assertEqual(-1, name_end)
Expand All @@ -101,7 +101,7 @@ def testEmpty(self):
h = '<img src=>'
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(5, name_start)
self.assertEqual(8, name_end)
Expand All @@ -116,14 +116,14 @@ def testEmpty(self):
self.assertEqual(-1, attr_start)
self.assertEqual(-1, attr_end)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Done)

def testMissing(self):
h = '<img SRC/>'
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(5, name_start)
self.assertEqual(8, name_end)
Expand All @@ -137,15 +137,15 @@ def testMissing(self):
self.assertEqual(-1, attr_start)
self.assertEqual(-1, attr_end)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Done)

def testUnquoted(self):
# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
h = '<a x=foo />'
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)
Expand All @@ -159,14 +159,14 @@ def testUnquoted(self):
self.assertEqual(5, attr_start)
self.assertEqual(8, attr_end)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Done)

def testDoubleQuoted(self):
h = '<a x="f&">'
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)
Expand All @@ -181,15 +181,15 @@ def testDoubleQuoted(self):
self.assertEqual(8, attr_end)
self.assertEqual(9, attr_lx.pos)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
log('n = %r', attr_name_str(n))
self.assertEqual(n, attr_name.Done)

def testSingleQuoted(self):
h = "<a x='&f'>"
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)
Expand All @@ -204,15 +204,15 @@ def testSingleQuoted(self):
self.assertEqual(8, attr_end)
self.assertEqual(9, attr_lx.pos)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
#log('n = %r', attr_name_str(n))
self.assertEqual(n, attr_name.Done)

def testDoubleQuoted_Bad(self):
h = '<a x="foo>'
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)
Expand All @@ -228,7 +228,7 @@ def testSingleQuoted_Bad(self):
h = "<a x='foo>"
attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)

n, name_start, name_end = attr_lx.ReadName()
n, name_start, name_end, _ = attr_lx.ReadName()
self.assertEqual(n, attr_name.Ok)
self.assertEqual(3, name_start)
self.assertEqual(4, name_end)
Expand Down
17 changes: 15 additions & 2 deletions data_lang/htm8_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from typing import List

from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str)
from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str, attr_value_e)
from data_lang import htm8
from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
from doctools.util import log
Expand Down Expand Up @@ -149,7 +149,20 @@ def ToXml(htm8_str):
elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
for name_start, name_end, v, val_start, val_end in all_attrs:
for name_start, name_end, equal_end, v, val_start, val_end in all_attrs:
if v == attr_value_e.Missing: # <a missing>
out.PrintUntil(name_end)
out.Print('=""')
elif v == attr_value_e.Empty: # <a empty=>
out.PrintUntil(equal_end)
out.Print('""')
elif v == attr_value_e.Unquoted: # <a foo=bar>
# Because we disallow ", we can just surround with quotes
out.PrintUntil(val_start)
out.Print('"')
out.PrintUntil(val_end)
out.Print('"')

#val_lexer.Reset(val_start, val_end)
pass
# TODO: get the kind of string
Expand Down
18 changes: 9 additions & 9 deletions data_lang/htm8_util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,17 @@ def testValid(self):
VALID_LEX = [
# TODO: convert these to XML
('<foo></foo>', UNCHANGED),
('<foo x=y></foo>', ''),
('<foo x=y></foo>', '<foo x="y"></foo>'),
#('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
('<foo x="&"></foo>', ''),

# Allowed with BadAmpersand
('<p> x & y </p>', '<p> x &amp; y </p>'),

# No ambiguity
('<img src=/ >', ''),
('<img src="/">', ''),
('<img src=foo/ >', ''),
('<img src=/ >', '<img src="/" >'),
('<img src="/">', UNCHANGED),
('<img src=foo/ >', '<img src="foo/" >'),
]

INVALID_PARSE = [
Expand Down Expand Up @@ -112,16 +112,16 @@ def testValid(self):
('<meta><a></a>', ''),

# no attribute
('<button disabled></button>', ''),
('<button disabled=></button>', ''),
('<button disabled= ></button>', ''),
('<button disabled></button>', '<button disabled=""></button>'),
('<button disabled=></button>', '<button disabled=""></button>'),
('<button disabled= ></button>', '<button disabled= ""></button>'),

# single quoted is pretty common
("<a href='single'></a>", ''),

# Conceding to reality - I used these myself
('<a href=ble.sh></a>', ''),
('<a href=foo.html></a>', ''),
('<a href=ble.sh></a>', '<a href="ble.sh"></a>'),
('<a href=foo.html></a>', '<a href="foo.html"></a>'),
('<foo x="&"></foo>', ''),

# caps
Expand Down

0 comments on commit 194a6ea

Please sign in to comment.