Skip to content

Commit 391f24f

Browse files
authored
Add support for parsing HTML numeric entities (#645)
1 parent 072b2b0 commit 391f24f

File tree

5 files changed

+43
-0
lines changed

5 files changed

+43
-0
lines changed

docs/v4/5.Entities.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ Following HTML entities are supported by the parser by default when `htmlEntitie
132132
|| Indian Rupee | `&inr;` | `₹` |
133133
---
134134

135+
In addition, [numeric character references](https://html.spec.whatwg.org/multipage/syntax.html#syntax-charref) are also supported. Both decimal (`num_dec`) and hexadecimal(`num_hex`).
136+
135137
In future version of FXP, we'll be supporting more features of DOCTYPE such as `ELEMENT`, reading content for an entity from a file etc.
136138

137139
## External Entities

spec/entities_spec.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,41 @@ describe("XMLParser Entities", function() {
377377
expect(result).toEqual(expected);
378378
});
379379

380+
381+
it("should parse HTML numeric entities when htmlEntities:true", function() {
382+
const xmlData = `
383+
<?xml version="1.0" encoding="UTF-8"?>
384+
<note>
385+
<heading>Bear</heading>
386+
<body face="&#x295;&#x2022;&#x1D25;&#x2022;&#x294;">Bears are called B&#228;ren in German!</body>
387+
</note> `;
388+
389+
const expected = {
390+
"?xml": {
391+
"version": "1.0",
392+
"encoding": "UTF-8"
393+
},
394+
"note": {
395+
"heading": "Bear",
396+
"body": {
397+
"#text": "Bears are called Bären in German!",
398+
"face": "ʕ•ᴥ•ʔ"
399+
}
400+
}
401+
};
402+
403+
const options = {
404+
attributeNamePrefix: "",
405+
ignoreAttributes: false,
406+
processEntities: true,
407+
htmlEntities: true,
408+
};
409+
const parser = new XMLParser(options);
410+
let result = parser.parse(xmlData);
411+
412+
expect(result).toEqual(expected);
413+
});
414+
380415
it("should throw error if an entity name contains special char", function() {
381416
const xmlData = `
382417
<?xml version="1.0" encoding="UTF-8"?>

src/v5/EntitiesParser.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ const htmlEntities = {
1313
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
1414
"reg" : { regex: /&(reg|#174);/g, val: "®" },
1515
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
16+
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
17+
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
1618
};
1719

1820
class EntitiesParser{

src/v5/valueParsers/EntitiesParser.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ const htmlEntities = {
1313
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
1414
"reg" : { regex: /&(reg|#174);/g, val: "®" },
1515
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
16+
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
17+
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
1618
};
1719

1820
class EntitiesParser{

src/xmlparser/OrderedObjParser.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ class OrderedObjParser{
4040
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
4141
"reg" : { regex: /&(reg|#174);/g, val: "®" },
4242
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
43+
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
44+
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
4345
};
4446
this.addExternalEntities = addExternalEntities;
4547
this.parseXml = parseXml;

0 commit comments

Comments
 (0)