Skip to content
This repository has been archived by the owner on Oct 22, 2022. It is now read-only.

HTMLFileContent and component for extracting IDL fragments #46

Merged
merged 27 commits into from
Jun 27, 2017
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dd775b5
WIP: Adding HTML Content parsing
m-cheung May 23, 2017
58e5e48
Adding preliminary HTMLParsing and corresponding tests
m-cheung May 29, 2017
32366c5
Merge branch 'htmlContent'
m-cheung May 29, 2017
6376b3d
Renaming HTMLParser to HTMLFileContent and adding additional tests
m-cheung May 30, 2017
959d608
Removing additional files
m-cheung May 30, 2017
b35f750
Adding some additional tests and filtering out examples / notes
m-cheung May 30, 2017
853d12d
Adding additional tests
m-cheung May 30, 2017
0f92a6b
Temporary work on pipeline
m-cheung May 31, 2017
b640239
Merge branch 'master' into htmlContent
m-cheung Jun 1, 2017
e9eb44d
Merge branch 'master' into htmlContent
m-cheung Jun 6, 2017
5bcfc90
Merge branch 'master' into htmlContent
m-cheung Jun 6, 2017
a041087
Refactoring HTMLFileContent to correspond with HTMLLexer changes
m-cheung Jun 7, 2017
5969d4b
Part 2 of refactor due to HTMLLexer changes
m-cheung Jun 7, 2017
ad7669e
Renaming components
m-cheung Jun 8, 2017
dd2bd1d
Removing parsing step from extractor
m-cheung Jun 8, 2017
f812b58
Removing unused variables and fixing comment styling
m-cheung Jun 8, 2017
ff6ad90
Fixing description for HTMLFileContents
m-cheung Jun 8, 2017
c5fb6fb
Removing requirement for HTTPRequest in HTMLFileContents
m-cheung Jun 8, 2017
c736c28
Allowing for nested excluded tags
m-cheung Jun 9, 2017
af4cd9d
Adding support for nested exclude tags
m-cheung Jun 9, 2017
20932a6
Adding Future message with regards to nested pre tags
m-cheung Jun 13, 2017
5f93bbb
Fixing minor formatting problem
m-cheung Jun 13, 2017
30df62a
Additional documentation in IDLFragmentExtractor and HTMLFileContents
m-cheung Jun 14, 2017
7c3d78e
Adding additional documentation to IDLFragmentExtractor
m-cheung Jun 14, 2017
6865625
Formatting and style fixes on HTMLFileContents and IDLFragmentExtractor
m-cheung Jun 14, 2017
2dc37e8
Minor changes to HTMLFileContents and removing unneeded test
m-cheung Jun 26, 2017
c93dec6
Addressing formatting changes in HTMLFileContents
m-cheung Jun 27, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/files.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
`lib${sep}org${sep}chromium${sep}webidl${sep}ast${sep}Stringifier.js`,
`lib${sep}org${sep}chromium${sep}webidl${sep}ast${sep}Float.js`,

// HTML File Content and Extractor
`lib${sep}org${sep}chromium${sep}webidl${sep}HTMLFileContents.js`,
`lib${sep}org${sep}chromium${sep}webidl${sep}IDLFragmentExtractor.js`,

// Web IDL parsers
`lib${sep}org${sep}chromium${sep}webidl${sep}BaseParser.js`,
`lib${sep}org${sep}chromium${sep}webidl${sep}Parser.js`,
Expand Down
33 changes: 33 additions & 0 deletions lib/org/chromium/webidl/HTMLFileContents.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
'use strict';

foam.CLASS({
package: 'org.chromium.webidl',
name: 'HTMLFileContents',

documentation: 'An HTML file that stores it contents.',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More docs: Is the HTMLFileContents.contents pre-processed in any way? (E.g., &foo;-escaped?) or is it the raw request body?


ids: ['url', 'timestamp'],

properties: [
{
class: 'String',
name: 'url',
required: true,
final: true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add trailing comma

},
{
class: 'Date',
name: 'timestamp',
required: true,
final: true,
},
{
class: 'String',
name: 'content',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HTMLFileContents.contents would be a more consistent name.

final: true,
},
],
});
1 change: 0 additions & 1 deletion lib/org/chromium/webidl/IDLFileContents.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ foam.CLASS({
requires: [
'foam.core.Property',
'foam.net.HTTPrequest',
'org.chromium.webidl.IDLFileBase',
],

properties: [
Expand Down
101 changes: 101 additions & 0 deletions lib/org/chromium/webidl/IDLFragmentExtractor.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
'use strict';

foam.CLASS({
package: 'org.chromium.webidl',
name: 'IDLFragmentExtractor',
documentation: 'extracts IDL Fragments from HTML files',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Full sentence. (with capital and period)

requires: [
'foam.parsers.html.HTMLLexer'
],

properties: [
{
name: 'file',
of: 'org.chromium.webidl.HTMLFileContent',
postSet: function(_, file) {
this.idlFragments = this.extract();
},
},
{
name: 'idlFragments',
// description: 'all idlFragments found in the parsed HTML file',
},
],

methods: [
function extract() {
var self = this;
var lexer = self.HTMLLexer.create();
var OPEN = lexer.TagType.OPEN.name;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and elsewhere (lexer.SomeClassName and lexer.lib): We should probably require classes we refer to and use this.TagType (or similar); this depends on the implementation detail of "lexer happens to require Tag". For the lib we can just refer to the foam.path.to.lib once and store it in a variable.

var CLOSE = lexer.TagType.CLOSE.name;
var extractAttr = function(node, attrName) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's legitimate to have:

<node-name attr="value1
    value2" attr="value3">

to yield {attr: ['value1, 'value2', 'value3']}

I assume HTMLLexer doesn't collapse whitespace, so I think we need to revise this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have made minor changes to the extract code which allows for this. It will be part of the next set of changes.

var retVal;
node.attributes.some(function(attr) {
if (attr.name === attrName) {
retVal = attr.value.split(' ');
}
});
return retVal;
};

var results = lexer.parseString(self.file.content).value;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably throw something intentional on incomplete or failed parse.


var idlFragments = [];
var tagStack = []; // Used for tag matching.
var excludeStack = []; // Used for tracking excluded (example/note) tags.
var tagMatching = true; // Set whe not inside a pre tag of interest.
var content = ''; // Used to group together related content.
for (var i = 0; i < results.length; i++) {
var item = results[i];
var isTag = lexer.Tag.isInstance(item);

// FUTURE: Handle nested pre tags.
// As of this writing, there has not been any IDL fragments
// that has been found within nested pre tags.
if (!tagMatching) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clearer IMHO: if (isTag) { if (item.nodeName === 'pre') { ... } else { ... } }

// Ignoring all tags. Only extracting text within pre tags.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this comment. Can we get a comment at the top of each if branch in this method? The logic is pretty complex.

if (isTag && item.nodeName === 'pre') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to handle nested pre tags?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is hard to say at this point whether nested pre tags affect the information we care about. From my current observations, there hasn't been any IDL fragments within nest pre tags (they seem to be mostly used for formatting). Thus, it seems like it currently is sufficient for our purposes (at least I hope so)

We could attempt to put the content through another round of processing or implement a proper HTML parser (which was my first attempt at this problem, but was scrapped since it did a lot more than it needed to and likely had other issues too).

tagMatching = true;
tagStack.pop();
idlFragments.push(lexer.lib.unescapeString(content));
content = '';
} else {
content += isTag ? '' : item;
}
} else if (isTag) {
var top = tagStack[tagStack.length - 1];
var classes = extractAttr(item, 'class');
var isIDL = classes && classes.includes('idl');
var isExcluded = function(cls) {
return cls && (cls.includes('example') || cls.includes('note'));
};

if (item.type.name === OPEN) {
if (isExcluded(classes)) {
excludeStack.push(item);
} else if (excludeStack.length === 0 && item.nodeName === 'pre' && isIDL) {
// Found a <pre class="idl.*">.
// Ignore tags and only extract text now.
tagMatching = false;
}
tagStack.push(item);
} else if (top && item.type.name === CLOSE && top.nodeName === item.nodeName) {
var parentCls = extractAttr(top, 'class');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't parent, right? It's openTag? Maybe the code would be easier to read if this branch started with:

var openTag = top;
var closeTag = item

and open and close prefixes are used to refer to tag-related things.

var excludeStackTop = excludeStack[excludeStack.length - 1];
if (isExcluded(parentCls) && item.nodeName === excludeStackTop.nodeName) {
excludeStack.pop();
}
tagStack.pop();
} else {
// Mismatched close tags and OPEN_CLOSE tags are ignored.
}
}
}
return idlFragments;
}
]
});

51 changes: 51 additions & 0 deletions test/any/htmlFileClasses-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2017 The Chromium Authors. All rights reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file name: we just test HTMLFileContents, yes? Let's name this file after that: HTMLFileContents-test.js.

// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
'use strict';

describe('HTML file classes', function() {
var HTMLFileContents;

beforeEach(function() {
HTMLFileContents = foam.lookup('org.chromium.webidl.HTMLFileContents');
});

it('should fetch some content and properly set the timestamp', function() {
var url = 'http://someTest.url/index.html';
var content = '<html></html>';
var date = new Date();
var file = HTMLFileContents.create({
url: url,
timestamp: new Date(),
content: content
});

// Verify properties are as set
expect(file.url).toBe(url);
expect(file.timestamp.getTime()).toBe(date.getTime());
expect(file.content).toBe(content);
});

it('should fail to set HTMLFileContent props after creation', function() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably nix this due to how DatastoreDAO works.

var initUrl = 'http://someTest.url/index.html';
var newUrl = 'http://someOther.url/index.html';
var initContent = '<html></html>';
var newContent = '<html>Potato</html>';
var origDate = new Date(0);
var newDate = new Date();

var file = HTMLFileContents.create({
url: initUrl,
timestamp: origDate,
content: initContent
});

// On set, they should fail
expect(function() { file.url = newUrl; }).toThrow();
expect(file.url).toBe(initUrl);
expect(function() { file.content = newContent; }).toThrow();
expect(file.content).toBe(initContent);
expect(function() { file.timestamp = newDate; }).toThrow();
expect(file.timestamp.getTime()).toBe(origDate.getTime());
});
});
25 changes: 25 additions & 0 deletions test/node/parsing/Console/2
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Exposed=(Window,Worker,Worklet)]
namespace console { // but see namespace object requirements below
// Logging
void assert(optional boolean condition = false, any... data);
void clear();
void count(optional DOMString label = "default");
void debug(any... data);
void error(any... data);
void info(any... data);
void log(any... data);
void table(any tabularData, optional sequence<DOMString> properties);
void trace(any... data);
void warn(any... data);
void dir(any item, optional object? options);
void dirxml(any... data);

// Grouping
void group(any... data);
void groupCollapsed(any... data);
void groupEnd();

// Timing
void time(optional DOMString label = "default");
void timeEnd(optional DOMString label = "default");
};
25 changes: 25 additions & 0 deletions test/node/parsing/Console/3
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Exposed=(Window,Worker,Worklet)]
namespace console { // but see namespace object requirements below
// Logging
void assert(optional boolean condition = false, any... data);
void clear();
void count(optional DOMString label = "default");
void debug(any... data);
void error(any... data);
void info(any... data);
void log(any... data);
void table(any tabularData, optional sequence<DOMString> properties);
void trace(any... data);
void warn(any... data);
void dir(any item, optional object? options);
void dirxml(any... data);

// Grouping
void group(any... data);
void groupCollapsed(any... data);
void groupEnd();

// Timing
void time(optional DOMString label = "default");
void timeEnd(optional DOMString label = "default");
};
Loading