Skip to content

Commit

Permalink
AG-16475 parse complex selector with extended pseudo-class inside
Browse files Browse the repository at this point in the history
Squashed commit of the following:

commit f6d2aecf144c3daf50688f7408fccf4243d13c24
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Wed Oct 5 21:39:02 2022 +0300

    fix typo: inside of it -> inside it

commit a9a211f32fa23275becf4ac4ea34566b491099b7
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Wed Oct 5 21:37:02 2022 +0300

    fix comment for square bracket left

commit b97f90e77138763b067bbe2f1472db3daad2fb15
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Wed Oct 5 21:34:14 2022 +0300

    parse standard pseudo-class with brackets after extended one in single complex selector

commit 807f1c4998f6e09086a760d084890868989ec649
Merge: 0e8d5c3 92d6b8b
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Wed Oct 5 14:08:47 2022 +0300

    Merge branch 'epic/AG-3532' into fix/AG-16475

commit 0e8d5c377e40f7cbd7c62e6cab8160087493016d
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Tue Oct 4 15:29:20 2022 +0300

    add few more query-jsdom tests for complex selectors

commit a2872da1456580dfc506ecdd360184ab70ed6462
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Tue Oct 4 15:26:17 2022 +0300

    add query-jsdom tests for complex selectors

commit 189a08c2d324fe8d09f7eef91fcb945f281f2577
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Tue Oct 4 15:14:25 2022 +0300

    parse complext selector with standard pseudo after extended one

commit 02607e20ef29e6c2c670b962fda7b2b7eda02f0f
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Mon Oct 3 20:07:58 2022 +0300

    add one more xpath limitation to readme

commit 8e6a50770a918a0aaf67863f20ace32266f1552c
Merge: e990fc8 1e77dac
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Mon Oct 3 20:04:34 2022 +0300

    Merge branch 'epic/AG-3532' into fix/AG-16475

commit e990fc83fde9bc5a022ff9a6fcd002087bd6f4c0
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Mon Oct 3 20:04:17 2022 +0300

    fix xpath parsing

commit e9af913aa4d07b3e21bb9d2c7ff9a262e959bc7b
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Mon Oct 3 19:59:28 2022 +0300

    add more complex selector tests

commit 6a31811f72e5fd79c8cb24361c363061a6c6e88c
Author: Slava Leleka <v.leleka@adguard.com>
Date:   Mon Oct 3 19:57:31 2022 +0300

    parse complex selector with extended pseudo-class inside of it
  • Loading branch information
slavaleleka committed Oct 6, 2022
1 parent 92d6b8b commit 36433d9
Show file tree
Hide file tree
Showing 4 changed files with 390 additions and 57 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ Pseudo-class `:xpath()` allows to select an element by evaluating a XPath expres
> Extended selectors with defined `target` as *any* selector — `*:xpath(expression)` — can still be used but it is not recommended, so `target` should be specified instead.
> Works properly only at the end of selector, except of [pseudo-class :remove()](#remove-pseudos).
**Examples**

`:xpath(//*[@class="banner"])` will select `div#target1`:
Expand Down
236 changes: 185 additions & 51 deletions src/selector/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import {
WHITE_SPACE_CHARACTERS,
SUPPORTED_PSEUDO_CLASSES,
ABSOLUTE_PSEUDO_CLASSES,
RELATIVE_PSEUDO_CLASSES,
XPATH_PSEUDO_CLASS_MARKER,
HAS_PSEUDO_CLASS_MARKERS,
IS_PSEUDO_CLASS_MARKER,
Expand Down Expand Up @@ -143,6 +144,28 @@ const getBufferNode = (context: Context): AnySelectorNodeInterface | null => {
return getLast(context.pathToBufferNode);
};

/**
* Gets last RegularSelector ast node.
* Needed for parsing of the complex selector with extended pseudo-class inside it
* @param context
*/
const getLastRegularSelectorNode = (context: Context): AnySelectorNodeInterface => {
const bufferNode = getBufferNode(context);
if (!bufferNode) {
throw new Error('No bufferNode found');
}
if (bufferNode.type !== NodeType.Selector) {
throw new Error('Unsupported bufferNode type');
}
const selectorRegularChildren = bufferNode.children.filter((node) => node.type === NodeType.RegularSelector);
if (selectorRegularChildren.length === 0) {
throw new Error('No RegularSelector node found');
}
const lastRegularSelectorNode = getLast(selectorRegularChildren);
context.pathToBufferNode.push(lastRegularSelectorNode);
return lastRegularSelectorNode;
};

/**
* Updates needed buffer node value while tokens iterating
* @param context parser context
Expand Down Expand Up @@ -239,6 +262,109 @@ const upToClosest = (context: Context, parentType: NodeType): void => {
}
};

/**
* Gets needed buffer node updated due to complex selector parsing
* @param context
*/
const getUpdatedBufferNode = (context: Context): AnySelectorNodeInterface | null => {
upToClosest(context, NodeType.Selector);
const selectorNode = getBufferNode(context);
if (!selectorNode) {
throw new Error('No SelectorNode, impossible to continue selector parsing');
}
const lastSelectorNodeChild = getLast(selectorNode.children);
const hasExtended = lastSelectorNodeChild.type === NodeType.ExtendedSelector
// parser position might be inside standard pseudo-class brackets which has space
// e.g. 'div:contains(/а/):nth-child(100n + 2)'
&& context.standardPseudoBracketsStack.length === 0;
const lastExtendedPseudoName = hasExtended
&& lastSelectorNodeChild.children[0].name;

const isLastExtendedNameRelative = lastExtendedPseudoName
&& RELATIVE_PSEUDO_CLASSES.includes(lastExtendedPseudoName);
const isLastExtendedNameAbsolute = lastExtendedPseudoName
&& ABSOLUTE_PSEUDO_CLASSES.includes(lastExtendedPseudoName);

const hasRelativeExtended = isLastExtendedNameRelative
&& context.extendedPseudoBracketsStack.length > 0
&& context.extendedPseudoBracketsStack.length === context.extendedPseudoNamesStack.length;
const hasAbsoluteExtended = isLastExtendedNameAbsolute
&& lastExtendedPseudoName === getLast(context.extendedPseudoNamesStack);

let newNeededBufferNode = selectorNode;
if (hasRelativeExtended) {
// return relative selector node to update later
context.pathToBufferNode.push(lastSelectorNodeChild);
newNeededBufferNode = lastSelectorNodeChild.children[0];
} else if (hasAbsoluteExtended) {
// return absolute selector node to update later
context.pathToBufferNode.push(lastSelectorNodeChild);
newNeededBufferNode = lastSelectorNodeChild.children[0];
} else if (hasExtended) {
// return selector node to add new regular selector node later
newNeededBufferNode = selectorNode;
} else {
// otherwise return last regular selector node to update later
newNeededBufferNode = getLastRegularSelectorNode(context);
}
context.pathToBufferNode.push(newNeededBufferNode);
return newNeededBufferNode;
};

/**
* Checks values of few next tokens on colon token `:` and
* - updates buffer node for following standard pseudo-class
* - adds extended selector ast node for following extended pseudo-class
* - validates some cases of `:remove()` and `:has()` usage
* @param context
* @param selector
* @param tokenValue
* @param nextTokenValue
* @param nextToNextTokenValue
*/
const handleNextTokenOnColon = (
context: Context,
selector: string,
tokenValue: string,
nextTokenValue: string,
nextToNextTokenValue: string,
) => {
if (!isSupportedExtendedPseudo(nextTokenValue.toLowerCase())) {
if (nextTokenValue.toLowerCase() === REMOVE_PSEUDO_MARKER) {
// :remove() pseudo-class should be handled before
// as it is not about element selecting but actions with elements
// e.g. 'body > div:empty:remove()'
throw new Error(`Selector parser error: invalid :remove() pseudo-class in selector: '${selector}'`); // eslint-disable-line max-len
}
// if following token is not an extended pseudo
// the colon should be collected to value of RegularSelector
// e.g. '.entry_text:nth-child(2)'
updateBufferNode(context, tokenValue);
// check the token after the pseudo and do balance parentheses later
// only if it is functional pseudo-class (standard with brackets, e.g. ':lang()').
// no brackets balance needed for such case,
// parser position is on first colon after the 'div':
// e.g. 'div:last-child:has(button.privacy-policy__btn)'
if (nextToNextTokenValue === BRACKETS.PARENTHESES.LEFT) {
context.standardPseudoNamesStack.push(nextTokenValue);
}
} else {
// it is supported extended pseudo-class.
// Disallow :has() inside the pseudos accepting only compound selectors
// https://bugs.chromium.org/p/chromium/issues/detail?id=669058#c54 [2]
if (HAS_PSEUDO_CLASS_MARKERS.includes(nextTokenValue)
&& context.standardPseudoNamesStack.length > 0) {
// eslint-disable-next-line max-len
throw new Error(`Usage of :${nextTokenValue} pseudo-class is not allowed inside regular pseudo: '${getLast(context.standardPseudoNamesStack)}'`);
} else {
// stop RegularSelector value collecting
upToClosest(context, NodeType.Selector);
// add ExtendedSelector to Selector children
addAstNodeByType(context, NodeType.ExtendedSelector);
}
}
};

/**
* Parses selector into ast for following element selection
* @param selector
Expand Down Expand Up @@ -349,6 +475,15 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
}
break;
case SPACE:
// it might be complex selector with extended pseudo-class inside it
// and the space is between that complex selector and following regular selector
// parser position is on ` ` before `span` now:
// e.g. 'div:has(img).banner span'
// so we need to check whether the new ast node should be added (example above)
// or previous regular selector node should be updated
if (bufferNode?.type === NodeType.RegularSelector) {
bufferNode = getUpdatedBufferNode(context);
}
if (bufferNode?.type === NodeType.RegularSelector) {
// standard selectors with white space between colon and name of pseudo
// are invalid for native document.querySelectorAll() anyway,
Expand Down Expand Up @@ -419,6 +554,15 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
case ID_MARKER:
case CLASS_MARKER:
case BRACKETS.SQUARE.LEFT:
// it might be complex selector with extended pseudo-class inside it
// and the space is between that complex selector and following regular selector
// e.g. 'div:has(img).banner' // parser position is on `.` before `banner` now
// 'div:has(img)[attr]' // parser position is on `[` before `attr` now
// so we need to check whether the new ast node should be added (example above)
// or previous regular selector node should be updated
if (COMBINATORS.includes(tokenValue)) {
bufferNode = getUpdatedBufferNode(context);
}
if (bufferNode === null) {
// no ast collecting has been started
if (tokenValue === ASTERISK
Expand Down Expand Up @@ -465,6 +609,13 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
// or '.inner:nth-ancestor(1)~ .banner'
if (COMBINATORS.includes(tokenValue)) {
addAstNodeByType(context, NodeType.RegularSelector, tokenValue);
} else if (!context.isRegexpOpen) {
// it might be complex selector with extended pseudo-class inside it.
// parser position is on `.` now:
// e.g. 'div:has(img).banner'
// so we need to get last regular selector node and update its value
bufferNode = getLastRegularSelectorNode(context);
updateBufferNode(context, tokenValue);
}
} else if (bufferNode.type === NodeType.SelectorList) {
// add Selector to SelectorList
Expand Down Expand Up @@ -569,41 +720,7 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
|| nextTokenValue === REGULAR_PSEUDO_CLASSES.WHERE)) {
throw new Error(`Usage of :${nextTokenValue} pseudo-class is not allowed inside upper :has`); // eslint-disable-line max-len
}

if (!isSupportedExtendedPseudo(nextTokenValue.toLowerCase())) {
if (nextTokenValue.toLowerCase() === REMOVE_PSEUDO_MARKER) {
// :remove() pseudo-class should be handled before
// as it is not about element selecting but actions with elements
// e.g. 'body > div:empty:remove()'
throw new Error(`Selector parser error: invalid :remove() pseudo-class in selector: '${selector}'`); // eslint-disable-line max-len
}
// if following token is not an extended pseudo
// the colon should be collected to value of RegularSelector
// e.g. '.entry_text:nth-child(2)'
updateBufferNode(context, tokenValue);
// check the token after the pseudo and do balance parentheses later
// only if it is functional pseudo-class (standard with brackets, e.g. ':lang()').
// no brackets balance needed for such case,
// parser position is on first colon after the 'div':
// e.g. 'div:last-child:has(button.privacy-policy__btn)'
if (nextToNextTokenValue === BRACKETS.PARENTHESES.LEFT) {
context.standardPseudoNamesStack.push(nextTokenValue);
}
} else {
// it is supported extended pseudo-class.
// Disallow :has() inside the pseudos accepting only compound selectors
// https://bugs.chromium.org/p/chromium/issues/detail?id=669058#c54 [2]
if (HAS_PSEUDO_CLASS_MARKERS.includes(nextTokenValue)
&& context.standardPseudoNamesStack.length > 0) {
// eslint-disable-next-line max-len
throw new Error(`Usage of :${nextTokenValue} pseudo-class is not allowed inside regular pseudo: '${getLast(context.standardPseudoNamesStack)}'`);
} else {
// stop RegularSelector value collecting
upToClosest(context, NodeType.Selector);
// add ExtendedSelector to Selector children
addAstNodeByType(context, NodeType.ExtendedSelector);
}
}
handleNextTokenOnColon(context, selector, tokenValue, nextTokenValue, nextToNextTokenValue);
}
if (bufferNode?.type === NodeType.Selector) {
// after the extended pseudo closing parentheses
Expand All @@ -621,10 +738,12 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
// e.g. '#banner:upward(2):remove()'
throw new Error(`Selector parser error: invalid :remove() pseudo-class in selector: '${selector}'`); // eslint-disable-line max-len
} else {
// otherwise it is standard pseudo after extended pseudo-class
// and colon should be collected to value of RegularSelector
// otherwise it is standard pseudo after extended pseudo-class in complex selector
// and colon should be collected to value of previous RegularSelector
// e.g. 'body *:not(input)::selection'
addAstNodeByType(context, NodeType.RegularSelector, tokenValue);
// 'input:matches-css(padding: 10):checked'
bufferNode = getLastRegularSelectorNode(context);
handleNextTokenOnColon(context, selector, tokenValue, nextTokenType, nextToNextTokenValue); // eslint-disable-line max-len
}
}
if (bufferNode?.type === NodeType.AbsolutePseudoClass) {
Expand Down Expand Up @@ -701,22 +820,37 @@ export const parse = (selector: string): AnySelectorNodeInterface => {
updateBufferNode(context, tokenValue);
} else {
// remove stacked open parentheses for brackets balance
// and stacked name of extended pseudo-class
// e.g. 'h3:contains((Ads))'
// or 'div:xpath(//h3[contains(text(),"Share it!")]/..)'
context.extendedPseudoBracketsStack.pop();
context.extendedPseudoNamesStack.pop();
if (context.extendedPseudoBracketsStack.length > context.extendedPseudoNamesStack.length) { // eslint-disable-line max-len
// if brackets stack is not empty yet, save tokenValue to arg of AbsolutePseudoClass
// parser position on first closing bracket after 'Ads':
// e.g. 'h3:contains((Ads))'
updateBufferNode(context, tokenValue);
} else if (context.extendedPseudoBracketsStack.length >= 0
&& context.extendedPseudoNamesStack.length >= 0) {
// assume it is combined extended pseudo-classes
// parser position on first closing bracket after 'advert':
// e.g. 'div:has(.banner, :contains(advert))'
upToClosest(context, NodeType.Selector);
if (bufferNode.name !== XPATH_PSEUDO_CLASS_MARKER) {
// for all other absolute pseudo-classes except :xpath()
// remove stacked name of extended pseudo-class
context.extendedPseudoNamesStack.pop();
if (context.extendedPseudoBracketsStack.length > context.extendedPseudoNamesStack.length) { // eslint-disable-line max-len
// if brackets stack is not empty yet,
// save tokenValue to arg of AbsolutePseudoClass
// parser position on first closing bracket after 'Ads':
// e.g. 'h3:contains((Ads))'
updateBufferNode(context, tokenValue);
} else if (context.extendedPseudoBracketsStack.length >= 0
&& context.extendedPseudoNamesStack.length >= 0) {
// assume it is combined extended pseudo-classes
// parser position on first closing bracket after 'advert':
// e.g. 'div:has(.banner, :contains(advert))'
upToClosest(context, NodeType.Selector);
}
} else {
// for :xpath()
if (context.extendedPseudoBracketsStack.length < context.extendedPseudoNamesStack.length) { // eslint-disable-line max-len
// remove stacked name of extended pseudo-class
// if there are less brackets than pseudo-class names
// with means last removes bracket was closing for pseudo-class
context.extendedPseudoNamesStack.pop();
} else {
// otherwise the bracket is part of arg
updateBufferNode(context, tokenValue);
}
}
}
}
Expand Down
Loading

0 comments on commit 36433d9

Please sign in to comment.