Skip to content

Commit 7d05736

Browse files
committed
switch back to a more simpler version (similar to 4.16.0) as suggested by René Schwietzke in PR #152
1 parent 12c5283 commit 7d05736

File tree

1 file changed

+56
-149
lines changed

1 file changed

+56
-149
lines changed

src/main/java/org/htmlunit/cyberneko/HTMLElements.java

Lines changed: 56 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
*/
1616
package org.htmlunit.cyberneko;
1717

18-
import java.util.ArrayList;
1918
import java.util.HashMap;
20-
import java.util.List;
2119
import java.util.Locale;
2220

2321
import org.htmlunit.cyberneko.util.FastHashMap;
@@ -32,6 +30,7 @@
3230
* @author Ahmed Ashour
3331
* @author Marc Guillemot
3432
* @author Ronald Brill
33+
* @author René Schwietzke
3534
*/
3635
public class HTMLElements implements HTMLElementsProvider {
3736

@@ -202,7 +201,7 @@ public class HTMLElements implements HTMLElementsProvider {
202201
private final HashMap<String, Element> elementsByNameForReference_ = new HashMap<>();
203202

204203
// this is a optimized version which will be later queried
205-
FastHashMap<String, Element>[] elementsByNamePerLength_;
204+
FastHashMap<String, Element> elementsByName_;
206205

207206
public HTMLElements() {
208207
final Element[][] elementsArray = new Element[26][];
@@ -557,7 +556,7 @@ public HTMLElements() {
557556
for (final Element[] elements : elementsArray) {
558557
if (elements != null) {
559558
for (final Element element : elements) {
560-
this.elementsByNameForReference_.put(element.name, element);
559+
elementsByNameForReference_.put(element.name, element);
561560
}
562561
}
563562
}
@@ -566,28 +565,22 @@ public HTMLElements() {
566565
setupOptimizedVersions();
567566
}
568567

568+
/**
569+
* Adds or replaces an element definition in the collection.
570+
* Rebuilds the internal lookup structures to reflect the change.
571+
*
572+
* @param element the element to add or replace
573+
*/
569574
public void setElement(final Element element) {
570-
this.elementsByNameForReference_.put(element.name, element);
575+
elementsByNameForReference_.put(element.name, element);
571576

572577
// rebuild the information "trees"
573578
setupOptimizedVersions();
574579
}
575580

576581
private void setupOptimizedVersions() {
577-
int maxCode = -1;
578-
ArrayList<List<Element>> elementsByLength = new ArrayList<>(10);
579-
for (final Element element : elementsByNameForReference_.values()) {
580-
if (element.code > maxCode) {
581-
maxCode = element.code;
582-
}
583-
584-
int length = element.lowercaseName.length();
585-
while (elementsByLength.size() < length) {
586-
elementsByLength.add(new ArrayList<>(30));
587-
}
588-
List<Element> elements = elementsByLength.get(length - 1);
589-
elements.add(element);
590-
}
582+
// get us the max length
583+
final int maxCode = elementsByNameForReference_.values().stream().mapToInt(e -> e.code).max().orElse(0);
591584

592585
// we got x amount of elements + 1 unknown
593586
// put that into an array instead of a map, that
@@ -598,22 +591,16 @@ private void setupOptimizedVersions() {
598591
elementsByNameForReference_.values().forEach(v -> elementsByCode_[v.code] = v);
599592
elementsByCode_[NO_SUCH_ELEMENT.code] = NO_SUCH_ELEMENT;
600593

601-
// get us a second version that is lowercase stringified to
602-
// reduce lookup overhead
603-
elementsByNamePerLength_ = new FastHashMap[elementsByLength.size()];
604-
int i = 0;
605-
for (final List<Element> elements : elementsByLength) {
606-
if (elements.size() > 0) {
607-
FastHashMap<String, Element> entry = new FastHashMap<>(elements.size(), 0.70f);
608-
for (Element element : elements) {
609-
entry.put(element.lowercaseName, element);
610-
611-
// initialize cross references to parent elements
612-
defineParents(element);
613-
}
614-
elementsByNamePerLength_[i] = entry;
615-
}
616-
i++;
594+
// add all together and also get us a second version that is
595+
// lowercase only for faster lower case lookups, hence we have twice
596+
// the size of the map as we need to store both versions
597+
elementsByName_ = new FastHashMap<>(2 * maxCode, 0.50f);
598+
599+
for (final Element element : elementsByNameForReference_.values()) {
600+
// initialize cross references to parent elements
601+
defineParents(element);
602+
603+
elementsByName_.put(element.lowercaseName, element);
617604
}
618605

619606
// NO_SUCH_ELEMENT is not part of elementsByLength
@@ -661,24 +648,14 @@ public final Element getElement(final String ename) {
661648
*/
662649
@Override
663650
public final Element getElement(final String ename, final Element elementIfNotFound) {
664-
int length = ename.length();
665-
if (length > elementsByNamePerLength_.length) {
666-
return elementIfNotFound;
667-
}
668-
669-
FastHashMap<String, Element> entry = elementsByNamePerLength_[length - 1];
670-
if (entry == null) {
671-
return elementIfNotFound;
672-
}
673-
674651
// check the current form casing first, which is mostly lowercase only
675-
Element r = entry.get(ename);
652+
Element r = elementsByName_.get(ename);
676653
if (r == null) {
677654
// we have not found it in its current form, might be uppercase
678655
// or mixed case, so try all lowercase for sanity, we speculated that
679656
// good HTML is mostly all lowercase in the first place so this is the
680657
// fallback for atypical HTML
681-
r = entry.get(ename.toLowerCase(Locale.ROOT));
658+
r = elementsByName_.get(ename.toLowerCase(Locale.ROOT));
682659
}
683660
if (r == null) {
684661
return elementIfNotFound;
@@ -692,24 +669,24 @@ public final Element getElement(final String ename, final Element elementIfNotFo
692669
*/
693670
@Override
694671
public final Element getElementLC(final String enameLC, final Element elementIfNotFound) {
695-
int length = enameLC.length();
696-
if (length > elementsByNamePerLength_.length) {
697-
return elementIfNotFound;
698-
}
699-
700-
FastHashMap<String, Element> entry = elementsByNamePerLength_[length - 1];
701-
if (entry == null) {
702-
return elementIfNotFound;
703-
}
704-
705-
Element r = entry.get(enameLC);
706-
if (r == null) {
707-
return elementIfNotFound;
708-
}
709-
710-
return r;
672+
final Element r = elementsByName_.get(enameLC);
673+
return r == null ? elementIfNotFound : r;
711674
}
712675

676+
/**
677+
* An implementation of {@link HTMLElementsProvider} that wraps an {@link HTMLElements}
678+
* instance and adds a simple cache for unknown element lookups.
679+
* <p>
680+
* This class is optimized for repeated element name lookups, especially for names
681+
* that are not present in the known elements set. It avoids repeated lowercasing
682+
* and unnecessary lookups for unknown elements by caching misses.
683+
* </p>
684+
*
685+
* <p>
686+
* This implementation is not thread-safe.
687+
* </p>
688+
*
689+
*/
713690
public static class HTMLElementsWithCache implements HTMLElementsProvider {
714691

715692
private final HTMLElements htmlElements_;
@@ -719,16 +696,22 @@ public static class HTMLElementsWithCache implements HTMLElementsProvider {
719696

720697
public HTMLElementsWithCache(final HTMLElements htmlElements) {
721698
htmlElements_ = htmlElements;
722-
unknownElements_ = new FastHashMap<>(11, 0.70f);
699+
unknownElements_ = new FastHashMap<>(11, 0.50f);
723700
}
724701

702+
/**
703+
* {@inheritDoc}
704+
*/
725705
@Override
726706
public Element getElement(short code) {
727707
return htmlElements_.getElement(code);
728708
}
729709

710+
/**
711+
* {@inheritDoc}
712+
*/
730713
@Override
731-
public Element getElement(String ename) {
714+
public Element getElement(final String ename) {
732715
Element element = getElement(ename, htmlElements_.NO_SUCH_ELEMENT);
733716
if (element == htmlElements_.NO_SUCH_ELEMENT) {
734717
element = new Element(UNKNOWN,
@@ -742,39 +725,12 @@ public Element getElement(String ename) {
742725
return element;
743726
}
744727

728+
/**
729+
* {@inheritDoc}
730+
*/
745731
@Override
746-
public Element getElement(String ename, Element elementIfNotFound) {
747-
int length = ename.length();
748-
if (length > htmlElements_.elementsByNamePerLength_.length) {
749-
if (unknownElements_.get(ename) != null) {
750-
// we added it to the cache, so we know it has been
751-
// queried once unsuccessfully before
752-
return elementIfNotFound;
753-
}
754-
755-
// remember that we had a miss
756-
unknownElements_.put(ename, Boolean.TRUE);
757-
758-
return elementIfNotFound;
759-
}
760-
761-
FastHashMap<String, Element> entry = htmlElements_.elementsByNamePerLength_[length - 1];
762-
if (entry == null) {
763-
// check first if we know that we don't know and avoid the
764-
// lowercasing later
765-
if (unknownElements_.get(ename) != null) {
766-
// we added it to the cache, so we know it has been
767-
// queried once unsuccessfully before
768-
return elementIfNotFound;
769-
}
770-
771-
// remember that we had a miss
772-
unknownElements_.put(ename, Boolean.TRUE);
773-
774-
return elementIfNotFound;
775-
}
776-
777-
Element r = entry.get(ename);
732+
public Element getElement(final String ename, final Element elementIfNotFound) {
733+
Element r = htmlElements_.elementsByName_.get(ename);
778734
if (r == null) {
779735
// check first if we know that we don't know and avoid the
780736
// lowercasing later
@@ -789,7 +745,7 @@ public Element getElement(String ename, Element elementIfNotFound) {
789745
// good HTML is mostly all lowercase in the first place so this is the
790746
// fallback for atypical HTML
791747
// we also have not seen that element missing yet
792-
r = entry.get(ename.toLowerCase(Locale.ROOT));
748+
r = htmlElements_.elementsByName_.get(ename.toLowerCase(Locale.ROOT));
793749
if (r == null) {
794750
// remember that we had a miss
795751
unknownElements_.put(ename, Boolean.TRUE);
@@ -801,62 +757,13 @@ public Element getElement(String ename, Element elementIfNotFound) {
801757
}
802758

803759
@Override
804-
public Element getElementLC(String enameLC, Element elementIfNotFound) {
805-
int length = enameLC.length();
806-
if (length > htmlElements_.elementsByNamePerLength_.length) {
807-
if (unknownElements_.get(enameLC) != null) {
808-
// we added it to the cache, so we know it has been
809-
// queried once unsuccessfully before
810-
return elementIfNotFound;
811-
}
812-
813-
// remember that we had a miss
814-
unknownElements_.put(enameLC, Boolean.TRUE);
815-
816-
return elementIfNotFound;
817-
}
818-
819-
FastHashMap<String, Element> entry = htmlElements_.elementsByNamePerLength_[length - 1];
820-
if (entry == null) {
821-
// check first if we know that we don't know and avoid the
822-
// lowercasing later
823-
if (unknownElements_.get(enameLC) != null) {
824-
// we added it to the cache, so we know it has been
825-
// queried once unsuccessfully before
826-
return elementIfNotFound;
827-
}
828-
829-
// remember that we had a miss
830-
unknownElements_.put(enameLC, Boolean.TRUE);
831-
832-
return elementIfNotFound;
833-
}
834-
835-
Element r = entry.get(enameLC);
836-
if (r == null) {
837-
// check first if we know that we don't know and avoid the
838-
// lowercasing later
839-
if (unknownElements_.get(enameLC) != null) {
840-
// we added it to the cache, so we know it has been
841-
// queried once unsuccessfully before
842-
return elementIfNotFound;
843-
}
844-
845-
// remember that we had a miss
846-
unknownElements_.put(enameLC, Boolean.TRUE);
847-
848-
return elementIfNotFound;
849-
}
850-
851-
return r;
760+
public Element getElementLC(final String enameLC, final Element elementIfNotFound) {
761+
return htmlElements_.getElementLC(enameLC, elementIfNotFound);
852762
}
853-
854763
}
855764

856765
/**
857766
* Element information.
858-
*
859-
* @author Andy Clark
860767
*/
861768
public static class Element {
862769

0 commit comments

Comments
 (0)