Skip to content

Commit

Permalink
Fix encoding issues and improve parsing performance
Browse files Browse the repository at this point in the history
  • Loading branch information
Petr Skoda committed Aug 20, 2016
1 parent 50a802f commit 888af13
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 68 deletions.
142 changes: 94 additions & 48 deletions lib/Sabberworm/CSS/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ class Parser {
private $aText;
private $iCurrentPosition;
private $oParserSettings;
private $sCharset;
private $iLength;
private $blockRules;
private $aSizeUnits;
private $iLineNo;
private $sTextLibrary;

/**
* Parser constructor.
Expand Down Expand Up @@ -64,20 +64,75 @@ public function __construct($sText, Settings $oParserSettings = null, $iLineNo =
$this->aSizeUnits[$iSize][strtolower($val)] = $val;
}
ksort($this->aSizeUnits, SORT_NUMERIC);
$this->fixCharset();
}

public function setCharset($sCharset) {
$this->sCharset = $sCharset;
$this->aText = $this->strsplit($this->sText);
$this->iLength = count($this->aText);
private function fixCharset() {
// We need to know the charset before the parsing starts.
$sCharset = strtolower($this->oParserSettings->sDefaultCharset);
if (strpos($this->sText, "\xef\xbb\xbf") === 0) {
// Remove BOM before any processing.
$this->sText = substr($this->sText, strlen("\xef\xbb\xbf"));
$sCharset = 'utf-8';
} else if (preg_match('/(.*)@charset\s+["\']([a-z0-9-]+)["\']\s*;/ims', $this->sText, $matches)) {
// This is a simplified guessing, the charset atRule location is validated later,
// hopefully this is not used much these days.
if (trim($matches[1]) === '' and preg_match('/^@charset\s+["\']([a-z0-9-]+)["\']\s*;/im', $matches[0])) {
$sCharset = strtolower($matches[2]);
}
}

// Convert all text to utf-8 so that code does not have to deal with encoding conversions and incompatible characters.
if ($sCharset !== 'utf-8') {
if (function_exists('mb_convert_encoding')) {
$this->sText = mb_convert_encoding($this->sText, 'utf-8', $sCharset);
} else {
$this->sText = iconv($sCharset, 'utf-8', $this->sText);
}
}

// Multibyte support can make the parsing 10x slower,
// but even if it is disabled the unicode characters usually survive this parsing unharmed.
$this->sTextLibrary = 'ascii';
if (!$this->oParserSettings->bMultibyteSupport) {
$this->iLength = $this->strlen($this->sText);
return;
}

// If there are only ASCII characters in the CSS then we can safely use good old PHP string functions here.
if (function_exists('mb_convert_encoding')) {
$sSubst = mb_substitute_character();
mb_substitute_character('none');
$asciiText = mb_convert_encoding($this->sText, 'ASCII', 'utf-8');
mb_substitute_character($sSubst);
} else {
$asciiText = @iconv('utf-8', 'ASCII//IGNORE', $this->sText);
}
if ($this->sText !== $asciiText) {
if (function_exists('mb_convert_encoding')) {
// Usually mbstring extension is much faster than iconv.
$this->sTextLibrary = 'mb';
} else {
$this->sTextLibrary = 'iconv';
}
}
unset($asciiText);
$this->iLength = $this->strlen($this->sText);

// Substring operations are slower with unicode, aText array is used for faster emulation.
if ($this->sTextLibrary !== 'ascii') {
$this->aText = preg_split('//u', $this->sText, null, PREG_SPLIT_NO_EMPTY);
if (!is_array($this->aText) || count($this->aText) !== $this->iLength) {
$this->aText = null;
}
}
}

public function getCharset() {
return $this->sCharset;
return 'utf-8';
}

public function parse() {
$this->setCharset($this->oParserSettings->sDefaultCharset);
$oResult = new Document($this->iLineNo);
$this->parseDocument($oResult);
return $oResult;
Expand Down Expand Up @@ -113,7 +168,7 @@ private function parseList(CSSList $oList, $bIsRoot = false) {
throw new SourceException("Unexpected end of document", $this->iLineNo);
}
}

private function parseListItem(CSSList $oList, $bIsRoot = false) {
if ($this->comes('@')) {
$oAtRule = $this->parseAtRule();
Expand All @@ -124,7 +179,7 @@ private function parseListItem(CSSList $oList, $bIsRoot = false) {
if(count($oList->getContents()) > 0) {
throw new UnexpectedTokenException('@charset must be the first parseable token in a document', '', 'custom', $this->iLineNo);
}
$this->setCharset($oAtRule->getCharset()->getString());
// We have already guessed the charset in the constructor, it cannot be changed now.
}
return $oAtRule;
} else if ($this->comes('}')) {
Expand Down Expand Up @@ -157,7 +212,8 @@ private function parseAtRule() {
$sCharset = $this->parseStringValue();
$this->consumeWhiteSpace();
$this->consume(';');
return new Charset($sCharset, $iIdentifierLineNum);
// Replace the original charset with utf-8 because we have changed the encoding in the constructor.
return new Charset(new CSSString('utf-8', $this->iLineNo), $iIdentifierLineNum);
} else if ($this->identifierIs($sIdentifier, 'keyframes')) {
$oResult = new KeyFrame($iIdentifierLineNum);
$oResult->setVendorKeyFrame($sIdentifier);
Expand Down Expand Up @@ -282,7 +338,12 @@ private function parseCharacter($bIsForIdentifier) {
$sUtf32 .= chr($iUnicode & 0xff);
$iUnicode = $iUnicode >> 8;
}
return iconv('utf-32le', $this->sCharset, $sUtf32);
$sChar = iconv('utf-32le', 'utf-8', $sUtf32);
if ($sChar === chr(0)) {
// PHP does not like null characters in strings for security reasons, just ignore them.
return '';
}
return $sChar;
}
if ($bIsForIdentifier) {
$peek = ord($this->peek());
Expand Down Expand Up @@ -529,7 +590,7 @@ private function identifierIs($sIdentifier, $sMatch) {
}

private function comes($sString, $bCaseInsensitive = false) {
$sPeek = $this->peek(strlen($sString));
$sPeek = $this->peek($this->strlen($sString));
return ($sPeek == '')
? false
: $this->streql($sPeek, $sString, $bCaseInsensitive);
Expand Down Expand Up @@ -652,13 +713,24 @@ private function consumeUntil($aEnd, $bIncludeEnd = false, $consumeEnd = false,
}

private function inputLeft() {
return $this->substr($this->iCurrentPosition, -1);
return $this->substr($this->iCurrentPosition, $this->iLength - $this->iCurrentPosition -1);
}

private function substr($iStart, $iLength) {
if ($iLength < 0) {
$iLength = $this->iLength - $iStart + $iLength;
if ($iLength <= 0 || $iStart >= $this->iLength) {
return '';
}
if ($this->sTextLibrary === 'ascii') {
return substr($this->sText, $iStart, $iLength);
}
if ($iLength > 100 || $iStart < 0 || !isset($this->aText)) {
if ($this->sTextLibrary === 'mb') {
return mb_substr($this->sText, $iStart, $iLength, 'utf-8');
} else {
return iconv_substr($this->sText, $iStart, $iLength, 'utf-8');
}
}
// Use faster substr emulation for short unicode lengths.
if ($iStart + $iLength > $this->iLength) {
$iLength = $this->iLength - $iStart;
}
Expand All @@ -672,8 +744,10 @@ private function substr($iStart, $iLength) {
}

private function strlen($sString) {
if ($this->oParserSettings->bMultibyteSupport) {
return mb_strlen($sString, $this->sCharset);
if ($this->sTextLibrary === 'mb') {
return mb_strlen($sString, 'utf-8');
} else if ($this->sTextLibrary === 'iconv') {
return iconv_strlen($sString, 'utf-8');
} else {
return strlen($sString);
}
Expand All @@ -688,40 +762,12 @@ private function streql($sString1, $sString2, $bCaseInsensitive = true) {
}

private function strtolower($sString) {
if ($this->oParserSettings->bMultibyteSupport) {
return mb_strtolower($sString, $this->sCharset);
if ($this->sTextLibrary === 'mb') {
return mb_strtolower($sString, 'utf-8');
} else {
// Iconv cannot lowercase strings, bad luck.
return strtolower($sString);
}
}

private function strsplit($sString) {
if ($this->oParserSettings->bMultibyteSupport) {
if ($this->streql($this->sCharset, 'utf-8')) {
return preg_split('//u', $sString, null, PREG_SPLIT_NO_EMPTY);
} else {
$iLength = mb_strlen($sString, $this->sCharset);
$aResult = array();
for ($i = 0; $i < $iLength; ++$i) {
$aResult[] = mb_substr($sString, $i, 1, $this->sCharset);
}
return $aResult;
}
} else {
if($sString === '') {
return array();
} else {
return str_split($sString);
}
}
}

private function strpos($sString, $sNeedle, $iOffset) {
if ($this->oParserSettings->bMultibyteSupport) {
return mb_strpos($sString, $sNeedle, $iOffset, $this->sCharset);
} else {
return strpos($sString, $sNeedle, $iOffset);
}
}

}
18 changes: 15 additions & 3 deletions lib/Sabberworm/CSS/RuleSet/DeclarationBlock.php
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,11 @@ public function expandFontShorthand() {
}
foreach ($aValues as $mValue) {
if (!$mValue instanceof Value) {
$mValue = mb_strtolower($mValue);
if (function_exists('mb_strtolower')) {
$mValue = mb_strtolower($mValue, 'utf-8');
} else {
$mValue = strtolower($mValue);
}
}
if (in_array($mValue, array('normal', 'inherit'))) {
foreach (array('font-style', 'font-weight', 'font-variant') as $sProperty) {
Expand Down Expand Up @@ -300,7 +304,11 @@ public function expandBackgroundShorthand() {
$iNumBgPos = 0;
foreach ($aValues as $mValue) {
if (!$mValue instanceof Value) {
$mValue = mb_strtolower($mValue);
if (function_exists('mb_strtolower')) {
$mValue = mb_strtolower($mValue, 'utf-8');
} else {
$mValue = strtolower($mValue);
}
}
if ($mValue instanceof URL) {
$aBgProperties['background-image'] = $mValue;
Expand Down Expand Up @@ -369,7 +377,11 @@ public function expandListStyleShorthand() {
}
foreach ($aValues as $mValue) {
if (!$mValue instanceof Value) {
$mValue = mb_strtolower($mValue);
if (function_exists('mb_strtolower')) {
$mValue = mb_strtolower($mValue, 'utf-8');
} else {
$mValue = strtolower($mValue);
}
}
if ($mValue instanceof Url) {
$aListProperties['list-style-image'] = $mValue;
Expand Down
43 changes: 40 additions & 3 deletions lib/Sabberworm/CSS/Value/CSSString.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,46 @@ public function __toString() {
}

public function render(\Sabberworm\CSS\OutputFormat $oOutputFormat) {
$sString = addslashes($this->sString);
$sString = str_replace("\n", '\A', $sString);
return $oOutputFormat->getStringQuotingType() . $sString . $oOutputFormat->getStringQuotingType();
$sQuote = $oOutputFormat->getStringQuotingType();
$aString = preg_split('//u', $this->sString, null, PREG_SPLIT_NO_EMPTY);
foreach ($aString as $i => $sChar) {
if (strlen($sChar) === 1) {
if ($sChar === "\n") {
$aString[$i] = '\a';
continue;
}
if ($sChar === "'") {
$aString[$i] = '\27';
continue;
}
if ($sChar === '"') {
$aString[$i] = '\22';
continue;
}
if ($sChar === '\\') {
$aString[$i] = '\5c';
continue;
}
$iOrd = ord($sChar);
if ($iOrd === 0) {
$aString[$i] = '';
continue;
}
if ($iOrd > 31 && $iOrd < 127) {
continue;
}
}

$sHex = '';
$sUtf32 = iconv('utf-8', 'utf-32le', $sChar);
$aBytes = str_split($sUtf32);
foreach (array_reverse($aBytes) as $sByte) {
$sHex .= str_pad(dechex(ord($sByte)), 2, '0', STR_PAD_LEFT);
}
$aString[$i] = '\\' . ltrim($sHex, '0');
}

return $sQuote . implode($aString) . $sQuote;
}

}
Loading

0 comments on commit 888af13

Please sign in to comment.