Fix encoding issues and improve parsing performance

MyIntervals · Aug 20, 2016 · 888af13 · 888af13
1 parent 50a802f
commit 888af13
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 68 deletions.
diff --git a/lib/Sabberworm/CSS/Parser.php b/lib/Sabberworm/CSS/Parser.php
@@ -32,11 +32,11 @@ class Parser {
 	private $aText;
 	private $iCurrentPosition;
 	private $oParserSettings;
-	private $sCharset;
 	private $iLength;
 	private $blockRules;
 	private $aSizeUnits;
 	private $iLineNo;
+	private $sTextLibrary;
 
 	/**
 	 * Parser constructor.
@@ -64,20 +64,75 @@ public function __construct($sText, Settings $oParserSettings = null, $iLineNo =
 			$this->aSizeUnits[$iSize][strtolower($val)] = $val;
 		}
 		ksort($this->aSizeUnits, SORT_NUMERIC);
+		$this->fixCharset();
 	}
 
-	public function setCharset($sCharset) {
-		$this->sCharset = $sCharset;
-		$this->aText = $this->strsplit($this->sText);
-		$this->iLength = count($this->aText);
+	private function fixCharset() {
+		// We need to know the charset before the parsing starts.
+		$sCharset = strtolower($this->oParserSettings->sDefaultCharset);
+		if (strpos($this->sText, "\xef\xbb\xbf") === 0) {
+			// Remove BOM before any processing.
+			$this->sText = substr($this->sText, strlen("\xef\xbb\xbf"));
+			$sCharset = 'utf-8';
+		} else if (preg_match('/(.*)@charset\s+["\']([a-z0-9-]+)["\']\s*;/ims', $this->sText, $matches)) {
+			// This is a simplified guessing, the charset atRule location is validated later,
+			// hopefully this is not used much these days.
+			if (trim($matches[1]) === '' and preg_match('/^@charset\s+["\']([a-z0-9-]+)["\']\s*;/im', $matches[0])) {
+				$sCharset = strtolower($matches[2]);
+			}
+		}
+
+		// Convert all text to utf-8 so that code does not have to deal with encoding conversions and incompatible characters.
+		if ($sCharset !== 'utf-8') {
+			if (function_exists('mb_convert_encoding')) {
+				$this->sText = mb_convert_encoding($this->sText, 'utf-8', $sCharset);
+			} else {
+				$this->sText = iconv($sCharset, 'utf-8', $this->sText);
+			}
+		}
+
+		// Multibyte support can make the parsing 10x slower,
+		// but even if it is disabled the unicode characters usually survive this parsing unharmed.
+		$this->sTextLibrary = 'ascii';
+		if (!$this->oParserSettings->bMultibyteSupport) {
+			$this->iLength = $this->strlen($this->sText);
+			return;
+		}
+
+		// If there are only ASCII characters in the CSS then we can safely use good old PHP string functions here.
+		if (function_exists('mb_convert_encoding')) {
+			$sSubst = mb_substitute_character();
+			mb_substitute_character('none');
+			$asciiText = mb_convert_encoding($this->sText, 'ASCII', 'utf-8');
+			mb_substitute_character($sSubst);
+		} else {
+			$asciiText = @iconv('utf-8', 'ASCII//IGNORE', $this->sText);
+		}
+		if ($this->sText !== $asciiText) {
+			if (function_exists('mb_convert_encoding')) {
+				// Usually mbstring extension is much faster than iconv.
+				$this->sTextLibrary = 'mb';
+			} else {
+				$this->sTextLibrary = 'iconv';
+			}
+		}
+		unset($asciiText);
+		$this->iLength = $this->strlen($this->sText);
+
+		// Substring operations are slower with unicode, aText array is used for faster emulation.
+		if ($this->sTextLibrary !== 'ascii') {
+			$this->aText = preg_split('//u', $this->sText, null, PREG_SPLIT_NO_EMPTY);
+			if (!is_array($this->aText) || count($this->aText) !== $this->iLength) {
+				$this->aText = null;
+			}
+		}
 	}
 
 	public function getCharset() {
-		return $this->sCharset;
+		return 'utf-8';
 	}
 
 	public function parse() {
-		$this->setCharset($this->oParserSettings->sDefaultCharset);
 		$oResult = new Document($this->iLineNo);
 		$this->parseDocument($oResult);
 		return $oResult;
@@ -113,7 +168,7 @@ private function parseList(CSSList $oList, $bIsRoot = false) {
 			throw new SourceException("Unexpected end of document", $this->iLineNo);
 		}
 	}
-	
+
 	private function parseListItem(CSSList $oList, $bIsRoot = false) {
 		if ($this->comes('@')) {
 			$oAtRule = $this->parseAtRule();
@@ -124,7 +179,7 @@ private function parseListItem(CSSList $oList, $bIsRoot = false) {
 				if(count($oList->getContents()) > 0) {
 					throw new UnexpectedTokenException('@charset must be the first parseable token in a document', '', 'custom', $this->iLineNo);
 				}
-				$this->setCharset($oAtRule->getCharset()->getString());
+				// We have already guessed the charset in the constructor, it cannot be changed now.
 			}
 			return $oAtRule;
 		} else if ($this->comes('}')) {
@@ -157,7 +212,8 @@ private function parseAtRule() {
 			$sCharset = $this->parseStringValue();
 			$this->consumeWhiteSpace();
 			$this->consume(';');
-			return new Charset($sCharset, $iIdentifierLineNum);
+			// Replace the original charset with utf-8 because we have changed the encoding in the constructor.
+			return new Charset(new CSSString('utf-8', $this->iLineNo), $iIdentifierLineNum);
 		} else if ($this->identifierIs($sIdentifier, 'keyframes')) {
 			$oResult = new KeyFrame($iIdentifierLineNum);
 			$oResult->setVendorKeyFrame($sIdentifier);
@@ -282,7 +338,12 @@ private function parseCharacter($bIsForIdentifier) {
 				$sUtf32 .= chr($iUnicode & 0xff);
 				$iUnicode = $iUnicode >> 8;
 			}
-			return iconv('utf-32le', $this->sCharset, $sUtf32);
+			$sChar = iconv('utf-32le', 'utf-8', $sUtf32);
+			if ($sChar === chr(0)) {
+				// PHP does not like null characters in strings for security reasons, just ignore them.
+				return '';
+			}
+			return $sChar;
 		}
 		if ($bIsForIdentifier) {
 			$peek = ord($this->peek());
@@ -529,7 +590,7 @@ private function identifierIs($sIdentifier, $sMatch) {
 	}
 
 	private function comes($sString, $bCaseInsensitive = false) {
-		$sPeek = $this->peek(strlen($sString));
+		$sPeek = $this->peek($this->strlen($sString));
 		return ($sPeek == '')
 			? false
 			: $this->streql($sPeek, $sString, $bCaseInsensitive);
@@ -652,13 +713,24 @@ private function consumeUntil($aEnd, $bIncludeEnd = false, $consumeEnd = false,
 	}
 
 	private function inputLeft() {
-		return $this->substr($this->iCurrentPosition, -1);
+		return $this->substr($this->iCurrentPosition, $this->iLength - $this->iCurrentPosition -1);
 	}
 
 	private function substr($iStart, $iLength) {
-		if ($iLength < 0) {
-			$iLength = $this->iLength - $iStart + $iLength;
+		if ($iLength <= 0 || $iStart >= $this->iLength) {
+			return '';
+		}
+		if ($this->sTextLibrary === 'ascii') {
+			return substr($this->sText, $iStart, $iLength);
+		}
+		if ($iLength > 100 || $iStart < 0 || !isset($this->aText)) {
+			if ($this->sTextLibrary === 'mb') {
+				return mb_substr($this->sText, $iStart, $iLength, 'utf-8');
+			} else {
+				return iconv_substr($this->sText, $iStart, $iLength, 'utf-8');
+			}
 		}
+		// Use faster substr emulation for short unicode lengths.
 		if ($iStart + $iLength > $this->iLength) {
 			$iLength = $this->iLength - $iStart;
 		}
@@ -672,8 +744,10 @@ private function substr($iStart, $iLength) {
 	}
 
 	private function strlen($sString) {
-		if ($this->oParserSettings->bMultibyteSupport) {
-			return mb_strlen($sString, $this->sCharset);
+		if ($this->sTextLibrary === 'mb') {
+			return mb_strlen($sString, 'utf-8');
+		} else if ($this->sTextLibrary === 'iconv') {
+			return iconv_strlen($sString, 'utf-8');
 		} else {
 			return strlen($sString);
 		}
@@ -688,40 +762,12 @@ private function streql($sString1, $sString2, $bCaseInsensitive = true) {
 	}
 
 	private function strtolower($sString) {
-		if ($this->oParserSettings->bMultibyteSupport) {
-			return mb_strtolower($sString, $this->sCharset);
+		if ($this->sTextLibrary === 'mb') {
+			return mb_strtolower($sString, 'utf-8');
 		} else {
+			// Iconv cannot lowercase strings, bad luck.
 			return strtolower($sString);
 		}
 	}
 
-	private function strsplit($sString) {
-		if ($this->oParserSettings->bMultibyteSupport) {
-			if ($this->streql($this->sCharset, 'utf-8')) {
-				return preg_split('//u', $sString, null, PREG_SPLIT_NO_EMPTY);
-			} else {
-				$iLength = mb_strlen($sString, $this->sCharset);
-				$aResult = array();
-				for ($i = 0; $i < $iLength; ++$i) {
-					$aResult[] = mb_substr($sString, $i, 1, $this->sCharset);
-				}
-				return $aResult;
-			}
-		} else {
-			if($sString === '') {
-				return array();
-			} else {
-				return str_split($sString);
-			}
-		}
-	}
-
-	private function strpos($sString, $sNeedle, $iOffset) {
-		if ($this->oParserSettings->bMultibyteSupport) {
-			return mb_strpos($sString, $sNeedle, $iOffset, $this->sCharset);
-		} else {
-			return strpos($sString, $sNeedle, $iOffset);
-		}
-	}
-
 }
diff --git a/lib/Sabberworm/CSS/RuleSet/DeclarationBlock.php b/lib/Sabberworm/CSS/RuleSet/DeclarationBlock.php
@@ -226,7 +226,11 @@ public function expandFontShorthand() {
 		}
 		foreach ($aValues as $mValue) {
 			if (!$mValue instanceof Value) {
-				$mValue = mb_strtolower($mValue);
+				if (function_exists('mb_strtolower')) {
+					$mValue = mb_strtolower($mValue, 'utf-8');
+				} else {
+					$mValue = strtolower($mValue);
+				}
 			}
 			if (in_array($mValue, array('normal', 'inherit'))) {
 				foreach (array('font-style', 'font-weight', 'font-variant') as $sProperty) {
@@ -300,7 +304,11 @@ public function expandBackgroundShorthand() {
 		$iNumBgPos = 0;
 		foreach ($aValues as $mValue) {
 			if (!$mValue instanceof Value) {
-				$mValue = mb_strtolower($mValue);
+				if (function_exists('mb_strtolower')) {
+					$mValue = mb_strtolower($mValue, 'utf-8');
+				} else {
+					$mValue = strtolower($mValue);
+				}
 			}
 			if ($mValue instanceof URL) {
 				$aBgProperties['background-image'] = $mValue;
@@ -369,7 +377,11 @@ public function expandListStyleShorthand() {
 		}
 		foreach ($aValues as $mValue) {
 			if (!$mValue instanceof Value) {
-				$mValue = mb_strtolower($mValue);
+				if (function_exists('mb_strtolower')) {
+					$mValue = mb_strtolower($mValue, 'utf-8');
+				} else {
+					$mValue = strtolower($mValue);
+				}
 			}
 			if ($mValue instanceof Url) {
 				$aListProperties['list-style-image'] = $mValue;

diff --git a/lib/Sabberworm/CSS/Value/CSSString.php b/lib/Sabberworm/CSS/Value/CSSString.php
@@ -24,9 +24,46 @@ public function __toString() {
 	}
 
 	public function render(\Sabberworm\CSS\OutputFormat $oOutputFormat) {
-		$sString = addslashes($this->sString);
-		$sString = str_replace("\n", '\A', $sString);
-		return $oOutputFormat->getStringQuotingType() . $sString . $oOutputFormat->getStringQuotingType();
+		$sQuote = $oOutputFormat->getStringQuotingType();
+		$aString = preg_split('//u', $this->sString, null, PREG_SPLIT_NO_EMPTY);
+		foreach ($aString as $i => $sChar) {
+			if (strlen($sChar) === 1) {
+				if ($sChar === "\n") {
+					$aString[$i] = '\a';
+					continue;
+				}
+				if ($sChar === "'") {
+					$aString[$i] = '\27';
+					continue;
+				}
+				if ($sChar === '"') {
+					$aString[$i] = '\22';
+					continue;
+				}
+				if ($sChar === '\\') {
+					$aString[$i] = '\5c';
+					continue;
+				}
+				$iOrd = ord($sChar);
+				if ($iOrd === 0) {
+					$aString[$i] = '';
+					continue;
+				}
+				if ($iOrd > 31 && $iOrd < 127) {
+					continue;
+				}
+			}
+
+			$sHex = '';
+			$sUtf32 = iconv('utf-8', 'utf-32le', $sChar);
+			$aBytes = str_split($sUtf32);
+			foreach (array_reverse($aBytes) as $sByte) {
+				$sHex .= str_pad(dechex(ord($sByte)), 2, '0', STR_PAD_LEFT);
+			}
+			$aString[$i] = '\\' . ltrim($sHex, '0');
+		}
+
+		return $sQuote . implode($aString) . $sQuote;
 	}
 
 }