Extra tests for assembly name parser. (#64022)

* Dead code in native assembly name parsing * disallow `\u` escaping in assembly names * misc cleanup * forward slash is illegal escaped or not * ignore "language" attribute in assembly name ("culture" must be used) * duplicate attributes are ok if unrecognized (just add tests) * drop support for "custom" blob attribute * drop support for publickey[token]=neutral ("null" must be used) * ignore unknown assembly name attributes in mono (compat) * disallow \0 anywhere in the assembly name * disallow \0 in assembly names on mono (compat) * only check for embedded nulls when parsing * fix mono build * make GCC happy * couple test scenarios for publickey vs. publickeytoken (CoreRT parser might trip on these) * produce errors on duplicate known attributes in mono
dotnet · Jan 22, 2022 · 8d2268a · 8d2268a
1 parent feb25b0
commit 8d2268a
Show file tree

Hide file tree

Showing 11 changed files with 137 additions and 238 deletions.
diff --git a/src/coreclr/binder/inc/assemblyidentity.hpp b/src/coreclr/binder/inc/assemblyidentity.hpp
@@ -30,12 +30,9 @@ namespace BINDER_SPACE
             IDENTITY_FLAG_PUBLIC_KEY_TOKEN       = 0x004,
             IDENTITY_FLAG_PUBLIC_KEY             = 0x008,
             IDENTITY_FLAG_CULTURE                = 0x010,
-            IDENTITY_FLAG_LANGUAGE               = 0x020,
             IDENTITY_FLAG_PROCESSOR_ARCHITECTURE = 0x040,
             IDENTITY_FLAG_RETARGETABLE           = 0x080,
             IDENTITY_FLAG_PUBLIC_KEY_TOKEN_NULL  = 0x100,
-            IDENTITY_FLAG_CUSTOM                 = 0x200,
-            IDENTITY_FLAG_CUSTOM_NULL            = 0x400,
             IDENTITY_FLAG_CONTENT_TYPE           = 0x800,
             IDENTITY_FLAG_FULL_NAME              = (IDENTITY_FLAG_SIMPLE_NAME |
                                                     IDENTITY_FLAG_VERSION)
@@ -50,7 +47,6 @@ namespace BINDER_SPACE
             // Need to pre-populate SBuffers because of bogus asserts
             static const BYTE byteArr[] = { 0 };
             m_publicKeyOrTokenBLOB.SetImmutable(byteArr, sizeof(byteArr));
-            m_customBLOB.SetImmutable(byteArr, sizeof(byteArr));
         }
         ~AssemblyIdentity()
         {
@@ -83,7 +79,6 @@ namespace BINDER_SPACE
         SBuffer             m_publicKeyOrTokenBLOB;
         PEKIND              m_kProcessorArchitecture;
         AssemblyContentType m_kContentType;
-        SBuffer             m_customBLOB;
         DWORD               m_dwIdentityFlags;
     };
 

diff --git a/src/coreclr/binder/inc/stringlexer.hpp b/src/coreclr/binder/inc/stringlexer.hpp
@@ -55,32 +55,28 @@ namespace BINDER_SPACE
         inline StringLexer();
         inline ~StringLexer();
 
-        inline void Init(SString &inputString, BOOL fSupportEscaping);
+        inline void Init(SString &inputString);
 
         static inline BOOL IsWhitespace(WCHAR wcChar);
         static inline BOOL IsEOS(WCHAR wcChar);
         static inline BOOL IsQuoteCharacter(WCHAR wcChar);
 
-        virtual BOOL IsSeparatorChar(WCHAR wcChar) = NULL;
-        virtual LEXEME_TYPE GetLexemeType(WCHAR wcChar) = NULL;
+        BOOL IsSeparatorChar(WCHAR wcChar);
+        LEXEME_TYPE GetLexemeType(WCHAR wcChar);
 
     protected:
         static const WCHAR INVALID_CHARACTER = -1;
 
-        LEXEME_TYPE GetNextLexeme(SString &currentString, BOOL fPermitUnescapedQuotes = FALSE);
+        LEXEME_TYPE GetNextLexeme(SString &currentString);
 
         inline WCHAR PopCharacter(BOOL *pfIsEscaped);
         inline void PushCharacter(WCHAR wcCurrentChar,
                                   BOOL fIsEscaped);
 
         inline WCHAR GetRawCharacter();
-        inline void PushRawCharacter();
-        inline WCHAR DecodeUTF16Character();
         inline WCHAR GetNextCharacter(BOOL *pfIsEscaped);
 
-        inline WCHAR ParseUnicode();
-        LEXEME_TYPE ParseString(SString &currentString,
-                                BOOL     fPermitUnescapeQuotes);
+        LEXEME_TYPE ParseString(SString &currentString);
 
         void TrimTrailingWhiteSpaces(SString &currentString);
 
@@ -89,8 +85,6 @@ namespace BINDER_SPACE
 
         WCHAR m_wcCurrentChar;
         BOOL m_fCurrentCharIsEscaped;
-        BOOL m_fSupportEscaping;
-        BOOL m_fReadRawCharacter;
     };
 
 #include "stringlexer.inl"

diff --git a/src/coreclr/binder/inc/stringlexer.inl b/src/coreclr/binder/inc/stringlexer.inl
@@ -25,12 +25,10 @@ StringLexer::~StringLexer()
     // Nothing to do here
 }
 
-void StringLexer::Init(SString &inputString, BOOL fSupportEscaping)
+void StringLexer::Init(SString &inputString)
 {
     m_cursor = inputString.Begin();
     m_end = inputString.End();
-    m_fSupportEscaping = fSupportEscaping;
-    m_fReadRawCharacter = FALSE;
 }
 
 BOOL StringLexer::IsWhitespace(WCHAR wcChar)
@@ -55,6 +53,7 @@ WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped)
     {
         m_wcCurrentChar = INVALID_CHARACTER;
         *pfIsEscaped = m_fCurrentCharIsEscaped;
+        m_cursor++;
     }
     else
     {
@@ -71,172 +70,63 @@ void StringLexer::PushCharacter(WCHAR wcCurrentChar,
 
     m_wcCurrentChar = wcCurrentChar;
     m_fCurrentCharIsEscaped = fIsEscaped;
+    m_cursor--;
 }
 
 WCHAR StringLexer::GetRawCharacter()
 {
     WCHAR wcCurrentChar = 0;
 
-    if (m_cursor <= m_end)
+    if (m_cursor < m_end)
     {
         wcCurrentChar = m_cursor[0];
-        m_fReadRawCharacter = TRUE;
         m_cursor++;
-    }
-    else
-    {
-        m_fReadRawCharacter = FALSE;
-    }
-
-    return wcCurrentChar;
-}
-
-void StringLexer::PushRawCharacter()
-{
-    if (m_fReadRawCharacter)
-    {
-        m_cursor--;
-        m_fReadRawCharacter = FALSE;
-    }
-}
 
-WCHAR StringLexer::DecodeUTF16Character()
-{
-    // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding.
-
-    WCHAR wcCurrentChar = 0;
-    SCOUNT_T nCharacters = m_end - m_cursor + 1;
-    WCHAR wcChar1 = GetRawCharacter();
-
-    if (wcChar1 < 0xd800)
-    {
-        wcCurrentChar = wcChar1;
+        // do not allow \0 anywhere in the string.
+        if (wcCurrentChar == 0)
+        {
+            wcCurrentChar = INVALID_CHARACTER;
+        }
     }
     else
     {
-        // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane,
-        // since it stores all characters in 16-bit WCHARs.
-        // However, since the vast majority of the time, we (Microsoft) produce the manifests,
-        // this is likely a non-scenario, as the other Unicode planes would never be used in practice.
-
-        if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair
-        {
-            if (nCharacters >= 2)
-            {
-                GetRawCharacter(); // Skip the second WCHAR of the surrogate pair
-            }
-        }
-        // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair,
-        // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat
-        // as invalid.
-
-        wcCurrentChar = INVALID_CHARACTER;
+        // EOS
+        wcCurrentChar = 0;
     }
 
     return wcCurrentChar;
 }
 
-
 WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped)
 {
     *pfIsEscaped = FALSE;
 
-    WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character()
+    WCHAR wcCurrentChar = GetRawCharacter();
     if (wcCurrentChar == L'\\')
     {
-        WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character()
+        WCHAR wcTempChar = GetRawCharacter();
 
-        if (m_fSupportEscaping)
-        {
-            // Handle standard escapes
-            switch (wcTempChar)
-            {
-            case L'"':
-            case L'\'':
-            case L',':
-            case L'\\':
-            case L'/':
-            case L'=':
-                break;
-            case L't':
-                wcTempChar = 9;
-                break;
-            case L'n':
-                wcTempChar = 10;
-                break;
-            case L'r':
-                wcTempChar = 13;
-                break;
-            case L'u':
-                wcTempChar = ParseUnicode();
-                break;
-            default:
-                return INVALID_CHARACTER;
-            }
-
-            *pfIsEscaped = TRUE;
-            wcCurrentChar = wcTempChar;
-        }
-        else
-        {
-            // Do not handle escapes except for quotes
-            switch (wcTempChar)
-            {
-            case L'"':
-            case L'\'':
-                *pfIsEscaped = TRUE;
-                wcCurrentChar = wcTempChar;
-                break;
-            default:
-                PushRawCharacter();
-                break;
-            }
-        }
-    }
-
-    return wcCurrentChar;
-}
-
-WCHAR StringLexer::ParseUnicode()
-{
-    int nCharacters = 0;
-    WCHAR wcUnicodeChar = 0;
-
-    for(;;)
-    {
-        WCHAR wcCurrentChar = DecodeUTF16Character();
-        nCharacters++;
-
-        if (wcCurrentChar == L';')
+        // Handle standard escapes
+        switch (wcTempChar)
         {
+        case L'"':
+        case L'\'':
+        case L',':
+        case L'\\':
+        case L'=':
+        case L't':
+        case L'n':
+        case L'r':
             break;
-        }
-        else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9))
-        {
+        default:
             return INVALID_CHARACTER;
         }
 
-        wcUnicodeChar <<= 4;
-
-        if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9'))
-        {
-            wcUnicodeChar += (wcCurrentChar - L'0');
-        }
-        else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f'))
-        {
-            wcUnicodeChar += (wcCurrentChar - L'a') + 10;
-        }
-        else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F'))
-        {
-            wcUnicodeChar += (wcCurrentChar - L'A') + 10;
-        }
-        else
-        {
-            return INVALID_CHARACTER;
-        }
+        *pfIsEscaped = TRUE;
+        wcCurrentChar = wcTempChar;
     }
 
-    return wcUnicodeChar;
+    return wcCurrentChar;
 }
 
 #endif
diff --git a/src/coreclr/binder/inc/textualidentityparser.hpp b/src/coreclr/binder/inc/textualidentityparser.hpp
@@ -28,12 +28,9 @@ namespace BINDER_SPACE
         TextualIdentityParser(AssemblyIdentity *pAssemblyIdentity);
         ~TextualIdentityParser();
 
-        virtual BOOL IsSeparatorChar(WCHAR wcChar);
-        virtual StringLexer::LEXEME_TYPE GetLexemeType(WCHAR wcChar);
-
         static HRESULT Parse(/* in */  SString           &textualIdentity,
-                             /* out */ AssemblyIdentity *pAssemblyIdentity,
-                             /* in */  BOOL              fPermitUnescapedQuotes = FALSE);
+                             /* out */ AssemblyIdentity *pAssemblyIdentity);
+
         static HRESULT ToString(/* in */  AssemblyIdentity *pAssemblyIdentity,
                                 /* in */  DWORD             dwIdentityFlags,
                                 /* out */ SString          &textualIdentity);
@@ -45,15 +42,15 @@ namespace BINDER_SPACE
                               /* in */  BOOL     fValidateHex,
                               /* in */  BOOL     fIsToken,
                               /* out */ SBuffer &publicKeyOrTokenBLOB);
+
         static void BlobToHex(/* in */  SBuffer &publicKeyOrTokenBLOB,
                               /* out */ SString &publicKeyOrToken);
 
         BOOL ParseString(/* in */  SString &textualString,
                          /* out */ SString &contentString);
 
     protected:
-        BOOL Parse(/* in */  SString &textualIdentity,
-                   /* in */  BOOL     fPermitUnescapedQuotes = FALSE);
+        BOOL Parse(/* in */  SString &textualIdentity);
 
         BOOL PopulateAssemblyIdentity(/* in */ SString &attributeString,
                                       /* in */ SString &valueString);

diff --git a/src/coreclr/binder/stringlexer.cpp b/src/coreclr/binder/stringlexer.cpp
@@ -19,7 +19,7 @@
 namespace BINDER_SPACE
 {
     StringLexer::LEXEME_TYPE
-    StringLexer::GetNextLexeme(SString &currentString, BOOL fPermitUnescapedQuotes)
+    StringLexer::GetNextLexeme(SString &currentString)
     {
         BOOL fIsEscaped = FALSE;
         WCHAR wcCurrentChar = INVALID_CHARACTER;
@@ -43,11 +43,11 @@ namespace BINDER_SPACE
 
         // First character of string lexeme; push it back
         PushCharacter(wcCurrentChar, fIsEscaped);
-        return ParseString(currentString, fPermitUnescapedQuotes);
+        return ParseString(currentString);
     }
 
     StringLexer::LEXEME_TYPE
-    StringLexer::ParseString(SString &currentString, BOOL fPermitUnescapedQuotes)
+    StringLexer::ParseString(SString &currentString)
     {
         BOOL fIsFirstCharacter = TRUE;
         WCHAR wcCurrentChar = INVALID_CHARACTER;
@@ -99,7 +99,7 @@ namespace BINDER_SPACE
                 break;
             }
 
-            if (!fPermitUnescapedQuotes && !fIsEscaped && IsQuoteCharacter(wcCurrentChar) && !IsQuoteCharacter(wcOpeningQuote))
+            if (!fIsEscaped && IsQuoteCharacter(wcCurrentChar) && !IsQuoteCharacter(wcOpeningQuote))
             {
                 // Unescaped quotes in the middle of the string are an error
                 return LEXEME_TYPE_INVALID;
@@ -147,4 +147,24 @@ namespace BINDER_SPACE
             currentString.Truncate(cursor + 1);
         }
     }
+
+    BOOL StringLexer::IsSeparatorChar(WCHAR wcChar)
+    {
+        return ((wcChar == W(',')) || (wcChar == W('=')));
+    }
+
+    StringLexer::LEXEME_TYPE StringLexer::GetLexemeType(WCHAR wcChar)
+    {
+        switch (wcChar)
+        {
+        case W('='):
+            return LEXEME_TYPE_EQUALS;
+        case W(','):
+            return LEXEME_TYPE_COMMA;
+        case 0:
+            return LEXEME_TYPE_END_OF_STREAM;
+        default:
+            return LEXEME_TYPE_STRING;
+        }
+    }
 };