PixarAnimationStudios · pixar-oss · Jan 6, 2024 · Dec 7, 2023 · gitamohr · Dec 19, 2023
diff --git a/pxr/usd/sdf/path.cpp b/pxr/usd/sdf/path.cpp
@@ -34,6 +34,7 @@
 #include "pxr/base/tf/staticData.h"
 #include "pxr/base/tf/stl.h"
 #include "pxr/base/tf/stringUtils.h"
+#include "pxr/base/tf/unicodeUtils.h"
 #include "pxr/base/tf/type.h"
 
 #include "pxr/base/trace/trace.h"
@@ -117,6 +118,7 @@ class _DeferredDiagnostics
 
 } // anon
 
+static inline bool _IsValidIdentifier(const std::string_view& name);
 static inline bool _IsValidIdentifier(TfToken const &name);
 
 // XXX: Enable this define to make bad path strings
@@ -1787,48 +1789,90 @@ SdfPath::MakeRelativePath(const SdfPath & anchor) const
     return result;
 }
 
-static inline bool _IsValidIdentifier(TfToken const &name)
+static
+inline bool 
+_IsValidIdentifier(const std::string_view& name)
 {
-    return TfIsValidIdentifier(name.GetString());
+    // empty strings are not valid identifiers
+    if (name.empty())
+    {
+        return false;
+    }
+
+    TfUtf8CodePointView view {name};
+    bool first = true;
+    for (const uint32_t codePoint : view)
+    {
+        bool result = first ? 
+            ((codePoint == SDF_UNDERSCORE_CODE_POINT) ||
+                TfIsUtf8CodePointXidStart(codePoint))
+            : TfIsUtf8CodePointXidContinue(codePoint);
+
+        if (!result)
+        {
+            return false;
+        }
+
+        first = false;
+    }
+
+    return true;
 }
 
-bool
-SdfPath::IsValidIdentifier(const std::string &name)
+static
+inline bool 
+_IsValidIdentifier(TfToken const &name)
 {
-    return TfIsValidIdentifier(name);
+    return _IsValidIdentifier(name.GetString());
 }
 
-// We use our own _IsAlpha and _IsAlnum here for two reasons.  One, we want to
-// ensure that they follow C/Python identifier rules and are not subject to
-// various locale differences.  And two, since we are not consulting a locale,
-// it is faster.
-static constexpr bool _IsAlpha(int x) {
-    return ('a' <= (x|32)) && ((x|32) <= 'z');
-}
-static constexpr bool _IsAlnum(int x) {
-    return _IsAlpha(x) || (('0' <= x) && (x <= '9'));
+bool
+SdfPath::IsValidIdentifier(const std::string &name)
+{
+    return _IsValidIdentifier(name);
 }
 
 bool
 SdfPath::IsValidNamespacedIdentifier(const std::string &name)
 {
+    // empty strings are not valid identifiers
+    if (name.empty())
+    {
+        return false;
+    }
+
     // A valid C/Python identifier except we also allow the namespace delimiter
     // and if we tokenize on that delimiter then all tokens are valid C/Python
-    // identifiers.  That means following a delimiter there must be an '_' or
-    // alphabetic character.
-    constexpr char delim = SDF_PATH_NS_DELIMITER_CHAR;
-    for (char const *p = name.c_str(); *p; ++p) {
-        if (!_IsAlpha(*p) && *p != '_') {
+    // identifiers.
+    std::string_view remainder {name};
+    while (!remainder.empty()) {
+        const auto index = remainder.find(':');
+
+        // can't start with ':'
+        if (index == 0) {
             return false;
         }
-        for (++p; _IsAlnum(*p) ||*p == '_'; ++p) {
-            /* consume identifier */
+
+        // can't end with ':'
+        if (index == remainder.size() - 1) {
+            return false;
         }
-        if (*p != delim) {
-            return !*p;
+
+        // substring must be a valid identifier
+        if (!_IsValidIdentifier(remainder.substr(0, index))) {
+            return false;
         }
+
+        // if ':' wasn't found, we are done
+        if (index == std::string_view::npos) {
+            break;
+        }
+
+        // otherwise check the next substring
+        remainder = remainder.substr(index + 1);
     }
-    return false;
+
+    return true;
 }
 
 std::vector<std::string>
@@ -1840,48 +1884,61 @@ SdfPath::TokenizeIdentifier(const std::string &name)
     const char namespaceDelimiter =
         SdfPathTokens->namespaceDelimiter.GetText()[0];
 
-    std::string::const_iterator first = name.begin();
-    std::string::const_iterator last = name.end();
-
-    // Not empty and first character is alpha or '_'.
-    if (first == last || !(isalpha(*first) || (*first == '_')))
+    // Empty or last character is namespace delimiter
+    if (name.empty() || name.back() == namespaceDelimiter)
+    {
         return result;
-    // Last character is not the namespace delimiter.
-    if (*(last - 1) == namespaceDelimiter)
+    }
+
+    TfUtf8CodePointView view {name};
+    TfUtf8CodePointIterator iterator = view.begin();
+    TfUtf8CodePointIterator anchor = iterator;
+
+    // Check first character is in XidStart or '_'
+    if(!TfIsUtf8CodePointXidStart(*iterator) &&
+        *iterator != SDF_UNDERSCORE_CODE_POINT)
+    {
         return result;
+    }
 
     // Count delimiters and reserve space in result.
-    result.reserve(1 + std::count(first, last, namespaceDelimiter));
+    result.reserve(1 + std::count(name.begin(), name.end(),
+        namespaceDelimiter));
 
-    std::string::const_iterator anchor = first;
-    for (++first; first != last; ++first) {
+    for (++iterator; iterator != view.end(); ++iterator)
+    {
         // Allow a namespace delimiter.
-        if (*first == namespaceDelimiter) {
+        if (*iterator == SDF_NAMESPACE_DELIMITER_CODE_POINT)
+        {
             // Record token.
-            result.push_back(std::string(anchor, first));
+            result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));
 
             // Skip delimiter.  We know we will not go beyond the end of
             // the string because we checked before the loop that the
             // last character was not the delimiter.
-            anchor = ++first;
+            anchor = ++iterator;
 
             // First character.
-            if (!(isalpha(*first) || (*first == '_'))) {
+            if (!TfIsUtf8CodePointXidStart(*iterator) && 
+                *iterator != SDF_UNDERSCORE_CODE_POINT)
+            {
                 TfReset(result);
                 return result;
             }
         }
-        else {
-            // Next character 
-            if (!(isalnum(*first) || (*first == '_'))) {
+        else
+        {
+            // Next character
+            if (!TfIsUtf8CodePointXidContinue(*iterator))
+            {
                 TfReset(result);
                 return result;
             }
         }
     }
 
     // Record the last token.
-    result.push_back(std::string(anchor, first));
+    result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));
 
     return result;
 }

diff --git a/pxr/usd/sdf/pathParser.h b/pxr/usd/sdf/pathParser.h
@@ -26,6 +26,7 @@
 
 #include "pxr/pxr.h"
 #include "pxr/base/tf/stringUtils.h"
+#include "pxr/base/tf/unicodeUtils.h"
 #include "pxr/usd/sdf/path.h"
 
 #include "pxr/base/tf/pxrPEGTL/pegtl.h"
@@ -39,6 +40,60 @@ namespace Sdf_PathParser {
 
 namespace PEGTL_NS = tao::TAO_PEGTL_NAMESPACE;
 
+////////////////////////////////////////////////////////////////////////
+// Helper rules for parsing UTF8 content
+struct XidStart
+{
+    template <typename ParseInput>
+    static bool match(ParseInput& in)
+    {
+        if (!in.empty())
+        {
+            // peek at the next character in the input
+            // if the size is not 0, it was a valid code point
+            auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
+            if (utf8_char.size != 0)
+            {
+                // valid utf8_char, data has the code point
+                if (TfIsUtf8CodePointXidStart(static_cast<uint32_t>(utf8_char.data)))
+                {
+                    // it has the property we want, consume the input
+                    in.bump(utf8_char.size);
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+};
+
+struct XidContinue
+{
+    template <typename ParseInput>
+    static bool match(ParseInput& in)
+    {
+        if (!in.empty())
+        {
+            // peek at the next character in the input
+            // if the size is not 0, it was a valid code point
+            auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
+            if (utf8_char.size != 0)
+            {
+                // valid utf8_char, data has the code point
+                if (TfIsUtf8CodePointXidContinue(static_cast<uint32_t>(utf8_char.data)))
+                {
+                    // it has the property we want, consume the input
+                    in.bump(utf8_char.size); 
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+};
+
 ////////////////////////////////////////////////////////////////////////
 // SdfPath grammar:
 
@@ -51,21 +106,31 @@ struct ReflexiveRelative : Dot {};
 
 struct DotDots : PEGTL_NS::list<DotDot, Slash> {};
 
-struct PrimName : PEGTL_NS::identifier {};
+// valid identifiers start with an '_' character or anything in the XidStart
+// character class, then continue with zero or more characters in the
+// XidContinue character class
+struct Utf8IdentifierStart : PEGTL_NS::sor<
+    PEGTL_NS::one<'_'>,
+    XidStart> {};
+struct Utf8Identifier : PEGTL_NS::seq<
+    Utf8IdentifierStart,
+    PEGTL_NS::star<XidContinue>> {};
+
+struct PrimName : Utf8Identifier {};
 
 // XXX This replicates old behavior where '-' chars are allowed in variant set
 // names in SdfPaths, but variant sets in layers cannot have '-' in their names.
 // For now we preserve the behavior. Internal bug USD-8321 tracks removing
 // support for '-' characters in variant set names in SdfPath.
 struct VariantSetName :
-    PEGTL_NS::seq<PEGTL_NS::identifier_first,
+    PEGTL_NS::seq<Utf8IdentifierStart,
     PEGTL_NS::star<PEGTL_NS::sor<
-    PEGTL_NS::identifier_other, PEGTL_NS::one<'-'>>>> {};
+    XidContinue, PEGTL_NS::one<'-'>>>> {};
 
 struct VariantName :
     PEGTL_NS::seq<PEGTL_NS::opt<
     PEGTL_NS::one<'.'>>, PEGTL_NS::star<
-    PEGTL_NS::sor<PEGTL_NS::identifier_other,
+    PEGTL_NS::sor<XidContinue,
     PEGTL_NS::one<'|', '-'>>>> {};
 
 struct VarSelOpen : PEGTL_NS::pad<PEGTL_NS::one<'{'>, PEGTL_NS::blank> {};
@@ -88,7 +153,7 @@ struct PrimElts : PEGTL_NS::seq<
     LookaheadList<PrimName, PEGTL_NS::sor<Slash, VariantSelections>>,
     PEGTL_NS::opt<VariantSelections>> {};
 
-struct PropertyName : PEGTL_NS::list<PEGTL_NS::identifier, PEGTL_NS::one<':'>> {};
+struct PropertyName : PEGTL_NS::list<Utf8Identifier, PEGTL_NS::one<':'>> {};
 
 struct MapperPath;
 struct TargetPath;

diff --git a/pxr/usd/sdf/schema.cpp b/pxr/usd/sdf/schema.cpp
@@ -28,6 +28,7 @@
 #include "pxr/usd/sdf/layerOffset.h"
 #include "pxr/usd/sdf/parserValueContext.h"
 #include "pxr/usd/sdf/pathExpression.h"
+#include "pxr/usd/sdf/pathParser.h"
 #include "pxr/usd/sdf/payload.h"
 #include "pxr/usd/sdf/reference.h"
 #include "pxr/usd/sdf/schemaTypeRegistration.h"
@@ -41,6 +42,7 @@
 #include "pxr/base/tf/diagnostic.h"
 #include "pxr/base/tf/envSetting.h"
 #include "pxr/base/tf/instantiateSingleton.h"
+#include "pxr/base/tf/unicodeUtils.h"
 #include "pxr/base/trace/trace.h"
 #include "pxr/base/vt/dictionary.h"
 
@@ -1287,28 +1289,34 @@ SdfSchemaBase::IsValidNamespacedIdentifier(const std::string& identifier)
 SdfAllowed
 SdfSchemaBase::IsValidVariantIdentifier(const std::string& identifier)
 {
-    // Allow [[:alnum:]_|\-]+ with an optional leading dot.
-
-    std::string::const_iterator first = identifier.begin();
-    std::string::const_iterator last = identifier.end();
-
-    // Allow optional leading dot.
-    if (first != last && *first == '.') {
-        ++first;
-    }
+    // use the path parser rules to determine validity of variant name
+    Sdf_PathParser::PPContext context;
+    bool result = false;
+    try
+    {
+        result = Sdf_PathParser::PEGTL_NS::parse<
+            Sdf_PathParser::PEGTL_NS::must<Sdf_PathParser::VariantName,
+            Sdf_PathParser::PEGTL_NS::eof>>(
+                Sdf_PathParser::PEGTL_NS::string_input<> {identifier, ""}, context);
 
-    for (; first != last; ++first) {
-        char c = *first;
-        if (!(isalnum(c) || (c == '_') || (c == '|') || (c == '-'))) {
+        if (!result)
+        {
             return SdfAllowed(TfStringPrintf(
-                    "\"%s\" is not a valid variant "
-                    "name due to '%c' at index %d",
-                    identifier.c_str(),
-                    c,
-                    (int)(first - identifier.begin())));
+                "\"%s\" is not a valid variant name",
+                identifier.c_str()));
         }
     }
+    catch(const Sdf_PathParser::PEGTL_NS::parse_error& e)
+    {
+        return SdfAllowed(TfStringPrintf(
+            "\"%s\" is not a valid variant "
+            "name due to '%s'",
+            identifier.c_str(),
+            e.what()));
 
+        return false;
+    }
+
     return true;
 }