Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for UTF-8 Identifiers #2848

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 99 additions & 42 deletions pxr/usd/sdf/path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "pxr/base/tf/staticData.h"
#include "pxr/base/tf/stl.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/base/tf/type.h"

#include "pxr/base/trace/trace.h"
Expand Down Expand Up @@ -117,6 +118,7 @@ class _DeferredDiagnostics

} // anon

static inline bool _IsValidIdentifier(const std::string_view& name);
static inline bool _IsValidIdentifier(TfToken const &name);

// XXX: Enable this define to make bad path strings
Expand Down Expand Up @@ -1787,48 +1789,90 @@ SdfPath::MakeRelativePath(const SdfPath & anchor) const
return result;
}

static inline bool _IsValidIdentifier(TfToken const &name)
static
inline bool
_IsValidIdentifier(const std::string_view& name)
{
return TfIsValidIdentifier(name.GetString());
// empty strings are not valid identifiers
if (name.empty())
{
return false;
}

TfUtf8CodePointView view {name};
bool first = true;
for (const uint32_t codePoint : view)
{
bool result = first ?
((codePoint == SDF_UNDERSCORE_CODE_POINT) ||
TfIsUtf8CodePointXidStart(codePoint))
: TfIsUtf8CodePointXidContinue(codePoint);

if (!result)
{
return false;
}

first = false;
}

return true;
}

bool
SdfPath::IsValidIdentifier(const std::string &name)
static
inline bool
_IsValidIdentifier(TfToken const &name)
{
return TfIsValidIdentifier(name);
return _IsValidIdentifier(name.GetString());
}

// We use our own _IsAlpha and _IsAlnum here for two reasons. One, we want to
// ensure that they follow C/Python identifier rules and are not subject to
// various locale differences. And two, since we are not consulting a locale,
// it is faster.
static constexpr bool _IsAlpha(int x) {
return ('a' <= (x|32)) && ((x|32) <= 'z');
}
static constexpr bool _IsAlnum(int x) {
return _IsAlpha(x) || (('0' <= x) && (x <= '9'));
bool
SdfPath::IsValidIdentifier(const std::string &name)
{
return _IsValidIdentifier(name);
}

bool
SdfPath::IsValidNamespacedIdentifier(const std::string &name)
{
// empty strings are not valid identifiers
if (name.empty())
{
return false;
}

// A valid C/Python identifier except we also allow the namespace delimiter
// and if we tokenize on that delimiter then all tokens are valid C/Python
// identifiers. That means following a delimiter there must be an '_' or
// alphabetic character.
constexpr char delim = SDF_PATH_NS_DELIMITER_CHAR;
for (char const *p = name.c_str(); *p; ++p) {
if (!_IsAlpha(*p) && *p != '_') {
// identifiers.
std::string_view remainder {name};
while (!remainder.empty()) {
const auto index = remainder.find(':');

// can't start with ':'
if (index == 0) {
return false;
}
for (++p; _IsAlnum(*p) ||*p == '_'; ++p) {
/* consume identifier */

// can't end with ':'
if (index == remainder.size() - 1) {
return false;
}
if (*p != delim) {
return !*p;

// substring must be a valid identifier
if (!_IsValidIdentifier(remainder.substr(0, index))) {
return false;
}

// if ':' wasn't found, we are done
if (index == std::string_view::npos) {
break;
}

// otherwise check the next substring
remainder = remainder.substr(index + 1);
}
return false;

return true;
}

std::vector<std::string>
Expand All @@ -1840,48 +1884,61 @@ SdfPath::TokenizeIdentifier(const std::string &name)
const char namespaceDelimiter =
SdfPathTokens->namespaceDelimiter.GetText()[0];

std::string::const_iterator first = name.begin();
std::string::const_iterator last = name.end();

// Not empty and first character is alpha or '_'.
if (first == last || !(isalpha(*first) || (*first == '_')))
// Empty or last character is namespace delimiter
if (name.empty() || name.back() == namespaceDelimiter)
{
return result;
// Last character is not the namespace delimiter.
if (*(last - 1) == namespaceDelimiter)
}

TfUtf8CodePointView view {name};
TfUtf8CodePointIterator iterator = view.begin();
TfUtf8CodePointIterator anchor = iterator;

// Check first character is in XidStart or '_'
if(!TfIsUtf8CodePointXidStart(*iterator) &&
*iterator != SDF_UNDERSCORE_CODE_POINT)
{
return result;
}

// Count delimiters and reserve space in result.
result.reserve(1 + std::count(first, last, namespaceDelimiter));
result.reserve(1 + std::count(name.begin(), name.end(),
namespaceDelimiter));

std::string::const_iterator anchor = first;
for (++first; first != last; ++first) {
for (++iterator; iterator != view.end(); ++iterator)
{
// Allow a namespace delimiter.
if (*first == namespaceDelimiter) {
if (*iterator == SDF_NAMESPACE_DELIMITER_CODE_POINT)
{
// Record token.
result.push_back(std::string(anchor, first));
result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));

// Skip delimiter. We know we will not go beyond the end of
// the string because we checked before the loop that the
// last character was not the delimiter.
anchor = ++first;
anchor = ++iterator;

// First character.
if (!(isalpha(*first) || (*first == '_'))) {
if (!TfIsUtf8CodePointXidStart(*iterator) &&
*iterator != SDF_UNDERSCORE_CODE_POINT)
{
TfReset(result);
return result;
}
}
else {
// Next character
if (!(isalnum(*first) || (*first == '_'))) {
else
{
// Next character
if (!TfIsUtf8CodePointXidContinue(*iterator))
{
TfReset(result);
return result;
}
}
}

// Record the last token.
result.push_back(std::string(anchor, first));
result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));

return result;
}
Expand Down
75 changes: 70 additions & 5 deletions pxr/usd/sdf/pathParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include "pxr/pxr.h"
#include "pxr/base/tf/stringUtils.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/usd/sdf/path.h"

#include "pxr/base/tf/pxrPEGTL/pegtl.h"
Expand All @@ -39,6 +40,60 @@ namespace Sdf_PathParser {

namespace PEGTL_NS = tao::TAO_PEGTL_NAMESPACE;

////////////////////////////////////////////////////////////////////////
// Helper rules for parsing UTF8 content
struct XidStart
{
template <typename ParseInput>
static bool match(ParseInput& in)
{
if (!in.empty())
{
// peek at the next character in the input
// if the size is not 0, it was a valid code point
auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
if (utf8_char.size != 0)
{
// valid utf8_char, data has the code point
if (TfIsUtf8CodePointXidStart(static_cast<uint32_t>(utf8_char.data)))
{
// it has the property we want, consume the input
in.bump(utf8_char.size);
return true;
}
}
}

return false;
}
};

struct XidContinue
{
template <typename ParseInput>
static bool match(ParseInput& in)
{
if (!in.empty())
{
// peek at the next character in the input
// if the size is not 0, it was a valid code point
auto utf8_char = tao::TAO_PEGTL_NAMESPACE::internal::peek_utf8::peek(in);
if (utf8_char.size != 0)
{
// valid utf8_char, data has the code point
if (TfIsUtf8CodePointXidContinue(static_cast<uint32_t>(utf8_char.data)))
{
// it has the property we want, consume the input
in.bump(utf8_char.size);
return true;
}
}
}

return false;
}
};

////////////////////////////////////////////////////////////////////////
// SdfPath grammar:

Expand All @@ -51,21 +106,31 @@ struct ReflexiveRelative : Dot {};

struct DotDots : PEGTL_NS::list<DotDot, Slash> {};

struct PrimName : PEGTL_NS::identifier {};
// valid identifiers start with an '_' character or anything in the XidStart
// character class, then continue with zero or more characters in the
// XidContinue character class
struct Utf8IdentifierStart : PEGTL_NS::sor<
PEGTL_NS::one<'_'>,
XidStart> {};
struct Utf8Identifier : PEGTL_NS::seq<
Utf8IdentifierStart,
PEGTL_NS::star<XidContinue>> {};

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it turns out this slows down the parser, one way we could speed it up perhaps is to make an 'Identifier' rule that first tries an ASCII ident first and only invokes the UTF-8 rule if that doesn't match (along the lines of sor<PEGTL_NS::identifier, Utf8Identifier>),

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The perf runs came back ok, the cost is really just the cost of decoding and class validation

struct PrimName : Utf8Identifier {};

// XXX This replicates old behavior where '-' chars are allowed in variant set
// names in SdfPaths, but variant sets in layers cannot have '-' in their names.
// For now we preserve the behavior. Internal bug USD-8321 tracks removing
// support for '-' characters in variant set names in SdfPath.
struct VariantSetName :
PEGTL_NS::seq<PEGTL_NS::identifier_first,
PEGTL_NS::seq<Utf8IdentifierStart,
PEGTL_NS::star<PEGTL_NS::sor<
PEGTL_NS::identifier_other, PEGTL_NS::one<'-'>>>> {};
XidContinue, PEGTL_NS::one<'-'>>>> {};

struct VariantName :
PEGTL_NS::seq<PEGTL_NS::opt<
PEGTL_NS::one<'.'>>, PEGTL_NS::star<
PEGTL_NS::sor<PEGTL_NS::identifier_other,
PEGTL_NS::sor<XidContinue,
PEGTL_NS::one<'|', '-'>>>> {};

struct VarSelOpen : PEGTL_NS::pad<PEGTL_NS::one<'{'>, PEGTL_NS::blank> {};
Expand All @@ -88,7 +153,7 @@ struct PrimElts : PEGTL_NS::seq<
LookaheadList<PrimName, PEGTL_NS::sor<Slash, VariantSelections>>,
PEGTL_NS::opt<VariantSelections>> {};

struct PropertyName : PEGTL_NS::list<PEGTL_NS::identifier, PEGTL_NS::one<':'>> {};
struct PropertyName : PEGTL_NS::list<Utf8Identifier, PEGTL_NS::one<':'>> {};

struct MapperPath;
struct TargetPath;
Expand Down
42 changes: 25 additions & 17 deletions pxr/usd/sdf/schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "pxr/usd/sdf/layerOffset.h"
#include "pxr/usd/sdf/parserValueContext.h"
#include "pxr/usd/sdf/pathExpression.h"
#include "pxr/usd/sdf/pathParser.h"
#include "pxr/usd/sdf/payload.h"
#include "pxr/usd/sdf/reference.h"
#include "pxr/usd/sdf/schemaTypeRegistration.h"
Expand All @@ -41,6 +42,7 @@
#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/envSetting.h"
#include "pxr/base/tf/instantiateSingleton.h"
#include "pxr/base/tf/unicodeUtils.h"
#include "pxr/base/trace/trace.h"
#include "pxr/base/vt/dictionary.h"

Expand Down Expand Up @@ -1287,28 +1289,34 @@ SdfSchemaBase::IsValidNamespacedIdentifier(const std::string& identifier)
SdfAllowed
SdfSchemaBase::IsValidVariantIdentifier(const std::string& identifier)
{
// Allow [[:alnum:]_|\-]+ with an optional leading dot.

std::string::const_iterator first = identifier.begin();
std::string::const_iterator last = identifier.end();

// Allow optional leading dot.
if (first != last && *first == '.') {
++first;
}
// use the path parser rules to determine validity of variant name
Sdf_PathParser::PPContext context;
bool result = false;
try
{
result = Sdf_PathParser::PEGTL_NS::parse<
Sdf_PathParser::PEGTL_NS::must<Sdf_PathParser::VariantName,
Sdf_PathParser::PEGTL_NS::eof>>(
Sdf_PathParser::PEGTL_NS::string_input<> {identifier, ""}, context);

for (; first != last; ++first) {
char c = *first;
if (!(isalnum(c) || (c == '_') || (c == '|') || (c == '-'))) {
if (!result)
{
return SdfAllowed(TfStringPrintf(
"\"%s\" is not a valid variant "
"name due to '%c' at index %d",
identifier.c_str(),
c,
(int)(first - identifier.begin())));
"\"%s\" is not a valid variant name",
identifier.c_str()));
}
}
catch(const Sdf_PathParser::PEGTL_NS::parse_error& e)
{
return SdfAllowed(TfStringPrintf(
"\"%s\" is not a valid variant "
"name due to '%s'",
identifier.c_str(),
e.what()));

return false;
}

return true;
}

Expand Down
Loading