StackStorm · Kami · Apr 5, 2021 · Mar 13, 2021 · Mar 13, 2021 · Mar 13, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -28,6 +28,20 @@ Added
 * Added the command line utility `st2-validate-pack` that can be used by pack developers to
   validate pack contents. (improvement)
 
+* Fix a bug in the API and CLI code which would prevent users from being able to retrieve resources
+  which contain non-ascii (utf-8) characters in the names / references. (bug fix) #5189
+
+  Contributed by @Kami.
+
+* Fix a bug in the API router code and make sure we return correct and user-friendly error to the
+  user in case we fail to parse the request URL / path because it contains invalid or incorrectly
+  URL encoded data.
+
+  Previously such errors weren't handled correctly which meant original exception with a stack
+  trace got propagated to the user. (bug fix) #5189
+
+  Contributed by @Kami.
+
 Changed
 ~~~~~~~
 

diff --git a/st2api/tests/unit/controllers/v1/test_actions.py b/st2api/tests/unit/controllers/v1/test_actions.py
@@ -17,6 +17,7 @@
 import os
 import os.path
 import copy
+import urllib
 
 try:
     import simplejson as json
@@ -277,6 +278,22 @@
 }
 
 
+ACTION_WITH_UNICODE_NAME = {
+    "name": "st2.dummy.action_unicode_我爱狗",
+    "description": "test description",
+    "enabled": True,
+    "pack": "dummy_pack_1",
+    "entry_point": "/tmp/test/action1.sh",
+    "runner_type": "local-shell-script",
+    "parameters": {
+        "a": {"type": "string", "default": "A1"},
+        "b": {"type": "string", "default": "B1"},
+        "sudo": {"default": True, "immutable": True},
+    },
+    "notify": {"on-complete": {"message": "Woohoo! I completed!!!"}},
+}
+
+
 class ActionsControllerTestCase(
     FunctionalTest, APIControllerWithIncludeAndExcludeFilterTestCase, CleanFilesTestCase
 ):
@@ -640,6 +657,48 @@ def test_action_with_notify_update(self):
         self.assertEqual(get_resp.json["notify"], {})
         self.__do_delete(action_id)
 
+    @mock.patch.object(
+        action_validator, "validate_action", mock.MagicMock(return_value=True)
+    )
+    def test_action_with_unicode_name_create(self):
+        post_resp = self.__do_post(ACTION_WITH_UNICODE_NAME)
+        action_id = self.__get_action_id(post_resp)
+        get_resp = self.__do_get_one(action_id)
+        self.assertEqual(get_resp.status_int, 200)
+        self.assertEqual(self.__get_action_id(get_resp), action_id)
+        self.assertEqual(get_resp.json["name"], "st2.dummy.action_unicode_我爱狗")
+        self.assertEqual(
+            get_resp.json["ref"], "dummy_pack_1.st2.dummy.action_unicode_我爱狗"
+        )
+        self.assertEqual(
+            get_resp.json["uid"], "action:dummy_pack_1:st2.dummy.action_unicode_我爱狗"
+        )
+
+        get_resp = self.__do_get_one("dummy_pack_1.st2.dummy.action_unicode_我爱狗")
+        self.assertEqual(get_resp.json["name"], "st2.dummy.action_unicode_我爱狗")
+        self.assertEqual(
+            get_resp.json["ref"], "dummy_pack_1.st2.dummy.action_unicode_我爱狗"
+        )
+        self.assertEqual(
+            get_resp.json["uid"], "action:dummy_pack_1:st2.dummy.action_unicode_我爱狗"
+        )
+
+        # Now retrieve the action using the ref and ensure it works correctly
+        # NOTE: We need to use urlquoted value when retrieving the item since that's how all the
+        # http clients work - non ascii characters get escaped / quoted. Passing in unquoted
+        # value will result in exception (as expected).
+        ref_quoted = urllib.parse.quote("dummy_pack_1.st2.dummy.action_unicode_我爱狗")
+        get_resp = self.__do_get_one(ref_quoted)
+        self.assertEqual(get_resp.json["name"], "st2.dummy.action_unicode_我爱狗")
+        self.assertEqual(
+            get_resp.json["ref"], "dummy_pack_1.st2.dummy.action_unicode_我爱狗"
+        )
+        self.assertEqual(
+            get_resp.json["uid"], "action:dummy_pack_1:st2.dummy.action_unicode_我爱狗"
+        )
+
+        self.__do_delete(action_id)
+
     @mock.patch.object(
         action_validator, "validate_action", mock.MagicMock(return_value=True)
     )

diff --git a/st2api/tests/unit/controllers/v1/test_base.py b/st2api/tests/unit/controllers/v1/test_base.py
@@ -13,7 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import webob
 from oslo_config import cfg
+from webob.request import Request
+
+from st2common.router import Router
+
 from st2tests.api import FunctionalTest
 
 
@@ -95,3 +100,16 @@ def test_valid_status_code_is_returned_on_invalid_path(self):
             "/v1/executions/577f775b0640fd1451f2030b/re_run/a/b", expect_errors=True
         )
         self.assertEqual(resp.status_int, 404)
+
+    def test_router_invalid_url_path_friendly_error(self):
+        # NOTE: We intentionally don't use sp.app.get here since that goes through the webtest
+        # layer which manipulates the path which means we won't be testing what we actually want
+        # to test (an edge case). To test the edge case correctly, we need to manually call router
+        # with specifically crafted data.
+        router = Router()
+        request = Request(environ={"PATH_INFO": "/v1/rules/好的".encode("utf-8")})
+
+        expected_msg = "URL likely contains invalid or incorrectly URL encoded values"
+        self.assertRaisesRegexp(
+            webob.exc.HTTPBadRequest, expected_msg, router.match, request
+        )
diff --git a/st2client/st2client/shell.py b/st2client/st2client/shell.py
@@ -63,6 +63,7 @@
 from st2client.config import set_config
 from st2client.exceptions.operations import OperationFailureException
 from st2client.utils.logging import LogLevelFilter, set_log_level_for_all_loggers
+from st2client.utils.misc import reencode_list_with_surrogate_escape_sequences
 from st2client.commands.auth import TokenCreateCommand
 from st2client.commands.auth import LoginCommand
 
@@ -98,6 +99,42 @@
 
 PACKAGE_METADATA_FILE_PATH = "/opt/stackstorm/st2/package.meta"
 
+"""
+Here we sanitize the provided args and ensure they contain valid unicode values.
+
+By default, sys.argv will contain a unicode string where the actual item values which contain
+unicode sequences are escaped using unicode surrogates.
+
+For example, if "examples.test_rule_utf8_náme" value is specified as a CLI argument, sys.argv
+and as such also url, would contain "examples.test_rule_utf8_n%ED%B3%83%ED%B2%A1me" which is not
+what we want.
+
+Complete sys.argv example:
+
+1. Default - ['shell.py', '--debug', 'rule', 'get', 'examples.test_rule_utf8_n\udcc3\udca1me']
+2. What we want - ['shell.py', '--debug', 'rule', 'get', 'examples.test_rule_utf8_náme']
+
+This won't work correctly when sending requests to the API. As such, we correctly escape the
+value to the unicode string here and then let the http layer (requests) correctly url encode
+this value.
+
+Technically, we could also just try to re-encode it in the HTTPClient and I tried that first, but
+it turns out more code in the client results in exceptions if it's not re-encoded as early as
+possible.
+"""
+
+REENCODE_ARGV = os.environ.get("ST2_CLI_RENCODE_ARGV", "true").lower() in [
+    "true",
+    "1",
+    "yes",
+]
+
+if REENCODE_ARGV:
+    try:
+        sys.argv = reencode_list_with_surrogate_escape_sequences(sys.argv)
+    except Exception as e:
+        print("Failed to re-encode sys.argv: %s" % (str(e)))
+
 
 def get_stackstorm_version():
     """

diff --git a/st2client/st2client/utils/misc.py b/st2client/st2client/utils/misc.py
@@ -14,11 +14,14 @@
 # limitations under the License.
 
 from __future__ import absolute_import
+
+from typing import List
+
 import copy
 
 import six
 
-__all__ = ["merge_dicts"]
+__all__ = ["merge_dicts", "reencode_list_with_surrogate_escape_sequences"]
 
 
 def merge_dicts(d1, d2):
@@ -39,3 +42,22 @@ def merge_dicts(d1, d2):
             result[key] = value
 
     return result
+
+
+def reencode_list_with_surrogate_escape_sequences(value: List[str]) -> List[str]:
+    """
+    Function which reencodes each item in the provided list replacing unicode surrogate escape
+    sequences using actual unicode values.
+    """
+    result = []
+
+    for item in value:
+        try:
+            item = item.encode("ascii", "surrogateescape").decode("utf-8")
+        except UnicodeEncodeError:
+            # Already a unicode string, nothing to do
+            pass
+
+        result.append(item)
+
+    return result
diff --git a/st2client/tests/unit/test_commands.py b/st2client/tests/unit/test_commands.py
@@ -295,6 +295,21 @@ def test_command_delete_failed(self):
         args = self.parser.parse_args(["fakeresource", "delete", "cba"])
         self.assertRaises(Exception, self.branch.commands["delete"].run, args)
 
+    @mock.patch.object(
+        models.ResourceManager,
+        "get_by_id",
+        mock.MagicMock(return_value=base.FakeResource(**base.RESOURCES[0])),
+    )
+    def test_command_get_unicode_primary_key(self):
+        args = self.parser.parse_args(
+            ["fakeresource", "get", "examples.test_rule_utf8_náme"]
+        )
+        self.assertEqual(args.func, self.branch.commands["get"].run_and_print)
+        instance = self.branch.commands["get"].run(args)
+        actual = instance.serialize()
+        expected = json.loads(json.dumps(base.RESOURCES[0]))
+        self.assertEqual(actual, expected)
+
 
 class ResourceViewCommandTestCase(unittest2.TestCase):
     def setUp(self):

diff --git a/st2client/tests/unit/test_shell.py b/st2client/tests/unit/test_shell.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2020 The StackStorm Authors.
 # Copyright 2019 Extreme Networks, Inc.
 #
@@ -32,6 +33,7 @@
 from st2client.shell import Shell
 from st2client.client import Client
 from st2client.utils import httpclient
+from st2client.utils.misc import reencode_list_with_surrogate_escape_sequences
 from st2common.models.db.auth import TokenDB
 from tests import base
 
@@ -828,3 +830,34 @@ def test_automatic_auth_skipped_if_token_provided_as_cli_argument(self):
         args = shell.parser.parse_args(args=argv)
         shell.get_client(args=args)
         self.assertEqual(shell._get_auth_token.call_count, 0)
+
+    def test_get_one_unicode_character_in_name(self):
+        self._write_mock_config()
+
+        shell = Shell()
+        shell._get_auth_token = mock.Mock()
+
+        os.environ["ST2_AUTH_TOKEN"] = "fooo"
+        argv = ["action", "get", "examples.test_rule_utf8_náme"]
+        args = shell.parser.parse_args(args=argv)
+        shell.get_client(args=args)
+        self.assertEqual(args.ref_or_id, "examples.test_rule_utf8_náme")
+
+    def test_reencode_list_replace_surrogate_escape(self):
+        value = ["a", "b", "c", "d"]
+        expected = value
+        result = reencode_list_with_surrogate_escape_sequences(value)
+
+        self.assertEqual(result, expected)
+
+        value = ["a", "b", "c", "n\udcc3\udca1me"]
+        expected = ["a", "b", "c", "náme"]
+        result = reencode_list_with_surrogate_escape_sequences(value)
+
+        self.assertEqual(result, expected)
+
+        value = ["a", "b", "c", "你好"]
+        expected = value
+        result = reencode_list_with_surrogate_escape_sequences(value)
+
+        self.assertEqual(result, expected)
diff --git a/st2common/st2common/middleware/instrumentation.py b/st2common/st2common/middleware/instrumentation.py
@@ -20,6 +20,8 @@
 from st2common.metrics.base import get_driver
 from st2common.util.date import get_datetime_utc_now
 from st2common.router import NotFoundException
+from st2common.router import Response
+from st2common.util.jsonify import json_encode
 
 __all__ = ["RequestInstrumentationMiddleware", "ResponseInstrumentationMiddleware"]
 
@@ -47,6 +49,16 @@ def __call__(self, environ, start_response):
             endpoint, _ = self.router.match(request)
         except NotFoundException:
             endpoint = {}
+        except Exception as e:
+            # Special case to make sure we return friendly error to the user.
+            # If we don't do that and router.match() throws an exception, we will return stack trace
+            # to the end user which is not good.
+            status_code = getattr(e, "status_code", 500)
+            headers = {"Content-Type": "application/json"}
+            body = {"faultstring": getattr(e, "detail", str(e))}
+            response_body = json_encode(body)
+            resp = Response(response_body, status=status_code, headers=headers)
+            return resp(environ, start_response)
 
         # NOTE: We don't track per request and response metrics for /v1/executions/<id> and some
         # other endpoints because this would result in a lot of unique metrics which is an

diff --git a/st2common/st2common/router.py b/st2common/st2common/router.py
@@ -29,7 +29,7 @@
 from six.moves.urllib import parse as urlparse  # pylint: disable=import-error
 import webob
 from webob import cookies, exc
-from webob.compat import url_unquote
+from six.moves import urllib
 
 from st2common.exceptions import rbac as rbac_exc
 from st2common.exceptions import auth as auth_exc
@@ -264,7 +264,31 @@ def add_spec(self, spec, transforms):
             )
 
     def match(self, req):
-        path = url_unquote(req.path)
+        # NOTE: webob.url_unquote doesn't work correctly under Python 3 when paths contain non-ascii
+        # characters. That method supposed to handle Python 2 and Python 3 compatibility, but it
+        # doesn't work correctly under Python 3.
+        try:
+            path = urllib.parse.unquote(req.path)
+        except Exception as e:
+            # This exception being thrown indicates that the URL / path contains bad or incorrectly
+            # URL escaped characters. Instead of returning this stack track + 500 error to the
+            # user we return a friendly and more correct exception
+            # NOTE: We should not access or log req.path here since it's a property which results
+            # in exception and if we try to log it, it will fail.
+            try:
+                path = req.environ["PATH_INFO"]
+            except Exception:
+                path = "unknown"
+
+            LOG.error('Failed to parse request URL / path "%s": %s' % (path, str(e)))
+
+            abort(
+                400,
+                'Failed to parse request path "%s". URL likely contains invalid or incorrectly '
+                "URL encoded values." % (path),
+            )
+            return
+
         LOG.debug("Match path: %s", path)
 
         if len(path) > 1 and path.endswith("/"):

diff --git a/st2tests/st2tests/api.py b/st2tests/st2tests/api.py
@@ -23,8 +23,12 @@
 
 import six
 import webtest
+import webob.compat
+import webob.request
 import mock
 
+from six.moves import urllib
+
 from oslo_config import cfg
 
 from st2common.router import Router
@@ -57,6 +61,25 @@ class ResponseLeakError(ValueError):
     pass
 
 
+# NOTE: This is not ideal, but we need to patch those functions so it works correctly for the
+# tests.
+# The problem is that for the unit based api tests we utilize webtest which has the same bug as
+# webob when handling unicode characters in the path names and the actual unit test API code doesn't
+# follow exactly the same code path as actual production code which doesn't utilize webtest
+# In short, that's why important we also have end to end tests for API endpoints!
+webob.request.url_unquote = urllib.parse.unquote
+webob.compat.url_unquote = urllib.parse.unquote
+
+
+def bytes_(s, encoding="utf-8", errors="strict"):
+    if isinstance(s, six.text_type):
+        return s.encode("utf-8", errors)
+
+
+webob.compat.bytes_ = bytes_
+webob.request.bytes_ = bytes_
+
+
 class TestApp(webtest.TestApp):
     def do_request(self, req, **kwargs):
         self.cookiejar.clear()