From 010ca7e460d13160b465de24dd3982a397f94c16 Mon Sep 17 00:00:00 2001 From: fisx Date: Mon, 7 Oct 2019 17:27:26 +0200 Subject: [PATCH 01/26] Update docs. (#870) --- docs/reference/provisioning/scim-via-curl.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/reference/provisioning/scim-via-curl.md b/docs/reference/provisioning/scim-via-curl.md index 7e28794a326..a3fd2bc0c3f 100644 --- a/docs/reference/provisioning/scim-via-curl.md +++ b/docs/reference/provisioning/scim-via-curl.md @@ -8,6 +8,11 @@ This page shows you how to communicate with the wire backend through the [SCIM API](http://www.simplecloud.info/) by example. All examples are [curl](https://curl.haxx.se/) (in bash syntax). +We support setting the handle and user name in wire (the thing with +`@` and the longer thing without `@`). There is also support for +setting rich-info. Group provisioning is planned, but the release +date hasn't been fixed yet. + If you want to dive into the backend code, start [reading here in our backend](https://github.com/wireapp/wire-server/blob/develop/services/spar/src/Spar/Scim.hs) and [our hscim library](https://github.com/wireapp/hscim). From 9130a37f0b2f9689898b0833ccf62c86291a48d8 Mon Sep 17 00:00:00 2001 From: fisx Date: Wed, 16 Oct 2019 14:27:21 +0200 Subject: [PATCH 02/26] Support HEAD requests for `/sso/initiate-bind` (#878) --- services/spar/src/Spar/API.hs | 1 + services/spar/src/Spar/API/Types.hs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/spar/src/Spar/API.hs b/services/spar/src/Spar/API.hs index 8705a376cdb..cccae4a5390 100644 --- a/services/spar/src/Spar/API.hs +++ b/services/spar/src/Spar/API.hs @@ -57,6 +57,7 @@ app ctx = SAML.setHttpCachePolicy api :: Opts -> ServerT API Spar api opts = apiSSO opts + :<|> authreqPrecheck :<|> authreq (maxttlAuthreqDiffTime opts) DoInitiateBind :<|> apiIDP :<|> apiScim diff --git a/services/spar/src/Spar/API/Types.hs b/services/spar/src/Spar/API/Types.hs index f0fc4e629cd..902bff25381 100644 --- a/services/spar/src/Spar/API/Types.hs +++ b/services/spar/src/Spar/API/Types.hs @@ -33,7 +33,8 @@ import "swagger2" Data.Swagger hiding (Header(..)) type API = "sso" :> APISSO - :<|> "sso-initiate-bind" :> APIAuthReq -- (see comment on 'APIAuthReq') + :<|> "sso-initiate-bind" :> APIAuthReqPrecheck -- (see comment on 'APIAuthReq') + :<|> "sso-initiate-bind" :> APIAuthReq -- (see comment on 'APIAuthReq') :<|> "identity-providers" :> APIIDP :<|> "scim" :> APIScim :<|> OmitDocs :> "i" :> APIINTERNAL From d972d2dfc9d87c14f565b21e7b59b5b96a237152 Mon Sep 17 00:00:00 2001 From: fisx Date: Thu, 17 Oct 2019 11:32:49 +0200 Subject: [PATCH 03/26] spar docs (#871) * docs * Add missing spar integration test cases --- deploy/services-demo/register_idp_internal.sh | 25 +++ docs/reference/spar-braindump.md | 183 ++++++++++++++++++ libs/brig-types/src/Brig/Types/Common.hs | 2 +- .../test-integration/Test/Spar/APISpec.hs | 74 ++++++- 4 files changed, 282 insertions(+), 2 deletions(-) create mode 100755 deploy/services-demo/register_idp_internal.sh create mode 100644 docs/reference/spar-braindump.md diff --git a/deploy/services-demo/register_idp_internal.sh b/deploy/services-demo/register_idp_internal.sh new file mode 100755 index 00000000000..8a142efbad1 --- /dev/null +++ b/deploy/services-demo/register_idp_internal.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -e + +# server-side variant of ./register_idp.sh; use if you have ssh access to one of your spar instances. +# usage: ./register_idp_internal.sh + +backend="http://localhost:8080" + +metadata_file=$1 +if [ ! -e "${metadata_file}" ]; then + echo "*** no metadata: '$1'" + exit 80 +fi + +z_user=$2 +if [ ! -n "${z_user}" ]; then + echo "*** no z_user uuid" + exit 80 +fi + +which curl >/dev/null || ( echo "*** please install https://curl.haxx.se/ in your path."; exit 81 ) +curl_exe=$(which curl) + +${curl_exe} -is -v -XPOST ${backend}/identity-providers -H"Z-User: ${z_user}" -H'Content-type: application/xml' -d@"${metadata_file}" diff --git a/docs/reference/spar-braindump.md b/docs/reference/spar-braindump.md new file mode 100644 index 00000000000..23e2d80fd1c --- /dev/null +++ b/docs/reference/spar-braindump.md @@ -0,0 +1,183 @@ +# the spar service for user provisioning (scim) and authentication (saml) - a brain dump + +this is a mix of information on inmplementation details, architecture, +and operation. it should probably be sorted into different places in +the future, but if you can't find any more well-structured +documentation answering your questions, look here! + + +## related documentation + +- [list of howtos for supported SAML IdP vendors](https://docs.wire.com/how-to/single-sign-on/index.html) +- [fragment](https://docs.wire.com/understand/single-sign-on/design.html) (TODO: clean up the section "common misconceptions" below and move it here.) +- [official docs for team admin from customer support](https://support.wire.com/hc/en-us/categories/360000248538?section=administration%3Fsection%3Dadministration) (skip to "Authentication") +- [talk scim using curl](https://github.com/wireapp/wire-server/blob/develop/docs/reference/provisioning/scim-via-curl.md) +- if you want to work on our saml/scim implementation and do not have access to [https://github.com/wireapp/design-specs/tree/master/Single%20Sign%20On], please get in touch with us. + + +## operations + +### enabling / disabling the sso feature for a team + +if you have sso disabled by default, you need to turn on the feature +for every team that wants to use it. you can do this in the stern +service (aka backoffice). look for `get/put +/teams/{tid}/features/sso` + + +### registering an idp with a team via curl + +you need to have: + +```sh +# user id of an admin of the team (or the creator from the team info +# in the backoffice, if you only have the team id). +export ADMIN_ID=... + +# path of the xml metadata file (if you only have the url, curl it) +export METADATA_FILE=... +``` + +then you need to do: + +```sh +./khan.me scp upload -e prod -r spar -s .../wire-server/deploy/services-demo/register_idp_internal.sh -d ./register.sh +./khan.me scp upload -e prod -r spar -s ${METADATA_FILE} -d ./metadata.xml +./khan.me ssh -e prod -r spar +export TEAM_OWNER_ID=... +./register.sh metadata.xml $TEAM_OWNER_ID +``` + +the output contains the a json object representing the idp. construct +the login code from the `id` field of that object by adding `wire-` in +front, eg.: + +``` +wire-e97fbe2e-eeb1-11e9-acf3-9ba77d8a04bf +``` + +give this login code to the users that you want to connect to wire +using this idp. see +[here](https://support.wire.com/hc/en-us/articles/360000954617-Login-with-SSO) +on how to use the login code. + + +### troubleshooting + +#### gathering information + +- find metadata for team in table `spar.idp_raw_metadata` via cqlsh + (since https://github.com/wireapp/wire-server/pull/872) + +- ask user for screenshots of the error message, or even better, for + the text. the error message contains lots of strings that you can + grep for in the spar sources. + + +#### making spar work with a new IdP + +often, new IdPs work out of the box, because there appears to be some +consensus about what minimum feature set everybody should support. + +if there are problems: collect the metadata xml and an authentication +response xml (either from the browser http logs via a more technically +savvy customer; FUTUREWORK: it would be nice to log all saml response +xml files that spar receives in prod and cannot process). + +https://github.com/wireapp/saml2-web-sso supports writing [unit vendor +compatibility +tests](https://github.com/wireapp/saml2-web-sso/blob/ff9b9f445475809d1fa31ef7f2932caa0ed31613/test/Test/SAML2/WebSSO/APISpec.hs#L266-L329) +against that response value. once that test passes, it should all +work fine. + + +### common misconceptions + + +#### an email address can be one of two things + +the email used for saml auth is only a name, and never used for +sending out emails, and does not show as the email address of the user +in the team settings. + +RATIONALE: emails that are passed in from an external identity +provider must be trusted, so the user cannot have them as an actual +email address that wire is sending emails to. + +POSSIBLE FEATURE: we could authenticate the emails sent in from the +identity provider in the same way we are doing that for +password-authenticated non-team users: email receives a link +containing a crypto token, user clicks on link if the email is +authentic, email gets authenticated. + + +#### scim, provisioning, metadata + +for changing the user information (name, handle, email, ...), saml +isn't enough. the identity management software (AD in this case, or +some add-on) needs to support scim. we *could* support doing that via +saml, but the part of the standards that are needed for that are even +in worse shape than the ones for the authentication bits, and it would +not lead to a good user experience. so instead we require users to +adopt the more robust and contemporary scim standard. + + +#### we don't support binding password/phone-auth'ed users to saml yet + +to keep track of whether we have, see https://github.com/zinfra/backend-issues/issues/731 + + + +## application logic + +### deleting users that exist on spar + +For scim- or saml-created users, there are three locations for user data: + +- `brig.user` (and a few things associated with that on brig and galley) +- `spar.user` +- `spar.scim_user` + +The single source of truth is `brig.user`. Dangling entries in the +other places are allowed, and must be checked by the application code +for danglingness. ([test case for +scim](https://github.com/wireapp/wire-server/blob/010ca7e460d13160b465de24dd3982a397f94c16/services/spar/test-integration/Test/Spar/Scim/UserSpec.hs#L239-L308); +[test case for +saml](https://github.com/wireapp/wire-server/blob/293518655d7bae60fbcb0c4aaa06034785bfb6fc/services/spar/test-integration/Test/Spar/APISpec.hs#L742-L795)) + +For the semantics of interesting corner cases, consult [the test +suite](https://github.com/wireapp/wire-server/blob/develop/services/spar/test-integration/Test/Spar/APISpec.hs). +If you can't find what you're looking for there, please add at least a +pending test case explaining what's missing. + +Side note: Users in brig carry an enum type +[`ManagedBy`](https://github.com/wireapp/wire-server/blob/010ca7e460d13160b465de24dd3982a397f94c16/libs/brig-types/src/Brig/Types/Common.hs#L393-L413); +see also {#DevScimOneWaySync}. This is a half-implemented feature for +managing conflicts between changes via scim vs. changes from wire +clients; and does currently not affect deletability of users. + + +#### delete via deleting idp + +[Currently](https://github.com/wireapp/wire-server/blob/010ca7e460d13160b465de24dd3982a397f94c16/services/spar/src/Spar/API.hs#L172-L187), +deleting an IdP does not delete any user data. In particular: + +- cookies of users that have authenticated via an IdP will remain valid if the IdP gets deleted. +- if a user authenticates via an IdP that has been deleted to obtain a new cookie, the login code will not work, and the user will never be able to login again. +- the user will still show in the team settings, and can be manually deleted from there. +- if a new idp is registered, and a user authenticates via that idp, the old user is unreachable. (spar will look up the wire `UserId` under the saml user id that consists partly of the id of the new IdP, come up empty, and [create a fresh user on brig](https://github.com/wireapp/wire-server/blob/010ca7e460d13160b465de24dd3982a397f94c16/services/spar/src/Spar/App.hs#L306).) + + +#### user deletes herself + +TODO + + +#### delete in team settings + +TODO (probably little difference between this and "user deletes herself"?) + + +#### delete via scim + +TODO diff --git a/libs/brig-types/src/Brig/Types/Common.hs b/libs/brig-types/src/Brig/Types/Common.hs index 4f43c0c207c..14937d3a175 100644 --- a/libs/brig-types/src/Brig/Types/Common.hs +++ b/libs/brig-types/src/Brig/Types/Common.hs @@ -391,7 +391,7 @@ codeParser err conv = do -- ManagedBy -- | Who controls changes to the user profile (where the profile is defined as "all --- user-editable, user-visible attributes"). +-- user-editable, user-visible attributes"). See {#DevScimOneWaySync}. data ManagedBy -- | The profile can be changed in-app; user doesn't show up via SCIM at all. = ManagedByWire diff --git a/services/spar/test-integration/Test/Spar/APISpec.hs b/services/spar/test-integration/Test/Spar/APISpec.hs index 372d15fefc1..c9bb6591a2e 100644 --- a/services/spar/test-integration/Test/Spar/APISpec.hs +++ b/services/spar/test-integration/Test/Spar/APISpec.hs @@ -5,6 +5,7 @@ import Imports hiding (head) import Bilge import Brig.Types.User import Control.Lens +import Control.Monad.Random.Class (getRandomR) import Data.Aeson import Data.Aeson.Lens import Data.ByteString.Conversion @@ -12,7 +13,7 @@ import Data.Id import Data.Proxy import Data.String.Conversions import Data.UUID as UUID hiding (null, fromByteString) -import Network.HTTP.Types (status200) +import Network.HTTP.Types (status200, status202) import SAML2.WebSSO as SAML import SAML2.WebSSO.Test.Lenses import SAML2.WebSSO.Test.MockResponse @@ -38,6 +39,7 @@ spec = do specFinalizeLogin specBindingUsers specCRUDIdentityProvider + specDeleteCornerCases specScimAndSAML specAux @@ -735,6 +737,76 @@ specCRUDIdentityProvider = do liftIO $ idp `shouldBe` idp' +specDeleteCornerCases :: SpecWith TestEnv +specDeleteCornerCases = describe "delete corner cases" $ do + it "create user1 via idp1 (saml); delete user1; create user via newly created idp2 (saml)" $ do + pending + + it "create user1 via saml; delete user1; create via scim (in same team)" $ do + pending + + it "create user1 via saml; delete user1; create via password (outside team)" $ do + pending + + it "delete idp; create idp with same issuer id" $ do + pending + + -- clone of 'testScimCreateVsUserRef', without the scim part: Create a user implicitly via + -- saml login; remove it via brig leaving a dangling entry in @spar.user@; create it via saml + -- login once more. This should work despite the dangling database entry. + it "re-create previously deleted, dangling users" $ do + (_ownerid, _teamid, idp) <- registerTestIdP + uname :: SAML.UnqualifiedNameID <- do + suffix <- cs <$> replicateM 7 (getRandomR ('0', '9')) + either (error . show) pure $ + SAML.mkUNameIDEmail ("email_" <> suffix <> "@example.com") + let uref = SAML.UserRef tenant subj + subj = either (error . show) id $ SAML.mkNameID uname Nothing Nothing Nothing + tenant = idp ^. SAML.idpMetadata . SAML.edIssuer + + !(Just !uid) <- createViaSaml idp uref + samlUserShouldSatisfy uref isJust + + deleteViaBrig uid + samlUserShouldSatisfy uref isJust -- brig doesn't talk to spar right now when users + -- are deleted there. we need to work around this + -- fact for now. (if the test fails here, this may + -- mean that you fixed the behavior and can + -- change this to 'isNothing'.) + + (Just _) <- createViaSaml idp uref + samlUserShouldSatisfy uref isJust + where + samlUserShouldSatisfy :: HasCallStack => SAML.UserRef -> (Maybe UserId -> Bool) -> TestSpar () + samlUserShouldSatisfy uref property = do + muid <- getUserIdViaRef' uref + liftIO $ muid `shouldSatisfy` property + + createViaSamlResp :: HasCallStack => IdP -> SAML.UserRef -> TestSpar ResponseLBS + createViaSamlResp idp (SAML.UserRef _ subj) = do + (privCreds, authnReq) <- negotiateAuthnRequest idp + spmeta <- getTestSPMetadata + authnResp <- runSimpleSP $ + mkAuthnResponseWithSubj subj privCreds idp spmeta authnReq True + createResp <- submitAuthnResponse authnResp + liftIO $ responseStatus createResp `shouldBe` status200 + pure createResp + + createViaSaml :: HasCallStack => IdP -> SAML.UserRef -> TestSpar (Maybe UserId) + createViaSaml idp uref = do + resp <- createViaSamlResp idp uref + liftIO $ do + maybe (error "no body") cs (responseBody resp) + `shouldContain` "wire:sso:success" + getUserIdViaRef' uref + + deleteViaBrig :: UserId -> TestSpar () + deleteViaBrig uid = do + brig <- view teBrig + resp <- (call . delete $ brig . paths ["i", "users", toByteString' uid]) + liftIO $ responseStatus resp `shouldBe` status202 + + specScimAndSAML :: SpecWith TestEnv specScimAndSAML = do it "SCIM and SAML work together and SCIM-created users can login" $ do From 9575ca3947ea14eeaced902a832c64a46c63870b Mon Sep 17 00:00:00 2001 From: jschaul Date: Thu, 17 Oct 2019 12:04:54 +0200 Subject: [PATCH 04/26] rename docker-ephemeral to docker_ephemeral (#880) Fixes #615 --- .gitignore | 16 ++++++++-------- README.md | 2 +- .../build/Makefile | 0 .../build/README.md | 0 .../db-migrate/brig-index.sh | 0 .../db-migrate/brig-schema.sh | 0 .../db-migrate/galley-schema.sh | 0 .../db-migrate/gundeck-schema.sh | 0 .../db-migrate/spar-schema.sh | 0 .../docker-compose.yaml | 0 .../init.sh | 0 .../run.sh | 0 deploy/services-demo/README.md | 2 +- deploy/services-demo/demo.sh | 4 ++-- deploy/services-demo/docker-compose.yaml | 18 +++++++++--------- services/brig/brig.integration.yaml | 2 +- services/integration.sh | 4 ++-- 17 files changed, 24 insertions(+), 24 deletions(-) rename deploy/{docker-ephemeral => dockerephemeral}/build/Makefile (100%) rename deploy/{docker-ephemeral => dockerephemeral}/build/README.md (100%) rename deploy/{docker-ephemeral => dockerephemeral}/db-migrate/brig-index.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/db-migrate/brig-schema.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/db-migrate/galley-schema.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/db-migrate/gundeck-schema.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/db-migrate/spar-schema.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/docker-compose.yaml (100%) rename deploy/{docker-ephemeral => dockerephemeral}/init.sh (100%) rename deploy/{docker-ephemeral => dockerephemeral}/run.sh (100%) diff --git a/.gitignore b/.gitignore index 28cbc6beab1..50a4d5aff20 100644 --- a/.gitignore +++ b/.gitignore @@ -63,14 +63,14 @@ swagger-ui deploy/services-demo/resources/templates/* deploy/services-demo/conf/nginz/zwagger-ui/* -deploy/docker-ephemeral/build/airdock_base-all/ -deploy/docker-ephemeral/build/airdock_base/ -deploy/docker-ephemeral/build/airdock_fakesqs-all/ -deploy/docker-ephemeral/build/airdock_fakesqs/ -deploy/docker-ephemeral/build/airdock_rvm-all/ -deploy/docker-ephemeral/build/airdock_rvm/ -deploy/docker-ephemeral/build/dynamodb_local/ -deploy/docker-ephemeral/build/smtp/ +deploy/dockerephemeral/build/airdock_base-all/ +deploy/dockerephemeral/build/airdock_base/ +deploy/dockerephemeral/build/airdock_fakesqs-all/ +deploy/dockerephemeral/build/airdock_fakesqs/ +deploy/dockerephemeral/build/airdock_rvm-all/ +deploy/dockerephemeral/build/airdock_rvm/ +deploy/dockerephemeral/build/dynamodb_local/ +deploy/dockerephemeral/build/smtp/ # Ignore cabal files; use package.yaml instead *.cabal diff --git a/README.md b/README.md index 593458473ec..d43f8cfe905 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ Integration tests require all of the haskell services (brig, galley, cannon, gun Setting up these real, but in-memory internal and "fake" external dependencies is done easiest using [`docker-compose`](https://docs.docker.com/compose/install/). Run the following in a separate terminal (it will block that terminal, C-c to shut all these docker images down again): ``` -deploy/docker-ephemeral/run.sh +deploy/dockerephemeral/run.sh ``` Then, to run all integration tests: diff --git a/deploy/docker-ephemeral/build/Makefile b/deploy/dockerephemeral/build/Makefile similarity index 100% rename from deploy/docker-ephemeral/build/Makefile rename to deploy/dockerephemeral/build/Makefile diff --git a/deploy/docker-ephemeral/build/README.md b/deploy/dockerephemeral/build/README.md similarity index 100% rename from deploy/docker-ephemeral/build/README.md rename to deploy/dockerephemeral/build/README.md diff --git a/deploy/docker-ephemeral/db-migrate/brig-index.sh b/deploy/dockerephemeral/db-migrate/brig-index.sh similarity index 100% rename from deploy/docker-ephemeral/db-migrate/brig-index.sh rename to deploy/dockerephemeral/db-migrate/brig-index.sh diff --git a/deploy/docker-ephemeral/db-migrate/brig-schema.sh b/deploy/dockerephemeral/db-migrate/brig-schema.sh similarity index 100% rename from deploy/docker-ephemeral/db-migrate/brig-schema.sh rename to deploy/dockerephemeral/db-migrate/brig-schema.sh diff --git a/deploy/docker-ephemeral/db-migrate/galley-schema.sh b/deploy/dockerephemeral/db-migrate/galley-schema.sh similarity index 100% rename from deploy/docker-ephemeral/db-migrate/galley-schema.sh rename to deploy/dockerephemeral/db-migrate/galley-schema.sh diff --git a/deploy/docker-ephemeral/db-migrate/gundeck-schema.sh b/deploy/dockerephemeral/db-migrate/gundeck-schema.sh similarity index 100% rename from deploy/docker-ephemeral/db-migrate/gundeck-schema.sh rename to deploy/dockerephemeral/db-migrate/gundeck-schema.sh diff --git a/deploy/docker-ephemeral/db-migrate/spar-schema.sh b/deploy/dockerephemeral/db-migrate/spar-schema.sh similarity index 100% rename from deploy/docker-ephemeral/db-migrate/spar-schema.sh rename to deploy/dockerephemeral/db-migrate/spar-schema.sh diff --git a/deploy/docker-ephemeral/docker-compose.yaml b/deploy/dockerephemeral/docker-compose.yaml similarity index 100% rename from deploy/docker-ephemeral/docker-compose.yaml rename to deploy/dockerephemeral/docker-compose.yaml diff --git a/deploy/docker-ephemeral/init.sh b/deploy/dockerephemeral/init.sh similarity index 100% rename from deploy/docker-ephemeral/init.sh rename to deploy/dockerephemeral/init.sh diff --git a/deploy/docker-ephemeral/run.sh b/deploy/dockerephemeral/run.sh similarity index 100% rename from deploy/docker-ephemeral/run.sh rename to deploy/dockerephemeral/run.sh diff --git a/deploy/services-demo/README.md b/deploy/services-demo/README.md index d82818c905d..16654b2a8d3 100644 --- a/deploy/services-demo/README.md +++ b/deploy/services-demo/README.md @@ -7,7 +7,7 @@ Use 2 different terminals and run: ``` # On terminal 1, start the dependencies. Note that you should turn up the max memory # limit of docker. More on https://github.com/wireapp/wire-server/issues/326 -deploy/docker-ephemeral/run.sh +deploy/dockerephemeral/run.sh ``` ``` diff --git a/deploy/services-demo/demo.sh b/deploy/services-demo/demo.sh index 46c6936dd14..e21b0a9eb9e 100755 --- a/deploy/services-demo/demo.sh +++ b/deploy/services-demo/demo.sh @@ -73,7 +73,7 @@ function check_prerequisites() { nc -z 127.0.0.1 9042 \ && nc -z 127.0.0.1 9200 \ && nc -z 127.0.0.1 6379 \ - || { echo "Databases not up. Maybe run 'deploy/docker-ephemeral/run.sh' in a separate terminal first?"; exit 1; } + || { echo "Databases not up. Maybe run 'deploy/dockerephemeral/run.sh' in a separate terminal first?"; exit 1; } if [ "$docker_deployment" = "false" ]; then test -f ${DIR}/../dist/brig \ && test -f ${DIR}/../dist/galley \ @@ -127,7 +127,7 @@ function copy_nginz_configs() { } # brig,gundeck,galley use the amazonka library's 'Discover', which expects AWS credentials -# even if those are not used/can be dummy values with the fake sqs/ses/etc containers used (see deploy/docker-ephemeral/docker-compose.yaml) +# even if those are not used/can be dummy values with the fake sqs/ses/etc containers used (see deploy/dockerephemeral/docker-compose.yaml) export AWS_REGION=${AWS_REGION:-eu-west-1} export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-dummy} export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-dummy} diff --git a/deploy/services-demo/docker-compose.yaml b/deploy/services-demo/docker-compose.yaml index 5435a02de02..62ff5165f03 100644 --- a/deploy/services-demo/docker-compose.yaml +++ b/deploy/services-demo/docker-compose.yaml @@ -1,5 +1,5 @@ networks: - docker-ephemeral_demo_wire: + dockerephemeral_demo_wire: external: true version: '2' @@ -30,7 +30,7 @@ services: - demo_wire_sqs:sqs - demo_wire_smtp:smtp networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire galley: image: quay.io/wire/galley @@ -50,7 +50,7 @@ services: external_links: - demo_wire_cassandra:cassandra networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire gundeck: image: quay.io/wire/gundeck @@ -72,7 +72,7 @@ services: - demo_wire_sqs:sqs - demo_wire_localstack:sns networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire cannon: image: quay.io/wire/cannon @@ -86,7 +86,7 @@ services: - /configs/conf/cannon.demo-docker.yaml working_dir: /configs networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire cargohold: image: quay.io/wire/cargohold @@ -102,7 +102,7 @@ services: external_links: - demo_wire_s3:s3 networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire proxy: image: quay.io/wire/proxy @@ -116,7 +116,7 @@ services: - /configs/conf/proxy.demo.yaml working_dir: /configs networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire spar: image: quay.io/wire/spar @@ -132,7 +132,7 @@ services: external_links: - demo_wire_cassandra:cassandra networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire nginz: image: quay.io/wire/nginz @@ -157,4 +157,4 @@ services: - /configs/conf/nginz/nginx-docker.conf working_dir: /configs networks: - - docker-ephemeral_demo_wire + - dockerephemeral_demo_wire diff --git a/services/brig/brig.integration.yaml b/services/brig/brig.integration.yaml index 69aa47ead48..ca5d4b37808 100644 --- a/services/brig/brig.integration.yaml +++ b/services/brig/brig.integration.yaml @@ -24,7 +24,7 @@ gundeck: host: 127.0.0.1 port: 8086 -# You can set up local SQS/Dynamo running e.g. `../../deploy/docker-ephemeral/run.sh` +# You can set up local SQS/Dynamo running e.g. `../../deploy/dockerephemeral/run.sh` aws: userJournalQueue: integration-user-events.fifo # ^ Comment this out if you don't want to journal user events diff --git a/services/integration.sh b/services/integration.sh index e40c67d2643..bae017d675d 100755 --- a/services/integration.sh +++ b/services/integration.sh @@ -37,7 +37,7 @@ function check_prerequisites() { if ! ( nc -z 127.0.0.1 9042 \ && nc -z 127.0.0.1 9200 \ && nc -z 127.0.0.1 6379 ); then - echo "Databases not up. Maybe run 'deploy/docker-ephemeral/run.sh' in a separate terminal first?"; exit 1; + echo "Databases not up. Maybe run 'deploy/dockerephemeral/run.sh' in a separate terminal first?"; exit 1; fi if [ ! -f "${TOP_LEVEL}/dist/brig" ] \ && [ ! -f "${TOP_LEVEL}/dist/galley" ] \ @@ -65,7 +65,7 @@ if [[ $INTEGRATION_USE_REAL_AWS -eq 1 ]]; then else # brig,gundeck,galley use the amazonka library's 'Discover', which expects AWS credentials # even if those are not used/can be dummy values with the fake sqs/ses/etc containers used - # (see deploy/docker-ephemeral/docker-compose.yaml ) + # (see deploy/dockerephemeral/docker-compose.yaml ) echo 'Running tests using mocked AWS services' export AWS_REGION=eu-west-1 export AWS_ACCESS_KEY_ID=dummykey From e5e2207ed92af454b70d29b3dad1a8e633c1a6d0 Mon Sep 17 00:00:00 2001 From: fisx Date: Thu, 17 Oct 2019 12:47:50 +0200 Subject: [PATCH 05/26] Store raw idp metadata with typed details in c* (#872) --- services/spar/package.yaml | 1 + services/spar/schema/src/Main.hs | 2 + services/spar/schema/src/V6.hs | 18 +++++++++ services/spar/src/Spar/API.hs | 17 ++++++-- services/spar/src/Spar/API/Swagger.hs | 3 ++ services/spar/src/Spar/API/Types.hs | 5 ++- services/spar/src/Spar/Data.hs | 30 +++++++++++++- services/spar/src/Spar/Types.hs | 39 ++++++++++++------- .../test-integration/Test/Spar/APISpec.hs | 27 +++++++++---- services/spar/test-integration/Util/Core.hs | 10 +++++ services/spar/test/Arbitrary.hs | 5 ++- services/spar/test/Test/Spar/APISpec.hs | 5 ++- 12 files changed, 133 insertions(+), 29 deletions(-) create mode 100644 services/spar/schema/src/V6.hs diff --git a/services/spar/package.yaml b/services/spar/package.yaml index 93fbee9f640..ee3d1d1d50c 100644 --- a/services/spar/package.yaml +++ b/services/spar/package.yaml @@ -45,6 +45,7 @@ dependencies: - http-api-data - http-client - http-client-tls + - http-media - http-types - imports - insert-ordered-containers diff --git a/services/spar/schema/src/Main.hs b/services/spar/schema/src/Main.hs index e443d29fed0..9d9988d5e79 100644 --- a/services/spar/schema/src/Main.hs +++ b/services/spar/schema/src/Main.hs @@ -13,6 +13,7 @@ import qualified V2 import qualified V3 import qualified V4 import qualified V5 +import qualified V6 main :: IO () main = do @@ -27,6 +28,7 @@ main = do , V3.migration , V4.migration , V5.migration + , V6.migration -- When adding migrations here, don't forget to update -- 'schemaVersion' in Spar.Data diff --git a/services/spar/schema/src/V6.hs b/services/spar/schema/src/V6.hs new file mode 100644 index 00000000000..2f26b02a147 --- /dev/null +++ b/services/spar/schema/src/V6.hs @@ -0,0 +1,18 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} + +module V6 (migration) where + +import Imports +import Cassandra.Schema +import Text.RawString.QQ + +migration :: Migration +migration = Migration 6 "Store raw XML metadata" $ do + void $ schema' [r| + CREATE TABLE if not exists idp_raw_metadata + ( id uuid + , metadata text + , primary key (id) + ) with compaction = {'class': 'LeveledCompactionStrategy'}; + |] diff --git a/services/spar/src/Spar/API.hs b/services/spar/src/Spar/API.hs index cccae4a5390..3c503e3ba83 100644 --- a/services/spar/src/Spar/API.hs +++ b/services/spar/src/Spar/API.hs @@ -74,6 +74,7 @@ apiSSO opts apiIDP :: ServerT APIIDP Spar apiIDP = idpGet + :<|> idpGetRaw :<|> idpGetAll :<|> idpCreate :<|> idpDelete @@ -164,6 +165,14 @@ idpGet zusr idpid = withDebugLog "idpGet" (Just . show . (^. SAML.idpId)) $ do authorizeIdP zusr idp pure idp +idpGetRaw :: Maybe UserId -> SAML.IdPId -> Spar RawIdPMetadata +idpGetRaw zusr idpid = do + idp <- SAML.getIdPConfig idpid + authorizeIdP zusr idp + wrapMonadClient (Data.getIdPRawMetadata idpid) >>= \case + Just txt -> pure $ RawIdPMetadata txt + Nothing -> throwSpar SparNotFound + idpGetAll :: Maybe UserId -> Spar IdPList idpGetAll zusr = withDebugLog "idpGetAll" (const Nothing) $ do teamid <- Intra.getZUsrOwnedTeam zusr @@ -185,21 +194,23 @@ idpDelete zusr idpid = withDebugLog "idpDelete" (const Nothing) $ do when (stiIdP == Just idpid) $ Data.deleteScimToken team stiId -- Delete IdP config Data.deleteIdPConfig idpid issuer team + Data.deleteIdPRawMetadata idpid return NoContent -- | We generate a new UUID for each IdP used as IdPConfig's path, thereby ensuring uniqueness. -idpCreateXML :: Maybe UserId -> SAML.IdPMetadata -> Spar IdP -idpCreateXML zusr idpmeta = withDebugLog "idpCreate" (Just . show . (^. SAML.idpId)) $ do +idpCreateXML :: Maybe UserId -> Text -> SAML.IdPMetadata -> Spar IdP +idpCreateXML zusr raw idpmeta = withDebugLog "idpCreate" (Just . show . (^. SAML.idpId)) $ do teamid <- Intra.getZUsrOwnedTeam zusr Galley.assertSSOEnabled teamid idp <- validateNewIdP idpmeta teamid + wrapMonadClient $ Data.storeIdPRawMetadata (idp ^. SAML.idpId) raw SAML.storeIdPConfig idp pure idp -- | This handler only does the json parsing, and leaves all authorization checks and -- application logic to 'idpCreateXML'. idpCreate :: Maybe UserId -> IdPMetadataInfo -> Spar IdP -idpCreate zusr (IdPMetadataValue xml) = idpCreateXML zusr xml +idpCreate zusr (IdPMetadataValue raw xml) = idpCreateXML zusr raw xml withDebugLog :: SAML.SP m => String -> (a -> Maybe String) -> m a -> m a diff --git a/services/spar/src/Spar/API/Swagger.hs b/services/spar/src/Spar/API/Swagger.hs index dc07411ef30..078756d0e63 100644 --- a/services/spar/src/Spar/API/Swagger.hs +++ b/services/spar/src/Spar/API/Swagger.hs @@ -157,3 +157,6 @@ instance ToParamSchema BindCookie where instance ToSchema Void where declareNamedSchema _ = declareNamedSchema (Proxy @String) + +instance ToSchema RawIdPMetadata where + declareNamedSchema _ = declareNamedSchema (Proxy @String) diff --git a/services/spar/src/Spar/API/Types.hs b/services/spar/src/Spar/API/Types.hs index 902bff25381..95802beecf9 100644 --- a/services/spar/src/Spar/API/Types.hs +++ b/services/spar/src/Spar/API/Types.hs @@ -132,13 +132,16 @@ type APIAuthResp type APIIDP = Header "Z-User" UserId :> IdpGet + :<|> Header "Z-User" UserId :> IdpGetRaw :<|> Header "Z-User" UserId :> IdpGetAll :<|> Header "Z-User" UserId :> IdpCreate :<|> Header "Z-User" UserId :> IdpDelete +type IdpGetRaw = Capture "id" SAML.IdPId :> "raw" :> Get '[RawXML] RawIdPMetadata + type IdpGet = Capture "id" SAML.IdPId :> Get '[JSON] IdP type IdpGetAll = Get '[JSON] IdPList -type IdpCreate = ReqBodyCustomError '[SAML.XML, JSON] "wai-error" IdPMetadataInfo :> PostCreated '[JSON] IdP +type IdpCreate = ReqBodyCustomError '[RawXML, JSON] "wai-error" IdPMetadataInfo :> PostCreated '[JSON] IdP type IdpDelete = Capture "id" SAML.IdPId :> DeleteNoContent '[JSON] NoContent instance MakeCustomError "wai-error" IdPMetadataInfo where diff --git a/services/spar/src/Spar/Data.hs b/services/spar/src/Spar/Data.hs index 35e4a395242..ba6d3d9dd67 100644 --- a/services/spar/src/Spar/Data.hs +++ b/services/spar/src/Spar/Data.hs @@ -31,6 +31,9 @@ module Spar.Data , getIdPConfigsByTeam , deleteIdPConfig , deleteTeam + , storeIdPRawMetadata + , getIdPRawMetadata + , deleteIdPRawMetadata -- * SCIM auth , insertScimToken @@ -70,7 +73,7 @@ import qualified Web.Scim.Class.User as ScimC.User -- | A lower bound: @schemaVersion <= whatWeFoundOnCassandra@, not @==@. schemaVersion :: Int32 -schemaVersion = 5 +schemaVersion = 6 ---------------------------------------------------------------------- @@ -382,6 +385,31 @@ deleteTeam team = do deleteSAMLUsersByIssuer issuer deleteIdPConfig idpid issuer team +storeIdPRawMetadata + :: (HasCallStack, MonadClient m) + => SAML.IdPId -> ST -> m () +storeIdPRawMetadata idp raw = retry x5 . write ins $ params Quorum (idp, raw) + where + ins :: PrepQuery W (SAML.IdPId, ST) () + ins = "INSERT INTO idp_raw_metadata (id, metadata) VALUES (?, ?)" + +getIdPRawMetadata + :: (HasCallStack, MonadClient m) + => SAML.IdPId -> m (Maybe ST) +getIdPRawMetadata idp = runIdentity <$$> + (retry x1 . query1 sel $ params Quorum (Identity idp)) + where + sel :: PrepQuery R (Identity SAML.IdPId) (Identity ST) + sel = "SELECT metadata FROM idp_raw_metadata WHERE id = ?" + +deleteIdPRawMetadata + :: (HasCallStack, MonadClient m) + => SAML.IdPId -> m () +deleteIdPRawMetadata idp = retry x5 . write del $ params Quorum (Identity idp) + where + del :: PrepQuery W (Identity SAML.IdPId) () + del = "DELETE FROM idp_raw_metadata WHERE id = ?" + ---------------------------------------------------------------------- -- SCIM auth -- diff --git a/services/spar/src/Spar/Types.hs b/services/spar/src/Spar/Types.hs index 425ccf7654b..90943d3a745 100644 --- a/services/spar/src/Spar/Types.hs +++ b/services/spar/src/Spar/Types.hs @@ -6,28 +6,31 @@ module Spar.Types where import Imports + import Control.Lens (makeLenses) import Control.Monad.Except import Data.Aeson import Data.Aeson.TH -import Data.Id (TeamId, UserId, ScimTokenId) import Data.ByteString.Conversion +import Data.Id (TeamId, UserId, ScimTokenId) import Data.Json.Util -import Data.Text.Encoding (encodeUtf8) import Data.Proxy (Proxy(Proxy)) import Data.String.Conversions import Data.String.Conversions (ST) +import Data.Text.Encoding (encodeUtf8) import Data.Time import GHC.TypeLits (KnownSymbol, symbolVal) import GHC.Types (Symbol) +import Network.HTTP.Media ((//)) import SAML2.Util (renderURI, parseURI') import SAML2.WebSSO (IdPConfig, IdPId, ID, AuthnRequest, Assertion, SimpleSetCookie) import SAML2.WebSSO.Types.TH (deriveJSONOptions) +import Servant.API as Servant hiding (MkLink, URI(..)) +import System.Logger.Extended (LogFormat) import URI.ByteString import Util.Options import Web.Cookie import Web.HttpApiData -import System.Logger.Extended (LogFormat) import qualified Data.ByteString.Builder as Builder import qualified Data.Text as ST @@ -67,24 +70,34 @@ deriveJSON deriveJSONOptions ''IdPList -- implement @{"uri": , "cert": }@. check both the certificate we get -- from the server against the pinned one and the metadata url in the metadata against the one -- we fetched the xml from, but it's unclear what the benefit would be.) -newtype IdPMetadataInfo = IdPMetadataValue SAML.IdPMetadata +data IdPMetadataInfo = IdPMetadataValue Text SAML.IdPMetadata deriving (Eq, Show, Generic) -instance SAML.HasXMLRoot IdPMetadataInfo where - renderRoot = error "instance SAML.HasXML IdPMetadataInfo: render not implemented" - -- FUTUREWORK: split up HasXML in saml-web-sso into FromXML and ToXML, then we probably - -- can actually not implement this (this even as an error). should be a nice, - -- backwards-compatible change! +-- | We want to store the raw xml text from the registration request in the database for +-- trouble shooting, but @SAML.XML@ only gives us access to the xml tree, not the raw text. +-- 'RawXML' helps with that. +data RawXML + +instance Accept RawXML where + contentType Proxy = "application" // "xml" + +instance MimeUnrender RawXML IdPMetadataInfo where + mimeUnrender Proxy raw = IdPMetadataValue (cs raw) <$> mimeUnrender (Proxy @SAML.XML) raw + +instance MimeRender RawXML RawIdPMetadata where + mimeRender Proxy (RawIdPMetadata raw) = cs raw + +newtype RawIdPMetadata = RawIdPMetadata Text + deriving (Eq, Show, Generic) -instance SAML.HasXML IdPMetadataInfo where - parse = fmap IdPMetadataValue . SAML.parse instance FromJSON IdPMetadataInfo where parseJSON = withObject "IdPMetadataInfo" $ \obj -> do - either fail (pure . IdPMetadataValue) . SAML.decode =<< (obj .: "value") + raw <- obj .: "value" + either fail (pure . IdPMetadataValue raw) (SAML.decode (cs raw)) instance ToJSON IdPMetadataInfo where - toJSON (IdPMetadataValue xml) = + toJSON (IdPMetadataValue _ xml) = object [ "value" .= SAML.encode xml ] diff --git a/services/spar/test-integration/Test/Spar/APISpec.hs b/services/spar/test-integration/Test/Spar/APISpec.hs index c9bb6591a2e..9767093647c 100644 --- a/services/spar/test-integration/Test/Spar/APISpec.hs +++ b/services/spar/test-integration/Test/Spar/APISpec.hs @@ -24,6 +24,7 @@ import Util.Core import Util.Types import qualified Data.ByteString.Builder as LB +import qualified Data.Text as ST import qualified Data.ZAuth.Token as ZAuth import qualified Galley.Types.Teams as Galley import qualified Spar.Intra.Brig as Intra @@ -298,8 +299,8 @@ specBindingUsers = describe "binding existing users to sso identities" $ do . expect2xx ) - let checkInitiateLogin :: HasCallStack => Bool -> TestSpar UserId -> SpecWith TestEnv - checkInitiateLogin hasZUser createUser = do + let checkInitiateBind :: HasCallStack => Bool -> TestSpar UserId -> SpecWith TestEnv + checkInitiateBind hasZUser createUser = do let testmsg = if hasZUser then "responds with 200 and a bind cookie" else "responds with 403 and 'bind-without-auth'" @@ -333,13 +334,13 @@ specBindingUsers = describe "binding existing users to sso identities" $ do describe "GET /sso-initiate-bind/:idp" $ do context "known IdP, running session without authentication" $ do - checkInitiateLogin False (fmap fst . call . createRandomPhoneUser =<< asks (^. teBrig)) + checkInitiateBind False (fmap fst . call . createRandomPhoneUser =<< asks (^. teBrig)) context "known IdP, running session with non-sso user" $ do - checkInitiateLogin True (fmap fst . call . createRandomPhoneUser =<< asks (^. teBrig)) + checkInitiateBind True (fmap fst . call . createRandomPhoneUser =<< asks (^. teBrig)) context "known IdP, running session with sso user" $ do - checkInitiateLogin True (registerTestIdP >>= \(_, _, idp) -> loginSsoUserFirstTime idp) + checkInitiateBind True (registerTestIdP >>= \(_, _, idp) -> loginSsoUserFirstTime idp) describe "POST /sso/finalize-login" $ do let checkGrantingAuthnResp :: HasCallStack => UserId -> SignedAuthnResponse -> ResponseLBS -> TestSpar () @@ -619,6 +620,8 @@ specCRUDIdentityProvider = do `shouldRespondWith` \resp -> statusCode resp < 300 callIdpGet' (env ^. teSpar) (Just userid) idpid `shouldRespondWith` checkErr (== 404) "not-found" + callIdpGetRaw' (env ^. teSpar) (Just userid) idpid + `shouldRespondWith` checkErr (== 404) "not-found" context "with email" $ it "responds with 2xx and removes IdP" $ do env <- ask @@ -714,7 +717,11 @@ specCRUDIdentityProvider = do metadata <- makeTestIdPMetadata idp <- call $ callIdpCreate (env ^. teSpar) (Just owner) metadata idp' <- call $ callIdpGet (env ^. teSpar) (Just owner) (idp ^. idpId) - liftIO $ idp `shouldBe` idp' + rawmeta <- call $ callIdpGetRaw (env ^. teSpar) (Just owner) (idp ^. idpId) + liftIO $ do + idp `shouldBe` idp' + let prefix = " makeTestIdPMetadata + metadata <- Data.Aeson.encode . (IdPMetadataValue mempty) <$> makeTestIdPMetadata idp <- call $ callIdpCreateRaw (env ^. teSpar) (Just owner) "application/json" metadata idp' <- call $ callIdpGet (env ^. teSpar) (Just owner) (idp ^. idpId) - liftIO $ idp `shouldBe` idp' + rawmeta <- call $ callIdpGetRaw (env ^. teSpar) (Just owner) (idp ^. idpId) + liftIO $ do + idp `shouldBe` idp' + let prefix = " SparReq -> Maybe UserId -> SAML.IdPId callIdpGet' sparreq_ muid idpid = do get $ sparreq_ . maybe id zUser muid . path (cs $ "/identity-providers/" -/ SAML.idPIdToST idpid) +callIdpGetRaw :: (MonadIO m, MonadHttp m) => SparReq -> Maybe UserId -> SAML.IdPId -> m Text +callIdpGetRaw sparreq_ muid idpid = do + resp <- callIdpGetRaw' (sparreq_ . expect2xx) muid idpid + maybe (liftIO . throwIO $ ErrorCall "Nothing") (pure . cs) (responseBody resp) + +callIdpGetRaw' :: (MonadIO m, MonadHttp m) => SparReq -> Maybe UserId -> SAML.IdPId -> m ResponseLBS +callIdpGetRaw' sparreq_ muid idpid = do + get $ sparreq_ . maybe id zUser muid . path (cs $ "/identity-providers/" -/ SAML.idPIdToST idpid -/ "raw") + callIdpGetAll :: (MonadIO m, MonadHttp m) => SparReq -> Maybe UserId -> m IdPList callIdpGetAll sparreq_ muid = do resp <- callIdpGetAll' (sparreq_ . expect2xx) muid diff --git a/services/spar/test/Arbitrary.hs b/services/spar/test/Arbitrary.hs index ad3b706c82a..09393b2abed 100644 --- a/services/spar/test/Arbitrary.hs +++ b/services/spar/test/Arbitrary.hs @@ -11,6 +11,7 @@ import Data.Proxy import "swagger2" Data.Swagger hiding (Header(..)) import Data.Aeson import Data.Id () +import Data.String.Conversions (cs) import SAML2.WebSSO.Test.Arbitrary () import Servant.API.ContentTypes import Spar.Types @@ -46,7 +47,9 @@ instance Arbitrary NoContent where arbitrary = pure NoContent instance Arbitrary IdPMetadataInfo where - arbitrary = IdPMetadataValue <$> arbitrary + arbitrary = do + mdata <- arbitrary + pure $ IdPMetadataValue (cs $ encode mdata) mdata -- This is not required by the servant-server instances, but the swagger -- tests want it. See https://github.com/haskell-servant/servant-swagger/issues/58 diff --git a/services/spar/test/Test/Spar/APISpec.hs b/services/spar/test/Test/Spar/APISpec.hs index 471b92cb600..1696ee29122 100644 --- a/services/spar/test/Test/Spar/APISpec.hs +++ b/services/spar/test/Test/Spar/APISpec.hs @@ -11,7 +11,7 @@ import Data.Metrics.Test (pathsConsistencyCheck) import Data.Proxy (Proxy(Proxy)) import Servant.Swagger (validateEveryToJSON) import Spar.API as API -import Spar.Types (IdPMetadataInfo) +import Spar.Types (IdPMetadataInfo(IdPMetadataValue)) import Test.Hspec (Spec, it, shouldBe) import Test.QuickCheck (property) @@ -25,4 +25,5 @@ spec = do pathsConsistencyCheck (routesToPaths @API.API) `shouldBe` mempty it "roundtrip: IdPMetadataInfo" . property $ \(val :: IdPMetadataInfo) -> do - (eitherDecode . encode) val `shouldBe` Right val + let withoutRaw (IdPMetadataValue _ x) = x + (withoutRaw <$> (eitherDecode . encode) val) `shouldBe` Right (withoutRaw val) From 4a78cd24b79048c6e3a4bd11c6b5f0fda1ead992 Mon Sep 17 00:00:00 2001 From: fisx Date: Fri, 18 Oct 2019 13:52:18 +0200 Subject: [PATCH 06/26] Tweak invite script (#877) * Do not have default args that never apply * Make errors more greppable, do not abort on first error * Usage info --- deploy/services-demo/create_team_members.sh | 22 +++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/deploy/services-demo/create_team_members.sh b/deploy/services-demo/create_team_members.sh index c899f8098cc..c0fabeafe51 100755 --- a/deploy/services-demo/create_team_members.sh +++ b/deploy/services-demo/create_team_members.sh @@ -2,10 +2,10 @@ set -e -ADMIN_UUID="a09e9521-e14e-4285-ad71-47caa97f4a16" -TEAM_UUID="9e57a378-0dca-468f-9661-7872f5f1c910" -BRIG_HOST="http://localhost:8082" -CSV_FILE="myfile.csv" +ADMIN_UUID="n/a" +TEAM_UUID="n/a" +BRIG_HOST="http://localhost:8080" +CSV_FILE="n/a" USAGE=" This bash script can be used to invite members to a given team. Input @@ -19,6 +19,17 @@ USAGE: $0 -t : ID of the inviting team. default: ${TEAM_UUID} -h : Base URI of brig. default: ${BRIG_HOST} -c : file containing info on the invitees in format 'Email,UserName'. default: ${CSV_FILE} + +If you tee(1) stdout, stderr of this script into a log file, you can +grep that log file for errors like this: + +$ grep code out.log | grep email-exists # the most common case +$ grep code out.log | grep -v email-exists + +If you are in a hurry, you may want to change the sleep(1) at the end +of the invite loop to less than a second. If you want to give up on +the first error, add an exit(1) where we check the $INVIDATION_ID. + " # Option parsing: @@ -73,8 +84,7 @@ do if ( ( echo "$INVITATION_ID" | grep -q '"code"' ) && ( echo "$INVITATION_ID" | grep -q '"label"' ) ) ; then - echo "Got an error, aborting: $INVITATION_ID" - exit 1 + echo "failed inviting $USER_NAME <$EMAIL>: $INVITATION_ID" fi echo "Sleeping 1 second..." 1>&2 From c507ed64a7d4f0af2bffe2f9c3eb4b5f89a477c0 Mon Sep 17 00:00:00 2001 From: fisx Date: Mon, 21 Oct 2019 11:22:28 +0200 Subject: [PATCH 07/26] Cleanup (#879) * Fix: remove duplicate object field in ToJSON * Remove unnecessary toJSON call --- libs/brig-types/src/Brig/Types/User.hs | 1 - libs/galley-types/src/Galley/Types.hs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/brig-types/src/Brig/Types/User.hs b/libs/brig-types/src/Brig/Types/User.hs index 98ff7e6d99c..f68cfb68aad 100644 --- a/libs/brig-types/src/Brig/Types/User.hs +++ b/libs/brig-types/src/Brig/Types/User.hs @@ -447,7 +447,6 @@ instance ToJSON NewUser where # "uuid" .= newUserUUID u # "email" .= newUserEmail u # "email_code" .= newUserEmailCode u - # "password" .= newUserPassword u # "picture" .= newUserPict u # "assets" .= newUserAssets u # "phone" .= newUserPhone u diff --git a/libs/galley-types/src/Galley/Types.hs b/libs/galley-types/src/Galley/Types.hs index af50bddebed..9f826949b45 100644 --- a/libs/galley-types/src/Galley/Types.hs +++ b/libs/galley-types/src/Galley/Types.hs @@ -493,7 +493,7 @@ instance ToJSON UserClients where where fn u c m = let k = T.decodeLatin1 (toASCIIBytes (toUUID u)) in - Map.insert k (toJSON c) m + Map.insert k c m instance FromJSON UserClients where parseJSON = From 1815fdb79f3e14de4818d9ef852588203a1c92f0 Mon Sep 17 00:00:00 2001 From: Arian van Putten Date: Mon, 21 Oct 2019 13:11:21 +0200 Subject: [PATCH 08/26] Change SCIM documentation to actually use SAML Ids (#883) * Change SCIM documentation to actually use SAML Ids UUIDs are a bit confusing here. It is more common that your SAML provider is configured to have email addresses isntead Co-Authored-By: fisx --- docs/reference/provisioning/scim-via-curl.md | 21 +++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/reference/provisioning/scim-via-curl.md b/docs/reference/provisioning/scim-via-curl.md index a3fd2bc0c3f..2b64e425f4b 100644 --- a/docs/reference/provisioning/scim-via-curl.md +++ b/docs/reference/provisioning/scim-via-curl.md @@ -92,19 +92,34 @@ A minimal definition of a user looks like this: ```bash export SCIM_USER='{ "schemas" : ["urn:ietf:params:scim:schemas:core:2.0:User"], - "externalId" : "f8c4ffde-4592-11e9-8600-afe11dc7d07b", + "externalId" : "nick@example.com", "userName" : "nick", "displayName" : "The Nick" }' ``` +The `externalId` is used to construct a saml identity. Two cases are +currently supported: + +1. `externalId` contains a valid email address. The SAML `NameID` has +the form `me@example.com`. +2. `externalId` contains anything that is *not* an email address. The +SAML `NameID` has the form `...`. + +*NOTE: It is important to configure your SAML provider to use +`nameid-format:emailAddress` or `nameid-format:unspecified`. Other +nameid formats are not supported at this moment*. +See also: https://github.com/wireapp/wire-server/blob/c507ed64a7d4f0af2bffe2f9c3eb4b5f89a477c0/services/spar/src/Spar/Scim/User.hs#L149-L158 + We also support custom fields that are used in rich profiles in this form [see {#RefRichInfo}](../user/rich-info.md): ```bash export SCIM_USER='{ "schemas" : ["urn:ietf:params:scim:schemas:core:2.0:User", "urn:wire:scim:schemas:profile:1.0"], - "externalId" : "f8c4ffde-4592-11e9-8600-afe11dc7d07b", + "externalId" : "rnick@example.com", "userName" : "rnick", "displayName" : "The Rich Nick", "urn:wire:scim:schemas:profile:1.0": { @@ -160,7 +175,7 @@ up-to-date user present, just `GET` one right before the `PUT`.) ```bash export SCIM_USER='{ "schemas" : ["urn:ietf:params:scim:schemas:core:2.0:User"], - "externalId" : "updated-user-id", + "externalId" : "rnick@example.com", "userName" : "newnick", "displayName" : "The New Nick" }' From 8d0698dbc45f675a2b09943b0237c03b9c442f7d Mon Sep 17 00:00:00 2001 From: fisx Date: Mon, 21 Oct 2019 14:31:59 +0200 Subject: [PATCH 09/26] stack: Split nix stuff into build deps and runtime deps. (#884) Stack started not liking being called from a nix-shell anymore, because stack wants to create the nix-shell itself so it does some auto-detection and crashes. To trick it into doing the right thing again, we only use the nix-shell for non-build-deps and just for dev tools, then stack itself will call another nix-shell file that just handles build deps for the stack project. --- shell.nix | 46 +++++++++------------------------------------- stack-deps.nix | 36 ++++++++++++++++++++++++++++++++++++ stack.yaml | 2 +- 3 files changed, 46 insertions(+), 38 deletions(-) create mode 100644 stack-deps.nix diff --git a/shell.nix b/shell.nix index 47ba31c95c9..af3393d55a3 100644 --- a/shell.nix +++ b/shell.nix @@ -1,37 +1,9 @@ -{ pkgs ? import {} }: -let - cryptobox-c = pkgs.callPackage ({fetchFromGitHub, rustPlatform, pkgconfig, libsodium}: - rustPlatform.buildRustPackage rec { - name = "cryptobox-c-${version}"; - version = "2019-06-17"; - buildInputs = [ pkgconfig libsodium ]; - src = fetchFromGitHub { - owner = "wireapp"; - repo = "cryptobox-c"; - rev = "4067ad96b125942545dbdec8c1a89f1e1b65d013"; - sha256 = "1i9dlhw0xk1viglyhail9fb36v1awrypps8jmhrkz8k1bhx98ci3"; - }; - cargoSha256 = "1373rpy0fi3cpacv06x1cv4cv0brwdri2680ymdkq8w44syp20ym"; - postInstall = '' - mkdir -p $out/include - cp src/cbox.h $out/include - ''; - }) {}; -in - pkgs.haskell.lib.buildStackProject { - name = "wire-server"; - buildInputs = with pkgs; [ - docker-compose - pkgconfig - cryptobox-c - libsodium - geoip - protobuf - openssl - snappy - icu - zlib - libxml2 - ]; - ghc = pkgs.haskell.compiler.ghc844; - } +{ pkgs ? import {}}: +with pkgs; mkShell { + name = "shell"; + buildInputs = [ + docker-compose + gnumake + stack + ]; +} diff --git a/stack-deps.nix b/stack-deps.nix new file mode 100644 index 00000000000..815b2462449 --- /dev/null +++ b/stack-deps.nix @@ -0,0 +1,36 @@ +{ pkgs ? import {} }: +let + cryptobox-c = pkgs.callPackage ({fetchFromGitHub, rustPlatform, pkgconfig, libsodium}: + rustPlatform.buildRustPackage rec { + name = "cryptobox-c-${version}"; + version = "2019-06-17"; + buildInputs = [ pkgconfig libsodium ]; + src = fetchFromGitHub { + owner = "wireapp"; + repo = "cryptobox-c"; + rev = "4067ad96b125942545dbdec8c1a89f1e1b65d013"; + sha256 = "1i9dlhw0xk1viglyhail9fb36v1awrypps8jmhrkz8k1bhx98ci3"; + }; + cargoSha256 = "1373rpy0fi3cpacv06x1cv4cv0brwdri2680ymdkq8w44syp20ym"; + postInstall = '' + mkdir -p $out/include + cp src/cbox.h $out/include + ''; + }) {}; +in + pkgs.haskell.lib.buildStackProject { + name = "wire-server"; + buildInputs = with pkgs; [ + pkgconfig + cryptobox-c + libsodium + geoip + protobuf + openssl + snappy + icu + zlib + libxml2 + ]; + ghc = pkgs.haskell.compiler.ghc844; + } diff --git a/stack.yaml b/stack.yaml index 4bc77c623b8..a617362e07e 100644 --- a/stack.yaml +++ b/stack.yaml @@ -74,4 +74,4 @@ flags: allow-newer: false nix: - shell-file: shell.nix + shell-file: stack-deps.nix From 08dd74232f2d577d9b03d4420fba4611b6a5cc6e Mon Sep 17 00:00:00 2001 From: fisx Date: Tue, 22 Oct 2019 11:02:39 +0200 Subject: [PATCH 10/26] Make gundeck handle AWS outages better. (#869) Introduce `Gundeck.ThreadBudget.runWithBudget` that can be wrapped around actions. Wrapped actions are executed only if there is budget available, otherwise they are dropped and a warning is logged. See the haddocks at the beginning of `Gundeck.ThreadBudget` for a rationale. --- libs/types-common/package.yaml | 1 + libs/types-common/src/Data/SizedHashMap.hs | 54 +++ libs/types-common/test/Main.hs | 3 +- libs/types-common/test/Test/SizedHashMap.hs | 62 ++++ services/brig/src/Brig/Budget.hs | 1 - services/cannon/src/Cannon/Dict.hs | 52 +-- services/gundeck/gundeck.integration.yaml | 3 + services/gundeck/package.yaml | 5 + services/gundeck/src/Gundeck/Env.hs | 5 +- services/gundeck/src/Gundeck/Options.hs | 10 + services/gundeck/src/Gundeck/Push.hs | 107 ++++-- services/gundeck/src/Gundeck/Push/Native.hs | 11 +- services/gundeck/src/Gundeck/Run.hs | 7 +- services/gundeck/src/Gundeck/ThreadBudget.hs | 255 ++++++++++++++ services/gundeck/test/unit/Main.hs | 2 + services/gundeck/test/unit/MockGundeck.hs | 3 + services/gundeck/test/unit/ThreadBudget.hs | 334 +++++++++++++++++++ stack.yaml | 1 + 18 files changed, 833 insertions(+), 83 deletions(-) create mode 100644 libs/types-common/src/Data/SizedHashMap.hs create mode 100644 libs/types-common/test/Test/SizedHashMap.hs create mode 100644 services/gundeck/src/Gundeck/ThreadBudget.hs create mode 100644 services/gundeck/test/unit/ThreadBudget.hs diff --git a/libs/types-common/package.yaml b/libs/types-common/package.yaml index 217c0c8bab9..ebeecd958d5 100644 --- a/libs/types-common/package.yaml +++ b/libs/types-common/package.yaml @@ -91,6 +91,7 @@ tests: - text-format - time - types-common + - unordered-containers - uuid flags: arbitrary: diff --git a/libs/types-common/src/Data/SizedHashMap.hs b/libs/types-common/src/Data/SizedHashMap.hs new file mode 100644 index 00000000000..c372d9e57e0 --- /dev/null +++ b/libs/types-common/src/Data/SizedHashMap.hs @@ -0,0 +1,54 @@ +module Data.SizedHashMap + ( SizedHashMap + , fromSizedHashMap + , size + , empty + , insert + , keys + , elems + , toList + , lookup + , delete + ) +where + +import Imports hiding (lookup, toList) +import Data.Hashable (Hashable) + +import qualified Data.HashMap.Strict as M + + +data SizedHashMap k v = SizedHashMap !Int !(HashMap k v) + +fromSizedHashMap :: SizedHashMap k v -> HashMap k v +fromSizedHashMap (SizedHashMap _ hm) = hm + +size :: forall k v. SizedHashMap k v -> Int +size (SizedHashMap s _) = s + +empty :: forall k v. SizedHashMap k v +empty = SizedHashMap 0 M.empty + +insert :: forall k v. (Eq k, Hashable k) => k -> v -> SizedHashMap k v -> SizedHashMap k v +insert k v (SizedHashMap n hm) = SizedHashMap n' hm' + where + n' = if M.member k hm then n else n + 1 + hm' = M.insert k v hm + +keys :: forall k v. SizedHashMap k v -> [k] +keys (SizedHashMap _ hm) = M.keys hm + +elems :: forall k v. SizedHashMap k v -> [v] +elems (SizedHashMap _ hm) = M.elems hm + +toList :: forall k v. SizedHashMap k v -> [(k, v)] +toList (SizedHashMap _ hm) = M.toList hm + +lookup :: forall k v. (Eq k, Hashable k) => k -> SizedHashMap k v -> Maybe v +lookup k (SizedHashMap _ hm) = M.lookup k hm + +delete :: forall k v. (Eq k, Hashable k) => k -> SizedHashMap k v -> SizedHashMap k v +delete k (SizedHashMap n hm) = SizedHashMap n' hm' + where + n' = if M.member k hm then n - 1 else n + hm' = M.delete k hm diff --git a/libs/types-common/test/Main.hs b/libs/types-common/test/Main.hs index 6181799fd6b..39e33f1b7b3 100644 --- a/libs/types-common/test/Main.hs +++ b/libs/types-common/test/Main.hs @@ -2,8 +2,9 @@ module Main (main) where import Imports import qualified Test.Properties as Properties +import qualified Test.SizedHashMap as SizedHashMap import Test.Tasty main :: IO () -main = defaultMain $ testGroup "Tests" [ Properties.tests ] +main = defaultMain $ testGroup "Tests" [ Properties.tests, SizedHashMap.tests ] diff --git a/libs/types-common/test/Test/SizedHashMap.hs b/libs/types-common/test/Test/SizedHashMap.hs new file mode 100644 index 00000000000..f5c89ce9314 --- /dev/null +++ b/libs/types-common/test/Test/SizedHashMap.hs @@ -0,0 +1,62 @@ +module Test.SizedHashMap (tests) where + +import Imports + +import Data.SizedHashMap as SHM +import Test.Tasty +import Test.Tasty.HUnit + +import qualified Data.HashMap.Strict as HM + + +zro, one, two :: HM.HashMap Char Char +zro = HM.empty +one = HM.insert '0' '0' zro +two = HM.insert '1' '1' one + +zro', one', two' :: SHM.SizedHashMap Char Char +zro' = SHM.empty +one' = SHM.insert '0' '0' zro' +two' = SHM.insert '1' '1' one' + + +tests :: TestTree +tests = testGroup "SizedHashMap" + [ testCase "empty" $ do + SHM.size zro' @=? 0 + fromSizedHashMap zro' @=? zro + + , testCase "insert" $ do + SHM.size one' @=? HM.size one + SHM.size two' @=? HM.size two + fromSizedHashMap one' @=? one + fromSizedHashMap two' @=? two + + , testCase "keys" $ do + SHM.keys zro' @=? HM.keys zro + SHM.keys one' @=? HM.keys one + SHM.keys two' @=? HM.keys two + + , testCase "elems" $ do + SHM.elems zro' @=? HM.elems zro + SHM.elems one' @=? HM.elems one + SHM.elems two' @=? HM.elems two + + , testCase "toList" $ do + SHM.toList zro' @=? HM.toList zro + SHM.toList one' @=? HM.toList one + SHM.toList two' @=? HM.toList two + + , testCase "lookup" $ do + SHM.lookup '0' zro' @=? HM.lookup '0' zro + SHM.lookup '0' one' @=? HM.lookup '0' one + SHM.lookup '0' two' @=? HM.lookup '0' two + + , testCase "delete" $ do + size (SHM.delete '0' zro') @=? 0 + size (SHM.delete '0' one') @=? 0 + size (SHM.delete '0' two') @=? 1 + fromSizedHashMap (SHM.delete '0' zro') @=? HM.delete '0' zro + fromSizedHashMap (SHM.delete '0' one') @=? HM.delete '0' one + fromSizedHashMap (SHM.delete '0' two') @=? HM.delete '0' two + ] diff --git a/services/brig/src/Brig/Budget.hs b/services/brig/src/Brig/Budget.hs index 656ac0252d1..2f7dcbe8a0b 100644 --- a/services/brig/src/Brig/Budget.hs +++ b/services/brig/src/Brig/Budget.hs @@ -11,7 +11,6 @@ module Brig.Budget ) where import Imports -import Brig.Data.Instances () import Cassandra import Data.Time.Clock diff --git a/services/cannon/src/Cannon/Dict.hs b/services/cannon/src/Cannon/Dict.hs index 730e49bdabf..dc93edcfc8f 100644 --- a/services/cannon/src/Cannon/Dict.hs +++ b/services/cannon/src/Cannon/Dict.hs @@ -11,43 +11,46 @@ module Cannon.Dict where import Imports hiding (lookup) + import Data.Hashable (hash, Hashable) +import Data.SizedHashMap (SizedHashMap) import Data.Vector (Vector, (!)) -import qualified Data.HashMap.Strict as M -import qualified Data.Vector as V +import qualified Data.SizedHashMap as SHM +import qualified Data.Vector as V + newtype Dict a b = Dict { _map :: Vector (IORef (SizedHashMap a b)) } size :: MonadIO m => Dict a b -> m Int -size d = liftIO $ sum <$> mapM (\r -> hmsize <$> readIORef r) (_map d) +size d = liftIO $ sum <$> mapM (\r -> SHM.size <$> readIORef r) (_map d) empty :: MonadIO m => Int -> m (Dict a b) empty w = liftIO $ if w > 0 && w < 8192 - then Dict <$> V.generateM w (const $ newIORef hmempty) + then Dict <$> V.generateM w (const $ newIORef SHM.empty) else error "Dict.empty: slice number out of range [1, 8191]" insert :: (Eq a, Hashable a, MonadIO m) => a -> b -> Dict a b -> m () -insert k v = mutDict (hminsert k v) . getSlice k +insert k v = mutDict (SHM.insert k v) . getSlice k add :: (Eq a, Hashable a, MonadIO m) => a -> b -> Dict a b -> m Bool add k v d = liftIO $ atomicModifyIORef' (getSlice k d) $ \m -> - if k `elem` hmkeys m + if k `elem` SHM.keys m then (m, False) - else (hminsert k v m, True) + else (SHM.insert k v m, True) remove :: (Eq a, Hashable a, MonadIO m) => a -> Dict a b -> m Bool remove = removeIf (const True) removeIf :: (Eq a, Hashable a, MonadIO m) => (Maybe b -> Bool) -> a -> Dict a b -> m Bool removeIf f k d = liftIO $ atomicModifyIORef' (getSlice k d) $ \m -> - if f (hmlookup k m) - then (hmdelete k m, True) + if f (SHM.lookup k m) + then (SHM.delete k m, True) else (m, False) lookup :: (Eq a, Hashable a, MonadIO m) => a -> Dict a b -> m (Maybe b) -lookup k = liftIO . fmap (hmlookup k) . readIORef . getSlice k +lookup k = liftIO . fmap (SHM.lookup k) . readIORef . getSlice k ----------------------------------------------------------------------------- -- Internal @@ -60,32 +63,3 @@ mutDict f d = liftIO $ atomicModifyIORef' d $ \m -> (f m, ()) getSlice :: (Hashable a) => a -> Dict a b -> IORef (SizedHashMap a b) getSlice k (Dict m) = m ! (hash k `mod` V.length m) - ----------------------------------------------------------------------- --- hashmap with O(1) size operator - -data SizedHashMap a b = SizedHashMap !Int !(HashMap a b) - -hmsize :: forall k v. SizedHashMap k v -> Int -hmsize (SizedHashMap s _) = s - -hmempty :: forall k v. SizedHashMap k v -hmempty = SizedHashMap 0 M.empty - -hminsert :: forall k v. (Eq k, Hashable k) => k -> v -> SizedHashMap k v -> SizedHashMap k v -hminsert k v (SizedHashMap n hm) = SizedHashMap n' hm' - where - n' = if M.member k hm then n else n + 1 - hm' = M.insert k v hm - -hmkeys :: forall k v. SizedHashMap k v -> [k] -hmkeys (SizedHashMap _ hm) = M.keys hm - -hmlookup :: forall k v. (Eq k, Hashable k) => k -> SizedHashMap k v -> Maybe v -hmlookup k (SizedHashMap _ hm) = M.lookup k hm - -hmdelete :: forall k v. (Eq k, Hashable k) => k -> SizedHashMap k v -> SizedHashMap k v -hmdelete k (SizedHashMap n hm) = SizedHashMap n' hm' - where - n' = if M.member k hm then n - 1 else n - hm' = M.delete k hm diff --git a/services/gundeck/gundeck.integration.yaml b/services/gundeck/gundeck.integration.yaml index ebd3897ce8a..9f6d0820a55 100644 --- a/services/gundeck/gundeck.integration.yaml +++ b/services/gundeck/gundeck.integration.yaml @@ -24,6 +24,9 @@ settings: httpPoolSize: 1024 notificationTTL: 24192200 bulkPush: true + maxConcurrentNativePushes: + hard: 30 # more than this number of threads will not be allowed + soft: 10 # more than this number of threads will be warned about logLevel: Info logNetStrings: false diff --git a/services/gundeck/package.yaml b/services/gundeck/package.yaml index c4ca4033a46..bb00596df9e 100644 --- a/services/gundeck/package.yaml +++ b/services/gundeck/package.yaml @@ -57,6 +57,7 @@ library: - redis-io >=0.4 - resourcet >=1.1 - retry >=0.5 + - safe-exceptions - semigroups >=0.12 - singletons >=1.0 - split >=0.2 @@ -173,6 +174,7 @@ tests: - -threaded dependencies: - base + - async - aeson - aeson-pretty - amazonka @@ -192,6 +194,7 @@ tests: - network-uri - QuickCheck - quickcheck-instances + - quickcheck-state-machine - random - scientific - string-conversions @@ -199,8 +202,10 @@ tests: - tasty-hunit - tasty-quickcheck - text + - time - tinylog - transformers + - tree-diff - types-common - unordered-containers - uuid diff --git a/services/gundeck/src/Gundeck/Env.hs b/services/gundeck/src/Gundeck/Env.hs index df349104d52..e861566510f 100644 --- a/services/gundeck/src/Gundeck/Env.hs +++ b/services/gundeck/src/Gundeck/Env.hs @@ -12,6 +12,7 @@ import Data.Text (unpack) import Data.Time.Clock.POSIX import Util.Options import Gundeck.Options as Opt +import Gundeck.ThreadBudget import Network.HTTP.Client (responseTimeoutMicro) import Network.HTTP.Client.TLS (tlsManagerSettings) @@ -32,6 +33,7 @@ data Env = Env , _rstate :: !Redis.Pool , _awsEnv :: !Aws.Env , _time :: !(IO Milliseconds) + , _threadBudgetState :: !(Maybe ThreadBudgetState) } makeLenses ''Env @@ -74,7 +76,8 @@ createEnv m o = do io <- mkAutoUpdate defaultUpdateSettings { updateAction = Ms . round . (* 1000) <$> getPOSIXTime } - return $! Env def m o l n p r a io + mtbs <- mkThreadBudgetState `mapM` (o ^. optSettings . setMaxConcurrentNativePushes) + return $! Env def m o l n p r a io mtbs reqIdMsg :: RequestId -> Logger.Msg -> Logger.Msg reqIdMsg = ("request" Logger..=) . unRequestId diff --git a/services/gundeck/src/Gundeck/Options.hs b/services/gundeck/src/Gundeck/Options.hs index eb0effdc4f0..af888da2716 100644 --- a/services/gundeck/src/Gundeck/Options.hs +++ b/services/gundeck/src/Gundeck/Options.hs @@ -36,11 +36,21 @@ data Settings = Settings -- | Use this option to group push notifications and send them in bulk to Cannon, instead -- of in individual requests , _setBulkPush :: !Bool + -- | Maximum number of concurrent threads calling SNS. + , _setMaxConcurrentNativePushes :: !(Maybe MaxConcurrentNativePushes) + } deriving (Show, Generic) + +data MaxConcurrentNativePushes = MaxConcurrentNativePushes + { _limitHard :: !(Maybe Int) -- ^ more than this number of threads will not be allowed + , _limitSoft :: !(Maybe Int) -- ^ more than this number of threads will be warned about } deriving (Show, Generic) deriveFromJSON toOptionFieldName ''Settings makeLenses ''Settings +deriveFromJSON toOptionFieldName ''MaxConcurrentNativePushes +makeLenses ''MaxConcurrentNativePushes + data Opts = Opts { _optGundeck :: !Endpoint -- ^ Hostname and port to bind to , _optCassandra :: !CassandraOpts diff --git a/services/gundeck/src/Gundeck/Push.hs b/services/gundeck/src/Gundeck/Push.hs index 84427c12911..be3fb5f37f6 100644 --- a/services/gundeck/src/Gundeck/Push.hs +++ b/services/gundeck/src/Gundeck/Push.hs @@ -8,6 +8,7 @@ module Gundeck.Push , pushAll, pushAny , MonadPushAll(..) , MonadNativeTargets(..) + , MonadMapAsync(..) , MonadPushAny(..) ) where @@ -26,8 +27,9 @@ import Gundeck.Aws (endpointUsers) import Gundeck.Aws.Arn import Gundeck.Env import Gundeck.Monad -import Gundeck.Push.Native.Types import Gundeck.Options +import Gundeck.Push.Native.Types +import Gundeck.ThreadBudget import Gundeck.Types import Gundeck.Util import Network.HTTP.Types @@ -74,6 +76,7 @@ class MonadThrow m => MonadPushAll m where mpaStreamAdd :: NotificationId -> List1 NotificationTarget -> List1 Aeson.Object -> NotificationTTL -> m () mpaPushNative :: Notification -> Push -> [Address] -> m () mpaForkIO :: m () -> m () + mpaRunWithBudget :: Int -> a -> m a -> m a instance MonadPushAll Gundeck where mpaNotificationTTL = view (options . optSettings . setNotificationTTL) @@ -83,20 +86,33 @@ instance MonadPushAll Gundeck where mpaStreamAdd = Stream.add mpaPushNative = pushNative mpaForkIO = void . forkIO + mpaRunWithBudget = runWithBudget'' + +-- | Another layer of wrap around 'runWithBudget'. +runWithBudget'' :: Int -> a -> Gundeck a -> Gundeck a +runWithBudget'' budget fallback action = do + view threadBudgetState >>= \case + Nothing -> action + Just tbs -> runWithBudget' tbs budget fallback action + -- | Abstract over all effects in 'nativeTargets' (for unit testing). class Monad m => MonadNativeTargets m where mntgtLogErr :: SomeException -> m () mntgtLookupAddresses :: UserId -> m [Address] - mntgtMapAsync :: (a -> m b) -> [a] -> m [Either SomeException b] instance MonadNativeTargets Gundeck where mntgtLogErr e = Log.err (msg (val "Failed to get native push address: " +++ show e)) mntgtLookupAddresses rcp = Data.lookup rcp Data.One + +class Monad m => MonadMapAsync m where + mntgtMapAsync :: (a -> m b) -> [a] -> m [Either SomeException b] + +instance MonadMapAsync Gundeck where mntgtMapAsync = mapAsync -- | Abstract over all effects in 'pushAny' (for unit testing). -class (MonadPushAll m, MonadNativeTargets m) => MonadPushAny m where +class (MonadPushAll m, MonadNativeTargets m, MonadMapAsync m) => MonadPushAny m where mpyPush :: Notification -> List1 NotificationTarget -> UserId @@ -132,9 +148,9 @@ pushAny' p = do unless (p^.pushTransient) $ mpaStreamAdd i tgts pload =<< mpaNotificationTTL mpaForkIO $ do - prs <- mpyPush notif tgts (p^.pushOrigin) (p^.pushOriginConnection) (p^.pushConnections) + alreadySent <- mpyPush notif tgts (p^.pushOrigin) (p^.pushOriginConnection) (p^.pushConnections) unless (p^.pushTransient) $ - mpaPushNative notif p =<< nativeTargets p prs + mpaPushNative notif p =<< nativeTargets p (nativeTargetsRecipients p) alreadySent where mkTarget :: Recipient -> NotificationTarget mkTarget r = @@ -145,7 +161,7 @@ pushAny' p = do -- | Construct and send a single bulk push request to the client. Write the 'Notification's from -- the request to C*. Trigger native pushes for all delivery failures notifications. -pushAll :: (MonadPushAll m, MonadNativeTargets m) => [Push] -> m () +pushAll :: (MonadPushAll m, MonadNativeTargets m, MonadMapAsync m) => [Push] -> m () pushAll pushes = do targets :: [(Push, (Notification, List1 (Recipient, [Presence])))] <- zip pushes <$> (mkNotificationAndTargets `mapM` pushes) @@ -172,8 +188,15 @@ pushAll pushes = do resp <- compilePushResps targets <$> mpaBulkPush (compilePushReq <$> targets) -- native push - forM_ resp $ \((notif, psh), alreadySent) -> unless (psh ^. pushTransient) $ - mpaPushNative notif psh =<< nativeTargets psh alreadySent + forM_ resp $ \((notif :: Notification, psh :: Push), alreadySent :: [Presence]) -> do + let rcps' = nativeTargetsRecipients psh + budget = length rcps' + -- this is a rough budget, since there may be more than one device in a + -- 'Presence', so one budget token may trigger at most 8 push notifications + -- to be sent out. + unless (psh ^. pushTransient) + $ mpaRunWithBudget budget () + $ mpaPushNative notif psh =<< nativeTargets psh rcps' alreadySent -- REFACTOR: @[Presence]@ here should be @newtype WebSockedDelivered = WebSockedDelivered [Presence]@ @@ -246,52 +269,65 @@ pushNative :: Notification -> Push -> [Address] -> Gundeck () pushNative _ _ [] = return () pushNative notif p rcps = do let prio = p^.pushNativePriority - void $ Native.push (Native.NativePush (ntfId notif) prio Nothing) rcps + Native.push (Native.NativePush (ntfId notif) prio Nothing) rcps -nativeTargets :: forall m. MonadNativeTargets m => Push -> [Presence] -> m [Address] -nativeTargets p pres = - let rcps' = filter routeNative (toList (fromRange (p^.pushRecipients))) - in mntgtMapAsync addresses rcps' >>= fmap concat . mapM check +-- | Compute list of 'Recipient's from a 'Push' that may be interested in a native push. More +-- filtering in 'nativeTargets'. +nativeTargetsRecipients :: Push -> [Recipient] +nativeTargetsRecipients psh = filter routeNative (toList (fromRange (psh ^. pushRecipients))) where - -- Interested in native pushes? - routeNative u = u^.recipientRoute /= RouteDirect - && (u^.recipientId /= p^.pushOrigin || p^.pushNativeIncludeOrigin) + routeNative u = u ^. recipientRoute /= RouteDirect + && (u ^. recipientId /= psh ^. pushOrigin || psh ^. pushNativeIncludeOrigin) +-- | TODO: 'nativeTargets' calls cassandra once for each 'Recipient' of the 'Push'. Instead, +-- it should be called once with @[Push]@ for every call to 'pushAll', and that call should +-- only call cassandra once in total, yielding all addresses of all recipients of all pushes. +-- +-- FUTUREWORK: we may want to turn 'mntgtMapAsync' into an ordinary `mapM`: it's cassandra +-- access, so it'll be fast either way given the size of the input, and synchronous calls +-- impose a much lower risk of choking when system load peaks. +nativeTargets + :: forall m. (MonadNativeTargets m, MonadMapAsync m) + => Push -> [Recipient] -> [Presence] -> m [Address] +nativeTargets psh rcps' alreadySent = + mntgtMapAsync addresses rcps' >>= fmap concat . mapM check + where addresses :: Recipient -> m [Address] addresses u = do - addrs <- mntgtLookupAddresses (u^.recipientId) + addrs <- mntgtLookupAddresses (u ^. recipientId) return $ preference . filter (eligible u) $ addrs + eligible :: Recipient -> Address -> Bool eligible u a -- Never include the origin client. - | a^.addrUser == p^.pushOrigin && Just (a^.addrConn) == p^.pushOriginConnection = False + | a ^. addrUser == psh ^. pushOrigin && Just (a ^. addrConn) == psh ^. pushOriginConnection = False -- Is the specific client an intended recipient? - | not (eligibleClient a (u^.recipientClients)) = False + | not (eligibleClient a (u ^. recipientClients)) = False -- Is the client not whitelisted? | not (whitelistedOrNoWhitelist a) = False - -- Include client if not found in presences. - | otherwise = isNothing (List.find (isOnline a) pres) + -- Include client if not found in already served presences. + | otherwise = isNothing (List.find (isOnline a) alreadySent) - isOnline a x = a^.addrUser == Presence.userId x - && (a^.addrConn == Presence.connId x || equalClient a x) + isOnline a x = a ^. addrUser == Presence.userId x + && (a ^. addrConn == Presence.connId x || equalClient a x) - equalClient a x = Just (a^.addrClient) == Presence.clientId x + equalClient a x = Just (a ^. addrClient) == Presence.clientId x eligibleClient _ RecipientClientsAll = True - eligibleClient a (RecipientClientsSome cs) = (a^.addrClient) `elem` cs + eligibleClient a (RecipientClientsSome cs) = (a ^. addrClient) `elem` cs - whitelistedOrNoWhitelist a = null (p^.pushConnections) - || a^.addrConn `elem` p^.pushConnections + whitelistedOrNoWhitelist a = null (psh ^. pushConnections) + || a ^. addrConn `elem` psh ^. pushConnections -- Apply transport preference in case of alternative transports for the -- same client (currently only APNS vs APNS VoIP). If no explicit -- preference is given, the default preference depends on the priority. - preference as = let pref = p^.pushNativeAps >>= view apsPreference in + preference as = let pref = psh ^. pushNativeAps >>= view apsPreference in filter (pick (fromMaybe defPreference pref)) as where - pick pr a = case a^.addrTransport of + pick pr a = case a ^. addrTransport of GCM -> True APNS -> pr == ApsStdPreference || notAny a APNSVoIP APNSSandbox -> pr == ApsStdPreference || notAny a APNSVoIPSandbox @@ -300,10 +336,10 @@ nativeTargets p pres = notAny a t = not (any (\a' -> addrEqualClient a a' - && a^.addrApp == a'^.addrApp - && a'^.addrTransport == t) as) + && a ^. addrApp == a' ^. addrApp + && a' ^. addrTransport == t) as) - defPreference = case p^.pushNativePriority of + defPreference = case psh ^. pushNativePriority of LowPriority -> ApsStdPreference HighPriority -> ApsVoIPPreference @@ -311,8 +347,9 @@ nativeTargets p pres = check (Left e) = mntgtLogErr e >> return [] check (Right r) = return r + addToken :: UserId ::: ConnId ::: JsonRequest PushToken ::: JSON -> Gundeck Response -addToken (uid ::: cid ::: req ::: _) = do +addToken (uid ::: cid ::: req ::: _) = mpaRunWithBudget 1 snsThreadBudgetReached $ do new <- fromJsonBody req (cur, old) <- foldl' (matching new) (Nothing, []) <$> Data.lookup uid Data.Quorum Log.info $ "user" .= UUID.toASCIIBytes (toUUID uid) @@ -425,6 +462,10 @@ invalidToken :: Response invalidToken = json (Error status400 "invalid-token" "Invalid push token") & setStatus status404 +snsThreadBudgetReached :: Response +snsThreadBudgetReached = json (Error status400 "sns-thread-budget-reached" "Too many concurrent calls to SNS; is SNS down?") + & setStatus status413 + tokenTooLong :: Response tokenTooLong = json (Error status400 "token-too-long" "Push token length must be < 8192 for GCM or 400 for APNS") & setStatus status413 diff --git a/services/gundeck/src/Gundeck/Push/Native.hs b/services/gundeck/src/Gundeck/Push/Native.hs index d25dcb2d056..86add930825 100644 --- a/services/gundeck/src/Gundeck/Push/Native.hs +++ b/services/gundeck/src/Gundeck/Push/Native.hs @@ -30,12 +30,12 @@ import qualified Gundeck.Notification.Data as Stream import qualified Gundeck.Push.Data as Data import qualified System.Logger.Class as Log -push :: NativePush -> [Address] -> Gundeck [Result] -push _ [] = return [] -push m [a] = pure <$> push1 m a -push m addrs = mapConcurrently (push1 m) addrs +push :: NativePush -> [Address] -> Gundeck () +push _ [] = pure () +push m [a] = push1 m a +push m addrs = void $ mapConcurrently (push1 m) addrs -push1 :: NativePush -> Address -> Gundeck Result +push1 :: NativePush -> Address -> Gundeck () push1 m a = do e <- view awsEnv r <- Aws.execute e $ publish m a @@ -51,7 +51,6 @@ push1 m a = do Failure (PushException ex) _ -> do logError a "Native push failed" ex view monitor >>= counterIncr (path "push.native.errors") - return r where onDisabled = handleAny (logError a "Failed to cleanup disabled endpoint") $ do diff --git a/services/gundeck/src/Gundeck/Run.hs b/services/gundeck/src/Gundeck/Run.hs index c80a0425a95..9293084ee27 100644 --- a/services/gundeck/src/Gundeck/Run.hs +++ b/services/gundeck/src/Gundeck/Run.hs @@ -5,7 +5,7 @@ import Cassandra (runClient, shutdown) import Cassandra.Schema (versionCheck) import Control.Exception (finally) import Control.Lens hiding (enum) -import Data.Metrics.Middleware +import Data.Metrics.Middleware (metrics) import Data.Metrics.Middleware.Prometheus (waiPrometheusMiddleware) import Data.Metrics.WaiRoute (treeToPaths) import Data.Text (unpack) @@ -14,11 +14,12 @@ import Gundeck.Env import Gundeck.Monad import Gundeck.Options import Gundeck.React +import Gundeck.ThreadBudget import Network.Wai as Wai import Network.Wai.Utilities.Server hiding (serverPort) import Util.Options -import qualified Control.Concurrent.Async as Async +import qualified UnliftIO.Async as Async import qualified Gundeck.Aws as Aws import qualified Network.Wai.Middleware.Gzip as GZip import qualified Network.Wai.Middleware.Gunzip as GZip @@ -33,10 +34,12 @@ run o = do let l = e^.applog s <- newSettings $ defaultServer (unpack $ o^.optGundeck.epHost) (o^.optGundeck.epPort) l m lst <- Async.async $ Aws.execute (e^.awsEnv) (Aws.listen (runDirect e . onEvent)) + wtbs <- forM (e ^. threadBudgetState) $ \tbs -> Async.async $ runDirect e $ watchThreadBudgetState m tbs 10 runSettingsWithShutdown s (middleware e $ app e) 5 `finally` do Log.info l $ Log.msg (Log.val "Shutting down ...") shutdown (e^.cstate) Async.cancel lst + forM_ wtbs Async.cancel Log.close (e^.applog) where middleware :: Env -> Wai.Middleware diff --git a/services/gundeck/src/Gundeck/ThreadBudget.hs b/services/gundeck/src/Gundeck/ThreadBudget.hs new file mode 100644 index 00000000000..e4085c22860 --- /dev/null +++ b/services/gundeck/src/Gundeck/ThreadBudget.hs @@ -0,0 +1,255 @@ +-- | Like "Brig.Budget", but in-memory, per host (not per service), and with an strict/exact +-- upper bound. Like https://hackage.haskell.org/package/token-bucket, but takes the entire +-- run-time of the actions into account, not just the number of executions. +-- http://hackage.haskell.org/package/rate-limit also looks related. +-- +-- USE CASE: keep a lid of stalled native push notification threads. if SNS is up, there +-- will be many short-running executions of the action. when SNS is down, the threads will +-- accumulate in memory and choke the gundeck instances. so we want to stop spawning more +-- threads (and discard or queue native push notifications) before we run out of memory (which +-- could cause system outages). +-- +-- FUTUREWORK: http-client connection pools should handle this naturally and without doing +-- anything, but instead connection pools grow infinitely until system resources (file +-- handles, memory) are exhausted. See +-- https://github.com/snoyberg/http-client/issues/307#issuecomment-343829351. We tried to fix +-- this here: https://github.com/wireapp/wire-server/pull/609, but getting this right requires +-- quite some digging: https://github.com/snoyberg/http-client/issues/394. So if you ever +-- want to figure this out properly, plan in some time for it. +module Gundeck.ThreadBudget + ( ThreadBudgetState + , mkThreadBudgetState + , runWithBudget + , runWithBudget' + , watchThreadBudgetState + + -- * for testing + , threadBudgetLimits + , budgetSpent + , cancelAllThreads + ) where + +import Imports + +import Control.Exception.Safe (catchAny) +import Control.Lens +import Control.Monad.Catch (MonadCatch) +import Data.Metrics (Metrics) +import Data.Metrics.Middleware (gaugeSet, path) +import Data.Time +import Data.UUID (UUID, toText) +import Data.UUID.V4 (nextRandom) +import Gundeck.Options +import UnliftIO.Async +import UnliftIO.Exception (finally) + +import qualified Data.Set as Set +import qualified Data.HashMap.Strict as HM +import qualified System.Logger.Class as LC + + +data ThreadBudgetState = ThreadBudgetState + { threadBudgetLimits :: MaxConcurrentNativePushes + , _threadBudgetRunning :: IORef BudgetMap + } deriving (Generic) + +-- | Store all handles for cleanup in 'watchThreadBudgetState'. +data BudgetMap = BudgetMap + { bspent :: Int + , bmap :: HashMap UUID (Int, Maybe (Async ())) + } + deriving (Eq, Generic) + +-- | Instead of taking the pre-computed total budget spent of the 'BudgetMap' (O(1)), this +-- counts all the threads that are successfully running (dropping the ones that are just about +-- to try to grab a token). +-- +-- WARNING: takes O(n)) and should only be used in testing. +budgetSpent :: ThreadBudgetState -> IO Int +budgetSpent (ThreadBudgetState _ running) = budgetSpent' <$> readIORef running + +budgetSpent' :: BudgetMap -> Int +budgetSpent' = sum . fmap fst . filter (isJust . snd) . HM.elems . bmap + +cancelAllThreads :: ThreadBudgetState -> IO () +cancelAllThreads (ThreadBudgetState _ ref) = readIORef ref + >>= mapM_ cancel . catMaybes . fmap snd . HM.elems . bmap + +mkThreadBudgetState :: HasCallStack => MaxConcurrentNativePushes -> IO ThreadBudgetState +mkThreadBudgetState limits = ThreadBudgetState limits <$> newIORef (BudgetMap 0 HM.empty) + + +-- | Allocate the resources for a new action to be called (but don't call the action yet). +allocate + :: IORef BudgetMap -> UUID -> Int -> MonadIO m => m Int +allocate ref key newspent + = atomicModifyIORef' ref $ + \(BudgetMap spent hm) -> + ( BudgetMap (spent + newspent) (HM.insert key (newspent, Nothing) hm) + , spent + ) + +-- | Register an already-allocated action with its 'Async'. +register + :: IORef BudgetMap -> UUID -> Async () -> MonadIO m => m Int +register ref key handle + = atomicModifyIORef' ref $ + \(BudgetMap spent hm) -> + ( BudgetMap spent (HM.adjust (_2 .~ Just handle) key hm) + , spent + ) + +-- | Remove an registered and/or allocated action from a 'BudgetMap'. +unregister + :: IORef BudgetMap -> UUID -> MonadIO m => m () +unregister ref key + = atomicModifyIORef' ref $ + \bhm@(BudgetMap spent hm) -> + case HM.lookup key hm of + Just (newspent, _) -> (BudgetMap (spent - newspent) (HM.delete key hm), ()) + Nothing -> (bhm, ()) + + +-- | If there is budget available, execute the action synchronously; otherwise, log a warning +-- and return immediately. Make sure the budget state is updated accordingly both when +-- starting and ending the execution. +-- +-- The hard limit in the 'ThreadBudgetState' argument is guaranteed to be an upper bound for +-- the number of concurrently spent budget tokens; surpassing the soft limit will trigger a +-- warning, but still execute the action. One action can use up any integer number of budget +-- tokens, including 0 and negative. +-- +-- The action is called in an 'Async', but 'runWithBudget' waits for it to finish so it can +-- update the budget. +runWithBudget + :: forall m. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) + => ThreadBudgetState -> Int -> m () -> m () +runWithBudget tbs spent = runWithBudget' tbs spent () + +-- | More flexible variant of 'runWithBudget' that allows the action to return a value. With +-- a default in case of budget exhaustion. +runWithBudget' + :: forall m a. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) + => ThreadBudgetState -> Int -> a -> m a -> m a +runWithBudget' (ThreadBudgetState limits ref) spent fallback action = do + key <- liftIO nextRandom + (`finally` unregister ref key) $ do + oldsize <- allocate ref key spent + + warnNoBudget (maybe False (oldsize >=) (limits ^. limitSoft)) + (maybe False (oldsize >=) (limits ^. limitHard)) + oldsize + + if (maybe True (oldsize <) (limits ^. limitHard)) + then go key oldsize + else pure fallback + where + go :: UUID -> Int -> m a + go key oldsize = do + LC.debug $ + "key" LC..= (toText key) LC.~~ + "spent" LC..= oldsize LC.~~ + LC.msg (LC.val "runWithBudget: go") + + handle <- async action + _ <- register ref key (const () <$> handle) + wait handle + + -- iff soft and/or hard limit are breached, log a warning-level message. + warnNoBudget :: Bool -> Bool -> Int -> m () + warnNoBudget False False _ = pure () + warnNoBudget soft hard oldsize = do + let limit = if hard then "hard" else "soft" + LC.warn $ + "spent" LC..= show oldsize LC.~~ + "soft-breach" LC..= soft LC.~~ + "hard-breach" LC..= hard LC.~~ + LC.msg (LC.val "runWithBudget: " <> limit <> " limit reached") + + +-- | Fork a thread that checks with the given frequency if any async handles stored in the +-- state are stale (ie., have terminated with or without exception, but not been removed). If +-- that happens, log a warning. +-- +-- 'runWithBudget' should keep track of the state itself; 'watchThreadBudgetState' is solely a +-- safety precaution to see if there aren't any corner cases we missed. +-- +-- Also, issue some metrics. +watchThreadBudgetState + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => Metrics -> ThreadBudgetState -> NominalDiffTime -> m () +watchThreadBudgetState metrics (ThreadBudgetState limits ref) freq = safeForever $ do + recordMetrics metrics limits ref + removeStaleHandles ref + threadDelayNominalDiffTime freq + +recordMetrics + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => Metrics -> MaxConcurrentNativePushes -> IORef BudgetMap -> m () +recordMetrics metrics limits ref = do + (BudgetMap spent _) <- readIORef ref + gaugeSet (fromIntegral spent) (path "net.nativepush.thread_budget_allocated") metrics + forM_ (limits ^. limitHard) $ \lim -> + gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_hard_limit") metrics + forM_ (limits ^. limitSoft) $ \lim -> + gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_soft_limit") metrics + +threadDelayNominalDiffTime :: NominalDiffTime -> MonadIO m => m () +threadDelayNominalDiffTime = threadDelay . round . (* 1000000) . toRational + +staleTolerance :: NominalDiffTime +staleTolerance = 3 + +-- | Get all handles for asyncs that have terminated, but not been removed from the state. Do +-- that again after 'staleTolerance' to make sure that we don't catch any handles that would +-- have been removed during the 'runWithBudget' cleanup, but we were faster. The intersection +-- between the two rounds constitutes the legitimately stale handles: warn about them, and +-- then remove them from the budgetmap. +removeStaleHandles + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => IORef BudgetMap -> m () +removeStaleHandles ref = do + round1 <- getStaleHandles + threadDelayNominalDiffTime staleTolerance + round2 <- getStaleHandles + + let staleHandles = Set.intersection round1 round2 + + unless (null staleHandles) $ do + warnStaleHandles (Set.size staleHandles) =<< readIORef ref + forM_ staleHandles $ \key -> do + mapM_ waitCatch . join . fmap snd =<< HM.lookup key . bmap <$> readIORef ref + unregister ref key + + isUnsanitary <- atomicModifyIORef' ref sanitize + when isUnsanitary . LC.warn . LC.msg . LC.val $ + "watchThreadBudgetState: total overall thread budget diverged from async weights (repaired)." + + where + getStaleHandles :: m (Set UUID) + getStaleHandles = Set.fromList . mconcat <$> do + handles <- HM.toList . bmap <$> readIORef ref + forM handles $ \case + (_, (_, Nothing)) -> do + pure [] + (key, (_, Just handle)) -> do + status <- poll handle + pure [key | isJust status] + + warnStaleHandles :: Int -> BudgetMap -> m () + warnStaleHandles num (BudgetMap spent _) = LC.warn $ + "spent" LC..= show spent + LC.~~ LC.msg ("watchThreadBudgetState: removed " <> show num <> " stale handles.") + + -- check that overall budget matches the budget in the indiviual map entries. + sanitize :: BudgetMap -> (BudgetMap, Bool) + sanitize bm@(BudgetMap spent hm) = (BudgetMap spent' hm, spent == spent') + where spent' = budgetSpent' bm + + +safeForever + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => m () -> m () +safeForever action = forever $ action `catchAny` \exc -> do + LC.err $ "error" LC..= show exc LC.~~ LC.msg (LC.val "watchThreadBudgetState: crashed; retrying") + threadDelay 60000000 -- pause to keep worst-case noise in logs manageable diff --git a/services/gundeck/test/unit/Main.hs b/services/gundeck/test/unit/Main.hs index 36570e300c3..2e7d251a8d9 100644 --- a/services/gundeck/test/unit/Main.hs +++ b/services/gundeck/test/unit/Main.hs @@ -14,6 +14,7 @@ import qualified Gundeck.API import qualified Json import qualified Native import qualified Push +import qualified ThreadBudget main :: IO () main = withOpenSSL . defaultMain $ @@ -25,4 +26,5 @@ main = withOpenSSL . defaultMain $ , Json.tests , Native.tests , Push.tests + , ThreadBudget.tests ] diff --git a/services/gundeck/test/unit/MockGundeck.hs b/services/gundeck/test/unit/MockGundeck.hs index 972049bc7d3..db02cdbd663 100644 --- a/services/gundeck/test/unit/MockGundeck.hs +++ b/services/gundeck/test/unit/MockGundeck.hs @@ -393,10 +393,13 @@ instance MonadPushAll MockGundeck where mpaPushNative = mockPushNative mpaForkIO = id -- just don't fork. (this *may* cause deadlocks in principle, but as long as it -- doesn't, this is good enough for testing). + mpaRunWithBudget = \_ _ -> id -- no throttling needed as long as we don't overdo it in the tests... instance MonadNativeTargets MockGundeck where mntgtLogErr _ = pure () mntgtLookupAddresses = mockLookupAddresses + +instance MonadMapAsync MockGundeck where mntgtMapAsync f xs = Right <$$> mapM f xs -- (no concurrency) instance MonadPushAny MockGundeck where diff --git a/services/gundeck/test/unit/ThreadBudget.hs b/services/gundeck/test/unit/ThreadBudget.hs new file mode 100644 index 00000000000..43c4beee6d0 --- /dev/null +++ b/services/gundeck/test/unit/ThreadBudget.hs @@ -0,0 +1,334 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE RecordWildCards #-} + +{-# OPTIONS_GHC -Wno-orphans #-} + +module ThreadBudget where + +import Imports + +import Control.Concurrent.Async +import Control.Lens +import Control.Monad.Catch (MonadCatch, catch) +import Data.Metrics.Middleware (metrics) +import Data.String.Conversions (cs) +import Data.Time +import Data.TreeDiff.Class (ToExpr) +import GHC.Generics +import Gundeck.ThreadBudget +import Gundeck.Options +import Test.QuickCheck +import Test.QuickCheck.Monadic +import Test.StateMachine +import Test.Tasty +import Test.Tasty.HUnit +import Test.Tasty.QuickCheck + +import qualified System.Logger.Class as LC +import qualified Test.StateMachine.Types as STM +import qualified Test.StateMachine.Types.Rank2 as Rank2 + + +---------------------------------------------------------------------- +-- helpers + +newtype NumberOfThreads = NumberOfThreads { fromNumberOfThreads :: Int } + deriving (Eq, Ord, Show, Generic, ToExpr) + +-- | 'microseconds' determines how long one unit lasts. there is a trade-off of fast +-- vs. robust in this whole setup. this type is supposed to help us find a good sweet spot. +newtype MilliSeconds = MilliSeconds { fromMilliSeconds :: Int } + deriving (Eq, Ord, Show, Generic, ToExpr) + +-- toMillisecondsCeiling 0.03 == MilliSeconds 30 +-- toMillisecondsCeiling 0.003 == MilliSeconds 3 +-- toMillisecondsCeiling 0.0003 == MilliSeconds 1 +-- toMillisecondsCeiling 0.0000003 == MilliSeconds 1 +toMillisecondsCeiling :: NominalDiffTime -> MilliSeconds +toMillisecondsCeiling = MilliSeconds . ceiling . (* 1000) . toRational + +milliSecondsToNominalDiffTime :: MilliSeconds -> NominalDiffTime +milliSecondsToNominalDiffTime = fromRational . (/ 1000) . toRational . fromMilliSeconds + +instance Arbitrary NumberOfThreads where + arbitrary = NumberOfThreads <$> choose (1, 30) + shrink (NumberOfThreads n) = NumberOfThreads <$> filter (> 0) (shrink n) + +instance Arbitrary MilliSeconds where + arbitrary = MilliSeconds <$> choose (1, 30) + shrink (MilliSeconds n) = MilliSeconds <$> filter (> 0) (shrink n) + + +data LogEntry = NoBudget | Debug String | Unknown String + deriving (Eq, Show, Generic) + +makePrisms ''LogEntry + +type LogHistory = MVar [LogEntry] + + +extractLogHistory :: (HasCallStack, MonadReader LogHistory m, MonadIO m) => m [LogEntry] +extractLogHistory = do + logHistory <- ask + liftIO $ modifyMVar logHistory (pure . ([],)) + +expectLogHistory :: (HasCallStack, MonadReader LogHistory m, MonadIO m) => ([LogEntry] -> Bool) -> m () +expectLogHistory expected = do + logHistory <- ask + liftIO $ do + found <- modifyMVar logHistory (\found -> pure ([], found)) + expected (filter (isn't _Debug) found) @? ("unexpected log data: " <> show found) + +enterLogHistory :: (HasCallStack, MonadReader LogHistory m, MonadIO m) => LogEntry -> m () +enterLogHistory entry = do + logHistory <- ask + liftIO $ do + modifyMVar_ logHistory (\found -> pure (entry : found)) + +instance LC.MonadLogger (ReaderT LogHistory IO) where + log level msg = do + let raw :: String = cs $ LC.render LC.renderNetstr msg + parsed + | level == LC.Debug = Debug raw + | "runWithBudget: hard limit reached" `isInfixOf` raw = NoBudget + | "runWithBudget: soft limit reached" `isInfixOf` raw = NoBudget + | otherwise = Unknown raw + enterLogHistory parsed + +delayms :: MilliSeconds -> (MonadCatch m, MonadIO m) => m () +delayms = delay' . (* 1000) . fromMilliSeconds + +delayndt :: NominalDiffTime -> (MonadCatch m, MonadIO m) => m () +delayndt = delay' . round . (* 1000) . (* 1000) . toRational + +delay' :: Int -> (MonadCatch m, MonadIO m) => m () +delay' microsecs = threadDelay microsecs `catch` \AsyncCancelled -> pure () + +burstActions + :: HasCallStack + => ThreadBudgetState + -> LogHistory + -> MilliSeconds + -> NumberOfThreads + -> (MonadIO m) => m () +burstActions tbs logHistory howlong (NumberOfThreads howmany) + = let budgeted = runWithBudget tbs 1 (delayms howlong) + in liftIO . replicateM_ howmany . forkIO $ runReaderT budgeted logHistory + +-- | Start a watcher with given params and a frequency of 10 milliseconds, so we are more +-- likely to find weird race conditions. +mkWatcher :: ThreadBudgetState -> LogHistory -> IO (Async ()) +mkWatcher tbs logHistory = do + mtr <- metrics + async $ runReaderT (watchThreadBudgetState mtr tbs 0.01) logHistory + `catch` \AsyncCancelled -> pure () + + +---------------------------------------------------------------------- +-- TOC + +tests :: TestTree +tests = testGroup "thread budgets" $ + [ testCase "unit test" testThreadBudgets + , testProperty "qc stm (sequential)" propSequential + ] + + +---------------------------------------------------------------------- +-- deterministic unit test + +testThreadBudgets :: Assertion +testThreadBudgets = do + tbs <- mkThreadBudgetState (MaxConcurrentNativePushes (Just 5) (Just 5)) + logHistory :: LogHistory <- newMVar [] + watcher <- mkWatcher tbs logHistory + + flip runReaderT logHistory $ do + burstActions tbs logHistory (MilliSeconds 100) (NumberOfThreads 5) + delayms (MilliSeconds 10) + expectLogHistory null + liftIO $ budgetSpent tbs >>= (@=? 5) + + burstActions tbs logHistory (MilliSeconds 100) (NumberOfThreads 3) + delayms (MilliSeconds 10) + expectLogHistory (== [NoBudget, NoBudget, NoBudget]) + liftIO $ budgetSpent tbs >>= (@=? 5) + + burstActions tbs logHistory (MilliSeconds 100) (NumberOfThreads 3) + delayms (MilliSeconds 10) + expectLogHistory (== [NoBudget, NoBudget, NoBudget]) + liftIO $ budgetSpent tbs >>= (@=? 5) + + delayms (MilliSeconds 80) + + burstActions tbs logHistory (MilliSeconds 100) (NumberOfThreads 3) + delayms (MilliSeconds 10) + expectLogHistory null + liftIO $ budgetSpent tbs >>= (@=? 3) + + burstActions tbs logHistory (MilliSeconds 100) (NumberOfThreads 3) + delayms (MilliSeconds 10) + expectLogHistory (== [NoBudget]) + liftIO $ budgetSpent tbs >>= (@=? 5) + + cancel watcher + + +---------------------------------------------------------------------- +-- property-based state machine tests + +type State = Reference (Opaque (ThreadBudgetState, Async (), LogHistory)) + +newtype Model r = Model (Maybe (State r)) + deriving (Show, Generic) + +instance ToExpr (Model Symbolic) +instance ToExpr (Model Concrete) + + +data Command r + = Init NumberOfThreads + | Run (State r) NumberOfThreads MilliSeconds + | Wait (State r) MilliSeconds + | Measure (State r) + deriving (Show, Generic, Generic1, Rank2.Functor, Rank2.Foldable, Rank2.Traversable) + +data Response r + = InitResponse (State r) + | RunResponse + | WaitResponse + | MeasureResponse { rspConcreteRunning :: Int } + deriving (Show, Generic, Generic1, Rank2.Functor, Rank2.Foldable, Rank2.Traversable) + + +generator :: HasCallStack => Model Symbolic -> Gen (Command Symbolic) +generator (Model Nothing) = Init <$> arbitrary +generator (Model (Just st)) = + oneof [ Run st <$> arbitrary <*> arbitrary + , Wait st <$> arbitrary + , pure $ Measure st + ] + +shrinker :: HasCallStack => Command Symbolic -> [Command Symbolic] +shrinker (Init _) = [] +shrinker (Run st n m) = Wait st (MilliSeconds 1) : (Run st <$> shrink n <*> shrink m) +shrinker (Wait st n) = Wait st <$> shrink n +shrinker (Measure _) = [] + + +initModel :: HasCallStack => Model r +initModel = Model Nothing + + +semantics :: HasCallStack => Command Concrete -> IO (Response Concrete) +semantics (Init (NumberOfThreads limit)) + = do + tbs <- mkThreadBudgetState (MaxConcurrentNativePushes (Just limit) (Just limit)) + logHistory <- newMVar [] + watcher <- mkWatcher tbs logHistory + pure . InitResponse . reference . Opaque $ (tbs, watcher, logHistory) + +-- 'Run' works asynchronously: start new threads, but return without any time passing. +semantics (Run (opaque -> (tbs, _, logHistory)) howmany howlong) + = do + burstActions tbs logHistory howlong howmany + pure RunResponse + +-- 'Wait' makes time pass, ie. reduces the run time of running threads, and removes the ones +-- that drop below 0. +semantics (Wait _ howlong) + = do + delayms howlong + pure WaitResponse + +-- 'Measure' looks at the concrete state and records it into the model. +semantics (Measure (opaque -> (tbs, _, _))) + = do + rspConcreteRunning <- budgetSpent tbs + pure MeasureResponse{..} + + +transition :: HasCallStack => Model r -> Command r -> Response r -> Model r +transition (Model Nothing) (Init _) (InitResponse st) = Model (Just st) +transition (Model (Just st)) Run{} RunResponse = Model (Just st) +transition (Model (Just st)) Wait{} WaitResponse = Model (Just st) +transition (Model (Just st)) Measure{} MeasureResponse{..} = Model (Just st) +transition _ _ _ = error "impossible." + + +precondition :: HasCallStack => Model Symbolic -> Command Symbolic -> Logic +precondition _ _ = Top + +postcondition :: HasCallStack => Model Concrete -> Command Concrete -> Response Concrete -> Logic +postcondition (Model Nothing) Init{} InitResponse{} = Top +postcondition (Model (Just _)) Run{} RunResponse{} = Top +postcondition (Model (Just _)) Wait{} WaitResponse{} = Top +postcondition model@(Model (Just _)) cmd@Measure{} resp@MeasureResponse{..} + = threadLimitExceeded + where + Model (Just state) = transition model cmd resp + + rspThreadLimit :: Int + rspThreadLimit = case opaque state of + (tbs, _, _) -> tbs ^?! Control.Lens.to threadBudgetLimits . limitHard . _Just + + -- number of running threads is never above the limit. + threadLimitExceeded = Annotate "thread limit exceeded" $ rspConcreteRunning .<= rspThreadLimit + + -- FUTUREWORK: check that the number of running threads matches the model exactly. looks + -- plausible, but when i tried to make the model rich enough to express this test i didn't + -- manage to sort out the timing. + -- syncNumRunning = Annotate "out of sync" $ rspConcreteRunning .== rspModelRunning + +postcondition m c r = error $ "impossible: " <> show (m, c, r) + + +mock :: HasCallStack => Model Symbolic -> Command Symbolic -> GenSym (Response Symbolic) +mock (Model Nothing) (Init _) + = InitResponse <$> genSym + +mock (Model (Just _)) Run{} = pure RunResponse +mock (Model (Just _)) Wait{} = pure WaitResponse +mock (Model (Just _)) Measure{} = pure MeasureResponse{..} + where + rspConcreteRunning = undefined + -- FUTUREWORK: mock is cool because if we do this right, it gives us a quickcheck- + -- validated mock component that we can use in other tests. it appears it's not needed in + -- the tests in this module, though, and we will need to keep track of more of the + -- concrete state in the model if we want to fill in this 'undefined'. + -- + -- See also: https://www.well-typed.com/blog/2019/01/qsm-in-depth/ + +mock badmodel badcmd = error $ "impossible: " <> show (badmodel, badcmd) + + +sm :: StateMachine Model Command IO Response +sm = StateMachine + { STM.initModel = initModel + , STM.transition = transition + , STM.precondition = precondition + , STM.postcondition = postcondition + , STM.invariant = Nothing + , STM.generator = generator + , STM.distribution = Nothing + , STM.shrinker = shrinker + , STM.semantics = semantics + , STM.mock = mock + } + + +-- | Remove resources created by the concrete 'STM.Commands', namely watcher and budgeted +-- async threads. +shutdown :: Model Concrete -> MonadIO m => m () +shutdown (Model Nothing) = pure () +shutdown (Model (Just (opaque -> (tbs, watcher, _)))) = liftIO $ do + cancelAllThreads tbs + cancel watcher + +-- | FUTUREWORK: in this use case of quickcheck-state-machine it may be more interesting to +-- look at fewer, but longer command sequences. +propSequential :: Property +propSequential = forAllCommands sm Nothing $ \cmds -> monadicIO $ do + (hist, model, res) <- runCommands sm cmds + shutdown model + prettyCommands sm hist (checkCommandNames cmds (res === Ok)) diff --git a/stack.yaml b/stack.yaml index a617362e07e..ee6dff9619a 100644 --- a/stack.yaml +++ b/stack.yaml @@ -58,6 +58,7 @@ extra-deps: commit: c0bcbe8ae5bb6fdc0b5b94f640f63a615c068cbf # master (Apr 25, 2019) - git: https://github.com/wireapp/hscim commit: 6b98b894c127eed4a5bde646ebf20febcfa656fa # master (Apr 2, 2019) +- quickcheck-state-machine-0.4.2 flags: types-common: From c4f7eb73da4c42424833e02498d01cbc5fc79363 Mon Sep 17 00:00:00 2001 From: fisx Date: Tue, 22 Oct 2019 15:15:06 +0200 Subject: [PATCH 11/26] Fix detail in swagger docs (#885) Co-Authored-By: Tiago Manuel Ventura Loureiro --- services/galley/src/Galley/API.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/services/galley/src/Galley/API.hs b/services/galley/src/Galley/API.hs index 2b7f698e882..4b304512d9a 100644 --- a/services/galley/src/Galley/API.hs +++ b/services/galley/src/Galley/API.hs @@ -200,6 +200,7 @@ sitemap = do body (ref TeamsModel.teamMemberDelete) $ do optional description "JSON body, required only for binding teams." + response 202 "Team member scheduled for deletion" end errorResponse Error.noTeamMember errorResponse (Error.operationDenied RemoveTeamMember) errorResponse Error.reAuthFailed From ff67fc55404bce5af7f416360448084190382b66 Mon Sep 17 00:00:00 2001 From: jschaul Date: Thu, 24 Oct 2019 12:31:56 +0200 Subject: [PATCH 12/26] add issue template (#889) --- .../ISSUE_TEMPLATE/question-installation.md | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/question-installation.md diff --git a/.github/ISSUE_TEMPLATE/question-installation.md b/.github/ISSUE_TEMPLATE/question-installation.md new file mode 100644 index 00000000000..4becf8d53fd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question-installation.md @@ -0,0 +1,37 @@ +--- +name: Question-Installation +about: Question about my installation of wire-server +title: '' +labels: '' +assignees: '' + +--- + +* [ ] I have seen https://docs.wire.com/ and https://github.com/wireapp/wire-server-deploy - the documentation there does not answer my question. + +## My question: + +## Context: + +> Please provide sufficent context about your problem: + +### How did you install wire-server? + +> On kubernetes? With docker-compose? by manually compiling and running? + +### How many servers are involved? + +### What is installed on which servers? + +> E.g Server A has component X and server B has component Y + +### Provide details about networking + +> We don't need to know any specific IP address, but it helps if you provide information whether an IP is ipv4 or ipv6, whether is is publicly reachable from the global internet or not, and if you installed any component of wire-server, which network interfaces are processes listening on? + +### How did you configure wire-server? + +> *Note: only the configuration from helm charts in wire-server-deploy is what we support, like [these defaults](https://github.com/wireapp/wire-server-deploy/blob/develop/charts/brig/values.yaml) applied [here](https://github.com/wireapp/wire-server-deploy/blob/develop/charts/brig/templates/configmap.yaml) in the case of `brig`. If you have used other configuration files, please post them (or the relevant parts of them).* +> Did you use the helm charts from wire-server-deploy? +> Did you use and adapt configuration files from wire-server? If so, which ones? +> Are there any overrides? From 32c605730eff3d9df1bb4d7e2dc5b3e517b3eb00 Mon Sep 17 00:00:00 2001 From: jschaul Date: Thu, 24 Oct 2019 12:42:04 +0200 Subject: [PATCH 13/26] Update README (#887) --- tools/db/migrate-sso-feature-flag/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/db/migrate-sso-feature-flag/README.md b/tools/db/migrate-sso-feature-flag/README.md index 9c11ec2cc57..13f1f32628d 100644 --- a/tools/db/migrate-sso-feature-flag/README.md +++ b/tools/db/migrate-sso-feature-flag/README.md @@ -1,3 +1,11 @@ A clone of ../service-backfill/ that enables the SSO feature flag for -all teams that already have an IdP. See README.md there on how to run -this. +all teams that already have an IdP. + +Connects to galley and spar databases. + +Set up port-forwarding to spar and galley databases (hacky, slow, maybe dangerous), or run from a machine with access to those databases (preferred approach). Refer to ../service-backfill/ for an example. Then: + +```sh +# assuming local port forwarding cassandra_galley on 2021 and cassandra_spar on 2022: +./dist/migrate-sso-feature-flag --cassandra-host-spar localhost --cassandra-port-spar 2022 --cassandra-keyspace-spar spar --cassandra-host-galley localhost --cassandra-port-galley 2021 --cassandra-keyspace-galley galley +``` From 28f6deee4816d063cff1d20258244f14515fc4f9 Mon Sep 17 00:00:00 2001 From: jschaul Date: Thu, 24 Oct 2019 12:42:40 +0200 Subject: [PATCH 14/26] fix incorrect boolean logic (#890) Currently, we get spammed with these warnings before any traffic happens, due to an incorrect handling of boolean. [gundeck] W, request=N/A, watchThreadBudgetState: total overall thread budget diverged from async weights (repaired). [gundeck] W, request=N/A, watchThreadBudgetState: total overall thread budget diverged from async weights (repaired). [gundeck] W, request=N/A, watchThreadBudgetState: total overall thread budget diverged from async weights (repaired). --- services/gundeck/src/Gundeck/ThreadBudget.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/gundeck/src/Gundeck/ThreadBudget.hs b/services/gundeck/src/Gundeck/ThreadBudget.hs index e4085c22860..636ce2a1788 100644 --- a/services/gundeck/src/Gundeck/ThreadBudget.hs +++ b/services/gundeck/src/Gundeck/ThreadBudget.hs @@ -221,8 +221,8 @@ removeStaleHandles ref = do mapM_ waitCatch . join . fmap snd =<< HM.lookup key . bmap <$> readIORef ref unregister ref key - isUnsanitary <- atomicModifyIORef' ref sanitize - when isUnsanitary . LC.warn . LC.msg . LC.val $ + isSanitary <- atomicModifyIORef' ref sanitize + unless isSanitary . LC.warn . LC.msg . LC.val $ "watchThreadBudgetState: total overall thread budget diverged from async weights (repaired)." where From a95becdf5794897ff04489ae819e71f7660f2e84 Mon Sep 17 00:00:00 2001 From: fisx Date: Fri, 25 Oct 2019 10:06:54 +0200 Subject: [PATCH 15/26] benchmarking for O(n) function, remove expensive call from atomicModifyIORef (#891) * Add ad-hoc bench for O(n) function * Fix: do not block ThreadBudgetState for expensive sanity check * move benchmarking code out from production code, do real benchmarks --- services/gundeck/package.yaml | 3 +++ services/gundeck/src/Gundeck/ThreadBudget.hs | 15 ++++++------- services/gundeck/test/bench/Main.hs | 23 ++++++++++++++++++++ 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/services/gundeck/package.yaml b/services/gundeck/package.yaml index bb00596df9e..6cdac2ce3bd 100644 --- a/services/gundeck/package.yaml +++ b/services/gundeck/package.yaml @@ -224,9 +224,12 @@ benchmarks: - gundeck - gundeck-types - HsOpenSSL + - random - text - types-common - unordered-containers + - time + - uuid flags: static: description: Enable static linking diff --git a/services/gundeck/src/Gundeck/ThreadBudget.hs b/services/gundeck/src/Gundeck/ThreadBudget.hs index 636ce2a1788..7257cb090f8 100644 --- a/services/gundeck/src/Gundeck/ThreadBudget.hs +++ b/services/gundeck/src/Gundeck/ThreadBudget.hs @@ -23,10 +23,14 @@ module Gundeck.ThreadBudget , runWithBudget' , watchThreadBudgetState - -- * for testing + -- * for testing: FUTUREWORK: factor out into a library? , threadBudgetLimits , budgetSpent + , budgetSpent' + , _threadBudgetRunning , cancelAllThreads + , allocate + , BudgetMap ) where import Imports @@ -64,7 +68,7 @@ data BudgetMap = BudgetMap -- counts all the threads that are successfully running (dropping the ones that are just about -- to try to grab a token). -- --- WARNING: takes O(n)) and should only be used in testing. +-- WARNING: takes O(n)), use with care. See 'bench_BudgetSpent''. budgetSpent :: ThreadBudgetState -> IO Int budgetSpent (ThreadBudgetState _ running) = budgetSpent' <$> readIORef running @@ -221,7 +225,7 @@ removeStaleHandles ref = do mapM_ waitCatch . join . fmap snd =<< HM.lookup key . bmap <$> readIORef ref unregister ref key - isSanitary <- atomicModifyIORef' ref sanitize + isSanitary <- (\bm -> bspent bm == budgetSpent' bm) <$> readIORef ref unless isSanitary . LC.warn . LC.msg . LC.val $ "watchThreadBudgetState: total overall thread budget diverged from async weights (repaired)." @@ -241,11 +245,6 @@ removeStaleHandles ref = do "spent" LC..= show spent LC.~~ LC.msg ("watchThreadBudgetState: removed " <> show num <> " stale handles.") - -- check that overall budget matches the budget in the indiviual map entries. - sanitize :: BudgetMap -> (BudgetMap, Bool) - sanitize bm@(BudgetMap spent hm) = (BudgetMap spent' hm, spent == spent') - where spent' = budgetSpent' bm - safeForever :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) diff --git a/services/gundeck/test/bench/Main.hs b/services/gundeck/test/bench/Main.hs index 98a3e775285..2d3badd4d79 100644 --- a/services/gundeck/test/bench/Main.hs +++ b/services/gundeck/test/bench/Main.hs @@ -6,17 +6,26 @@ import Data.Id (randomId, ConnId (..), ClientId (..)) import Gundeck.Types.Push import Gundeck.Push.Native.Serialise import Gundeck.Push.Native.Types +import Gundeck.ThreadBudget +import Gundeck.Options import Network.AWS (Region (Ireland)) import OpenSSL (withOpenSSL) +import System.Random (randomRIO) +import Data.UUID.V4 (nextRandom) import qualified Data.Text.Lazy as LT main :: IO () main = withOpenSSL $ do + prepared <- prepareBudgetState (100000) defaultMain [ bgroup "notice" [ bench "32" $ nfIO notice ] + , bgroup "ThreadBudget" + [ bench "budgetSpent'" $ nfIO (bench_BudgetSpent' prepared) + , bench "prepare + budgetSpent'" $ nfIO (bench_BudgetSpent' =<< prepareBudgetState 100000) + ] ] ----------------------------------------------------------------------------- @@ -30,6 +39,11 @@ notice = do Right txt <- serialise msg a return $! LT.toStrict txt +bench_BudgetSpent' :: IORef BudgetMap -> IO () +bench_BudgetSpent' ref = do + budgetmap <- readIORef ref + void $ return $ budgetSpent' budgetmap + ----------------------------------------------------------------------------- -- Utilities @@ -47,3 +61,12 @@ mkEndpoint :: Transport -> AppName -> EndpointArn mkEndpoint t a = mkSnsArn Ireland (Account "test") topic where topic = mkEndpointTopic (ArnEnv "test") t a (EndpointId "test") + +prepareBudgetState :: Int -> IO (IORef BudgetMap) +prepareBudgetState size = do + ref <- _threadBudgetRunning <$> mkThreadBudgetState (MaxConcurrentNativePushes Nothing Nothing) + forM_ [1..size] $ \_ -> do + key <- nextRandom + weight <- randomRIO (1, 1000) + allocate ref key weight + return ref From a6ed328e13a97fa90e2efd26c8497d039787300b Mon Sep 17 00:00:00 2001 From: jschaul Date: Sun, 27 Oct 2019 00:37:13 +0200 Subject: [PATCH 16/26] emit counter metric on soft/hard limit breached (#892) * emit counter metric on soft/hard limit breached * Cleanup exports * More related work --- services/gundeck/src/Gundeck/Push.hs | 3 +- services/gundeck/src/Gundeck/ThreadBudget.hs | 233 +----------------- .../src/Gundeck/ThreadBudget/Internal.hs | 223 +++++++++++++++++ services/gundeck/test/bench/Main.hs | 2 +- services/gundeck/test/unit/ThreadBudget.hs | 9 +- 5 files changed, 236 insertions(+), 234 deletions(-) create mode 100644 services/gundeck/src/Gundeck/ThreadBudget/Internal.hs diff --git a/services/gundeck/src/Gundeck/Push.hs b/services/gundeck/src/Gundeck/Push.hs index be3fb5f37f6..f3dc5e73d6b 100644 --- a/services/gundeck/src/Gundeck/Push.hs +++ b/services/gundeck/src/Gundeck/Push.hs @@ -91,9 +91,10 @@ instance MonadPushAll Gundeck where -- | Another layer of wrap around 'runWithBudget'. runWithBudget'' :: Int -> a -> Gundeck a -> Gundeck a runWithBudget'' budget fallback action = do + metrics <- view monitor view threadBudgetState >>= \case Nothing -> action - Just tbs -> runWithBudget' tbs budget fallback action + Just tbs -> runWithBudget' metrics tbs budget fallback action -- | Abstract over all effects in 'nativeTargets' (for unit testing). diff --git a/services/gundeck/src/Gundeck/ThreadBudget.hs b/services/gundeck/src/Gundeck/ThreadBudget.hs index 7257cb090f8..d7aabfe8b4d 100644 --- a/services/gundeck/src/Gundeck/ThreadBudget.hs +++ b/services/gundeck/src/Gundeck/ThreadBudget.hs @@ -3,6 +3,10 @@ -- run-time of the actions into account, not just the number of executions. -- http://hackage.haskell.org/package/rate-limit also looks related. -- +-- FUTUREWORK: https://github.com/layer-3-communications/lockpool seems like almost exactly +-- the same thing, but I only found this after ThreadBudget was done. Before considering to +-- ThreadBudget a standalone package, take a closer look at lockpool! +-- -- USE CASE: keep a lid of stalled native push notification threads. if SNS is up, there -- will be many short-running executions of the action. when SNS is down, the threads will -- accumulate in memory and choke the gundeck instances. so we want to stop spawning more @@ -22,233 +26,6 @@ module Gundeck.ThreadBudget , runWithBudget , runWithBudget' , watchThreadBudgetState - - -- * for testing: FUTUREWORK: factor out into a library? - , threadBudgetLimits - , budgetSpent - , budgetSpent' - , _threadBudgetRunning - , cancelAllThreads - , allocate - , BudgetMap ) where -import Imports - -import Control.Exception.Safe (catchAny) -import Control.Lens -import Control.Monad.Catch (MonadCatch) -import Data.Metrics (Metrics) -import Data.Metrics.Middleware (gaugeSet, path) -import Data.Time -import Data.UUID (UUID, toText) -import Data.UUID.V4 (nextRandom) -import Gundeck.Options -import UnliftIO.Async -import UnliftIO.Exception (finally) - -import qualified Data.Set as Set -import qualified Data.HashMap.Strict as HM -import qualified System.Logger.Class as LC - - -data ThreadBudgetState = ThreadBudgetState - { threadBudgetLimits :: MaxConcurrentNativePushes - , _threadBudgetRunning :: IORef BudgetMap - } deriving (Generic) - --- | Store all handles for cleanup in 'watchThreadBudgetState'. -data BudgetMap = BudgetMap - { bspent :: Int - , bmap :: HashMap UUID (Int, Maybe (Async ())) - } - deriving (Eq, Generic) - --- | Instead of taking the pre-computed total budget spent of the 'BudgetMap' (O(1)), this --- counts all the threads that are successfully running (dropping the ones that are just about --- to try to grab a token). --- --- WARNING: takes O(n)), use with care. See 'bench_BudgetSpent''. -budgetSpent :: ThreadBudgetState -> IO Int -budgetSpent (ThreadBudgetState _ running) = budgetSpent' <$> readIORef running - -budgetSpent' :: BudgetMap -> Int -budgetSpent' = sum . fmap fst . filter (isJust . snd) . HM.elems . bmap - -cancelAllThreads :: ThreadBudgetState -> IO () -cancelAllThreads (ThreadBudgetState _ ref) = readIORef ref - >>= mapM_ cancel . catMaybes . fmap snd . HM.elems . bmap - -mkThreadBudgetState :: HasCallStack => MaxConcurrentNativePushes -> IO ThreadBudgetState -mkThreadBudgetState limits = ThreadBudgetState limits <$> newIORef (BudgetMap 0 HM.empty) - - --- | Allocate the resources for a new action to be called (but don't call the action yet). -allocate - :: IORef BudgetMap -> UUID -> Int -> MonadIO m => m Int -allocate ref key newspent - = atomicModifyIORef' ref $ - \(BudgetMap spent hm) -> - ( BudgetMap (spent + newspent) (HM.insert key (newspent, Nothing) hm) - , spent - ) - --- | Register an already-allocated action with its 'Async'. -register - :: IORef BudgetMap -> UUID -> Async () -> MonadIO m => m Int -register ref key handle - = atomicModifyIORef' ref $ - \(BudgetMap spent hm) -> - ( BudgetMap spent (HM.adjust (_2 .~ Just handle) key hm) - , spent - ) - --- | Remove an registered and/or allocated action from a 'BudgetMap'. -unregister - :: IORef BudgetMap -> UUID -> MonadIO m => m () -unregister ref key - = atomicModifyIORef' ref $ - \bhm@(BudgetMap spent hm) -> - case HM.lookup key hm of - Just (newspent, _) -> (BudgetMap (spent - newspent) (HM.delete key hm), ()) - Nothing -> (bhm, ()) - - --- | If there is budget available, execute the action synchronously; otherwise, log a warning --- and return immediately. Make sure the budget state is updated accordingly both when --- starting and ending the execution. --- --- The hard limit in the 'ThreadBudgetState' argument is guaranteed to be an upper bound for --- the number of concurrently spent budget tokens; surpassing the soft limit will trigger a --- warning, but still execute the action. One action can use up any integer number of budget --- tokens, including 0 and negative. --- --- The action is called in an 'Async', but 'runWithBudget' waits for it to finish so it can --- update the budget. -runWithBudget - :: forall m. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) - => ThreadBudgetState -> Int -> m () -> m () -runWithBudget tbs spent = runWithBudget' tbs spent () - --- | More flexible variant of 'runWithBudget' that allows the action to return a value. With --- a default in case of budget exhaustion. -runWithBudget' - :: forall m a. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) - => ThreadBudgetState -> Int -> a -> m a -> m a -runWithBudget' (ThreadBudgetState limits ref) spent fallback action = do - key <- liftIO nextRandom - (`finally` unregister ref key) $ do - oldsize <- allocate ref key spent - - warnNoBudget (maybe False (oldsize >=) (limits ^. limitSoft)) - (maybe False (oldsize >=) (limits ^. limitHard)) - oldsize - - if (maybe True (oldsize <) (limits ^. limitHard)) - then go key oldsize - else pure fallback - where - go :: UUID -> Int -> m a - go key oldsize = do - LC.debug $ - "key" LC..= (toText key) LC.~~ - "spent" LC..= oldsize LC.~~ - LC.msg (LC.val "runWithBudget: go") - - handle <- async action - _ <- register ref key (const () <$> handle) - wait handle - - -- iff soft and/or hard limit are breached, log a warning-level message. - warnNoBudget :: Bool -> Bool -> Int -> m () - warnNoBudget False False _ = pure () - warnNoBudget soft hard oldsize = do - let limit = if hard then "hard" else "soft" - LC.warn $ - "spent" LC..= show oldsize LC.~~ - "soft-breach" LC..= soft LC.~~ - "hard-breach" LC..= hard LC.~~ - LC.msg (LC.val "runWithBudget: " <> limit <> " limit reached") - - --- | Fork a thread that checks with the given frequency if any async handles stored in the --- state are stale (ie., have terminated with or without exception, but not been removed). If --- that happens, log a warning. --- --- 'runWithBudget' should keep track of the state itself; 'watchThreadBudgetState' is solely a --- safety precaution to see if there aren't any corner cases we missed. --- --- Also, issue some metrics. -watchThreadBudgetState - :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) - => Metrics -> ThreadBudgetState -> NominalDiffTime -> m () -watchThreadBudgetState metrics (ThreadBudgetState limits ref) freq = safeForever $ do - recordMetrics metrics limits ref - removeStaleHandles ref - threadDelayNominalDiffTime freq - -recordMetrics - :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) - => Metrics -> MaxConcurrentNativePushes -> IORef BudgetMap -> m () -recordMetrics metrics limits ref = do - (BudgetMap spent _) <- readIORef ref - gaugeSet (fromIntegral spent) (path "net.nativepush.thread_budget_allocated") metrics - forM_ (limits ^. limitHard) $ \lim -> - gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_hard_limit") metrics - forM_ (limits ^. limitSoft) $ \lim -> - gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_soft_limit") metrics - -threadDelayNominalDiffTime :: NominalDiffTime -> MonadIO m => m () -threadDelayNominalDiffTime = threadDelay . round . (* 1000000) . toRational - -staleTolerance :: NominalDiffTime -staleTolerance = 3 - --- | Get all handles for asyncs that have terminated, but not been removed from the state. Do --- that again after 'staleTolerance' to make sure that we don't catch any handles that would --- have been removed during the 'runWithBudget' cleanup, but we were faster. The intersection --- between the two rounds constitutes the legitimately stale handles: warn about them, and --- then remove them from the budgetmap. -removeStaleHandles - :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) - => IORef BudgetMap -> m () -removeStaleHandles ref = do - round1 <- getStaleHandles - threadDelayNominalDiffTime staleTolerance - round2 <- getStaleHandles - - let staleHandles = Set.intersection round1 round2 - - unless (null staleHandles) $ do - warnStaleHandles (Set.size staleHandles) =<< readIORef ref - forM_ staleHandles $ \key -> do - mapM_ waitCatch . join . fmap snd =<< HM.lookup key . bmap <$> readIORef ref - unregister ref key - - isSanitary <- (\bm -> bspent bm == budgetSpent' bm) <$> readIORef ref - unless isSanitary . LC.warn . LC.msg . LC.val $ - "watchThreadBudgetState: total overall thread budget diverged from async weights (repaired)." - - where - getStaleHandles :: m (Set UUID) - getStaleHandles = Set.fromList . mconcat <$> do - handles <- HM.toList . bmap <$> readIORef ref - forM handles $ \case - (_, (_, Nothing)) -> do - pure [] - (key, (_, Just handle)) -> do - status <- poll handle - pure [key | isJust status] - - warnStaleHandles :: Int -> BudgetMap -> m () - warnStaleHandles num (BudgetMap spent _) = LC.warn $ - "spent" LC..= show spent - LC.~~ LC.msg ("watchThreadBudgetState: removed " <> show num <> " stale handles.") - - -safeForever - :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) - => m () -> m () -safeForever action = forever $ action `catchAny` \exc -> do - LC.err $ "error" LC..= show exc LC.~~ LC.msg (LC.val "watchThreadBudgetState: crashed; retrying") - threadDelay 60000000 -- pause to keep worst-case noise in logs manageable +import Gundeck.ThreadBudget.Internal diff --git a/services/gundeck/src/Gundeck/ThreadBudget/Internal.hs b/services/gundeck/src/Gundeck/ThreadBudget/Internal.hs new file mode 100644 index 00000000000..b8bfa822b78 --- /dev/null +++ b/services/gundeck/src/Gundeck/ThreadBudget/Internal.hs @@ -0,0 +1,223 @@ +module Gundeck.ThreadBudget.Internal where + +import Imports + +import Control.Exception.Safe (catchAny) +import Control.Lens +import Control.Monad.Catch (MonadCatch) +import Data.Metrics (Metrics, counterIncr) +import Data.Metrics.Middleware (gaugeSet, path) +import Data.Time +import Data.UUID (UUID, toText) +import Data.UUID.V4 (nextRandom) +import Gundeck.Options +import UnliftIO.Async +import UnliftIO.Exception (finally) + +import qualified Data.Set as Set +import qualified Data.HashMap.Strict as HM +import qualified System.Logger.Class as LC + + +data ThreadBudgetState = ThreadBudgetState + { threadBudgetLimits :: MaxConcurrentNativePushes + , _threadBudgetRunning :: IORef BudgetMap + } deriving (Generic) + +-- | Store all handles for cleanup in 'watchThreadBudgetState'. +data BudgetMap = BudgetMap + { bspent :: Int + , bmap :: HashMap UUID (Int, Maybe (Async ())) + } + deriving (Eq, Generic) + +-- | Instead of taking the pre-computed total budget spent of the 'BudgetMap' (O(1)), this +-- counts all the threads that are successfully running (dropping the ones that are just about +-- to try to grab a token). +-- +-- WARNING: takes O(n)), use with care. See 'bench_BudgetSpent''. +budgetSpent :: ThreadBudgetState -> IO Int +budgetSpent (ThreadBudgetState _ running) = budgetSpent' <$> readIORef running + +budgetSpent' :: BudgetMap -> Int +budgetSpent' = sum . fmap fst . filter (isJust . snd) . HM.elems . bmap + +cancelAllThreads :: ThreadBudgetState -> IO () +cancelAllThreads (ThreadBudgetState _ ref) = readIORef ref + >>= mapM_ cancel . catMaybes . fmap snd . HM.elems . bmap + +mkThreadBudgetState :: HasCallStack => MaxConcurrentNativePushes -> IO ThreadBudgetState +mkThreadBudgetState limits = ThreadBudgetState limits <$> newIORef (BudgetMap 0 HM.empty) + + +-- | Allocate the resources for a new action to be called (but don't call the action yet). +allocate + :: IORef BudgetMap -> UUID -> Int -> MonadIO m => m Int +allocate ref key newspent + = atomicModifyIORef' ref $ + \(BudgetMap spent hm) -> + ( BudgetMap (spent + newspent) (HM.insert key (newspent, Nothing) hm) + , spent + ) + +-- | Register an already-allocated action with its 'Async'. +register + :: IORef BudgetMap -> UUID -> Async () -> MonadIO m => m Int +register ref key handle + = atomicModifyIORef' ref $ + \(BudgetMap spent hm) -> + ( BudgetMap spent (HM.adjust (_2 .~ Just handle) key hm) + , spent + ) + +-- | Remove an registered and/or allocated action from a 'BudgetMap'. +unregister + :: IORef BudgetMap -> UUID -> MonadIO m => m () +unregister ref key + = atomicModifyIORef' ref $ + \bhm@(BudgetMap spent hm) -> + case HM.lookup key hm of + Just (newspent, _) -> (BudgetMap (spent - newspent) (HM.delete key hm), ()) + Nothing -> (bhm, ()) + + +-- | If there is budget available, execute the action synchronously; otherwise, log a warning +-- and return immediately. Make sure the budget state is updated accordingly both when +-- starting and ending the execution. +-- +-- The hard limit in the 'ThreadBudgetState' argument is guaranteed to be an upper bound for +-- the number of concurrently spent budget tokens; surpassing the soft limit will trigger a +-- warning, but still execute the action. One action can use up any integer number of budget +-- tokens, including 0 and negative. +-- +-- The action is called in an 'Async', but 'runWithBudget' waits for it to finish so it can +-- update the budget. +runWithBudget + :: forall m. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) + => Metrics -> ThreadBudgetState -> Int -> m () -> m () +runWithBudget metrics tbs spent = runWithBudget' metrics tbs spent () + +-- | More flexible variant of 'runWithBudget' that allows the action to return a value. With +-- a default in case of budget exhaustion. +runWithBudget' + :: forall m a. (MonadIO m, LC.MonadLogger m, MonadUnliftIO m) + => Metrics -> ThreadBudgetState -> Int -> a -> m a -> m a +runWithBudget' metrics (ThreadBudgetState limits ref) spent fallback action = do + key <- liftIO nextRandom + (`finally` unregister ref key) $ do + oldsize <- allocate ref key spent + + let softLimitBreached = maybe False (oldsize >=) (limits ^. limitSoft) + hardLimitBreached = maybe False (oldsize >=) (limits ^. limitHard) + warnNoBudget softLimitBreached hardLimitBreached oldsize + + if (maybe True (oldsize <) (limits ^. limitHard)) + then go key oldsize + else pure fallback + where + go :: UUID -> Int -> m a + go key oldsize = do + LC.debug $ + "key" LC..= (toText key) LC.~~ + "spent" LC..= oldsize LC.~~ + LC.msg (LC.val "runWithBudget: go") + + handle <- async action + _ <- register ref key (const () <$> handle) + wait handle + + -- iff soft and/or hard limit are breached, log a warning-level message. + warnNoBudget :: Bool -> Bool -> Int -> m () + warnNoBudget False False _ = pure () + warnNoBudget soft hard oldsize = do + let limit = if hard then "hard" else "soft" + metric = "net.nativepush." <> limit <> "_limit_breached" + counterIncr (path metric) metrics + LC.warn $ + "spent" LC..= show oldsize LC.~~ + "soft-breach" LC..= soft LC.~~ + "hard-breach" LC..= hard LC.~~ + LC.msg ("runWithBudget: " <> limit <> " limit reached") + + +-- | Fork a thread that checks with the given frequency if any async handles stored in the +-- state are stale (ie., have terminated with or without exception, but not been removed). If +-- that happens, log a warning. +-- +-- 'runWithBudget' should keep track of the state itself; 'watchThreadBudgetState' is solely a +-- safety precaution to see if there aren't any corner cases we missed. +-- +-- Also, issue some metrics. +watchThreadBudgetState + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => Metrics -> ThreadBudgetState -> NominalDiffTime -> m () +watchThreadBudgetState metrics (ThreadBudgetState limits ref) freq = safeForever $ do + recordMetrics metrics limits ref + removeStaleHandles ref + threadDelayNominalDiffTime freq + +recordMetrics + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => Metrics -> MaxConcurrentNativePushes -> IORef BudgetMap -> m () +recordMetrics metrics limits ref = do + (BudgetMap spent _) <- readIORef ref + gaugeSet (fromIntegral spent) (path "net.nativepush.thread_budget_allocated") metrics + forM_ (limits ^. limitHard) $ \lim -> + gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_hard_limit") metrics + forM_ (limits ^. limitSoft) $ \lim -> + gaugeSet (fromIntegral lim) (path "net.nativepush.thread_budget_soft_limit") metrics + +threadDelayNominalDiffTime :: NominalDiffTime -> MonadIO m => m () +threadDelayNominalDiffTime = threadDelay . round . (* 1000000) . toRational + +staleTolerance :: NominalDiffTime +staleTolerance = 3 + +-- | Get all handles for asyncs that have terminated, but not been removed from the state. Do +-- that again after 'staleTolerance' to make sure that we don't catch any handles that would +-- have been removed during the 'runWithBudget' cleanup, but we were faster. The intersection +-- between the two rounds constitutes the legitimately stale handles: warn about them, and +-- then remove them from the budgetmap. +removeStaleHandles + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => IORef BudgetMap -> m () +removeStaleHandles ref = do + round1 <- getStaleHandles + threadDelayNominalDiffTime staleTolerance + round2 <- getStaleHandles + + let staleHandles = Set.intersection round1 round2 + + unless (null staleHandles) $ do + warnStaleHandles (Set.size staleHandles) =<< readIORef ref + forM_ staleHandles $ \key -> do + mapM_ waitCatch . join . fmap snd =<< HM.lookup key . bmap <$> readIORef ref + unregister ref key + + isSanitary <- (\bm -> bspent bm == budgetSpent' bm) <$> readIORef ref + unless isSanitary . LC.warn . LC.msg . LC.val $ + "watchThreadBudgetState: total overall thread budget diverged from async weights (repaired)." + + where + getStaleHandles :: m (Set UUID) + getStaleHandles = Set.fromList . mconcat <$> do + handles <- HM.toList . bmap <$> readIORef ref + forM handles $ \case + (_, (_, Nothing)) -> do + pure [] + (key, (_, Just handle)) -> do + status <- poll handle + pure [key | isJust status] + + warnStaleHandles :: Int -> BudgetMap -> m () + warnStaleHandles num (BudgetMap spent _) = LC.warn $ + "spent" LC..= show spent + LC.~~ LC.msg ("watchThreadBudgetState: removed " <> show num <> " stale handles.") + + +safeForever + :: forall m. (MonadIO m, LC.MonadLogger m, MonadCatch m) + => m () -> m () +safeForever action = forever $ action `catchAny` \exc -> do + LC.err $ "error" LC..= show exc LC.~~ LC.msg (LC.val "watchThreadBudgetState: crashed; retrying") + threadDelay 60000000 -- pause to keep worst-case noise in logs manageable diff --git a/services/gundeck/test/bench/Main.hs b/services/gundeck/test/bench/Main.hs index 2d3badd4d79..712293407ff 100644 --- a/services/gundeck/test/bench/Main.hs +++ b/services/gundeck/test/bench/Main.hs @@ -6,7 +6,7 @@ import Data.Id (randomId, ConnId (..), ClientId (..)) import Gundeck.Types.Push import Gundeck.Push.Native.Serialise import Gundeck.Push.Native.Types -import Gundeck.ThreadBudget +import Gundeck.ThreadBudget.Internal import Gundeck.Options import Network.AWS (Region (Ireland)) import OpenSSL (withOpenSSL) diff --git a/services/gundeck/test/unit/ThreadBudget.hs b/services/gundeck/test/unit/ThreadBudget.hs index 43c4beee6d0..4f135866271 100644 --- a/services/gundeck/test/unit/ThreadBudget.hs +++ b/services/gundeck/test/unit/ThreadBudget.hs @@ -15,7 +15,7 @@ import Data.String.Conversions (cs) import Data.Time import Data.TreeDiff.Class (ToExpr) import GHC.Generics -import Gundeck.ThreadBudget +import Gundeck.ThreadBudget.Internal import Gundeck.Options import Test.QuickCheck import Test.QuickCheck.Monadic @@ -111,9 +111,10 @@ burstActions -> MilliSeconds -> NumberOfThreads -> (MonadIO m) => m () -burstActions tbs logHistory howlong (NumberOfThreads howmany) - = let budgeted = runWithBudget tbs 1 (delayms howlong) - in liftIO . replicateM_ howmany . forkIO $ runReaderT budgeted logHistory +burstActions tbs logHistory howlong (NumberOfThreads howmany) = do + mtr <- metrics + let budgeted = runWithBudget mtr tbs 1 (delayms howlong) + liftIO . replicateM_ howmany . forkIO $ runReaderT budgeted logHistory -- | Start a watcher with given params and a frequency of 10 milliseconds, so we are more -- likely to find weird race conditions. From 51b25f7df01f79d2775de3e3f75dbcb3242e1f90 Mon Sep 17 00:00:00 2001 From: jschaul Date: Mon, 4 Nov 2019 16:40:17 +0100 Subject: [PATCH 17/26] Do not send events to team members to be deleted (#897) Events about deleted conversations do not need to be sent to everyone when a team is deleted. Indeed, it is harmful in that it leads to performance problems: For large teams with many conversations, notifying every team member about that fact that every conversation has been deleted creates a lot of events. None of these events are useful, since on a team deletion, the accounts of every user are deleted, i.e. in practice these events cannot be read. This PR fixes the bug that was introduced in #849 --- services/galley/src/Galley/API/Teams.hs | 7 +++++-- services/galley/test/integration/API/Teams.hs | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/galley/src/Galley/API/Teams.hs b/services/galley/src/Galley/API/Teams.hs index b90c7207500..29aace4ca78 100644 --- a/services/galley/src/Galley/API/Teams.hs +++ b/services/galley/src/Galley/API/Teams.hs @@ -214,9 +214,12 @@ uncheckedDeleteTeam zusr zcon tid = do -> Galley ([Push],[(BotMember, Conv.Event)]) pushConvDeleteEvents now teamMembs c (pp, ee) = do (bots, convMembs) <- botsAndUsers <$> Data.members (c ^. conversationId) - let mm = convMembsAndTeamMembs convMembs teamMembs + -- Only nonTeamMembers need to get any events, since on team deletion, + -- all team users are deleted immediately after these events are sent + -- and will thus never be able to see these events in practice. + let mm = nonTeamMembers convMembs teamMembs let e = Conv.Event Conv.ConvDelete (c ^. conversationId) zusr now Nothing - let p = newPush zusr (ConvEvent e) mm + let p = newPush zusr (ConvEvent e) (map recipient mm) let ee' = bots `zip` repeat e let pp' = maybe pp (\x -> (x & pushConn .~ zcon) : pp) p pure (pp', ee' ++ ee) diff --git a/services/galley/test/integration/API/Teams.hs b/services/galley/test/integration/API/Teams.hs index 341b7ff5023..84b6f634b9f 100644 --- a/services/galley/test/integration/API/Teams.hs +++ b/services/galley/test/integration/API/Teams.hs @@ -598,8 +598,7 @@ testDeleteTeam = do const 202 === statusCode checkTeamDeleteEvent tid wsOwner checkTeamDeleteEvent tid wsMember - checkConvDeleteEvent cid1 wsOwner - checkConvDeleteEvent cid1 wsMember + -- team members should not receive conversation delete events checkConvDeleteEvent cid1 wsExtern WS.assertNoEvent timeout [wsOwner, wsExtern, wsMember] From 268520cbdeb8e64b68ddedfd273dbe35a16ab6e6 Mon Sep 17 00:00:00 2001 From: Serge Bazanski Date: Mon, 4 Nov 2019 18:18:53 +0100 Subject: [PATCH 18/26] http-client: bump to wire-20191104 --- snapshots/wire-1.4.yaml | 12 ++++++++++++ stack.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 snapshots/wire-1.4.yaml diff --git a/snapshots/wire-1.4.yaml b/snapshots/wire-1.4.yaml new file mode 100644 index 00000000000..4e83c24eced --- /dev/null +++ b/snapshots/wire-1.4.yaml @@ -0,0 +1,12 @@ +# DO NOT MODIFY THIS FILE. See README.md to learn why. + +resolver: https://raw.githubusercontent.com/wireapp/wire-server/develop/snapshots/wire-1.3.yaml +name: wire-1.4 + +packages: +- archive: https://github.com/wireapp/http-client/archive/wire-2019-11-04.tar.gz + subdirs: + - http-client + - http-client-openssl + - http-client-tls + - http-conduit diff --git a/stack.yaml b/stack.yaml index ee6dff9619a..c90fe32dd0b 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,4 +1,4 @@ -resolver: snapshots/wire-1.3.yaml +resolver: snapshots/wire-1.4.yaml packages: - libs/api-bot From 13ff5a76a114f3b073e6d2841e9b4535862c31fa Mon Sep 17 00:00:00 2001 From: Serge Bazanski Date: Mon, 4 Nov 2019 18:40:05 +0100 Subject: [PATCH 19/26] snapshots/wire-1.4.yaml: comment --- snapshots/wire-1.4.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/snapshots/wire-1.4.yaml b/snapshots/wire-1.4.yaml index 4e83c24eced..82e4d23959e 100644 --- a/snapshots/wire-1.4.yaml +++ b/snapshots/wire-1.4.yaml @@ -4,6 +4,14 @@ resolver: https://raw.githubusercontent.com/wireapp/wire-server/develop/snapshot name: wire-1.4 packages: + # http-client forked by wire, commit 032b6503ab0c47f8f85bf48e0beb1f895a95bb27 + # Contains patches on top of http-client-openssl-0.2.2.0: + # - 89136497b8e0fa0624c1451883eb011347203532 + # - 916b04313c6864e02ebed4278b43b971189c61cd + # - 64ebec4fe7b48c131b7d6f0f8d7a6c6cacae70e6 + # - 78ebeead1a2efb17c55fb72d2d0041295d1271b8 + # - 032b6503ab0c47f8f85bf48e0beb1f895a95bb27 + # These provide a hacky way to implement TLS ceritifcate pinning. - archive: https://github.com/wireapp/http-client/archive/wire-2019-11-04.tar.gz subdirs: - http-client From 2f314e6341188c59644ec7e8ec2e51bd3c8f01cb Mon Sep 17 00:00:00 2001 From: jschaul Date: Tue, 5 Nov 2019 14:06:37 +0100 Subject: [PATCH 20/26] gundeck: cleanup of deprecated code (#894) Follow-up from #549 --- services/gundeck/src/Gundeck/API.hs | 14 -------------- services/gundeck/src/Gundeck/Client.hs | 10 ++-------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/services/gundeck/src/Gundeck/API.hs b/services/gundeck/src/Gundeck/API.hs index 8f9cc5d31a5..78b8b33dc4b 100644 --- a/services/gundeck/src/Gundeck/API.hs +++ b/services/gundeck/src/Gundeck/API.hs @@ -64,11 +64,6 @@ sitemap = do returns (ref Model.pushTokenList) response 200 "Object containing list of push tokens" end - post "/i/push" (continue Push.push) $ - request .&. accept "application" "json" - -- TODO: REFACTOR: this end-point is probably noise, and should be dropped. @/i/push/v2@ does exactly - -- the same thing. - post "/i/push/v2" (continue Push.push) $ request .&. accept "application" "json" @@ -143,15 +138,6 @@ sitemap = do -- User-Client API ------------------------------------------------------- - -- DEPRECATED: this is deprecated as of https://github.com/wireapp/wire-server/pull/549 (can be - -- removed once brig is deployed everywhere and won't trip over this missing any more.) - put "/i/clients/:cid" (continue Client.register) $ - header "Z-User" - .&. param "cid" - .&. request - .&. contentType "application" "json" - .&. accept "application" "json" - delete "/i/clients/:cid" (continue Client.unregister) $ header "Z-User" .&. param "cid" diff --git a/services/gundeck/src/Gundeck/Client.hs b/services/gundeck/src/Gundeck/Client.hs index cd110218c28..8b783d52ff4 100644 --- a/services/gundeck/src/Gundeck/Client.hs +++ b/services/gundeck/src/Gundeck/Client.hs @@ -1,6 +1,5 @@ module Gundeck.Client - ( register - , unregister + ( unregister , removeUser ) where @@ -10,17 +9,12 @@ import Data.Id import Data.Predicate import Gundeck.Monad import Gundeck.Push.Native -import Gundeck.Util -import Network.Wai (Request, Response) +import Network.Wai (Response) import Network.Wai.Utilities import qualified Gundeck.Notification.Data as Notifications import qualified Gundeck.Push.Data as Push --- | DEPRECATED: remove once brig is upgraded everywhere. -register :: UserId ::: ClientId ::: Request ::: JSON ::: JSON -> Gundeck Response -register (_uid ::: _cid ::: _req ::: _) = return empty - unregister :: UserId ::: ClientId -> Gundeck Response unregister (uid ::: cid) = do toks <- filter byClient <$> Push.lookup uid Push.Quorum From c2a79607b7a8bae340e3ff415e22a0e1781e0d80 Mon Sep 17 00:00:00 2001 From: Serge Bazanski Date: Tue, 5 Nov 2019 11:09:38 +0100 Subject: [PATCH 21/26] snapshots/wire-1.4.yaml: typo --- snapshots/wire-1.4.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshots/wire-1.4.yaml b/snapshots/wire-1.4.yaml index 82e4d23959e..023c078c9ba 100644 --- a/snapshots/wire-1.4.yaml +++ b/snapshots/wire-1.4.yaml @@ -11,7 +11,7 @@ packages: # - 64ebec4fe7b48c131b7d6f0f8d7a6c6cacae70e6 # - 78ebeead1a2efb17c55fb72d2d0041295d1271b8 # - 032b6503ab0c47f8f85bf48e0beb1f895a95bb27 - # These provide a hacky way to implement TLS ceritifcate pinning. + # These provide a hacky way to implement TLS certificate pinning. - archive: https://github.com/wireapp/http-client/archive/wire-2019-11-04.tar.gz subdirs: - http-client From 407a8aef4f3e51423269ab92ba811886273d1336 Mon Sep 17 00:00:00 2001 From: Arian van Putten Date: Wed, 6 Nov 2019 12:06:51 +0100 Subject: [PATCH 22/26] Pin nixpkgs (#898) --- stack-deps.nix | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/stack-deps.nix b/stack-deps.nix index 815b2462449..945c6778e77 100644 --- a/stack-deps.nix +++ b/stack-deps.nix @@ -1,5 +1,22 @@ -{ pkgs ? import {} }: let + # Pin nixpkgs for all dependencies. + # If you want to update. + # 1. go to https://nixos.org/channels/nixos-19.09 + # 2. copy the URL to nixexprs.tar.gz and the sha256 hash + # 3. Uncomment the sha256 = 00000 field + # 4. nix-build + # 5. Make nix complain to you what the correct hash is. + # 6. comment sha256 = 0000 and add sha256 = + # 7. nix-build + # 8. commit + # TODO(arianvp): There are tools that automate this; we should use them + pkgsTar = builtins.fetchTarball { + name = "nixos-1909"; + url = "https://releases.nixos.org/nixos/19.09/nixos-19.09.1019.c5aabb0d603/nixexprs.tar.xz"; + sha256 = "1hjw843g964aj9cd9p6x5473yy4sfmqnqlvavc5c1lbqa8v676zg"; + # sha256 = "0000000000000000000000000000000000000000000000000000"; + }; + pkgs = import pkgsTar {}; cryptobox-c = pkgs.callPackage ({fetchFromGitHub, rustPlatform, pkgconfig, libsodium}: rustPlatform.buildRustPackage rec { name = "cryptobox-c-${version}"; From 80f1adc8c2ffcde81eb6d118ff222b1006cd649a Mon Sep 17 00:00:00 2001 From: jschaul Date: Wed, 6 Nov 2019 12:42:41 +0100 Subject: [PATCH 23/26] throttle team deletion (#900) * throttle galley->gundeck requests on team deletion by sending events slower (configurable, optional, slow/cautious defaults) * throttle user deletions (brig) by waiting in between each user deletion (configurable, optional, default: 100ms) --- services/brig/brig.integration.yaml | 1 + .../brig/src/Brig/InternalEvent/Process.hs | 7 +++++ services/brig/src/Brig/Options.hs | 6 ++++ services/galley/galley.integration.yaml | 3 ++ services/galley/src/Galley/API/Teams.hs | 29 +++++++++++++++---- services/galley/src/Galley/Options.hs | 22 ++++++++++---- 6 files changed, 57 insertions(+), 11 deletions(-) diff --git a/services/brig/brig.integration.yaml b/services/brig/brig.integration.yaml index ca5d4b37808..605a2705cc0 100644 --- a/services/brig/brig.integration.yaml +++ b/services/brig/brig.integration.yaml @@ -150,6 +150,7 @@ optSettings: setEmailVisibility: visible_to_self setPropertyMaxKeyLen: 1024 setPropertyMaxValueLen: 4096 + setDeleteThrottleMillis: 0 logLevel: Warn logNetStrings: false diff --git a/services/brig/src/Brig/InternalEvent/Process.hs b/services/brig/src/Brig/InternalEvent/Process.hs index 16054a20715..3ae069ba699 100644 --- a/services/brig/src/Brig/InternalEvent/Process.hs +++ b/services/brig/src/Brig/InternalEvent/Process.hs @@ -4,7 +4,9 @@ module Brig.InternalEvent.Process import Imports import Brig.App +import Brig.Options (setDeleteThrottleMillis, defDeleteThrottleMillis) import Brig.InternalEvent.Types +import Control.Lens (view) import Control.Monad.Catch import Data.ByteString.Conversion import System.Logger.Class (field, msg, (~~), val) @@ -23,6 +25,11 @@ onEvent n = handleTimeout $ case n of Log.info $ msg (val "Processing user delete event") ~~ field "user" (toByteString uid) API.lookupAccount uid >>= mapM_ API.deleteAccount + -- As user deletions are expensive resource-wise in the context of + -- bulk user deletions (e.g. during team deletions), + -- wait 'delay' ms before processing the next event + delay <- fromMaybe defDeleteThrottleMillis . setDeleteThrottleMillis <$> view settings + liftIO $ threadDelay (1000 * delay) DeleteService pid sid -> do Log.info $ msg (val "Processing service delete event") ~~ field "provider" (toByteString pid) diff --git a/services/brig/src/Brig/Options.hs b/services/brig/src/Brig/Options.hs index 49937ca5541..24170d59587 100644 --- a/services/brig/src/Brig/Options.hs +++ b/services/brig/src/Brig/Options.hs @@ -312,6 +312,9 @@ data Settings = Settings , setPropertyMaxKeyLen :: !(Maybe Int64) , setPropertyMaxValueLen :: !(Maybe Int64) + , setDeleteThrottleMillis :: !(Maybe Int) -- ^ How long, in milliseconds, to wait + -- in between processing delete events + -- from the internal delete queue } deriving (Show, Generic) @@ -321,6 +324,9 @@ defMaxKeyLen = 256 defMaxValueLen :: Int64 defMaxValueLen = 512 +defDeleteThrottleMillis :: Int +defDeleteThrottleMillis = 100 + instance FromJSON Timeout where parseJSON (Y.Number n) = let defaultV = 3600 diff --git a/services/galley/galley.integration.yaml b/services/galley/galley.integration.yaml index 5e4b4924c14..236968b3ee4 100644 --- a/services/galley/galley.integration.yaml +++ b/services/galley/galley.integration.yaml @@ -26,6 +26,9 @@ settings: maxConvSize: 16 intraListing: false conversationCodeURI: https://app.wire.com/join/ + concurrentDeletionEvents: 1024 + deleteConvThrottleMillis: 0 + featureFlags: # see #RefConfigOptions in `/docs/reference` sso: disabled-by-default legalhold: disabled-by-default diff --git a/services/galley/src/Galley/API/Teams.hs b/services/galley/src/Galley/API/Teams.hs index 29aace4ca78..cded2143c59 100644 --- a/services/galley/src/Galley/API/Teams.hs +++ b/services/galley/src/Galley/API/Teams.hs @@ -61,6 +61,7 @@ import Network.Wai.Predicate hiding (setStatus, result, or) import Network.Wai.Utilities import UnliftIO (mapConcurrently) +import qualified Data.List.Extra as List import qualified Data.Set as Set import qualified Galley.Data as Data import qualified Galley.Data.LegalHold as LegalHoldData @@ -193,10 +194,9 @@ uncheckedDeleteTeam zusr zcon tid = do membs <- Data.teamMembers tid now <- liftIO getCurrentTime convs <- filter (not . view managedConversation) <$> Data.teamConversations tid - (ue, be) <- foldrM (pushConvDeleteEvents now membs) ([],[]) convs + (ue, be) <- foldrM (createConvDeleteEvents now membs) ([],[]) convs let e = newEvent TeamDelete tid now - let r = list1 (userRecipient zusr) (membersToRecipients (Just zusr) membs) - pushSome ((newPush1 zusr (TeamEvent e) r & pushConn .~ zcon) : ue) + pushDeleteEvents membs e ue void . forkIO $ void $ External.deliver be -- TODO: we don't delete bots here, but we should do that, since -- every bot user can only be in a single conversation. Just @@ -206,13 +206,32 @@ uncheckedDeleteTeam zusr zcon tid = do Journal.teamDelete tid Data.deleteTeam tid where - pushConvDeleteEvents + pushDeleteEvents :: [TeamMember] -> Event -> [Push] -> Galley () + pushDeleteEvents membs e ue = do + o <- view $ options . optSettings + let r = list1 (userRecipient zusr) (membersToRecipients (Just zusr) membs) + -- To avoid DoS on gundeck, send team deletion events in chunks + let chunkSize = fromMaybe defConcurrentDeletionEvents (o^.setConcurrentDeletionEvents) + let chunks = List.chunksOf chunkSize (toList r) + forM_ chunks $ \chunk -> case chunk of + [] -> return () + -- push TeamDelete events + x:xs -> push1 (newPush1 zusr (TeamEvent e) (list1 x xs) & pushConn .~ zcon) + + -- To avoid DoS on gundeck, send conversation deletion events slowly + let delay = 1000 * (fromMaybe defDeleteConvThrottleMillis (o^.setDeleteConvThrottleMillis)) + forM_ ue $ \event -> do + -- push ConversationDelete events + push1 event + threadDelay delay + + createConvDeleteEvents :: UTCTime -> [TeamMember] -> TeamConversation -> ([Push],[(BotMember, Conv.Event)]) -> Galley ([Push],[(BotMember, Conv.Event)]) - pushConvDeleteEvents now teamMembs c (pp, ee) = do + createConvDeleteEvents now teamMembs c (pp, ee) = do (bots, convMembs) <- botsAndUsers <$> Data.members (c ^. conversationId) -- Only nonTeamMembers need to get any events, since on team deletion, -- all team users are deleted immediately after these events are sent diff --git a/services/galley/src/Galley/Options.hs b/services/galley/src/Galley/Options.hs index 33989be8b0d..fa0739f15b1 100644 --- a/services/galley/src/Galley/Options.hs +++ b/services/galley/src/Galley/Options.hs @@ -12,21 +12,31 @@ import Galley.Types.Teams (FeatureFlags(..)) data Settings = Settings { -- | Number of connections for the HTTP client pool - _setHttpPoolSize :: !Int + _setHttpPoolSize :: !Int -- | Max number of members in a team. NOTE: This must be in sync with Brig - , _setMaxTeamSize :: !Word16 + , _setMaxTeamSize :: !Word16 -- | Max number of members in a conversation. NOTE: This must be in sync with Brig - , _setMaxConvSize :: !Word16 + , _setMaxConvSize :: !Word16 -- | Whether to call Brig for device listing - , _setIntraListing :: !Bool + , _setIntraListing :: !Bool -- | URI prefix for conversations with access mode @code@ - , _setConversationCodeURI :: !HttpsUrl - , _setFeatureFlags :: !FeatureFlags + , _setConversationCodeURI :: !HttpsUrl + -- | Throttling: limits to concurrent deletion events + , _setConcurrentDeletionEvents :: !(Maybe Int) + -- | Throttling: delay between sending events upon team deletion + , _setDeleteConvThrottleMillis :: !(Maybe Int) + , _setFeatureFlags :: !FeatureFlags } deriving (Show, Generic) deriveFromJSON toOptionFieldName ''Settings makeLenses ''Settings +defConcurrentDeletionEvents :: Int +defConcurrentDeletionEvents = 128 + +defDeleteConvThrottleMillis :: Int +defDeleteConvThrottleMillis = 20 + data JournalOpts = JournalOpts { _awsQueueName :: !Text -- ^ SQS queue name to send team events , _awsEndpoint :: !AWSEndpoint -- ^ AWS endpoint From 83c30f9fe24b2bbcd7f037f550ab4dcee5a86a23 Mon Sep 17 00:00:00 2001 From: jschaul Date: Wed, 6 Nov 2019 14:23:53 +0100 Subject: [PATCH 24/26] Feature/gundeck limit parallel connections (#895) Chunk nativeToken lookups and chunk native pushes to SNS to avoid unbounded new connections to open on when sending to large amounts of recipients. * configurable (suggestion: 32); default to unbounded --- services/gundeck/gundeck.integration.yaml | 1 + services/gundeck/src/Gundeck/Options.hs | 4 ++++ services/gundeck/src/Gundeck/Push.hs | 25 ++++++++++++--------- services/gundeck/src/Gundeck/Push/Native.hs | 12 +++++++++- services/gundeck/test/unit/MockGundeck.hs | 1 + 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/services/gundeck/gundeck.integration.yaml b/services/gundeck/gundeck.integration.yaml index 9f6d0820a55..6ffd0d2e8bd 100644 --- a/services/gundeck/gundeck.integration.yaml +++ b/services/gundeck/gundeck.integration.yaml @@ -24,6 +24,7 @@ settings: httpPoolSize: 1024 notificationTTL: 24192200 bulkPush: true + perNativePushConcurrency: 32 maxConcurrentNativePushes: hard: 30 # more than this number of threads will not be allowed soft: 10 # more than this number of threads will be warned about diff --git a/services/gundeck/src/Gundeck/Options.hs b/services/gundeck/src/Gundeck/Options.hs index af888da2716..484fcfc3687 100644 --- a/services/gundeck/src/Gundeck/Options.hs +++ b/services/gundeck/src/Gundeck/Options.hs @@ -38,6 +38,10 @@ data Settings = Settings , _setBulkPush :: !Bool -- | Maximum number of concurrent threads calling SNS. , _setMaxConcurrentNativePushes :: !(Maybe MaxConcurrentNativePushes) + -- | Maximum number of parallel requests to SNS and cassandra + -- during native push processing (per incoming push request) + -- defaults to unbounded, if unset. + , _setPerNativePushConcurrency :: !(Maybe Int) } deriving (Show, Generic) data MaxConcurrentNativePushes = MaxConcurrentNativePushes diff --git a/services/gundeck/src/Gundeck/Push.hs b/services/gundeck/src/Gundeck/Push.hs index f3dc5e73d6b..7d4ab86a07f 100644 --- a/services/gundeck/src/Gundeck/Push.hs +++ b/services/gundeck/src/Gundeck/Push.hs @@ -107,10 +107,16 @@ instance MonadNativeTargets Gundeck where mntgtLookupAddresses rcp = Data.lookup rcp Data.One class Monad m => MonadMapAsync m where - mntgtMapAsync :: (a -> m b) -> [a] -> m [Either SomeException b] + mntgtMapAsync :: (a -> m b) -> [a] -> m [Either SomeException b] + mntgtPerPushConcurrency :: m (Maybe Int) instance MonadMapAsync Gundeck where - mntgtMapAsync = mapAsync + mntgtPerPushConcurrency = view (options . optSettings . setPerNativePushConcurrency ) + mntgtMapAsync f l = do + perPushConcurrency <- mntgtPerPushConcurrency + case perPushConcurrency of + Nothing -> mapAsync f l + Just chunkSize -> concat <$> mapM (mapAsync f) (List.chunksOf chunkSize l) -- | Abstract over all effects in 'pushAny' (for unit testing). class (MonadPushAll m, MonadNativeTargets m, MonadMapAsync m) => MonadPushAny m where @@ -189,14 +195,17 @@ pushAll pushes = do resp <- compilePushResps targets <$> mpaBulkPush (compilePushReq <$> targets) -- native push + perPushConcurrency <- mntgtPerPushConcurrency forM_ resp $ \((notif :: Notification, psh :: Push), alreadySent :: [Presence]) -> do let rcps' = nativeTargetsRecipients psh - budget = length rcps' - -- this is a rough budget, since there may be more than one device in a + cost = maybe (length rcps') (min (length rcps')) perPushConcurrency + -- ^ this is a rough budget cost, since there may be more than one device in a -- 'Presence', so one budget token may trigger at most 8 push notifications -- to be sent out. + -- If perPushConcurrency is defined, we take the min with 'perNativePushConcurrency', as native push requests + -- to cassandra and SNS are limited to 'perNativePushConcurrency' in parallel. unless (psh ^. pushTransient) - $ mpaRunWithBudget budget () + $ mpaRunWithBudget cost () $ mpaPushNative notif psh =<< nativeTargets psh rcps' alreadySent @@ -280,13 +289,9 @@ nativeTargetsRecipients psh = filter routeNative (toList (fromRange (psh ^. push routeNative u = u ^. recipientRoute /= RouteDirect && (u ^. recipientId /= psh ^. pushOrigin || psh ^. pushNativeIncludeOrigin) --- | TODO: 'nativeTargets' calls cassandra once for each 'Recipient' of the 'Push'. Instead, +-- | FUTUREWORK: 'nativeTargets' calls cassandra once for each 'Recipient' of the 'Push'. Instead, -- it should be called once with @[Push]@ for every call to 'pushAll', and that call should -- only call cassandra once in total, yielding all addresses of all recipients of all pushes. --- --- FUTUREWORK: we may want to turn 'mntgtMapAsync' into an ordinary `mapM`: it's cassandra --- access, so it'll be fast either way given the size of the input, and synchronous calls --- impose a much lower risk of choking when system load peaks. nativeTargets :: forall m. (MonadNativeTargets m, MonadMapAsync m) => Push -> [Recipient] -> [Presence] -> m [Address] diff --git a/services/gundeck/src/Gundeck/Push/Native.hs b/services/gundeck/src/Gundeck/Push/Native.hs index 86add930825..82655ce60f0 100644 --- a/services/gundeck/src/Gundeck/Push/Native.hs +++ b/services/gundeck/src/Gundeck/Push/Native.hs @@ -29,11 +29,21 @@ import qualified Gundeck.Aws as Aws import qualified Gundeck.Notification.Data as Stream import qualified Gundeck.Push.Data as Data import qualified System.Logger.Class as Log +import qualified Data.List.Extra as List push :: NativePush -> [Address] -> Gundeck () push _ [] = pure () push m [a] = push1 m a -push m addrs = void $ mapConcurrently (push1 m) addrs +push m addrs = do + perPushConcurrency <- view (options . optSettings . setPerNativePushConcurrency ) + case perPushConcurrency of + -- send all at once + Nothing -> void $ mapConcurrently (push1 m) addrs + -- avoid high amounts of fresh parallel network requests by + -- parallelizing only chunkSize native pushes at a time + Just chunkSize -> do + let chunks = List.chunksOf chunkSize addrs + mapM_ (mapConcurrently (push1 m)) chunks push1 :: NativePush -> Address -> Gundeck () push1 m a = do diff --git a/services/gundeck/test/unit/MockGundeck.hs b/services/gundeck/test/unit/MockGundeck.hs index db02cdbd663..7de66181428 100644 --- a/services/gundeck/test/unit/MockGundeck.hs +++ b/services/gundeck/test/unit/MockGundeck.hs @@ -400,6 +400,7 @@ instance MonadNativeTargets MockGundeck where mntgtLookupAddresses = mockLookupAddresses instance MonadMapAsync MockGundeck where + mntgtPerPushConcurrency = pure Nothing -- (unbounded) mntgtMapAsync f xs = Right <$$> mapM f xs -- (no concurrency) instance MonadPushAny MockGundeck where From 7bc768606601a13497a1e64fa9b26c5b5a5cb74e Mon Sep 17 00:00:00 2001 From: jschaul Date: Wed, 6 Nov 2019 14:25:10 +0100 Subject: [PATCH 25/26] stern add endpoint (#896) --- tools/stern/src/Stern/API.hs | 11 +++++++++++ tools/stern/src/Stern/Types.hs | 31 ++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/stern/src/Stern/API.hs b/tools/stern/src/Stern/API.hs index 920bcadc0cc..010b15c0398 100644 --- a/tools/stern/src/Stern/API.hs +++ b/tools/stern/src/Stern/API.hs @@ -293,6 +293,15 @@ sitemap = do description "Team ID" Doc.response 200 "Team Information" Doc.end + get "/teams/:tid" (continue getTeamAdminInfo) $ + capture "tid" + + document "GET" "getTeamAdminInfo" $ do + summary "Gets information about a team's owners and admins only" + Doc.parameter Doc.Path "tid" Doc.bytes' $ + description "Team ID" + Doc.response 200 "Team Information about Owners and Admins" Doc.end + -- feature flags get "/teams/:tid/features/legalhold" (continue (liftM json . Intra.getLegalholdStatus)) $ @@ -533,6 +542,8 @@ deleteFromBlacklist emailOrPhone = do getTeamInfo :: TeamId -> Handler Response getTeamInfo = liftM json . Intra.getTeamInfo +getTeamAdminInfo :: TeamId -> Handler Response +getTeamAdminInfo = liftM (json . toAdminInfo) . Intra.getTeamInfo setLegalholdStatus :: JSON ::: TeamId ::: JsonRequest SetLegalHoldStatus -> Handler Response setLegalholdStatus (_ ::: tid ::: req) = do diff --git a/tools/stern/src/Stern/Types.hs b/tools/stern/src/Stern/Types.hs index a24ca114aae..d184175b41e 100644 --- a/tools/stern/src/Stern/Types.hs +++ b/tools/stern/src/Stern/Types.hs @@ -14,7 +14,6 @@ import Data.Aeson.TH import Data.ByteString.Conversion import Data.Json.Util import Data.Range -import Data.Text import Imports import Galley.Types.Teams import Galley.Types.Teams.Intra @@ -35,12 +34,42 @@ data TeamInfo = TeamInfo , tiMembers :: [TeamMemberInfo] } +data TeamAdminInfo = TeamAdminInfo + { taData :: TeamData + , taOwners :: [TeamMemberInfo] + , taAdmins :: [TeamMemberInfo] + , taMembers :: Int + } + +toAdminInfo :: TeamInfo -> TeamAdminInfo +toAdminInfo (TeamInfo d members) = TeamAdminInfo + { taData = d + , taMembers = length members + , taOwners = filter (\(TeamMemberInfo m) -> isOwner m) members + , taAdmins = filter (\(TeamMemberInfo m) -> isAdmin m) members + } + +-- FUTUREWORK: use the same criteria as in RoleOwner, RoleAdmin +isOwner :: TeamMember -> Bool +isOwner m = hasPermission m SetBilling + +isAdmin :: TeamMember -> Bool +isAdmin m = (hasPermission m AddTeamMember) && not (hasPermission m SetBilling) + instance ToJSON TeamInfo where toJSON (TeamInfo d m) = object [ "info" .= d , "members" .= m ] +instance ToJSON TeamAdminInfo where + toJSON (TeamAdminInfo d o a m) = object + [ "info" .= d + , "owners" .= o + , "admins" .= a + , "total_members" .= m + ] + newtype UserProperties = UserProperties { unUserProperties :: M.HashMap PropertyKey PropertyValue } deriving (Eq, Show, ToJSON) From 4b58cf99d2c53a63aa33ae4a588732890d3faeb8 Mon Sep 17 00:00:00 2001 From: jschaul Date: Wed, 6 Nov 2019 18:10:19 +0100 Subject: [PATCH 26/26] Changelog for release 2019-11-06 - New configuration options available (none mandatory). See #895 #900 #869 - Support HEAD requests for `/sso/initiate-bind` (#878) - Do not send conversation delete events to team members upon team deletion (#897) - Support SNI for bot registrations (by bumping http-client version) (#899) - Make gundeck handle AWS outages better. (#869, #890, #892) - Improve performance by avoiding unbounded intra-service traffic spikes on team deletions (#900) - Add optional native push connection throttling (#895) - New backoffice/stern endpoint (#896) - SAML: Store raw idp metadata with typed details in c* (#872) - documentation/script updates --- CHANGELOG.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58fbbc44fcd..68c480da637 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,27 @@ +# 2019-11-06 #901 + +## Relevant for self-hosters + +- New configuration options available (none mandatory). See #895 #900 #869 + +## Relevant for client developers + +- Support HEAD requests for `/sso/initiate-bind` (#878) + +## Bug fixes + +- Do not send conversation delete events to team members upon team deletion (#897) +- Support SNI for bot registrations (by bumping http-client version) (#899) + +## Internal changes + +- Make gundeck handle AWS outages better. (#869, #890, #892) +- Improve performance by avoiding unbounded intra-service traffic spikes on team deletions (#900) +- Add optional native push connection throttling (#895) +- New backoffice/stern endpoint (#896) +- SAML: Store raw idp metadata with typed details in c* (#872) +- documentation/script updates + # 2019-09-30 #868 ## Relevant for self-hosters