From 866ad444af263f70ce0043166ad79ccf63a06e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 21 Nov 2024 22:38:13 +0000 Subject: [PATCH] fix: Metrics cleanup based on syn module Adds a new worker that will cleanup metrics based on our syn information to reduce the amount of data pulled by Prometheus. --- README.md | 72 ++++++++++++++++----------------- config/runtime.exs | 2 + lib/realtime/application.ex | 3 +- lib/realtime/metrics_cleaner.ex | 59 +++++++++++++++++++++++++++ lib/realtime/tenants/connect.ex | 6 +++ mix.exs | 2 +- 6 files changed, 106 insertions(+), 38 deletions(-) create mode 100644 lib/realtime/metrics_cleaner.ex diff --git a/README.md b/README.md index 3ffd4c829..c3c3700a6 100644 --- a/README.md +++ b/README.md @@ -120,42 +120,42 @@ If you're using the default tenant, the URL is `ws://realtime-dev.localhost:4000 **Environment Variables** -| Variable | Type | Description | -| --------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| PORT | number | Port which you can connect your client/listeners | -| DB_HOST | string | Database host URL | -| DB_PORT | number | Database port | -| DB_USER | string | Database user | -| DB_PASSWORD | string | Database password | -| DB_NAME | string | Postgres database name | -| DB_ENC_KEY | string | Key used to encrypt sensitive fields in \_realtime.tenants and \_realtime.extensions tables. Recommended: 16 characters. | -| DB_AFTER_CONNECT_QUERY | string | Query that is run after server connects to database. | -| API_JWT_SECRET | string | Secret that is used to sign tokens used to manage tenants and their extensions via HTTP requests. | -| SECRET_KEY_BASE | string | Secret used by the server to sign cookies. Recommended: 64 characters. | -| ERL_AFLAGS | string | Set to either "-proto_dist inet_tcp" or "-proto_dist inet6_tcp" depending on whether or not your network uses IPv4 or IPv6, respectively. | -| APP_NAME | string | A name of the server. | -| DNS_NODES | string | Node name used when running server in a cluster. | -| MAX_CONNECTIONS | string | Set the soft maximum for WebSocket connections. Defaults to '16384'. | -| MAX_HEADER_LENGTH | string | Set the maximum header length for connections (in bytes). Defaults to '4096'. | -| NUM_ACCEPTORS | string | Set the number of server processes that will relay incoming WebSocket connection requests. Defaults to '100'. | -| DB_QUEUE_TARGET | string | Maximum time to wait for a connection from the pool. Defaults to '5000' or 5 seconds. See for more info: [DBConnection](https://hexdocs.pm/db_connection/DBConnection.html#start_link/2-queue-config). | -| DB_QUEUE_INTERVAL | string | Interval to wait to check if all connections were checked out under DB_QUEUE_TARGET. If all connections surpassed the target during this interval than the target is doubled. Defaults to '5000' or 5 seconds. See for more info: [DBConnection](https://hexdocs.pm/db_connection/DBConnection.html#start_link/2-queue-config). | -| DB_POOL_SIZE | string | Sets the number of connections in the database pool. Defaults to '5'. | -| SLOT_NAME_SUFFIX | string | This is appended to the replication slot which allows making a custom slot name. May contain lowercase letters, numbers, and the underscore character. Together with the default `supabase_realtime_replication_slot`, slot name should be up to 64 characters long. | -| TENANT_MAX_BYTES_PER_SECOND | string | The default value of maximum bytes per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100_000'. | -| TENANT_MAX_CHANNELS_PER_CLIENT | string | The default value of maximum number of channels each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | -| TENANT_MAX_CONCURRENT_USERS | string | The default value of maximum concurrent users per channel that each tenant can support, used when creating a tenant for the first time. Defaults to '200'. | -| TENANT_MAX_EVENTS_PER_SECOND | string | The default value of maximum events per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | -| TENANT_MAX_JOINS_PER_SECOND | string | The default value of maximum channel joins per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | -| SEED_SELF_HOST | boolean | Seeds the system with default tenant | -| RUN_JANITOR | boolean | Do you want to janitor tasks to run | -| JANITOR_SCHEDULE_TIMER_IN_MS | number | Time in ms to run the janitor task | -| JANITOR_SCHEDULE_RANDOMIZE | boolean | Adds a randomized value of minutes to the timer | -| JANITOR_RUN_AFTER_IN_MS | number | Tells system when to start janitor tasks after boot | -| JANITOR_CLEANUP_MAX_CHILDREN | number | Maximum number of concurrent tasks working on janitor cleanup | -| JANITOR_CLEANUP_CHILDREN_TIMEOUT | number | Timeout for each async task for janitor cleanup | -| JANITOR_CHUNK_SIZE | number | Number of tenants to process per chunk. Each chunk will be processed by a Task | - +| Variable | Type | Description | +| ----------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| PORT | number | Port which you can connect your client/listeners | +| DB_HOST | string | Database host URL | +| DB_PORT | number | Database port | +| DB_USER | string | Database user | +| DB_PASSWORD | string | Database password | +| DB_NAME | string | Postgres database name | +| DB_ENC_KEY | string | Key used to encrypt sensitive fields in \_realtime.tenants and \_realtime.extensions tables. Recommended: 16 characters. | +| DB_AFTER_CONNECT_QUERY | string | Query that is run after server connects to database. | +| API_JWT_SECRET | string | Secret that is used to sign tokens used to manage tenants and their extensions via HTTP requests. | +| SECRET_KEY_BASE | string | Secret used by the server to sign cookies. Recommended: 64 characters. | +| ERL_AFLAGS | string | Set to either "-proto_dist inet_tcp" or "-proto_dist inet6_tcp" depending on whether or not your network uses IPv4 or IPv6, respectively. | +| APP_NAME | string | A name of the server. | +| DNS_NODES | string | Node name used when running server in a cluster. | +| MAX_CONNECTIONS | string | Set the soft maximum for WebSocket connections. Defaults to '16384'. | +| MAX_HEADER_LENGTH | string | Set the maximum header length for connections (in bytes). Defaults to '4096'. | +| NUM_ACCEPTORS | string | Set the number of server processes that will relay incoming WebSocket connection requests. Defaults to '100'. | +| DB_QUEUE_TARGET | string | Maximum time to wait for a connection from the pool. Defaults to '5000' or 5 seconds. See for more info: [DBConnection](https://hexdocs.pm/db_connection/DBConnection.html#start_link/2-queue-config). | +| DB_QUEUE_INTERVAL | string | Interval to wait to check if all connections were checked out under DB_QUEUE_TARGET. If all connections surpassed the target during this interval than the target is doubled. Defaults to '5000' or 5 seconds. See for more info: [DBConnection](https://hexdocs.pm/db_connection/DBConnection.html#start_link/2-queue-config). | +| DB_POOL_SIZE | string | Sets the number of connections in the database pool. Defaults to '5'. | +| SLOT_NAME_SUFFIX | string | This is appended to the replication slot which allows making a custom slot name. May contain lowercase letters, numbers, and the underscore character. Together with the default `supabase_realtime_replication_slot`, slot name should be up to 64 characters long. | +| TENANT_MAX_BYTES_PER_SECOND | string | The default value of maximum bytes per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100_000'. | +| TENANT_MAX_CHANNELS_PER_CLIENT | string | The default value of maximum number of channels each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | +| TENANT_MAX_CONCURRENT_USERS | string | The default value of maximum concurrent users per channel that each tenant can support, used when creating a tenant for the first time. Defaults to '200'. | +| TENANT_MAX_EVENTS_PER_SECOND | string | The default value of maximum events per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | +| TENANT_MAX_JOINS_PER_SECOND | string | The default value of maximum channel joins per second that each tenant can support, used when creating a tenant for the first time. Defaults to '100'. | +| SEED_SELF_HOST | boolean | Seeds the system with default tenant | +| RUN_JANITOR | boolean | Do you want to janitor tasks to run | +| JANITOR_SCHEDULE_TIMER_IN_MS | number | Time in ms to run the janitor task | +| JANITOR_SCHEDULE_RANDOMIZE | boolean | Adds a randomized value of minutes to the timer | +| JANITOR_RUN_AFTER_IN_MS | number | Tells system when to start janitor tasks after boot | +| JANITOR_CLEANUP_MAX_CHILDREN | number | Maximum number of concurrent tasks working on janitor cleanup | +| JANITOR_CLEANUP_CHILDREN_TIMEOUT | number | Timeout for each async task for janitor cleanup | +| JANITOR_CHUNK_SIZE | number | Number of tenants to process per chunk. Each chunk will be processed by a Task | +| METRICS_CLEANER_SCHEDULE_TIMER_IN_MS | number | Time in ms to run the Metric Cleaner task | ## WebSocket URL The WebSocket URL is in the following format for local development: `ws://[external_id].localhost:4000/socket/websocket` diff --git a/config/runtime.exs b/config/runtime.exs index 210132a27..7bab46148 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -22,6 +22,8 @@ config :realtime, System.get_env("TENANT_MAX_EVENTS_PER_SECOND", "100") |> String.to_integer(), tenant_max_joins_per_second: System.get_env("TENANT_MAX_JOINS_PER_SECOND", "100") |> String.to_integer(), + metrics_cleaner_scheduke_timer_in_ms: + System.get_env("METRICS_CLEANER_SCHEDULE_TIMER_IN_MS", "1800000") |> String.to_integer(), rpc_timeout: System.get_env("RPC_TIMEOUT", "30000") |> String.to_integer() run_janitor? = System.get_env("RUN_JANITOR", "false") == "true" diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index 2b893850a..0fe603c6d 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -84,7 +84,8 @@ defmodule Realtime.Application do strategy: :one_for_one, name: Realtime.BroadcastChanges.Handler.DynamicSupervisor}, RealtimeWeb.Endpoint, - RealtimeWeb.Presence + RealtimeWeb.Presence, + Realtime.MetricsCleaner ] ++ extensions_supervisors() ++ janitor_tasks() children = diff --git a/lib/realtime/metrics_cleaner.ex b/lib/realtime/metrics_cleaner.ex new file mode 100644 index 000000000..eec0a1d78 --- /dev/null +++ b/lib/realtime/metrics_cleaner.ex @@ -0,0 +1,59 @@ +defmodule Realtime.MetricsCleaner do + @moduledoc false + + use GenServer + require Logger + + defstruct [:check_ref, :interval] + + def start_link(args), + do: GenServer.start_link(__MODULE__, args, name: __MODULE__) + + def init(_args) do + interval = Application.get_env(:realtime, :metrics_cleaner_scheduke_timer_in_ms) + + Logger.info("Starting MetricsCleaner") + {:ok, %{check_ref: check(interval), interval: interval}} + end + + def handle_info(:check, %{interval: interval} = state) do + Process.cancel_timer(state.check_ref) + + {exec_time, _} = :timer.tc(fn -> loop_and_cleanup_metrics_table() end) + + if exec_time > :timer.seconds(5), + do: Logger.warning("Metrics check took: #{exec_time} ms") + + {:noreply, %{state | check_ref: check(interval)}} + end + + def handle_info(msg, state) do + Logger.error("Unexpected message: #{inspect(msg)}") + {:noreply, state} + end + + defp check(interval) do + Process.send_after(self(), :check, interval) + end + + @table_name :"syn_registry_by_name_Elixir.Realtime.Tenants.Connect" + @metrics_table Realtime.PromEx.Metrics + @filter_spec [{{{:_, %{tenant: :"$1"}}, :_}, [], [:"$1"]}] + @tenant_id_spec [{{:"$1", :_, :_, :_, :_, :_}, [], [:"$1"]}] + defp loop_and_cleanup_metrics_table do + tenant_ids = :ets.select(@table_name, @tenant_id_spec) + + :ets.select(@metrics_table, @filter_spec) + |> Enum.uniq() + |> Enum.reject(fn tenant_id -> tenant_id in tenant_ids end) + |> Enum.each(fn tenant_id -> delete_metric(tenant_id) end) + end + + @doc """ + Deletes all metrics that contain the given tenant. + """ + @spec delete_metric(String.t()) :: :ok + def delete_metric(tenant) do + :ets.select_delete(@metrics_table, [{{{:_, %{tenant: tenant}}, :_}, [], [true]}]) + end +end diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index c4aaa23c4..01e9748ce 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -248,6 +248,12 @@ defmodule Realtime.Tenants.Connect do {:stop, :kill, state} end + @impl true + def terminate(_, %{tenant_id: tenant_id}) do + Realtime.MetricsCleaner.delete_metric(tenant_id) + :ok + end + ## Private functions defp call_external_node(tenant_id, opts) do diff --git a/mix.exs b/mix.exs index 997445cf8..e95bfd37a 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.33.55", + version: "2.33.56", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod,