@@ -10,6 +10,8 @@ import (
1010 "time"
1111
1212 "github.com/coder/websocket/wsjson"
13+ "github.com/prometheus/client_golang/prometheus"
14+ "github.com/prometheus/client_golang/prometheus/promauto"
1315
1416 "github.com/coreos/go-oidc/v3/oidc"
1517
@@ -36,6 +38,97 @@ const (
3638 CloudWebSocketPingInterval = 15 * time .Second
3739)
3840
41+ var (
42+ metricCloudConnectionStatus = promauto .NewGauge (
43+ prometheus.GaugeOpts {
44+ Name : "jetkvm_cloud_connection_status" ,
45+ Help : "The status of the cloud connection" ,
46+ },
47+ )
48+ metricCloudConnectionEstablishedTimestamp = promauto .NewGauge (
49+ prometheus.GaugeOpts {
50+ Name : "jetkvm_cloud_connection_established_timestamp" ,
51+ Help : "The timestamp when the cloud connection was established" ,
52+ },
53+ )
54+ metricCloudConnectionLastPingTimestamp = promauto .NewGauge (
55+ prometheus.GaugeOpts {
56+ Name : "jetkvm_cloud_connection_last_ping_timestamp" ,
57+ Help : "The timestamp when the last ping response was received" ,
58+ },
59+ )
60+ metricCloudConnectionLastPingDuration = promauto .NewGauge (
61+ prometheus.GaugeOpts {
62+ Name : "jetkvm_cloud_connection_last_ping_duration" ,
63+ Help : "The duration of the last ping response" ,
64+ },
65+ )
66+ metricCloudConnectionPingDuration = promauto .NewHistogram (
67+ prometheus.HistogramOpts {
68+ Name : "jetkvm_cloud_connection_ping_duration" ,
69+ Help : "The duration of the ping response" ,
70+ Buckets : []float64 {
71+ 0.1 , 0.5 , 1 , 10 ,
72+ },
73+ },
74+ )
75+ metricCloudConnectionTotalPingCount = promauto .NewCounter (
76+ prometheus.CounterOpts {
77+ Name : "jetkvm_cloud_connection_total_ping_count" ,
78+ Help : "The total number of pings sent to the cloud" ,
79+ },
80+ )
81+ metricCloudConnectionSessionRequestCount = promauto .NewCounter (
82+ prometheus.CounterOpts {
83+ Name : "jetkvm_cloud_connection_session_total_request_count" ,
84+ Help : "The total number of session requests received from the cloud" ,
85+ },
86+ )
87+ metricCloudConnectionSessionRequestDuration = promauto .NewHistogram (
88+ prometheus.HistogramOpts {
89+ Name : "jetkvm_cloud_connection_session_request_duration" ,
90+ Help : "The duration of session requests" ,
91+ Buckets : []float64 {
92+ 0.1 , 0.5 , 1 , 10 ,
93+ },
94+ },
95+ )
96+ metricCloudConnectionLastSessionRequestTimestamp = promauto .NewGauge (
97+ prometheus.GaugeOpts {
98+ Name : "jetkvm_cloud_connection_last_session_request_timestamp" ,
99+ Help : "The timestamp of the last session request" ,
100+ },
101+ )
102+ metricCloudConnectionLastSessionRequestDuration = promauto .NewGauge (
103+ prometheus.GaugeOpts {
104+ Name : "jetkvm_cloud_connection_last_session_request_duration" ,
105+ Help : "The duration of the last session request" ,
106+ },
107+ )
108+ metricCloudConnectionFailureCount = promauto .NewCounter (
109+ prometheus.CounterOpts {
110+ Name : "jetkvm_cloud_connection_failure_count" ,
111+ Help : "The number of times the cloud connection has failed" ,
112+ },
113+ )
114+ )
115+
116+ func cloudResetMetrics (established bool ) {
117+ metricCloudConnectionLastPingTimestamp .Set (- 1 )
118+ metricCloudConnectionLastPingDuration .Set (- 1 )
119+
120+ metricCloudConnectionLastSessionRequestTimestamp .Set (- 1 )
121+ metricCloudConnectionLastSessionRequestDuration .Set (- 1 )
122+
123+ if established {
124+ metricCloudConnectionEstablishedTimestamp .SetToCurrentTime ()
125+ metricCloudConnectionStatus .Set (1 )
126+ } else {
127+ metricCloudConnectionEstablishedTimestamp .Set (- 1 )
128+ metricCloudConnectionStatus .Set (- 1 )
129+ }
130+ }
131+
39132func handleCloudRegister (c * gin.Context ) {
40133 var req CloudRegisterRequest
41134
@@ -130,15 +223,18 @@ func runWebsocketClient() error {
130223 if err != nil {
131224 return fmt .Errorf ("failed to parse config.CloudURL: %w" , err )
132225 }
226+
133227 if wsURL .Scheme == "http" {
134228 wsURL .Scheme = "ws"
135229 } else {
136230 wsURL .Scheme = "wss"
137231 }
232+
138233 header := http.Header {}
139234 header .Set ("X-Device-ID" , GetDeviceID ())
140235 header .Set ("Authorization" , "Bearer " + config .CloudToken )
141236 dialCtx , cancelDial := context .WithTimeout (context .Background (), CloudWebSocketConnectTimeout )
237+
142238 defer cancelDial ()
143239 c , _ , err := websocket .Dial (dialCtx , wsURL .String (), & websocket.DialOptions {
144240 HTTPHeader : header ,
@@ -148,17 +244,35 @@ func runWebsocketClient() error {
148244 }
149245 defer c .CloseNow () //nolint:errcheck
150246 cloudLogger .Infof ("websocket connected to %s" , wsURL )
247+
248+ // set the metrics when we successfully connect to the cloud.
249+ cloudResetMetrics (true )
250+
151251 runCtx , cancelRun := context .WithCancel (context .Background ())
152252 defer cancelRun ()
153253 go func () {
154254 for {
155255 time .Sleep (CloudWebSocketPingInterval )
256+
257+ // set the timer for the ping duration
258+ timer := prometheus .NewTimer (prometheus .ObserverFunc (func (v float64 ) {
259+ metricCloudConnectionLastPingDuration .Set (v )
260+ metricCloudConnectionPingDuration .Observe (v )
261+ }))
262+
156263 err := c .Ping (runCtx )
264+
157265 if err != nil {
158266 cloudLogger .Warnf ("websocket ping error: %v" , err )
159267 cancelRun ()
160268 return
161269 }
270+
271+ // dont use `defer` here because we want to observe the duration of the ping
272+ timer .ObserveDuration ()
273+
274+ metricCloudConnectionTotalPingCount .Inc ()
275+ metricCloudConnectionLastPingTimestamp .SetToCurrentTime ()
162276 }
163277 }()
164278 for {
@@ -180,6 +294,8 @@ func runWebsocketClient() error {
180294 cloudLogger .Infof ("new session request: %v" , req .OidcGoogle )
181295 cloudLogger .Tracef ("session request info: %v" , req )
182296
297+ metricCloudConnectionSessionRequestCount .Inc ()
298+ metricCloudConnectionLastSessionRequestTimestamp .SetToCurrentTime ()
183299 err = handleSessionRequest (runCtx , c , req )
184300 if err != nil {
185301 cloudLogger .Infof ("error starting new session: %v" , err )
@@ -189,6 +305,12 @@ func runWebsocketClient() error {
189305}
190306
191307func handleSessionRequest (ctx context.Context , c * websocket.Conn , req WebRTCSessionRequest ) error {
308+ timer := prometheus .NewTimer (prometheus .ObserverFunc (func (v float64 ) {
309+ metricCloudConnectionLastSessionRequestDuration .Set (v )
310+ metricCloudConnectionSessionRequestDuration .Observe (v )
311+ }))
312+ defer timer .ObserveDuration ()
313+
192314 oidcCtx , cancelOIDC := context .WithTimeout (ctx , CloudOidcRequestTimeout )
193315 defer cancelOIDC ()
194316 provider , err := oidc .NewProvider (oidcCtx , "https://accounts.google.com" )
@@ -249,6 +371,9 @@ func handleSessionRequest(ctx context.Context, c *websocket.Conn, req WebRTCSess
249371
250372func RunWebsocketClient () {
251373 for {
374+ // reset the metrics when we start the websocket client.
375+ cloudResetMetrics (false )
376+
252377 // If the cloud token is not set, we don't need to run the websocket client.
253378 if config .CloudToken == "" {
254379 time .Sleep (5 * time .Second )
@@ -272,6 +397,8 @@ func RunWebsocketClient() {
272397 err := runWebsocketClient ()
273398 if err != nil {
274399 cloudLogger .Errorf ("websocket client error: %v" , err )
400+ metricCloudConnectionStatus .Set (0 )
401+ metricCloudConnectionFailureCount .Inc ()
275402 time .Sleep (5 * time .Second )
276403 }
277404 }
0 commit comments