-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmain.go
424 lines (370 loc) · 13.9 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"time"
"github.com/coreos/etcd/clientv3"
etcdregistry "github.com/flaviostutz/etcd-registry/etcd-registry"
"github.com/serialx/hashring"
"github.com/sirupsen/logrus"
)
// SourceTarget defines the structure of a prometheus source target
type SourceTarget struct {
Targets []string `json:"targets"`
Labels map[string]string `json:"labels,omitempty"`
}
func main() {
logLevel := flag.String("loglevel", "info", "debug, info, warning, error")
etcdURLRegistry0 := flag.String("registry-etcd-url", "", "ETCD URLs. ex: http://etcd0:2379")
etcdBase0 := flag.String("registry-etcd-base", "/registry", "ETCD base path for services")
etcdServiceName0 := flag.String("registry-service-name", "", "Prometheus cluster service name. Ex.: proml1")
etcdServiceTTL0 := flag.Int("registry-node-ttl", -1, "Node registration TTL in ETCD. After killing Promster instance, it will vanish from ETCD registry after this time")
etcdURLScrape0 := flag.String("scrape-etcd-url", "", "ETCD URLs for scrape source server. If empty, will be the same as --etcd-url. ex: http://etcd0:2379")
scrapeEtcdPath0 := flag.String("scrape-etcd-path", "", "Base ETCD path for getting servers to be scrapped")
scrapePaths0 := flag.String("scrape-paths", "/metrics", "URI for scrape of each target. May contain a list separated by ','.")
scrapeInterval0 := flag.String("scrape-interval", "30s", "Prometheus scrape interval")
scrapeTimeout0 := flag.String("scrape-timeout", "30s", "Prometheus scrape timeout")
scrapeMatch0 := flag.String("scrape-match", "", "Metrics regex filter applied on scraped targets. Commonly used in conjunction with /federate metrics endpoint")
scrapeShardingEnable0 := flag.Bool("scrape-shard-enable", false, "Enable sharding distribution among targets so that each Promster instance will scrape a different set of targets, enabling distribution of load among instances. Defaults to true.")
evaluationInterval0 := flag.String("evaluation-interval", "30s", "Prometheus evaluation interval")
scheme0 := flag.String("scheme", "http", "Scrape scheme, either http or https")
tlsInsecure0 := flag.String("tls-insecure", "false", "Disable validation of the server certificate. true or false")
flag.Parse()
etcdURLRegistry := *etcdURLRegistry0
etcdURLScrape := *etcdURLScrape0
etcdBase := *etcdBase0
etcdServiceName := *etcdServiceName0
scrapeEtcdPath := *scrapeEtcdPath0
etcdServiceTTL := *etcdServiceTTL0
scrapeInterval := *scrapeInterval0
scrapeTimeout := *scrapeTimeout0
scrapeMatch := *scrapeMatch0
scrapeShardingEnable := *scrapeShardingEnable0
evaluationInterval := *evaluationInterval0
se := *scrapePaths0
scrapePaths := strings.Split(se, ",")
scheme := *scheme0
tlsInsecure := *tlsInsecure0
// if etcdURLRegistry == "" {
// panic("--etcd-url-registry should be defined")
// }
if etcdURLScrape == "" {
panic("--etcd-url-scrape should be defined")
}
if etcdURLRegistry != "" {
if etcdBase == "" {
panic("--etcd-base should be defined")
}
if etcdServiceName == "" {
panic("--etcd-service-name should be defined")
}
if etcdServiceTTL == -1 {
panic("--etcd-node-ttl should be defined")
}
}
if scrapeEtcdPath == "" {
panic("--scrape-etcd-path should be defined")
}
switch *logLevel {
case "debug":
logrus.SetLevel(logrus.DebugLevel)
break
case "warning":
logrus.SetLevel(logrus.WarnLevel)
break
case "error":
logrus.SetLevel(logrus.ErrorLevel)
break
default:
logrus.SetLevel(logrus.InfoLevel)
}
logrus.Infof("====Starting Promster====")
// logrus.Infof("Generating prometheus.yml")
// sourceCode, err := executeTemplate("/", "prometheus.tmpl", templateRulesMap)
// if err != nil {
// panic(err)
// }
logrus.Debugf("Updating prometheus file...")
time.Sleep(5 * time.Second)
err := updatePrometheusConfig("/prometheus.yml", scrapeInterval, scrapeTimeout, evaluationInterval, scrapePaths, scrapeMatch, scheme, tlsInsecure)
if err != nil {
panic(err)
}
logrus.Debugf("Creating rules file...")
err = createRulesFromENV("/rules.yml")
if err != nil {
panic(err)
}
nodesChan := make(chan []string, 0)
if etcdURLRegistry != "" {
logrus.Debugf("Initializing Registry client. etcdURLRegistry=%s", etcdURLRegistry)
endpointsRegistry := strings.Split(etcdURLRegistry, ",")
registry, err := etcdregistry.NewEtcdRegistry(endpointsRegistry, etcdBase, 10*time.Second)
if err != nil {
panic(err)
}
logrus.Infof("Keeping self node registered on ETCD...")
go keepSelfNodeRegistered(registry, etcdServiceName, time.Duration(etcdServiceTTL)*time.Second)
logrus.Debugf("Initializing ETCD client for registry")
cliRegistry, err := clientv3.New(clientv3.Config{Endpoints: endpointsRegistry, DialTimeout: 10 * time.Second})
if err != nil {
logrus.Errorf("Could not initialize ETCD client. err=%s", err)
panic(err)
}
logrus.Infof("Etcd client initialized")
servicePath := fmt.Sprintf("%s/%s/", etcdBase, etcdServiceName)
logrus.Infof("Starting to watch registered prometheus nodes...")
go watchRegisteredNodes(cliRegistry, servicePath, nodesChan)
} else {
go func() {
nodesChan <- []string{getSelfNodeName()}
}()
}
logrus.Debugf("Initializing ETCD client for source scrape targets")
logrus.Infof("Starting to watch source scrape targets. etcdURLScrape=%s", etcdURLScrape)
endpointsScrape := strings.Split(etcdURLScrape, ",")
cliScrape, err := clientv3.New(clientv3.Config{Endpoints: endpointsScrape, DialTimeout: 10 * time.Second})
if err != nil {
logrus.Errorf("Could not initialize ETCD client. err=%s", err)
panic(err)
}
logrus.Infof("Etcd client initialized for scrape")
sourceTargetsChan := make(chan []SourceTarget, 0)
go watchSourceScrapeTargets(cliScrape, scrapeEtcdPath, sourceTargetsChan)
promNodes := make([]string, 0)
scrapeTargets := make([]SourceTarget, 0)
go func() {
for {
logrus.Debugf("Prometheus nodes found: %s", promNodes)
logrus.Debugf("Scrape targets found: %s", scrapeTargets)
time.Sleep(5 * time.Second)
}
}()
for {
select {
case promNodes = <-nodesChan:
logrus.Debugf("updated promNodes: %s", promNodes)
case scrapeTargets = <-sourceTargetsChan:
logrus.Debugf("updated scapeTargets: %s", scrapeTargets)
}
err := updatePrometheusTargets(scrapeTargets, promNodes, scrapeShardingEnable)
if err != nil {
logrus.Warnf("Couldn't update Prometheus scrape targets. err=%s", err)
}
}
}
func updatePrometheusConfig(prometheusFile string, scrapeInterval string, scrapeTimeout string, evaluationInterval string, scrapePaths []string, scrapeMatch string, scheme string, tlsInsecure string) error {
logrus.Infof("updatePrometheusConfig. scrapeInterval=%s,scrapeTimeout=%s,evaluationInterval=%s,scrapePaths=%s,scrapeMatch=%s,scheme=%s,tlsInsecure=%s", scrapeInterval, scrapeTimeout, evaluationInterval, scrapePaths, scrapeMatch, scheme, tlsInsecure)
input := make(map[string]interface{})
input["scrapeInterval"] = scrapeInterval
input["scrapeTimeout"] = scrapeTimeout
input["evaluationInterval"] = evaluationInterval
input["scrapePaths"] = scrapePaths
input["scrapeMatch"] = scrapeMatch
input["scheme"] = scheme
input["tlsInsecure"] = tlsInsecure
input["prometheusServer"] = getSelfNodeName()
contents, err := executeTemplate("/", "prometheus.yml.tmpl", input)
if err != nil {
return err
}
logrus.Debugf("%s: '%s'", prometheusFile, contents)
err = ioutil.WriteFile(prometheusFile, []byte(contents), 0666)
if err != nil {
return err
}
_, err = ExecShell("wget --post-data='' http://localhost:9090/-/reload -O -")
if err != nil {
logrus.Warnf("Couldn't reload Prometheus config. Maybe it wasn't initialized at this time and will get the config as soon as getting started. Ignoring.")
}
return nil
}
// RecordingRule defines a structure to simplify the handling of Prometheus recording rules
type RecordingRule struct {
name string
expr string
labels map[string]string
}
// getLabelMap builds a label map from a raw configuration string
func getLabelMap(rawLabels string) map[string]string {
toReturn := make(map[string]string)
mappings := strings.Split(rawLabels, ",")
for _, mapping := range mappings {
if mapping != "" {
var keyValue = strings.Split(mapping, ":")
toReturn[keyValue[0]] = keyValue[1]
}
}
return toReturn
}
// getPrintableLabels builds the labels in a printable format
func getPrintableLabels(labels map[string]string) string {
if len(labels) <= 0 {
return ""
}
var toReturn = `
labels:`
for k, v := range labels {
var format = `
%s: %s`
toReturn += fmt.Sprintf(format, k, v)
}
return toReturn
}
func createRulesFromENV(rulesFile string) error {
env := make(map[string]string)
for _, e := range os.Environ() {
pair := strings.Split(e, "=")
env[pair[0]] = pair[1]
}
rules := make([]RecordingRule, 0)
for i := 1; i < 100; i++ {
kname := fmt.Sprintf("RECORD_RULE_%d_NAME", i)
kexpr := fmt.Sprintf("RECORD_RULE_%d_EXPR", i)
klabels := fmt.Sprintf("RECORD_RULE_%d_LABELS", i)
vname, exists := env[kname]
if !exists {
break
}
vexpr, exists := env[kexpr]
if !exists {
break
}
rules = append(rules, RecordingRule{name: vname, expr: vexpr, labels: getLabelMap(env[klabels])})
}
if len(rules) == 0 {
logrus.Infof("No prometheus rules found in environment variables")
return nil
}
logrus.Debugf("Found %d rules: %s", len(rules), rules)
rulesContents := `groups:
- name: env-rules
rules:`
for _, v := range rules {
rc := `%s
- record: %s
expr: %s
%s
`
rulesContents = fmt.Sprintf(rc, rulesContents, v.name, v.expr, getPrintableLabels(v.labels))
}
logrus.Debugf("%s: '%v'", rulesFile, rulesContents)
err := ioutil.WriteFile(rulesFile, []byte(rulesContents), 0666)
if err != nil {
return err
}
_, err = ExecShell("wget --post-data='' http://localhost:9090/-/reload -O -")
if err != nil {
logrus.Warnf("Couldn't reload Prometheus config. Maybe it wasn't initialized at this time and will get the config as soon as getting started. Ignoring.")
}
return nil
}
func updatePrometheusTargets(scrapeTargets []SourceTarget, promNodes []string, shardingEnabled bool) error {
//Apply consistent hashing to determine which scrape endpoints will
//be handled by this Prometheus instance
logrus.Debugf("updatePrometheusTargets. scrapeTargets=%s, promNodes=%s", scrapeTargets, promNodes)
ring := hashring.New(hashList(promNodes))
selfNodeName := getSelfNodeName()
selfScrapeTargets := make([]SourceTarget, 0)
for _, starget := range scrapeTargets {
hashedPromNode, ok := ring.GetNode(stringSha512(starget.Targets[0]))
if !ok {
return fmt.Errorf("Couldn't get prometheus node for %s in consistent hash", starget.Targets[0])
}
logrus.Debugf("Target %s - Prometheus %x", starget, hashedPromNode)
hashedSelf := stringSha512(selfNodeName)
if !shardingEnabled || hashedSelf == hashedPromNode {
logrus.Debugf("Target %s - Prometheus %s", starget, selfNodeName)
selfScrapeTargets = append(selfScrapeTargets, starget)
}
}
//generate json file
contents, err := json.Marshal(selfScrapeTargets)
if err != nil {
return err
}
logrus.Debugf("Writing /servers.json: '%s'", string(contents))
err = ioutil.WriteFile("/servers.json", contents, 0666)
if err != nil {
return err
}
//force Prometheus to update its configuration live
_, err = ExecShell("wget --post-data='' http://localhost:9090/-/reload -O -")
if err != nil {
return err
}
// output, err0 := ExecShell("kill -HUP $(ps | grep prometheus | awk '{print $1}' | head -1)")
// if err0 != nil {
// logrus.Warnf("Could not reload Prometheus configuration. err=%s. output=%s", err0, output)
// }
return nil
}
func keepSelfNodeRegistered(reg *etcdregistry.EtcdRegistry, etcdServiceName string, ttl time.Duration) {
node := etcdregistry.Node{}
node.Name = getSelfNodeName()
logrus.Debugf("Registering Prometheus instance on ETCD registry. service=%s; node=%s", etcdServiceName, node)
err := reg.RegisterNode(context.TODO(), etcdServiceName, node, ttl)
if err != nil {
panic(err)
}
}
func getSelfNodeName() string {
hostip, err := ExecShell("ip route get 8.8.8.8 | grep -oE 'src ([0-9\\.]+)' | cut -d ' ' -f 2")
if err != nil {
panic(err)
}
return fmt.Sprintf("%s:9090", strings.TrimSpace(hostip))
}
func watchSourceScrapeTargets(cli *clientv3.Client, sourceTargetsPath string, sourceTargetsChan chan []SourceTarget) {
logrus.Debugf("Getting source scrape targets from %s", sourceTargetsPath)
watchChan := cli.Watch(context.TODO(), sourceTargetsPath, clientv3.WithPrefix())
for {
logrus.Debugf("Source scrape targets updated")
rsp, err0 := cli.Get(context.TODO(), sourceTargetsPath, clientv3.WithPrefix())
if err0 != nil {
logrus.Warnf("Error retrieving source scrape targets. err=%s", err0)
}
if len(rsp.Kvs) == 0 {
logrus.Debugf("No source scrape targets were found under %s", sourceTargetsPath)
} else {
sourceTargets := make([]SourceTarget, 0)
for _, kv := range rsp.Kvs {
record := string(kv.Key)
targetAddress := path.Base(record)
serviceName := path.Base(path.Dir(record))
sourceTargets = append(sourceTargets, SourceTarget{Labels: map[string]string{"prsn": serviceName}, Targets: []string{targetAddress}})
}
sourceTargetsChan <- sourceTargets
logrus.Debugf("Found source scrape targets: %s", sourceTargets)
}
<-watchChan
}
// logrus.Infof("Updating scrape targets for this shard to %s")
}
func watchRegisteredNodes(cli *clientv3.Client, servicePath string, nodesChan chan []string) {
watchChan := cli.Watch(context.TODO(), servicePath, clientv3.WithPrefix())
for {
logrus.Debugf("Registered nodes updated")
rsp, err0 := cli.Get(context.TODO(), servicePath, clientv3.WithPrefix())
if err0 != nil {
logrus.Warnf("Error retrieving service nodes. err=%s", err0)
}
if len(rsp.Kvs) == 0 {
logrus.Debugf("No services nodes were found under %s", servicePath)
} else {
promNodes := make([]string, 0)
for _, kv := range rsp.Kvs {
promNodes = append(promNodes, path.Base(string(kv.Key)))
}
nodesChan <- promNodes
logrus.Debugf("Found registered nodes %s", promNodes)
}
<-watchChan
}
}