Skip to content

Commit

Permalink
Add basic monitoring with Prometheus/Grafana
Browse files Browse the repository at this point in the history
C3SR-385
  • Loading branch information
youngest committed Jun 2, 2022
1 parent 08049d5 commit a421936
Show file tree
Hide file tree
Showing 11 changed files with 6,178 additions and 9 deletions.
3 changes: 2 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ MQ_USER=admin
MQ_PASSWORD=password
MQ_HOST=mq
MQ_PORT=5672
TRACER_ADDRESS=trace.local.mlmodelscope.org
MQ_ERLANG_COOKIE=quadruple-chocolate-chunk
TRACER_ADDRESS=trace.local.mlmodelscope.org
51 changes: 49 additions & 2 deletions docker-compose.override.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ services:
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.mlmodelscope-api.rule=Host(`api.local.mlmodelscope.org`)"
- "traefik.http.routers.mlmodelscope-api.rule=Host(`api.${ENVIRONMENT}mlmodelscope.org`)"
- "traefik.http.routers.mlmodelscope-api.entrypoints=web"
- "traefik.http.services.mlmodelscope-api.loadbalancer.server.port=8080"
- "traefik.http.middlewares.add-cors.headers.accessControlAllowMethods=GET,OPTIONS,POST,PUT,DELETE"
Expand Down Expand Up @@ -47,7 +47,7 @@ services:
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.mlmodelscope-companion.rule=Host(`companion.local.mlmodelscope.org`)"
- "traefik.http.routers.mlmodelscope-companion.rule=Host(`companion.${ENVIRONMENT}mlmodelscope.org`)"
- "traefik.http.routers.mlmodelscope-companion.entrypoints=web"
- "traefik.http.services.mlmodelscope-companion.loadbalancer.server.port=3020"
- "traefik.http.middlewares.add-cors-companion.headers.accessControlAllowMethods=GET,OPTIONS,POST"
Expand Down Expand Up @@ -75,6 +75,21 @@ services:
volumes:
- ./docker/data:/docker-entrypoint-initdb.d/

mq:
ports:
- "5672:5672"
- "15672:15672"
- "15692:15692"

volumes:
- ./docker/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
- ./docker/rabbitmq/definitions.json:/etc/rabbitmq/definitions.json:ro

consul:
volumes:
- ./docker/consul/server.json:/consul/config/server.json:ro
- consul_data:/consul/data

traefik:
image: traefik:v2.5
command:
Expand All @@ -96,7 +111,39 @@ services:
- ./docker/mlmodelscope-www.yml:/root/mlmodelscope-www.yml
- /var/run/docker.sock:/var/run/docker.sock:ro

### Monitoring services

grafana:
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.grafana.rule=Host(`monitoring.${ENVIRONMENT}mlmodelscope.org`)"
- "traefik.http.routers.grafana.entrypoints=web"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"

networks:
- default
- traefik

volumes:
- grafana_data:/var/lib/grafana
- ./docker/grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/rabbitmq.yaml
- ./docker/grafana/datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yaml
- ./docker/grafana/dashboards:/dashboards

prometheus:
ports:
- "9090:9090"
volumes:
- prometheus_data:/prometheus
- ./docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml

networks:
traefik:
name: traefik
attachable: true

volumes:
consul_data:
prometheus_data:
grafana_data:
64 changes: 63 additions & 1 deletion docker-compose.swarm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,68 @@ services:
- "traefik.http.routers.mlmodelscope-mq.entrypoints=websecure"
- "traefik.http.routers.mlmodelscope-mq.tls.certresolver=letsencrypt"
- "traefik.http.services.mlmodelscope-mq.loadbalancer.server.port=15672"
configs:
- source: rabbitmq_config
target: /etc/rabbitmq/rabbitmq.conf
- source: rabbitmq_definitions
target: /etc/rabbitmq/definitions.json
environment:
RABBITMQ_NODENAME: "$RABBITMQ_NODENAME"
volumes:
- staging-rabbit-efs:/var/lib/rabbitmq/mnesia

consul:
configs:
- source: consul_config
target: /consul/config/server.json
volumes:
- consul_data:/consul/data

### Monitoring services

grafana:
configs:
- source: grafana_dashboards_config
target: /etc/grafana/provisioning/dashboards/rabbitmq.yaml
- source: grafana_prometheus_datasource
target: /etc/grafana/provisioning/datasources/prometheus.yaml
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.grafana.rule=Host(`monitoring.${ENVIRONMENT}mlmodelscope.org`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"

networks:
- default
- traefik

volumes:
- grafana_dashboards:/dashboards
- grafana_data:/var/lib/grafana

prometheus:
configs:
- source: prometheus_config
target: /etc/prometheus/prometheus.yml
volumes:
- prometheus_data:/prometheus

configs:
consul_config:
external: true
grafana_dashboards_config:
external: true
grafana_prometheus_datasource:
external: true
prometheus_config:
external: true
rabbitmq_config:
external: true
rabbitmq_definitions:
external: true

networks:
traefik:
external: true
Expand All @@ -77,4 +134,9 @@ networks:
volumes:
staging-rabbit-efs:
external: true
staging-companion-data:
staging-companion-data:
consul_data:
grafana_dashboards:
grafana_data:
prometheus_data:

47 changes: 42 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,46 @@ services:

mq:
image: rabbitmq:3-management-alpine
depends_on:
- consul
environment:
RABBITMQ_DEFAULT_USER: "$MQ_USER"
RABBITMQ_DEFAULT_PASS: "$MQ_PASSWORD"
ports:
- "15672:15672"
- "5672:5672"
RABBITMQ_ERLANG_COOKIE: "$MQ_ERLANG_COOKIE"

consul:
image: hashicorp/consul:1.12
command: "agent -bootstrap=1"

### Monitoring services

grafana:
image: grafana/grafana:8.5.3
environment:
GF_INSTALL_PLUGINS: "flant-statusmap-panel,grafana-piechart-panel"

prometheus:
image: prom/prometheus:v2.35.0

node-exporter:
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
expose:
- 9100
image: prom/node-exporter:v1.3.1
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro

cadvisor:
expose:
- 8080
image: gcr.io/cadvisor/cadvisor:v0.44.0
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
8 changes: 8 additions & 0 deletions docker/consul/server.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"node_name": "consul-server",
"server": true,
"data_dir": "/consul/data",
"addresses": {
"http": "0.0.0.0"
}
}
10 changes: 10 additions & 0 deletions docker/grafana/dashboards.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: 1

providers:
- name: 'rabbitmq'
orgId: 1
folder: ''
type: file
disableDeletion: true
options:
path: /dashboards
Loading

0 comments on commit a421936

Please sign in to comment.