Skip to content

Commit

Permalink
Merge pull request #1215 from GeorgianaElena/grafana-datasources
Browse files Browse the repository at this point in the history
Add a script that can add all clusters as datasources for central grafana
  • Loading branch information
GeorgianaElena authored Apr 20, 2022
2 parents 78e4916 + b13d590 commit 3c16800
Show file tree
Hide file tree
Showing 3 changed files with 299 additions and 9 deletions.
256 changes: 256 additions & 0 deletions deployer/update_central_grafana_datasources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
"""
### Summary
Ensures that the central grafana at https://grafana.pilot.2i2c.cloud is configured to use as datasource the authenticated prometheus instances of all the clusters that we run.
### How to use
This is meant to by run as a script from the command line, like so:
$ python deployer/grafana_datasources_manager.py
"""

import argparse
import json

import requests
from file_acquisition import find_absolute_path_to_cluster_file, get_decrypted_file
from helm_upgrade_decision import get_all_cluster_yaml_files
from ruamel.yaml import YAML
from utils import print_colour

yaml = YAML(typ="safe")


def build_datasource_details(cluster_name):
"""Builds the payload needed to create an authenticated datasource in Grafana for `cluster_name`.
Args:
cluster_name: name of the cluster
Returns:
dict object: req payload to be consumed by Grafana
"""
# Get the prometheus address for cluster_name
datasource_url = get_cluster_prometheus_address(cluster_name)

# Get the credentials of this prometheus instance
prometheus_creds = get_cluster_prometheus_creds(cluster_name)

datasource_details = {
"name": cluster_name,
"type": "prometheus",
"access": "proxy",
"url": f"https://{datasource_url}",
"basicAuth": True,
"basicAuthUser": prometheus_creds["username"],
"secureJsonData": {"basicAuthPassword": prometheus_creds["password"]},
}

return datasource_details


def get_central_grafana_url(central_cluster_name):
cluster_config_dir_path = find_absolute_path_to_cluster_file(
central_cluster_name
).parent

config_file = cluster_config_dir_path.joinpath("support.values.yaml")
with open(config_file) as f:
support_config = yaml.load(f)

grafana_tls_config = (
support_config.get("grafana", {}).get("ingress", {}).get("tls", [])
)

if not grafana_tls_config:
raise ValueError(
f"No tls config was found for the Grafana instance of {central_cluster_name}. Please consider enable it before using it as the central Grafana."
)

# We only have one tls host right now. Modify this when things change.
return grafana_tls_config[0]["hosts"][0]


def get_cluster_prometheus_address(cluster_name):
"""Retrieves the address of the prometheus instance running on the `cluster_name` cluster.
This address is stored in the `support.values.yaml` file of each cluster config directory.
Args:
cluster_name: name of the cluster
Returns:
string object: https address of the prometheus instance
Raises ValueError if
- `prometheusIngressAuthSecret` isn't configured
- `support["prometheus"]["server"]["ingress"]["tls"]` doesn't exist
"""
cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent

config_file = cluster_config_dir_path.joinpath("support.values.yaml")
with open(config_file) as f:
support_config = yaml.load(f)

# Don't return the address if the prometheus instance wasn't securely exposed to the outside.
if not support_config.get("prometheusIngressAuthSecret", {}).get("enabled", False):
raise ValueError(
f"`prometheusIngressAuthSecret` wasn't configured for {cluster_name}"
)

tls_config = (
support_config.get("prometheus", {})
.get("server", {})
.get("ingress", {})
.get("tls", [])
)

if not tls_config:
raise ValueError(
f"No tls config was found for the prometheus instance of {cluster_name}"
)

# We only have one tls host right now. Modify this when things change.
return tls_config[0]["hosts"][0]


def get_cluster_prometheus_creds(cluster_name):
"""Retrieves the credentials of the prometheus instance running on the `cluster_name` cluster.
These credentials are stored in `enc-support.secret.values.yaml` file of each cluster config directory.
Args:
cluster_name: name of the cluster
Returns:
dict object: {username: `username`, password: `password`}
"""
cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent

config_filename = cluster_config_dir_path.joinpath("enc-support.secret.values.yaml")

with get_decrypted_file(config_filename) as decrypted_path:
with open(decrypted_path) as f:
prometheus_config = yaml.load(f)

return prometheus_config.get("prometheusIngressAuthSecret", {})


def get_central_grafana_token(cluster_name):
"""Returns the access token of the Grafana located in `cluster_name` cluster.
This access token should have enough permissions to create datasources.
"""
# Get the location of the file that stores the central grafana token
cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent

grafana_token_file = (cluster_config_dir_path).joinpath(
"enc-grafana-token.secret.yaml"
)

# Read the secret grafana token file
with get_decrypted_file(grafana_token_file) as decrypted_file_path:
with open(decrypted_file_path) as f:
config = yaml.load(f)

return config["grafana_token"]


def build_request_headers(cluster_name):
token = get_central_grafana_token(cluster_name)

headers = {
"Accept": "application/json",
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}

return headers


def get_clusters_used_as_datasources(cluster_name, datasource_endpoint):
"""Returns a list of cluster names that have prometheus instances already defined as datasources of the centralized Grafana."""
headers = build_request_headers(cluster_name)
# Get a list of all the currently existing datasources
response = requests.get(datasource_endpoint, headers=headers)

if response.status_code != 200:
print(
f"An error occured when retrieving the datasources from {datasource_endpoint}. \n Error was {response.text}."
)
response.raise_for_status()

datasources = response.json()
return [datasource["name"] for datasource in datasources]


def main():
argparser = argparse.ArgumentParser(
description="""A command line tool to update Grafana
datasources.
"""
)

argparser.add_argument(
"cluster_name",
type=str,
nargs="?",
help="The name of the cluster where the Grafana lives",
default="2i2c",
)

args = argparser.parse_args()
cluster = args.cluster_name
grafana_host = get_central_grafana_url(cluster)
datasource_endpoint = f"https://{grafana_host}/api/datasources"

# Get a list of the clusters that already have their prometheus instances used as datasources
datasources = get_clusters_used_as_datasources(cluster, datasource_endpoint)

# Get a list of filepaths to all cluster.yaml files in the repo
cluster_files = get_all_cluster_yaml_files()

print("Searching for clusters that aren't Grafana datasources...")
# Count how many clusters we can't add as datasources for logging
exceptions = 0
for cluster_file in cluster_files:
# Read in the cluster.yaml file
with open(cluster_file) as f:
cluster_config = yaml.load(f)

# Get the cluster's name
cluster_name = cluster_config.get("name", {})
if cluster_name and cluster_name not in datasources:
print(f"Found {cluster_name} cluster. Checking if it can be added...")
# Build the datasource details for the instances that aren't configures as datasources
try:
datasource_details = build_datasource_details(cluster_name)
req_body = json.dumps(datasource_details)
print(req_body)

# Tell Grafana to create and register a datasource for this cluster
headers = build_request_headers()
response = requests.post(
datasource_endpoint, data=req_body, headers=headers
)
if response.status_code != 200:
print(
f"An error occured when creating the datasource. \nError was {response.text}."
)
response.raise_for_status()
print_colour(
f"Successfully created a new datasource for {cluster_name}!"
)
except Exception as e:
print_colour(
f"An error occured for {cluster_name}.\nError was: {e}.\nSkipping...",
"yellow",
)
exceptions += 1
pass

print_colour(
f"Failed to add {exceptions} clusters as datasources. See errors above!", "red"
)
print_colour(
f"Successfully retrieved {len(datasources)} existing datasources! {datasources}"
)


if __name__ == "__main__":
main()
28 changes: 20 additions & 8 deletions deployer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess


def print_colour(msg: str):
def print_colour(msg: str, colour="green"):
"""Print messages in colour to be distinguishable in CI logs
See the mybinder.org deploy.py script for more details:
Expand All @@ -11,12 +11,24 @@ def print_colour(msg: str):
Args:
msg (str): The message to print in colour
"""
if os.environ.get("TERM"):
BOLD = subprocess.check_output(["tput", "bold"]).decode()
GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode()
NC = subprocess.check_output(["tput", "sgr0"]).decode()
else:
if not os.environ.get("TERM"):
# no term, no colors
BOLD = GREEN = NC = ""
print(msg)

return

print(BOLD + GREEN + msg + NC, flush=True)
BOLD = subprocess.check_output(["tput", "bold"]).decode()
YELLOW = subprocess.check_output(["tput", "setaf", "3"]).decode()
GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode()
RED = subprocess.check_output(["tput", "setaf", "1"]).decode()
NC = subprocess.check_output(["tput", "sgr0"]).decode()

if colour == "green":
print(BOLD + GREEN + msg + NC, flush=True)
elif colour == "red":
print(BOLD + RED + msg + NC, flush=True)
elif colour == "yellow":
print(BOLD + YELLOW + msg + NC, flush=True)
else:
# colour not recognized, no colors
print(msg)
24 changes: 23 additions & 1 deletion docs/howto/operate/grafana.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Grafana Dashboards

Each 2i2c Hub is set up with [a Prometheus server](https://prometheus.io/) to generate metrics and information about activity on the hub, and each cluster of hubs has a [Grafana deployment](https://grafana.com/) to ingest and visualize this data.

This section describes how to use these dashboards for a cluster.

## Access Hub Grafana Dashboards
Expand All @@ -13,6 +14,14 @@ To access the Grafana dashboards you'll need a **username** and **password**.
These can be accessed using `sops` (see {ref}`tc:secrets:sops` for how to set up `sops` on your machine).
See [](grafana:log-in) for how to find the credentials information.

## The Central Grafana

The Grafana deployment in the `2i2c` cluster ingests data from all the 2i2c clusters and will soon be able to be used as "the central Grafana".

```{note}
TODO: should add more info once this is ready to use.
```

(grafana:new-grafana)=
## Set up Grafana Dashboards for a cluster

Expand Down Expand Up @@ -118,14 +127,27 @@ IPv4 address), or `CNAME` records if using AWS (where external IP is a domain na
**Wait a while for the DNS to propagate!**

(grafana:log-in)=
### Log in to the Grafana dashboard
### Log in to the cluster-spcific Grafana dashboard

Eventually, visiting `GRAFANA_URL` will present you with a login page.
Here are the credentials for logging in:

- **username**: `admin`
- **password**: located in `helm-charts/support/enc-support.secret.values.yaml` (`sops` encrypted).

### Register the cluster's Prometheus Server with the central Grafana

Once you have deployed the support chart, you must also register this cluster as a datasource for the central Grafana dashboard. This will allow you to visualize cluster statistics not only from the cluster-specific Grafana deployement but also from the central dashboard, that aggregates data from all the clusters.

Run the `update_central_grafana_datasources.py` script in the deployer to let the central Grafana know about this new prometheus server:

```
$ python3 deployer/update_central_grafana_datasources.py <grafana-cluster-name>
```

Where:
- <grafana-cluster-name> is the name of the cluster where the central Grafana lives. Right now, this defaults to "2i2c".

### Setting up Grafana Dashboards

Once you have logged into grafana as the admin user, create a new API key.
Expand Down

0 comments on commit 3c16800

Please sign in to comment.