Merge pull request #1215 from GeorgianaElena/grafana-datasources

Add a script that can add all clusters as datasources for central grafana
2i2c-org · Apr 20, 2022 · 3c16800 · 3c16800
2 parents 78e4916 + b13d590
commit 3c16800
Show file tree

Hide file tree

Showing 3 changed files with 299 additions and 9 deletions.
diff --git a/deployer/update_central_grafana_datasources.py b/deployer/update_central_grafana_datasources.py
@@ -0,0 +1,256 @@
+"""
+### Summary
+
+Ensures that the central grafana at https://grafana.pilot.2i2c.cloud is configured to use as datasource the authenticated prometheus instances of all the clusters that we run.
+
+### How to use
+
+This is meant to by run as a script from the command line, like so:
+
+$ python deployer/grafana_datasources_manager.py
+
+"""
+
+import argparse
+import json
+
+import requests
+from file_acquisition import find_absolute_path_to_cluster_file, get_decrypted_file
+from helm_upgrade_decision import get_all_cluster_yaml_files
+from ruamel.yaml import YAML
+from utils import print_colour
+
+yaml = YAML(typ="safe")
+
+
+def build_datasource_details(cluster_name):
+    """Builds the payload needed to create an authenticated datasource in Grafana for `cluster_name`.
+
+    Args:
+        cluster_name: name of the cluster
+    Returns:
+        dict object: req payload to be consumed by Grafana
+    """
+    # Get the prometheus address for cluster_name
+    datasource_url = get_cluster_prometheus_address(cluster_name)
+
+    # Get the credentials of this prometheus instance
+    prometheus_creds = get_cluster_prometheus_creds(cluster_name)
+
+    datasource_details = {
+        "name": cluster_name,
+        "type": "prometheus",
+        "access": "proxy",
+        "url": f"https://{datasource_url}",
+        "basicAuth": True,
+        "basicAuthUser": prometheus_creds["username"],
+        "secureJsonData": {"basicAuthPassword": prometheus_creds["password"]},
+    }
+
+    return datasource_details
+
+
+def get_central_grafana_url(central_cluster_name):
+    cluster_config_dir_path = find_absolute_path_to_cluster_file(
+        central_cluster_name
+    ).parent
+
+    config_file = cluster_config_dir_path.joinpath("support.values.yaml")
+    with open(config_file) as f:
+        support_config = yaml.load(f)
+
+    grafana_tls_config = (
+        support_config.get("grafana", {}).get("ingress", {}).get("tls", [])
+    )
+
+    if not grafana_tls_config:
+        raise ValueError(
+            f"No tls config was found for the Grafana instance of {central_cluster_name}. Please consider enable it before using it as the central Grafana."
+        )
+
+    # We only have one tls host right now. Modify this when things change.
+    return grafana_tls_config[0]["hosts"][0]
+
+
+def get_cluster_prometheus_address(cluster_name):
+    """Retrieves the address of the prometheus instance running on the `cluster_name` cluster.
+    This address is stored in the `support.values.yaml` file of each cluster config directory.
+
+    Args:
+        cluster_name: name of the cluster
+    Returns:
+        string object: https address of the prometheus instance
+    Raises ValueError if
+        - `prometheusIngressAuthSecret` isn't configured
+        - `support["prometheus"]["server"]["ingress"]["tls"]` doesn't exist
+    """
+    cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent
+
+    config_file = cluster_config_dir_path.joinpath("support.values.yaml")
+    with open(config_file) as f:
+        support_config = yaml.load(f)
+
+    # Don't return the address if the prometheus instance wasn't securely exposed to the outside.
+    if not support_config.get("prometheusIngressAuthSecret", {}).get("enabled", False):
+        raise ValueError(
+            f"`prometheusIngressAuthSecret` wasn't configured for {cluster_name}"
+        )
+
+    tls_config = (
+        support_config.get("prometheus", {})
+        .get("server", {})
+        .get("ingress", {})
+        .get("tls", [])
+    )
+
+    if not tls_config:
+        raise ValueError(
+            f"No tls config was found for the prometheus instance of {cluster_name}"
+        )
+
+    # We only have one tls host right now. Modify this when things change.
+    return tls_config[0]["hosts"][0]
+
+
+def get_cluster_prometheus_creds(cluster_name):
+    """Retrieves the credentials of the prometheus instance running on the `cluster_name` cluster.
+    These credentials are stored in `enc-support.secret.values.yaml` file of each cluster config directory.
+
+    Args:
+        cluster_name: name of the cluster
+    Returns:
+        dict object: {username: `username`, password: `password`}
+    """
+    cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent
+
+    config_filename = cluster_config_dir_path.joinpath("enc-support.secret.values.yaml")
+
+    with get_decrypted_file(config_filename) as decrypted_path:
+        with open(decrypted_path) as f:
+            prometheus_config = yaml.load(f)
+
+    return prometheus_config.get("prometheusIngressAuthSecret", {})
+
+
+def get_central_grafana_token(cluster_name):
+    """Returns the access token of the Grafana located in `cluster_name` cluster.
+    This access token should have enough permissions to create datasources.
+    """
+    # Get the location of the file that stores the central grafana token
+    cluster_config_dir_path = find_absolute_path_to_cluster_file(cluster_name).parent
+
+    grafana_token_file = (cluster_config_dir_path).joinpath(
+        "enc-grafana-token.secret.yaml"
+    )
+
+    # Read the secret grafana token file
+    with get_decrypted_file(grafana_token_file) as decrypted_file_path:
+        with open(decrypted_file_path) as f:
+            config = yaml.load(f)
+
+    return config["grafana_token"]
+
+
+def build_request_headers(cluster_name):
+    token = get_central_grafana_token(cluster_name)
+
+    headers = {
+        "Accept": "application/json",
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+
+    return headers
+
+
+def get_clusters_used_as_datasources(cluster_name, datasource_endpoint):
+    """Returns a list of cluster names that have prometheus instances already defined as datasources of the centralized Grafana."""
+    headers = build_request_headers(cluster_name)
+    # Get a list of all the currently existing datasources
+    response = requests.get(datasource_endpoint, headers=headers)
+
+    if response.status_code != 200:
+        print(
+            f"An error occured when retrieving the datasources from {datasource_endpoint}. \n Error was {response.text}."
+        )
+        response.raise_for_status()
+
+    datasources = response.json()
+    return [datasource["name"] for datasource in datasources]
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        description="""A command line tool to update Grafana
+        datasources.
+        """
+    )
+
+    argparser.add_argument(
+        "cluster_name",
+        type=str,
+        nargs="?",
+        help="The name of the cluster where the Grafana lives",
+        default="2i2c",
+    )
+
+    args = argparser.parse_args()
+    cluster = args.cluster_name
+    grafana_host = get_central_grafana_url(cluster)
+    datasource_endpoint = f"https://{grafana_host}/api/datasources"
+
+    # Get a list of the clusters that already have their prometheus instances used as datasources
+    datasources = get_clusters_used_as_datasources(cluster, datasource_endpoint)
+
+    # Get a list of filepaths to all cluster.yaml files in the repo
+    cluster_files = get_all_cluster_yaml_files()
+
+    print("Searching for clusters that aren't Grafana datasources...")
+    # Count how many clusters we can't add as datasources for logging
+    exceptions = 0
+    for cluster_file in cluster_files:
+        # Read in the cluster.yaml file
+        with open(cluster_file) as f:
+            cluster_config = yaml.load(f)
+
+        # Get the cluster's name
+        cluster_name = cluster_config.get("name", {})
+        if cluster_name and cluster_name not in datasources:
+            print(f"Found {cluster_name} cluster. Checking if it can be added...")
+            # Build the datasource details for the instances that aren't configures as datasources
+            try:
+                datasource_details = build_datasource_details(cluster_name)
+                req_body = json.dumps(datasource_details)
+                print(req_body)
+
+                # Tell Grafana to create and register a datasource for this cluster
+                headers = build_request_headers()
+                response = requests.post(
+                    datasource_endpoint, data=req_body, headers=headers
+                )
+                if response.status_code != 200:
+                    print(
+                        f"An error occured when creating the datasource. \nError was {response.text}."
+                    )
+                    response.raise_for_status()
+                print_colour(
+                    f"Successfully created a new datasource for {cluster_name}!"
+                )
+            except Exception as e:
+                print_colour(
+                    f"An error occured for {cluster_name}.\nError was: {e}.\nSkipping...",
+                    "yellow",
+                )
+                exceptions += 1
+                pass
+
+    print_colour(
+        f"Failed to add {exceptions} clusters as datasources. See errors above!", "red"
+    )
+    print_colour(
+        f"Successfully retrieved {len(datasources)} existing datasources! {datasources}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deployer/utils.py b/deployer/utils.py
@@ -2,7 +2,7 @@
 import subprocess
 
 
-def print_colour(msg: str):
+def print_colour(msg: str, colour="green"):
     """Print messages in colour to be distinguishable in CI logs
 
     See the mybinder.org deploy.py script for more details:
@@ -11,12 +11,24 @@ def print_colour(msg: str):
     Args:
         msg (str): The message to print in colour
     """
-    if os.environ.get("TERM"):
-        BOLD = subprocess.check_output(["tput", "bold"]).decode()
-        GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode()
-        NC = subprocess.check_output(["tput", "sgr0"]).decode()
-    else:
+    if not os.environ.get("TERM"):
         # no term, no colors
-        BOLD = GREEN = NC = ""
+        print(msg)
+
+        return
 
-    print(BOLD + GREEN + msg + NC, flush=True)
+    BOLD = subprocess.check_output(["tput", "bold"]).decode()
+    YELLOW = subprocess.check_output(["tput", "setaf", "3"]).decode()
+    GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode()
+    RED = subprocess.check_output(["tput", "setaf", "1"]).decode()
+    NC = subprocess.check_output(["tput", "sgr0"]).decode()
+
+    if colour == "green":
+        print(BOLD + GREEN + msg + NC, flush=True)
+    elif colour == "red":
+        print(BOLD + RED + msg + NC, flush=True)
+    elif colour == "yellow":
+        print(BOLD + YELLOW + msg + NC, flush=True)
+    else:
+        # colour not recognized, no colors
+        print(msg)
diff --git a/docs/howto/operate/grafana.md b/docs/howto/operate/grafana.md
@@ -2,6 +2,7 @@
 # Grafana Dashboards
 
 Each 2i2c Hub is set up with [a Prometheus server](https://prometheus.io/) to generate metrics and information about activity on the hub, and each cluster of hubs has a [Grafana deployment](https://grafana.com/) to ingest and visualize this data.
+
 This section describes how to use these dashboards for a cluster.
 
 ## Access Hub Grafana Dashboards
@@ -13,6 +14,14 @@ To access the Grafana dashboards you'll need a **username** and **password**.
 These can be accessed using `sops` (see {ref}`tc:secrets:sops` for how to set up `sops` on your machine).
 See [](grafana:log-in) for how to find the credentials information.
 
+## The Central Grafana
+
+The Grafana deployment in the `2i2c` cluster ingests data from all the 2i2c clusters and will soon be able to be used as "the central Grafana".
+
+```{note}
+TODO: should add more info once this is ready to use.
+```
+
 (grafana:new-grafana)=
 ## Set up Grafana Dashboards for a cluster
 
@@ -118,14 +127,27 @@ IPv4 address), or `CNAME` records if using AWS (where external IP is a domain na
 **Wait a while for the DNS to propagate!**
 
 (grafana:log-in)=
-### Log in to the Grafana dashboard
+### Log in to the cluster-spcific Grafana dashboard
 
 Eventually, visiting `GRAFANA_URL` will present you with a login page.
 Here are the credentials for logging in:
 
 - **username**: `admin`
 - **password**: located in `helm-charts/support/enc-support.secret.values.yaml` (`sops` encrypted).
 
+### Register the cluster's Prometheus Server with the central Grafana
+
+Once you have deployed the support chart, you must also register this cluster as a datasource for the central Grafana dashboard. This will allow you to visualize cluster statistics not only from the cluster-specific Grafana deployement but also from the central dashboard, that aggregates data from all the clusters.
+
+Run the `update_central_grafana_datasources.py` script in the deployer to let the central Grafana know about this new prometheus server:
+
+```
+$ python3 deployer/update_central_grafana_datasources.py <grafana-cluster-name>
+```
+
+Where:
+- <grafana-cluster-name> is the name of the cluster where the central Grafana lives. Right now, this defaults to "2i2c".
+
 ### Setting up Grafana Dashboards
 
 Once you have logged into grafana as the admin user, create a new API key.