os-climate · pacospace · Mar 16, 2022 · Mar 15, 2022 · Mar 15, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 ---
 repos:
- - repo: git://github.com/Lucas-C/pre-commit-hooks
+ - repo: https://github.com/Lucas-C/pre-commit-hooks
  rev: v1.1.10
  hooks:
  - id: remove-tabs
 
- - repo: git://github.com/pre-commit/pre-commit-hooks
+ - repo: https://github.com/pre-commit/pre-commit-hooks
  rev: v4.0.1
  hooks:
  - id: trailing-whitespace
@@ -21,36 +21,24 @@ repos:
  - id: check-ast
  - id: debug-statements
 
- - repo: git://github.com/pycqa/pydocstyle.git
+ - repo: https://github.com/pycqa/pydocstyle.git
  rev: 6.1.1
  hooks:
  - id: pydocstyle
 
  - repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.0.1
+ rev: v4.1.0
  hooks:
  - id: check-toml
  - id: check-yaml
  - id: end-of-file-fixer
  - id: trailing-whitespace
 
- - repo: https://github.com/pre-commit/mirrors-mypy
- rev: v0.902
- hooks:
- - id: mypy
- exclude: '^(docs|tasks|tests)|setup\.py'
- args: [--ignore-missing-imports]
-
  - repo: https://github.com/psf/black
- rev: 21.6b0
+ rev: 22.1.0
  hooks:
  - id: black
 
- - repo: https://github.com/tomcatling/black-nb
- rev: '0.5.0'
- hooks:
- - id: black-nb
-
  # Enable this in repositories with python packages.
  # - repo: https://github.com/mgedmin/check-manifest
  # rev: '0.39'
@@ -63,4 +51,4 @@ repos:
  - id: flake8-nb
  additional_dependencies: ['pep8-naming']
  # Ignore all format-related checks as Black takes care of those.
- args: ['--ignore', 'E2,W5', '--select', 'E,W,F,N', '--max-line-length=120']
+ args: ['--ignore', 'E2,W5', '--select', 'E,W,F,N', '--max-line-length=130']
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ The following demos provide examples of how to use the tools available with [Ope
 * [Run SQL queries from a Jupyter Notebook environment](notebooks/demo1/demo1-join-tables.ipynb)
 * [Demo 1 Elyra Pipeline](https://github.com/os-climate/aicoe-osc-demo/blob/master/notebooks/demo1/demo1.pipeline)
 * [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3/)
-* [Video on creating Elyra Pipelines and Superset Dashboard](https://youtu.be/TFgsR7UlcHA) 
+* [Video on creating Elyra Pipelines and Superset Dashboard](https://youtu.be/TFgsR7UlcHA)
 
 
 ## [Demo 2 - Automated Inference Pipeline and Dashboarding](notebooks/demo2/README.md)

diff --git a/_config.yml b/_config.yml
@@ -1,4 +1,4 @@
-title: Open Data Hub and Operate First for OS-Climate 
+title: Open Data Hub and Operate First for OS-Climate
 author: AIOps
 # logo: logo.png
 execute:
@@ -33,4 +33,4 @@ launch_buttons:
 sphinx:
  config:
  html_js_files:
- - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
+ - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
diff --git a/_toc.yml b/_toc.yml
@@ -1,7 +1,7 @@
 format: jb-book
 root: README
 parts:
- - caption: Demo 1 - ETL & Dashboarding 
+ - caption: Demo 1 - ETL & Dashboarding
  chapters:
  - file: notebooks/demo1/README
  - file: notebooks/demo1/demo1-create-tables

diff --git a/data/superset/demo1.json b/data/superset/demo1.json
@@ -10051,4 +10051,4 @@
  }
  }
  ]
-}
+}
diff --git a/data/superset/demo2.json b/data/superset/demo2.json
@@ -358,4 +358,4 @@
  }
  }
  ]
-}
+}
diff --git a/docs/conf.py b/docs/conf.py
@@ -40,7 +40,7 @@
 master_doc = "index"
 
 # General information about the project.
-project = u"project-template"
+project = "project-template"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -183,8 +183,8 @@
  (
  "index",
  "project-template.tex",
- u"project-template Documentation",
- u"aicoe-aiops",
+ "project-template Documentation",
+ "aicoe-aiops",
  "manual",
  ),
 ]
@@ -218,8 +218,8 @@
  (
  "index",
  "project-template",
- u"project-template Documentation",
- [u"aicoe-aiops"],
+ "project-template Documentation",
+ ["aicoe-aiops"],
  1,
  )
 ]
@@ -237,8 +237,8 @@
  (
  "index",
  "project-template",
- u"project-template Documentation",
- u"aicoe-aiops",
+ "project-template Documentation",
+ "aicoe-aiops",
  "project-template",
  "template for the team to use",
  "Miscellaneous",

diff --git a/notebooks/demo1/demo1-create-tables.ipynb b/notebooks/demo1/demo1-create-tables.ipynb
@@ -216,6 +216,7 @@
  " pz = list(zip(df.columns.to_list(), stypes))\n",
  " return \",\\n\".join([\" {n} {t}\".format(n=e[0], t=e[1]) for e in pz])\n",
  "\n",
+ "\n",
  "# Convert GHG values with string representation of numbers to float\n",
  "def str_w_spaces_to_numeric(df, col):\n",
  " df[col] = df[col].str.replace(' ','').str.replace(',','')\n",

diff --git a/notebooks/demo2/README.md b/notebooks/demo2/README.md
@@ -28,9 +28,9 @@ You can also use AICoE-CI to enable other Thoth services such as pre-commit chec
 ![Spawn JupyterHub](../../docs/assets/demo1-spawn-jupyter.png)
 
 
-## Data Preprocessing 
+## Data Preprocessing
 
-Now let’s look at how we process raw data and prepare it for model training. 
+Now let’s look at how we process raw data and prepare it for model training.
 The source code for preprocessing is available in the `src` directory preinstalled in the JupyterHub image. This directory follows the project structure laid out in the [aicoe-aiops project template](https://github.com/aicoe-aiops/project-template).
 
 * Extraction
@@ -41,20 +41,20 @@ The source code for preprocessing is available in the `src` directory preinstall
 
 * Curation
 
- * In the text and table curation notebook, we will load the json files (one per pdf) and the corresponding csv files from the s3 bucket, and then add labels to it. For each text extract or table, we will assign label "1" to the correct corresponding text, and label "0" to a randomly selected text that does not correspond to the table. 
+ * In the text and table curation notebook, we will load the json files (one per pdf) and the corresponding csv files from the s3 bucket, and then add labels to it. For each text extract or table, we will assign label "1" to the correct corresponding text, and label "0" to a randomly selected text that does not correspond to the table.
 
-## Inference 
+## Inference
 
 * Infer relevance
  * The infer relevance notebook takes in extracted text from the preprocessing stage and for a predefined set of KPI questions, finds relevant paragraphs from the text. These paragraphs are then used to find the exact answers to the questions. The notebook uses a fine-tuned language model stored on s3. The output prediction csv files are saved back on s3.
 
 * Infer KPI
  * The infer kpi notebook takes in the results from the infer relevance stage and for the predefined set of KPI questions, it finds the exact answers from the relevant paragraphs. The notebook uses a fine-tuned language model stored on s3. The output prediction csv files are saved back on s3.
 
-## Trino 
+## Trino
 
 * Results table
- * The create results table notebook takes the prediction output csv from infer KPI step and creates a Trino SQL table that can be used for querying and visualization in Superset. 
+ * The create results table notebook takes the prediction output csv from infer KPI step and creates a Trino SQL table that can be used for querying and visualization in Superset.
 
 ## Elyra pipeline
 
@@ -68,4 +68,3 @@ The source code for preprocessing is available in the `src` directory preinstall
 ## Superset Visualization
 
 * The Superset dashboard is the final step of demo 2. The automated Elyra inference pipeline answers KPI questions from raw pdfs and stores the results in the Trino table. The dashboard queries the table according to user selected filters and shows the answers. To interact with the results, find the [dashboard here](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/15).
-
diff --git a/notebooks/demo2/config.py b/notebooks/demo2/config.py
@@ -1,15 +1,13 @@
 """Default runtime config."""
-import src
 import pathlib
-import torch
 import os
 
 # General config
 STAGE = "extract" # "extract" | "curate "
 SEED = 42
 
 if os.getenv("AUTOMATION"):
- ROOT =  pathlib.Path("/opt/app-root")
+ ROOT = pathlib.Path("/opt/app-root")
 else:
  ROOT = pathlib.Path(__file__).resolve().parent.parent.parent