Kubernetes (#103)

- Kubernetes deployment - Incident detector - Saving incidents request sets into cloud storage using s3 interface - Streaming challenges directly to ElasticSearch
deflect-ca · Jan 19, 2022 · e48390c · e48390c
1 parent 433551d
commit e48390c
Show file tree

Hide file tree

Showing 225 changed files with 90,113 additions and 1,365 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,12 @@
+notebooks
+container
+docs
+ip_cache
+src/baskerville/logs
+data/airflow
+data/feature_overview
+data/img
+data/scripts
+data/templates
+ip_cache
+alembic
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -1,13 +1,13 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: Python application
+name: Unit tests
 
 on:
   push:
-    branches: [ master ]
+    branches: [ develop, master ]
   pull_request:
-    branches: [ master ]
+    branches: [ develop, master ]
 
 jobs:
   build:
@@ -24,8 +24,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest
-        if [ -f requirements_unit_tests.txt ]; then pip install -r requirements_unit_tests.txt; fi
-        git clone https://github.com/titicaca/spark-iforest.git
+        pip install -r requirements.txt
+        git clone -b categorical_features https://github.com/equalitie/spark-iforest.git
         cd spark-iforest/python
         python setup.py sdist
         pip install dist/pyspark-iforest-2.4.0.tar.gz
@@ -37,10 +37,13 @@ jobs:
         mkdir ./src/baskerville/logs/
     - name: Lint with flake8
       run: |
-        chmod linging.sh 745
-        ./linting.sh
+        cd ./src
+        flake8 . --count --ignore=C901,W503,W504,E226 --max-line-length=127 --statistics
 
     - name: Test with pytest
       run: |
         export PYTHONPATH="./src:./esretriever/src"
         pytest ./tests/unit
+    - name: License check
+      run: |
+        python ./src/baskerville/util/licensing.py
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+ip_cache/
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,22 @@
+# Copyright (c) 2020, eQualit.ie inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+FROM equalitie/baskerville:worker
+
+WORKDIR /usr/local
+RUN apt install wget
+RUN wget https://builds.openlogic.com/downloadJDK/openlogic-openjdk/8u262-b10/openlogic-openjdk-8u262-b10-linux-x64.tar.gz
+RUN mkdir jdk262
+RUN tar -zxvf openlogic-openjdk-8u262-b10-linux-x64.tar.gz -C jdk262
+RUN rm -r $JAVA_HOME/*
+RUN mv jdk262/openlogic-openjdk-8u262-b10-linux-64/* $JAVA_HOME/
+
+COPY ./src /usr/local/baskerville/src
+COPY ./data/jars /usr/local/baskerville/data/jars
+COPY ./requirements.txt /usr/local/baskerville
+
+WORKDIR /usr/local/baskerville
+RUN pip3 install -e .
diff --git a/Dockerfile_notebook b/Dockerfile_notebook
@@ -0,0 +1,4 @@
+FROM equalitie/baskerville:notebook
+
+COPY ./src /usr/local/baskerville/src
+COPY ./data/jars /usr/local/baskerville/data/jars
diff --git a/README.md b/README.md
@@ -538,10 +538,36 @@ Most of the features are `updateable`, wich means, they **take the past into con
 
 ![Baskerville's Request Set Cache](data/img/request_set_cache.png?raw=true "Baskerville's Request Set Cache")
 
+## Building Baskerville image
+* build spark image https://levelup.gitconnected.com/spark-on-kubernetes-3d822969f85b
+```commandline
+wget https://archive.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
+mkdir spark
+mv spark-2.4.6-bin-hadoop2.7.tgz spark
+tar -xvzf spark-2.4.6-bin-hadoop2.7.tgz
+export SPARK_HOME=/root/spark/spark-2.4.6-bin-hadoop2.7
+alias spark-shell=”$SPARK_HOME/bin/spark-shell”
+
+$SPARK_HOME/bin/docker-image-tool.sh -r baskerville -t spark2.4.6 -p $SPARK_HOME/kubernetes/dockerfiles/spark/bindings/python/Dockerfile build
+docker tag baskerville/spark-py:v2.4.6 equalitie/baskerville:spark246
+```
+
+* build Baskerville worker image
+```commandline
+docker build -t equalitie/baskerville:worker dockerfiles/worker/
+```
+
+* build the latest Baskerville image with your local changes
+```commandline
+docker build -t equalitie/baskerville:latest .
+docker push equalitie/baskerville:latest
+```
+
 ## Related Projects
 - ES Retriever: https://github.com/equalitie/esretriever: A spark wrapper to retrieve data from ElastiSearch
 - Deflect Analysis Ecosystem: https://github.com/equalitie/deflect-analysis-ecosystem:
     Docker files for all the components baskerville might need.
+- Baskerville client: https://github.com/equalitie/baskerville_client
 
 ## TODO
 - Implement full suite of unit and entity tests.

diff --git a/alembic/versions/0c5cf09f1fc4_initial_revision.py b/alembic/versions/0c5cf09f1fc4_initial_revision.py
diff --git a/alembic/versions/41ff5ba653c6_remove_subsets.py b/alembic/versions/41ff5ba653c6_remove_subsets.py
diff --git a/alembic/versions/4c5d9065aee2_add_banjax_bans_table.py b/alembic/versions/4c5d9065aee2_add_banjax_bans_table.py
diff --git a/alembic/versions/88eb5854154f_add_id_group_in_request_sets.py b/alembic/versions/88eb5854154f_add_id_group_in_request_sets.py
@@ -0,0 +1,23 @@
+"""add uuid_request_set in request_sets
+
+Revision ID: 88eb5854154f
+Revises:
+Create Date: 2020-07-07 11:02:39.321300
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = '88eb5854154f'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column('request_sets', sa.Column('uuid_request_set', sa.TEXT))
+
+
+def downgrade():
+    op.op.drop_column('request_sets', 'uuid_request_set')
diff --git a/conf/conf_example_baskerville.yaml b/conf/conf_example_baskerville.yaml
@@ -39,6 +39,8 @@ misp:
   misp_verifycert: True
 
 engine:
+  client_mode: False      # used in isac mode: whether the pipeline will run in client mode
+  id_client: 'some_id'    # used in isac mode: the unique client id - assigned by baskerville team
   time_bucket: 120  # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
   load_test: 10     # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
   es_log:
@@ -53,17 +55,40 @@ engine:
       - 'path2/log2.json'
       - 'path/to/somename*/*'         # use * to match folder names and files, spark will load them all in one dataframe if they are of the same format
   training:                           # Optional: only for the Training pipeline
-    model: 'baskerville.models.anomaly_model.AnomalyModel' # which model to use, see baskerville.util.enums.ModelEnum
-    data_parameters:                  # to define the training period, either use training_days or from/ to date
-#      training_days: 30               # today - training_days
-      from_date: '2020-03-01 20:59:59'
-      to_date: '2020-03-05 13:01:01'
-    model_parameters:        # depending on the model chosen, provide the training parameters, as you would to instantiate the class
-      num_trees: 100  # number of trees
-      max_samples: 100 # number of samples per tree
-      categorical_features: ['target'] # the list of categorical features
-#      contamination: 0.1 # the target portion of anomalies in the dataset. Either use contamination or threshold
-      threshold: 0.45 # the threshold for anomalies in the dataset. Either use contamination or threshold
+    model: 'baskerville.models.anomaly_model.AnomalyModel'
+    data_parameters:
+      #training_days: 30
+      from_date: '2020-07-06 18:00:00'
+      to_date: '2020-07-10 17:00:00'
+      max_samples_per_host: 5000
+    model_parameters:
+      threshold: 0.45
+      max_samples: 1000
+      #contamination: 0.1
+      num_trees: 300
+      max_depth: 10
+      max_features: 1.0
+      #approximate_quantile_relative_error: 0.4
+      features:
+        - host
+        # - country
+        - request_rate
+        - css_to_html_ratio
+        - image_to_html_ratio
+        - js_to_html_ratio
+        - path_depth_average
+        - path_depth_variance
+        - payload_size_average
+        - payload_size_log_average
+        - request_interval_average
+        - request_interval_variance
+        - response4xx_to_request_ratio
+        - top_page_to_request_ratio
+        - unique_path_rate
+        - unique_path_to_request_ratio
+        - unique_query_rate
+        - unique_query_to_unique_path_ratio
+        - unique_ua_rate
   simulation:                     # Optional: used only to test the kafka pipeline
     sleep: True
     verbose: True
@@ -101,12 +126,12 @@ engine:
   logpath: /where/to/save/logs.log
   log_level: 'ERROR'
 
-# Optional: used only by the Kafka Pipeline
 kafka:
-  bootstrap_servers: '0.0.0.0:9092'       # ip: port for kafka
-  zookeeper: 'localhost:2181'             # ip: port for zookeeper
-  consume_topic: 'incoming.logs'          # which should baskerville consume
-  consume_group: 'baskerville'            # a name for the consume group
+  url: !ENV '${KAFKA_HOST}:9092'       # ip: port for kafka
+  zookeeper: !ENV '${KAFKA_HOST}:2181'             # ip: port for zookeeper
+  logs_topic: 'incoming.logs'             # which should baskerville consume
+  features_topic: 'features'              # used in isac mode: where to send the feature vectors
+  predictions_topic: 'predictions'        # used in isac mode: where to send/ listen for predictions
 
 spark:
   app_name: 'Baskerville'   # the application name - can be changed for two different runs - used by the spark UI

diff --git a/conf/conf_kafka_example_baskerville.yaml b/conf/conf_kafka_example_baskerville.yaml
@@ -25,9 +25,11 @@ database:                             # Mandatory configuration
       template: 'data_archiving.jinja2' # Optional: the template name, default value, can be omitted
 
 engine:
-  time_bucket: 120  # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
-#  load_test: 10     # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
-  simulation:                     # Optional: used only to test the kafka pipeline
+  client_mode: False      # used in isac mode: whether the pipeline will run in client mode
+  id_client: 'some_id'    # used in isac mode: the unique client id - assigned by baskerville team
+  time_bucket: 120        # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
+#  load_test: 10          # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
+  simulation:             # Optional: used only to test the kafka pipeline
     sleep: True
     log_file: '/path/to/log.json' # the file to chunk and send to kafka
   datetime_format: '%Y-%m-%d %H:%M:%S'
@@ -65,7 +67,9 @@ engine:
 kafka:
   bootstrap_servers: '0.0.0.0:9092'       # ip: port for kafka
   zookeeper: 'localhost:2181'             # ip: port for zookeeper
-  consume_topic: 'incoming.logs'          # which should baskerville consume
+  logs_topic: 'incoming.logs'          # which should baskerville consume
+  features_topic: 'features'              # used in isac mode: where to send the feature vectors
+  predictions_topic: 'predictions'        # used in isac mode: where to send/ listen for predictions
   consume_group: 'baskerville'            # a name for the consume group
 
 spark: