Full project commit (#1)

* Baskerville - complete project Set theme jekyll-theme-minimal Update Dockerfile
deflect-ca · May 20, 2020 · 77235c4 · 77235c4
1 parent f770a4e
commit 77235c4
Show file tree

Hide file tree

Showing 330 changed files with 79,118 additions and 0 deletions.
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
diff --git a/README.md b/README.md
diff --git a/_config.yml b/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-minimal
diff --git a/alembic/README b/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
diff --git a/alembic/env.py b/alembic/env.py
@@ -0,0 +1,87 @@
+from __future__ import with_statement
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+from logging.config import fileConfig
+
+import os
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# read and set current database configuration
+db_type = os.environ.get('BASKERVILLE_DB_TYPE', 'postgres')
+db_user = os.environ.get('DB_USER')
+db_pass = os.environ.get('DB_PASS')
+db_host = os.environ.get('DB_HOST')
+db_name = os.environ.get('BASKERVILLE_DB')
+# or parse baskerville config and use get_db_connection_str
+# conf = parse_config(path=conf_options['conf_file'])
+if all([db_user, db_host, db_pass, db_name]):
+    config.set_main_option(
+        'sqlalchemy.url',
+        f'{db_type}://{db_user}:{db_pass}@{db_host}/{db_name}'
+    )
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = None
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline():
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url, target_metadata=target_metadata, literal_binds=True)
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online():
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section),
+        prefix='sqlalchemy.',
+        poolclass=pool.NullPool)
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/alembic/script.py.mako b/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade():
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade():
+    ${downgrades if downgrades else "pass"}
diff --git a/alembic/versions/0c5cf09f1fc4_initial_revision.py b/alembic/versions/0c5cf09f1fc4_initial_revision.py
@@ -0,0 +1,24 @@
+"""initial revision
+
+Revision ID: 0c5cf09f1fc4
+Revises: 
+Create Date: 2018-05-28 15:01:48.308789
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '0c5cf09f1fc4'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    pass
+
+
+def downgrade():
+    pass
diff --git a/alembic/versions/41ff5ba653c6_remove_subsets.py b/alembic/versions/41ff5ba653c6_remove_subsets.py
@@ -0,0 +1,42 @@
+"""remove subsets
+
+Revision ID: 41ff5ba653c6
+Revises: 4c5d9065aee2
+Create Date: 2019-03-04 14:11:26.543111
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+from baskerville.db.models import utcnow
+
+revision = '41ff5ba653c6'
+down_revision = '4c5d9065aee2'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.drop_table('subsets')
+
+
+def downgrade():
+    op.create_table(
+        'subsets',
+        sa.Column('id', sa.Integer, primary_key=True),
+        sa.Column('target', sa.String(45), nullable=False),
+        sa.Column('ip', sa.TEXT(), nullable=False),
+        sa.Column('start', sa.DateTime(timezone=True)),
+        sa.Column('stop', sa.DateTime(timezone=True)),
+        sa.Column('num_requests', sa.Integer(), nullable=False),
+        sa.Column('features', sa.JSON()),
+        sa.Column('prediction', sa.Integer()),
+        sa.Column('row_num', sa.Integer()),
+        sa.Column('r', sa.Float()),
+        sa.Column('time_bucket', sa.Integer()),
+        sa.Column(
+            'created_at', sa.DateTime(timezone=True), server_default=utcnow()
+        ),
+    )
diff --git a/alembic/versions/4c5d9065aee2_add_banjax_bans_table.py b/alembic/versions/4c5d9065aee2_add_banjax_bans_table.py
@@ -0,0 +1,32 @@
+"""add banjax bans table
+
+Revision ID: 4c5d9065aee2
+Revises: 
+Create Date: 2019-02-19 13:12:54.127134
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '4c5d9065aee2'
+down_revision = '0c5cf09f1fc4'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_table(
+        'banjax_bans',
+        sa.Column('id', sa.Integer, primary_key=True),
+        sa.Column('sync_start', sa.DateTime(timezone=True)),
+        sa.Column('sync_stop', sa.DateTime(timezone=True)),
+        sa.Column('ip', sa.TEXT(), nullable=False)
+    )
+    op.add_column('request_sets', sa.Column('id_banjax', sa.Integer))
+
+
+def downgrade():
+    op.drop_table('banjax_bans')
+    op.drop_column('request_sets', 'id_banjax')
diff --git a/conf/conf_es_example_baskerville.yaml b/conf/conf_es_example_baskerville.yaml
@@ -0,0 +1,105 @@
+---
+database:                             # Mandatory configuration
+  name: baskerville                   # the database name
+  user: user
+  password: 'pass'
+  type: 'postgres'
+  host: 127.0.0.1
+  port: 5432
+  maintenance:                        # Optional, for data partitioning and archiving
+    template_folder: '/path/to/template/folder/'  # Optional: by default the data folder, can be omitted
+    partition_table: 'request_sets'   # default value
+    partition_by: week                # partition by week or month, default value is week
+    partition_field: created_at       # which field to use for the partitioning, this is the default value, can be omitted
+    strict: False                     # if False, then for the week partition the start and end date will be changed to the start and end of the respective weeks. If true, then the dates will remain unchanged. Be careful to be consistent with this.
+    data_partition:                   # Optional: Define the period to create partitions for
+      since: 2018-01-01               # when to start partitioning
+      until: "2018-12-31 23:59:59"    # when to stop partitioning
+      index_by:                       # which fields to index in the partitions that will be created (only one index is supported currently), default value, can be omitted
+        - target
+        - ip
+      template: 'data_partitioning.jinja2'  # Optional: the template name, default value, can be omitted
+    data_archive:                       # Optional: define the period to archive
+      since: 2017-02-01                 # which dates to archive - in a non-strict mode, the start date will be modified to the start date of the week
+      until: 2017-12-31                 # this is also true for the end date. If a strict mode is requested then the end date will be modified to the end of the week the until date belongs to.
+      template: 'data_archiving.jinja2' # Optional: the template name, default value, can be omitted
+
+# Optional: used only by the Elastic pipeline
+elastic:
+  user: 'elastic'
+  password: 'changeme'
+  host: 'url to ES instance'
+  base_index: 'some.log'
+  index_type: 'some_type'
+
+engine:
+  time_bucket: 120  # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
+#  load_test: 10     # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
+  es_log:
+    host: somehost                    # Optional
+    start: 2018-01-01 00:00:00        # Optional
+    stop: 2018-01-02 00:00:00         # Optional
+    batch_length: 30                  # minutes - split start and stop in batch_length periods to avoid overloading the es cluster
+    save_logs_dir: path/to/directory/to/save/logs  # optional
+  datetime_format: '%Y-%m-%d %H:%M:%S'
+  cache_expire_time: 604800       # sec (604800 = 1 week)
+  cross_reference: False          # search MISP for IPs
+  model_version_id: n             # optional
+  extra_features:                 # useful when we need to calculate more features than the model requests or when there is no model
+    - 'example_feature_average'
+  metrics:
+    port: 8998
+    performance:
+      pipeline:   # list the name of the methods you want to time for performance
+        - 'preprocessing'
+        - 'group_by'
+        - 'feature_calculation'
+        - 'label_or_predict'
+        - 'save'
+      request_set_cache: # list the name of the methods you want to time for performance
+        - 'instantiate_cache'
+        - '__getitem__'
+        - '__contains__'
+        - 'clean'
+      features: True     # add a metric to time the features
+    progress: True       # add a metric to watch the pipeline progress
+  data_config:
+    parser: JSONLogSparkParser
+    schema: '/path/to/data/samples/sample_log_schema.json'
+    group_by_cols:
+    - 'client_request_host'
+    - 'client_ip'
+    timestamp_column: '@timestamp'
+  logpath: /where/to/save/logs.log
+  log_level: 'ERROR'
+
+spark:
+  app_name: 'Baskerville'   # the application name - can be changed for two different runs - used by the spark UI
+  master: 'local'           # the ip:port of the master node, e.g. spark://someip:7077 to submit to a cluster
+  parallelism: -1           # controls the number of tasks, -1 means use all cores - used for local master
+  log_level: 'INFO'         # spark logs level
+  storage_level: 'OFF_HEAP' # which strategy to use for storing dfs - valid values are the ones found here: https://spark.apache.org/docs/2.4.0/api/python/_modules/pyspark/storagelevel.html default: OFF_HEAP
+  jars: '/path/to/jars/postgresql-42.2.4.jar,/path/to/spark-iforest-2.4.0.jar,/path/to/elasticsearch-spark-20_2.11-5.6.5.jar' # or /path/to/jars/mysql-connector-java-8.0.11.jar
+  session_timezone: 'UTC'
+  shuffle_partitions: 14    # depends on your dataset and your hardware, usually ~ 2 * number of cores is a good choice
+  executor_instances: 4     # omitted when running locally
+  executor_cores: 4         # omitted when running locally
+  spark_driver_memory: '6G' # depends on your dataset and the available ram you have. If running locally 6 - 8 GB should be a good choice, depending on the amount of data you need to process
+  db_driver: 'org.postgresql.Driver'  # or for mysql: 'com.mysql.cj.jdbc.Driver'
+  metrics_conf: /path/to/data/spark.metrics  # Optional: required only  to export spark metrics
+  jar_packages: 'com.banzaicloud:spark-metrics_2.11:2.3-2.0.4,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2'  # required to export spark metrics
+  jar_repositories: 'https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases' # Optional: Required only to export spark metrics
+  event_log: True
+  serializer: 'org.apache.spark.serializer.KryoSerializer'
+  kryoserializer_buffer_max: '2024m'          # 2024m and 1024k are the max values the KryoSerializer can handle
+  kryoserializer_buffer: '1024k'              # It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors.
+  driver_java_options: '-verbose:gc'          # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
+  executor_extra_java_options: '-verbose:gc'  # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
+  # to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail):
+  # -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps  -XX:+UnlockDiagnosticVMOptions -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=1098
+  # depending on your configuration and resources:
+  # -Dio.netty.noPreferDirect=true -Dio.netty.allocator.type=unpooled -XX:+UseCompressedOops -XX:G1HeapRegionSize=10 -XX:+UseG1GC -XX:ParallelGCThreads=8 -XX:ConcGCThreads=2 -XX:InitiatingHeapOccupancyPercent=25
+  # UseG1GC is usually the best option
+  # number of ParallelGCThreads cannot go above the number of cores
+  # ConcGCThreads=2 : two per core is a reasonable option that works well on most cases
+  # InitiatingHeapOccupancyPercent=25: allocate 25% for heap - this has to be tested on your machine to see which percentage works well