Skip to content

Commit

Permalink
Full project commit (#1)
Browse files Browse the repository at this point in the history
* Baskerville - complete project
Set theme jekyll-theme-minimal
Update Dockerfile
  • Loading branch information
mkaranasou committed May 20, 2020
1 parent f770a4e commit 77235c4
Show file tree
Hide file tree
Showing 330 changed files with 79,118 additions and 0 deletions.
726 changes: 726 additions & 0 deletions DEPLOYMENT.md

Large diffs are not rendered by default.

557 changes: 557 additions & 0 deletions README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
theme: jekyll-theme-minimal
1 change: 1 addition & 0 deletions alembic/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Generic single-database configuration.
87 changes: 87 additions & 0 deletions alembic/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import with_statement
from alembic import context
from sqlalchemy import engine_from_config, pool
from logging.config import fileConfig

import os

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config

# read and set current database configuration
db_type = os.environ.get('BASKERVILLE_DB_TYPE', 'postgres')
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
db_host = os.environ.get('DB_HOST')
db_name = os.environ.get('BASKERVILLE_DB')
# or parse baskerville config and use get_db_connection_str
# conf = parse_config(path=conf_options['conf_file'])
if all([db_user, db_host, db_pass, db_name]):
config.set_main_option(
'sqlalchemy.url',
f'{db_type}://{db_user}:{db_pass}@{db_host}/{db_name}'
)

# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)

# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = None

# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.


def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True)

with context.begin_transaction():
context.run_migrations()


def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix='sqlalchemy.',
poolclass=pool.NullPool)

with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata
)

with context.begin_transaction():
context.run_migrations()


if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
24 changes: 24 additions & 0 deletions alembic/script.py.mako
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""${message}

Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}

"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}

# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}


def upgrade():
${upgrades if upgrades else "pass"}


def downgrade():
${downgrades if downgrades else "pass"}
24 changes: 24 additions & 0 deletions alembic/versions/0c5cf09f1fc4_initial_revision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""initial revision
Revision ID: 0c5cf09f1fc4
Revises:
Create Date: 2018-05-28 15:01:48.308789
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '0c5cf09f1fc4'
down_revision = None
branch_labels = None
depends_on = None


def upgrade():
pass


def downgrade():
pass
42 changes: 42 additions & 0 deletions alembic/versions/41ff5ba653c6_remove_subsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""remove subsets
Revision ID: 41ff5ba653c6
Revises: 4c5d9065aee2
Create Date: 2019-03-04 14:11:26.543111
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
from baskerville.db.models import utcnow

revision = '41ff5ba653c6'
down_revision = '4c5d9065aee2'
branch_labels = None
depends_on = None


def upgrade():
op.drop_table('subsets')


def downgrade():
op.create_table(
'subsets',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('target', sa.String(45), nullable=False),
sa.Column('ip', sa.TEXT(), nullable=False),
sa.Column('start', sa.DateTime(timezone=True)),
sa.Column('stop', sa.DateTime(timezone=True)),
sa.Column('num_requests', sa.Integer(), nullable=False),
sa.Column('features', sa.JSON()),
sa.Column('prediction', sa.Integer()),
sa.Column('row_num', sa.Integer()),
sa.Column('r', sa.Float()),
sa.Column('time_bucket', sa.Integer()),
sa.Column(
'created_at', sa.DateTime(timezone=True), server_default=utcnow()
),
)
32 changes: 32 additions & 0 deletions alembic/versions/4c5d9065aee2_add_banjax_bans_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""add banjax bans table
Revision ID: 4c5d9065aee2
Revises:
Create Date: 2019-02-19 13:12:54.127134
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '4c5d9065aee2'
down_revision = '0c5cf09f1fc4'
branch_labels = None
depends_on = None


def upgrade():
op.create_table(
'banjax_bans',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('sync_start', sa.DateTime(timezone=True)),
sa.Column('sync_stop', sa.DateTime(timezone=True)),
sa.Column('ip', sa.TEXT(), nullable=False)
)
op.add_column('request_sets', sa.Column('id_banjax', sa.Integer))


def downgrade():
op.drop_table('banjax_bans')
op.drop_column('request_sets', 'id_banjax')
105 changes: 105 additions & 0 deletions conf/conf_es_example_baskerville.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
---
database: # Mandatory configuration
name: baskerville # the database name
user: user
password: 'pass'
type: 'postgres'
host: 127.0.0.1
port: 5432
maintenance: # Optional, for data partitioning and archiving
template_folder: '/path/to/template/folder/' # Optional: by default the data folder, can be omitted
partition_table: 'request_sets' # default value
partition_by: week # partition by week or month, default value is week
partition_field: created_at # which field to use for the partitioning, this is the default value, can be omitted
strict: False # if False, then for the week partition the start and end date will be changed to the start and end of the respective weeks. If true, then the dates will remain unchanged. Be careful to be consistent with this.
data_partition: # Optional: Define the period to create partitions for
since: 2018-01-01 # when to start partitioning
until: "2018-12-31 23:59:59" # when to stop partitioning
index_by: # which fields to index in the partitions that will be created (only one index is supported currently), default value, can be omitted
- target
- ip
template: 'data_partitioning.jinja2' # Optional: the template name, default value, can be omitted
data_archive: # Optional: define the period to archive
since: 2017-02-01 # which dates to archive - in a non-strict mode, the start date will be modified to the start date of the week
until: 2017-12-31 # this is also true for the end date. If a strict mode is requested then the end date will be modified to the end of the week the until date belongs to.
template: 'data_archiving.jinja2' # Optional: the template name, default value, can be omitted

# Optional: used only by the Elastic pipeline
elastic:
user: 'elastic'
password: 'changeme'
host: 'url to ES instance'
base_index: 'some.log'
index_type: 'some_type'

engine:
time_bucket: 120 # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
# load_test: 10 # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
es_log:
host: somehost # Optional
start: 2018-01-01 00:00:00 # Optional
stop: 2018-01-02 00:00:00 # Optional
batch_length: 30 # minutes - split start and stop in batch_length periods to avoid overloading the es cluster
save_logs_dir: path/to/directory/to/save/logs # optional
datetime_format: '%Y-%m-%d %H:%M:%S'
cache_expire_time: 604800 # sec (604800 = 1 week)
cross_reference: False # search MISP for IPs
model_version_id: n # optional
extra_features: # useful when we need to calculate more features than the model requests or when there is no model
- 'example_feature_average'
metrics:
port: 8998
performance:
pipeline: # list the name of the methods you want to time for performance
- 'preprocessing'
- 'group_by'
- 'feature_calculation'
- 'label_or_predict'
- 'save'
request_set_cache: # list the name of the methods you want to time for performance
- 'instantiate_cache'
- '__getitem__'
- '__contains__'
- 'clean'
features: True # add a metric to time the features
progress: True # add a metric to watch the pipeline progress
data_config:
parser: JSONLogSparkParser
schema: '/path/to/data/samples/sample_log_schema.json'
group_by_cols:
- 'client_request_host'
- 'client_ip'
timestamp_column: '@timestamp'
logpath: /where/to/save/logs.log
log_level: 'ERROR'

spark:
app_name: 'Baskerville' # the application name - can be changed for two different runs - used by the spark UI
master: 'local' # the ip:port of the master node, e.g. spark://someip:7077 to submit to a cluster
parallelism: -1 # controls the number of tasks, -1 means use all cores - used for local master
log_level: 'INFO' # spark logs level
storage_level: 'OFF_HEAP' # which strategy to use for storing dfs - valid values are the ones found here: https://spark.apache.org/docs/2.4.0/api/python/_modules/pyspark/storagelevel.html default: OFF_HEAP
jars: '/path/to/jars/postgresql-42.2.4.jar,/path/to/spark-iforest-2.4.0.jar,/path/to/elasticsearch-spark-20_2.11-5.6.5.jar' # or /path/to/jars/mysql-connector-java-8.0.11.jar
session_timezone: 'UTC'
shuffle_partitions: 14 # depends on your dataset and your hardware, usually ~ 2 * number of cores is a good choice
executor_instances: 4 # omitted when running locally
executor_cores: 4 # omitted when running locally
spark_driver_memory: '6G' # depends on your dataset and the available ram you have. If running locally 6 - 8 GB should be a good choice, depending on the amount of data you need to process
db_driver: 'org.postgresql.Driver' # or for mysql: 'com.mysql.cj.jdbc.Driver'
metrics_conf: /path/to/data/spark.metrics # Optional: required only to export spark metrics
jar_packages: 'com.banzaicloud:spark-metrics_2.11:2.3-2.0.4,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2' # required to export spark metrics
jar_repositories: 'https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases' # Optional: Required only to export spark metrics
event_log: True
serializer: 'org.apache.spark.serializer.KryoSerializer'
kryoserializer_buffer_max: '2024m' # 2024m and 1024k are the max values the KryoSerializer can handle
kryoserializer_buffer: '1024k' # It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors.
driver_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
executor_extra_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
# to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail):
# -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UnlockDiagnosticVMOptions -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=1098
# depending on your configuration and resources:
# -Dio.netty.noPreferDirect=true -Dio.netty.allocator.type=unpooled -XX:+UseCompressedOops -XX:G1HeapRegionSize=10 -XX:+UseG1GC -XX:ParallelGCThreads=8 -XX:ConcGCThreads=2 -XX:InitiatingHeapOccupancyPercent=25
# UseG1GC is usually the best option
# number of ParallelGCThreads cannot go above the number of cores
# ConcGCThreads=2 : two per core is a reasonable option that works well on most cases
# InitiatingHeapOccupancyPercent=25: allocate 25% for heap - this has to be tested on your machine to see which percentage works well
Loading

0 comments on commit 77235c4

Please sign in to comment.