opensearch-project · YANG-DB · Nov 18, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
@@ -0,0 +1,68 @@
+# Apache Log Processor
+
+This folder contains scripts for processing Apache log files and transforming them into a JSON format suitable for ingestion into an OpenSearch Apache based SS4O index.
+
+## Scripts
+
+### `generator.py`
+
+This script simulates a live Apache log file. It continuously generates log entries and appends them to a specified file.
+
+#### Arguments
+
+- `--filename`: The filename where the generated logs will be stored. Default is `apache_logs.txt`.
+
+### `run-etl.py`
+
+This script reads from a live Apache log file, transforms each log entry into a structured JSON format, and appends the output to a file called `output_log.json`.
+
+#### Arguments
+
+- `--filename`: The filename of the raw Apache log file to process. Default is `apache_logs.txt`.
+
+## Usage
+
+### Running
+To use these scripts, you will need Python installed on your system along with the necessary packages listed in `requirements.txt`.
+
+The next `run.sh` bash script is designed to orchestrate the execution of two Python scripts: `generator.py` and `run-etl.py`, which are involved in generating and processing Apache log data, respectively.
+The script also ensures that the necessary Python dependencies are installed by using a requirements.txt file. Here's a breakdown of its functionality:
+
+```bash
+./run.sh --filename apache_raw.logs
+```
+Dont forget allowing the script to run by adding the next file premissions
+
+```text
+chmod +x run.sh
+```
+
+### Stopping
+
+Stopping the background processes is done using the `stop.sh` script
+
+```bash
+./stop.sh 
+```
+
+Don't forget allowing the script to run by adding the next file premissions
+
+```text
+chmod +x stop.sh
+```
+
+### Sync the result to S3 bucket
+To synchronizes a JSON file to an Amazon S3 bucket based on provided arguments for the file name and the S3 bucket name, you'll need to use the AWS Command Line Interface (AWS CLI).
+Make sure you have AWS CLI installed and configured with the necessary credentials and permissions to write to the S3 bucket.
+
+Dont forget allowing the script to run by adding the next file premissions
+
+```text
+chmod +x sync_s3.sh
+```
+
+You can then run the script by providing the filename and the S3 bucket name:
+
+```bash
+./sync_s3.sh --filename output_log.json --bucket s3://my_data/apache_logs
+```
@@ -0,0 +1,78 @@
+import argparse
+from faker import Faker
+import random
+import time
+import multiprocessing
+import os
+from datetime import datetime
+
+def generate_logs(args, process_id):
+    fake = Faker()
+
+    # Define the log format
+    log_format = '{ip} - - [{time}] "{method} {url} HTTP/1.1" {status_code} {size} "{referer}" "{user_agent}"'
+
+    # Add process ID and current date to the filename
+    current_date = datetime.now().strftime('%Y-%m-%d')
+    filename = f"apache_logs_process_{process_id}_{current_date}.txt"
+
+    # Open the log file
+    with open(filename, 'w') as log_file:
+        log_count = 0  # Initialize a counter for the number of logs generated
+        while args.log_number < 0 or log_count < args.log_number:
+            # If time range is provided, generate logs within that range
+            if args.start_time and args.end_time:
+                time_str = fake.date_time_between(start_date=args.start_time, end_date=args.end_time).strftime('%d/%b/%Y:%H:%M:%S +0000')
+            else:
+                time_str = fake.date_time().strftime('%d/%b/%Y:%H:%M:%S +0000')
+
+            log_entry = log_format.format(
+                ip=fake.ipv4(),
+                time=time_str,
+                method=random.choice(['GET', 'POST', 'DELETE', 'PUT']),
+                url=fake.uri_path(),
+                status_code=random.choice([200, 301, 400, 404, 500]),
+                size=random.randint(20, 5000),
+                referer=fake.uri(),
+                user_agent=fake.user_agent()
+            )
+            print(log_entry, file=log_file)
+            log_count += 1
+            # To avoid high CPU usage, add a small delay in the infinite loop
+            if args.log_number < 0:
+                time.sleep(0.1)  # Sleep for 0.1 second
+
+    # Print out how many log entries were generated
+    if args.log_number < 0:
+        print(f"Generated logs indefinitely in {filename}")
+    else:
+        print(f"Generated {log_count} log entries in {filename}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate Apache log files with Faker.')
+    parser.add_argument('--filename', type=str, default='', help='Base filename for the generated log files.')
+    parser.add_argument('--log-number', type=int, default=-1, help='Number of logs to generate. Pass a negative number to generate logs indefinitely.')
+    parser.add_argument('--start-time', type=str, help='Start time for log generation in YYYY-MM-DD format.')
+    parser.add_argument('--end-time', type=str, help='End time for log generation in YYYY-MM-DD format.')
+    parser.add_argument('--num-processes', type=int, default=4, help='Number of processes to run in parallel.')
+
+    args = parser.parse_args()
+
+    # If no filename is provided, use the default with the current date suffix
+    if not args.filename:
+        current_date = datetime.now().strftime('%Y-%m-%d')
+        args.filename = f"apache_logs_{current_date}"
+
+    # Create a list to hold the process objects
+    processes = []
+
+    for process_id in range(args.num_processes):
+        process = multiprocessing.Process(target=generate_logs, args=(args, process_id))
+        processes.append(process)
+        process.start()
+
+    for process in processes:
+        process.join()
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,5 @@
+faker
+argparse
+pandas
+user-agents
+python-dateutil
@@ -0,0 +1,115 @@
+import argparse
+import time
+import os
+import json
+import re
+import uuid
+import pandas as pd
+
+from user_agents import parse as ua_parse
+from faker import Faker
+
+fake = Faker()
+
+# Initialize the argument parser
+parser = argparse.ArgumentParser(description='ETL script for processing log files.')
+parser.add_argument('--filename', type=str, default='apache_logs.txt', help='Filename for the raw log file to process.')
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Function to follow the tail of a log file
+def follow(filename):
+    with open(filename, 'r') as f:
+        # Move to the end of the file
+        f.seek(0, os.SEEK_END)
+
+        while True:
+            line = f.readline()
+            if not line:
+                time.sleep(0.1)  # Sleep briefly to avoid busy waiting
+                continue
+            yield line
+
+# Define a regular expression pattern for Apache logs
+log_pattern = re.compile(
+    r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^\s]+) HTTP/(?P<flavor>[^"]+)" (?P<status_code>\d+) (?P<bytes>\d+) "(?P<referer>[^"]+)" "(?P<user_agent>[^"]+)"'
+)
+
+# Function to parse and transform a single log line
+def parse_and_transform_log_line(line):
+    match = log_pattern.match(line)
+    if match:
+        user_agent = ua_parse(match.group('user_agent'))
+
+        # Use Faker to generate country name and country code separately
+        country_name = fake.country()
+        country_code = fake.country_code()  # This generates a country ISO code
+
+        log_json = {
+            "observedTimestamp": pd.to_datetime(match.group('timestamp'), format='%d/%b/%Y:%H:%M:%S %z').strftime('%Y-%m-%dT%H:%M:%S.000Z'),
+            "http": {
+                "response": {
+                    "status_code": int(match.group('status_code')),
+                    "bytes": int(match.group('bytes'))
+                },
+                "url": match.group('url'),
+                "flavor": match.group('flavor'),
+                "request": {
+                    "method": match.group('method')
+                },
+                "user_agent": {
+                    "original": match.group('user_agent'),
+                    "name": user_agent.browser.family,
+                    "version": user_agent.browser.version_string,
+                    "os": {
+                        "name": user_agent.os.family,
+                        "full": user_agent.os.family + " " + user_agent.os.version_string,
+                        "version": user_agent.os.version_string,
+                        "device": {
+                            "name": user_agent.device.family
+                        }
+                    }
+                }
+            },
+            "communication": {
+                "source": {
+                    "address": match.group('ip'),
+                    "ip": match.group('ip'),
+                    "geo": {
+                        "country": country_name,  # Randomly generated country name
+                        "country_iso_code": country_code  # Randomly generated country ISO code
+                    }
+                }
+            },
+            "body": line,
+            "traceId": str(uuid.uuid4()),  # Randomly generated UUID for traceId
+            "spanId": str(uuid.uuid4()),   # Randomly generated UUID for spanId
+            "@timestamp": pd.to_datetime(match.group('timestamp'), format='%d/%b/%Y:%H:%M:%S %z').strftime('%Y-%m-%dT%H:%M:%S.000Z')
+        }
+        return log_json
+    else:
+        return None
+def main(logfile_path):
+    loglines = follow(logfile_path)
+
+    # Open the output file in append mode
+    with open('output_log.json', 'a') as outfile:
+        # Process each incoming log line
+        for line in loglines:
+            log_json = parse_and_transform_log_line(line)
+            if log_json:
+                # Write the JSON to the file
+                outfile.write(json.dumps(log_json) + '\n')
+                outfile.flush()
+
+if __name__ == "__main__":
+    # Initialize the argument parser
+    parser = argparse.ArgumentParser(description='ETL script for processing log files.')
+    parser.add_argument('--filename', type=str, default='apache_logs.txt', help='Filename for the raw log file to process.')
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Run the main function
+    main(args.filename)
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Define the Python scripts and requirements file
+GENERATOR_SCRIPT="generator.py"
+ETL_SCRIPT="run-etl.py"
+REQUIREMENTS_FILE="requirements.txt"
+
+# Check if the Python scripts and requirements file exist
+if [[ ! -f "$GENERATOR_SCRIPT" ]]; then
+    echo "Python script '$GENERATOR_SCRIPT' does not exist."
+    exit 1
+fi
+
+if [[ ! -f "$ETL_SCRIPT" ]]; then
+    echo "Python ETL script '$ETL_SCRIPT' does not exist."
+    exit 1
+fi
+
+if [[ ! -f "$REQUIREMENTS_FILE" ]]; then
+    echo "Requirements file '$REQUIREMENTS_FILE' does not exist."
+    exit 1
+fi
+
+# Install the required Python packages
+echo "Installing requirements from '$REQUIREMENTS_FILE'..."
+pip install -r "$REQUIREMENTS_FILE"
+
+# Check if pip install succeeded
+if [[ $? -ne 0 ]]; then
+    echo "Failed to install required packages."
+    exit 1
+fi
+
+# Run the generator Python script in the background
+echo "Running script '$GENERATOR_SCRIPT' in the background..."
+nohup python "$GENERATOR_SCRIPT" "$@" &
+
+# Run the ETL Python script in the background
+echo "Running ETL script '$ETL_SCRIPT' in the background..."
+nohup python "$ETL_SCRIPT" "$@" &
+
+# Wait for all background jobs to finish
+wait
+
+echo "Both scripts are now running in the background."
+
+echo "redirect the stdout and stderr of the generator script to generator.log and the ETL script to etl.log"
+nohup python "$GENERATOR_SCRIPT" "$@" > generator.log 2>&1 &
+nohup python "$ETL_SCRIPT" "$@" > etl.log 2>&1 &
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Define the Python scripts
+GENERATOR_SCRIPT="generator.py"
+ETL_SCRIPT="run-etl.py"
+
+# Function to kill a script given its name
+kill_script() {
+    SCRIPT_NAME=$1
+    echo "Killing all processes matching $SCRIPT_NAME"
+    pkill -f $SCRIPT_NAME
+    if [ $? -eq 0 ]; then
+        echo "Successfully killed $SCRIPT_NAME"
+    else
+        echo "Failed to kill $SCRIPT_NAME or no process found"
+    fi
+}
+
+# Kill the generator and ETL scripts
+kill_script "$GENERATOR_SCRIPT"
+kill_script "$ETL_SCRIPT"
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Check for AWS CLI and exit if not installed
+if ! command -v aws &> /dev/null; then
+    echo "AWS CLI could not be found. Please install it to use this script."
+    exit 1
+fi
+
+# Parse command line arguments for filename pattern and bucket name
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        -f|--filename) filename_pattern="$2"; shift ;;
+        -b|--bucket) bucket="$2"; shift ;;
+        *) echo "Unknown parameter: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+# Check if the filename pattern and bucket name arguments were provided
+if [ -z "$filename_pattern" ] || [ -z "$bucket" ]; then
+    echo "Usage: $0 --filename FILENAME_PATTERN --bucket BUCKET_NAME"
+    exit 1
+fi
+
+# Find files that match the pattern
+matching_files=($(find . -type f -name "$filename_pattern"))
+
+# Check if any matching files were found
+if [ ${#matching_files[@]} -eq 0 ]; then
+    echo "No files found matching the pattern: $filename_pattern"
+    exit 1
+fi
+
+# Sync each matching file to the S3 bucket
+for filename in "${matching_files[@]}"; do
+    echo "Syncing $filename to s3://$bucket/"
+    aws s3 cp "$filename" "s3://$bucket/"
+
+    # Check if AWS CLI command succeeded
+    if [ $? -ne 0 ]; then
+        echo "Sync to S3 failed for $filename."
+    else
+        echo "File $filename successfully synced to S3."
+    fi
+done