{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Spam Detector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Retrieve the Data\n",
    "\n",
    "The data is located at [https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv](https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv)\n",
    "\n",
    "Dataset Source: [UCI Machine Learning Library](https://archive-beta.ics.uci.edu/dataset/94/spambase)\n",
    "\n",
    "Import the data using Pandas. Display the resulting DataFrame to confirm the import was successful."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word_freq_make</th>\n",
       "      <th>word_freq_address</th>\n",
       "      <th>word_freq_all</th>\n",
       "      <th>word_freq_3d</th>\n",
       "      <th>word_freq_our</th>\n",
       "      <th>word_freq_over</th>\n",
       "      <th>word_freq_remove</th>\n",
       "      <th>word_freq_internet</th>\n",
       "      <th>word_freq_order</th>\n",
       "      <th>word_freq_mail</th>\n",
       "      <th>...</th>\n",
       "      <th>char_freq_;</th>\n",
       "      <th>char_freq_(</th>\n",
       "      <th>char_freq_[</th>\n",
       "      <th>char_freq_!</th>\n",
       "      <th>char_freq_$</th>\n",
       "      <th>char_freq_#</th>\n",
       "      <th>capital_run_length_average</th>\n",
       "      <th>capital_run_length_longest</th>\n",
       "      <th>capital_run_length_total</th>\n",
       "      <th>spam</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.778</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>3.756</td>\n",
       "      <td>61</td>\n",
       "      <td>278</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.21</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.94</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.132</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.372</td>\n",
       "      <td>0.180</td>\n",
       "      <td>0.048</td>\n",
       "      <td>5.114</td>\n",
       "      <td>101</td>\n",
       "      <td>1028</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.06</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.71</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.23</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.25</td>\n",
       "      <td>...</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.143</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.276</td>\n",
       "      <td>0.184</td>\n",
       "      <td>0.010</td>\n",
       "      <td>9.821</td>\n",
       "      <td>485</td>\n",
       "      <td>2259</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.63</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.137</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.137</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>3.537</td>\n",
       "      <td>40</td>\n",
       "      <td>191</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.63</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.135</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.135</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>3.537</td>\n",
       "      <td>40</td>\n",
       "      <td>191</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 58 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \\\n",
       "0            0.00               0.64           0.64           0.0   \n",
       "1            0.21               0.28           0.50           0.0   \n",
       "2            0.06               0.00           0.71           0.0   \n",
       "3            0.00               0.00           0.00           0.0   \n",
       "4            0.00               0.00           0.00           0.0   \n",
       "\n",
       "   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \\\n",
       "0           0.32            0.00              0.00                0.00   \n",
       "1           0.14            0.28              0.21                0.07   \n",
       "2           1.23            0.19              0.19                0.12   \n",
       "3           0.63            0.00              0.31                0.63   \n",
       "4           0.63            0.00              0.31                0.63   \n",
       "\n",
       "   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \\\n",
       "0             0.00            0.00  ...         0.00        0.000   \n",
       "1             0.00            0.94  ...         0.00        0.132   \n",
       "2             0.64            0.25  ...         0.01        0.143   \n",
       "3             0.31            0.63  ...         0.00        0.137   \n",
       "4             0.31            0.63  ...         0.00        0.135   \n",
       "\n",
       "   char_freq_[  char_freq_!  char_freq_$  char_freq_#  \\\n",
       "0          0.0        0.778        0.000        0.000   \n",
       "1          0.0        0.372        0.180        0.048   \n",
       "2          0.0        0.276        0.184        0.010   \n",
       "3          0.0        0.137        0.000        0.000   \n",
       "4          0.0        0.135        0.000        0.000   \n",
       "\n",
       "   capital_run_length_average  capital_run_length_longest  \\\n",
       "0                       3.756                          61   \n",
       "1                       5.114                         101   \n",
       "2                       9.821                         485   \n",
       "3                       3.537                          40   \n",
       "4                       3.537                          40   \n",
       "\n",
       "   capital_run_length_total  spam  \n",
       "0                       278     1  \n",
       "1                      1028     1  \n",
       "2                      2259     1  \n",
       "3                       191     1  \n",
       "4                       191     1  \n",
       "\n",
       "[5 rows x 58 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import the data\n",
    "data = pd.read_csv(\"https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict Model Performance\n",
    "\n",
    "You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! \n",
    "\n",
    "Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Replace the text in this markdown cell with your predictions, and be sure to provide justification for your guess.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split the Data into Training and Testing Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the labels set `y` and features DataFrame `X`\n",
    "X = data.copy()\n",
    "X = X.drop(columns='spam')\n",
    "y = data[\"spam\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    2788\n",
       "1    1813\n",
       "Name: spam, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the balance of the labels variable (`y`) by using the `value_counts` function.\n",
    "y.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word_freq_make</th>\n",
       "      <th>word_freq_address</th>\n",
       "      <th>word_freq_all</th>\n",
       "      <th>word_freq_3d</th>\n",
       "      <th>word_freq_our</th>\n",
       "      <th>word_freq_over</th>\n",
       "      <th>word_freq_remove</th>\n",
       "      <th>word_freq_internet</th>\n",
       "      <th>word_freq_order</th>\n",
       "      <th>word_freq_mail</th>\n",
       "      <th>...</th>\n",
       "      <th>word_freq_conference</th>\n",
       "      <th>char_freq_;</th>\n",
       "      <th>char_freq_(</th>\n",
       "      <th>char_freq_[</th>\n",
       "      <th>char_freq_!</th>\n",
       "      <th>char_freq_$</th>\n",
       "      <th>char_freq_#</th>\n",
       "      <th>capital_run_length_average</th>\n",
       "      <th>capital_run_length_longest</th>\n",
       "      <th>capital_run_length_total</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3405</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.54</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.51</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.51</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.158</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.079</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>1.711</td>\n",
       "      <td>15</td>\n",
       "      <td>77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3026</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.52</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.76</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.121</td>\n",
       "      <td>0.365</td>\n",
       "      <td>0.121</td>\n",
       "      <td>0.487</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>1.956</td>\n",
       "      <td>22</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3930</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.72</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.467</td>\n",
       "      <td>0.116</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>2.431</td>\n",
       "      <td>12</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.58</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.558</td>\n",
       "      <td>0.279</td>\n",
       "      <td>0.000</td>\n",
       "      <td>3.272</td>\n",
       "      <td>23</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3252</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.244</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>1.663</td>\n",
       "      <td>10</td>\n",
       "      <td>168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1947</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.380</td>\n",
       "      <td>2.307</td>\n",
       "      <td>9</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4095</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.743</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.371</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.371</td>\n",
       "      <td>1.714</td>\n",
       "      <td>11</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4465</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.336</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>2.352</td>\n",
       "      <td>15</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2695</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.925</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>1.833</td>\n",
       "      <td>6</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2245</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.277</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>3.000</td>\n",
       "      <td>17</td>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3450 rows × 57 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \\\n",
       "3405             0.0               0.00            0.0           0.0   \n",
       "3026             0.0               1.52            0.0           0.0   \n",
       "3930             0.0               0.00            0.0           0.0   \n",
       "143              0.0               0.00            0.0           0.0   \n",
       "3252             0.0               0.18            0.0           0.0   \n",
       "...              ...                ...            ...           ...   \n",
       "1947             0.0               0.00            0.0           0.0   \n",
       "4095             0.0               0.00            0.0           0.0   \n",
       "4465             0.0               0.00            0.0           0.0   \n",
       "2695             0.0               0.00            0.0           0.0   \n",
       "2245             0.0               0.00            0.0           0.0   \n",
       "\n",
       "      word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \\\n",
       "3405           1.54             0.0              0.00                0.51   \n",
       "3026           0.76             0.0              0.00                0.00   \n",
       "3930           0.00             0.0              0.00                0.00   \n",
       "143            0.00             0.0              1.58                0.00   \n",
       "3252           0.18             0.0              0.00                0.37   \n",
       "...             ...             ...               ...                 ...   \n",
       "1947           0.00             0.0              0.00                0.00   \n",
       "4095           0.00             0.0              0.00                0.00   \n",
       "4465           0.00             0.0              0.00                0.00   \n",
       "2695           0.00             0.0              0.00                0.00   \n",
       "2245           0.00             0.0              0.00                0.00   \n",
       "\n",
       "      word_freq_order  word_freq_mail  ...  word_freq_conference  char_freq_;  \\\n",
       "3405              0.0            0.51  ...                   0.0        0.000   \n",
       "3026              0.0            0.76  ...                   0.0        0.121   \n",
       "3930              0.0            0.72  ...                   0.0        0.000   \n",
       "143               0.0            0.00  ...                   0.0        0.000   \n",
       "3252              0.0            0.00  ...                   0.0        0.000   \n",
       "...               ...             ...  ...                   ...          ...   \n",
       "1947              0.0            0.00  ...                   0.0        0.000   \n",
       "4095              0.0            0.00  ...                   0.0        0.000   \n",
       "4465              0.0            0.00  ...                   0.0        0.000   \n",
       "2695              0.0            0.00  ...                   0.0        0.000   \n",
       "2245              0.0            0.00  ...                   0.0        0.000   \n",
       "\n",
       "      char_freq_(  char_freq_[  char_freq_!  char_freq_$  char_freq_#  \\\n",
       "3405        0.158        0.000        0.079        0.000        0.000   \n",
       "3026        0.365        0.121        0.487        0.000        0.000   \n",
       "3930        0.467        0.116        0.000        0.000        0.000   \n",
       "143         0.000        0.000        0.558        0.279        0.000   \n",
       "3252        0.244        0.000        0.000        0.000        0.000   \n",
       "...           ...          ...          ...          ...          ...   \n",
       "1947        0.000        0.000        0.000        0.000        0.380   \n",
       "4095        0.743        0.000        0.371        0.000        0.371   \n",
       "4465        0.336        0.000        0.000        0.000        0.000   \n",
       "2695        0.000        0.000        0.925        0.000        0.000   \n",
       "2245        0.000        0.000        0.277        0.000        0.000   \n",
       "\n",
       "      capital_run_length_average  capital_run_length_longest  \\\n",
       "3405                       1.711                          15   \n",
       "3026                       1.956                          22   \n",
       "3930                       2.431                          12   \n",
       "143                        3.272                          23   \n",
       "3252                       1.663                          10   \n",
       "...                          ...                         ...   \n",
       "1947                       2.307                           9   \n",
       "4095                       1.714                          11   \n",
       "4465                       2.352                          15   \n",
       "2695                       1.833                           6   \n",
       "2245                       3.000                          17   \n",
       "\n",
       "      capital_run_length_total  \n",
       "3405                        77  \n",
       "3026                        90  \n",
       "3930                       124  \n",
       "143                         36  \n",
       "3252                       168  \n",
       "...                        ...  \n",
       "1947                        30  \n",
       "4095                        24  \n",
       "4465                        40  \n",
       "2695                        11  \n",
       "2245                        51  \n",
       "\n",
       "[3450 rows x 57 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Split the data into X_train, X_test, y_train, y_test\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
    "X_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scale the Features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the `StandardScaler` to scale the features data. Remember that only `X_train` and `X_test` DataFrames should be scaled."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Create the StandardScaler instance\n",
    "scaler = StandardScaler()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fit the Standard Scaler with the training data\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scale the training data\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create and Fit a Logistic Regression Model\n",
    "\n",
    "Create a Logistic Regression model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(random_state=1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(random_state=1)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LogisticRegression(random_state=1)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Train a Logistic Regression model and print the model score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "logistic_regression_model = LogisticRegression(random_state=1)\n",
    "\n",
    "logistic_regression_model.fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, ..., 0, 1, 0], dtype=int64)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Make and save testing predictions with the saved logistic regression model using the test data\n",
    "predictions = logistic_regression_model.predict(X_test_scaled)\n",
    "\n",
    "# Review the predictions\n",
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9105125977410947"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.\n",
    "accuracy_score(y_test, predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create and Fit a Random Forest Classifier Model\n",
    "\n",
    "Create a Random Forest Classifier model, fit it to the training data, make predictions with the testing data, and print the model's accuracy score. You may choose any starting settings you like. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-3 {color: black;background-color: white;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" checked><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=1)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "RandomForestClassifier(random_state=1)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Train a Random Forest Classifier model and print the model score\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_model = RandomForestClassifier(random_state = 1)\n",
    "\n",
    "rf_model = rf_model.fit(X_train_scaled, y_train)\n",
    "\n",
    "rf_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 0, ..., 0, 1, 1], dtype=int64)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Make and save testing predictions with the saved logistic regression model using the test data\n",
    "predictions = rf_model.predict(X_test_scaled)\n",
    "\n",
    "# Review the predictions\n",
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9357080799304952"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.\n",
    "accuracy_score(y_test, predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the Models\n",
    "\n",
    "Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the following markdown cell."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Replace the text in this markdown cell with your answers to these questions.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}