diff --git a/analyses/2019_04_sanchittechnogeek__geolocation/geolocation&language.ipynb b/analyses/2019_04_sanchittechnogeek__geolocation/geolocation&language.ipynb
new file mode 100644
index 0000000..ce24ee0
--- /dev/null
+++ b/analyses/2019_04_sanchittechnogeek__geolocation/geolocation&language.ipynb
@@ -0,0 +1,1101 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis: \n",
+    "\n",
+    "My analysis revolves around checking what percentage of along with which websites and scripts in this dataset are tracking users location (geolocation) and language preferences as well as their country code. So, as to provide them with a customized content based on the users preferences (eg. location, language)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dataset used: Sample 10 percent\n",
+    " - [sample 10 percent](https://public-data.telemetry.mozilla.org/bigcrawl/sample_10percent.parquet.tar.bz2) - 3.7GB download / 7.4GB on disk"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1. Importing all the required libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Anaconda3\\envs\\overscripted\\lib\\site-packages\\dask\\config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
+      "  data = yaml.load(f.read()) or {}\n",
+      "D:\\Anaconda3\\envs\\overscripted\\lib\\site-packages\\distributed\\config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
+      "  defaults = yaml.load(f)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#importing dask\n",
+    "import dask.dataframe as dd\n",
+    "\n",
+    "#importing pandas\n",
+    "import pandas as pd\n",
+    "\n",
+    "#importing Dask.distributed for distributed computing\n",
+    "from dask.distributed import Client, progress\n",
+    "\n",
+    "#importing os\n",
+    "import os\n",
+    "\n",
+    "#importing tldextract\n",
+    "import tldextract"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract domain function using tldextract\n",
+    "def extract_domain(url):\n",
+    "    \"\"\"Use tldextract to return the base domain from a url\"\"\"\n",
+    "    try:\n",
+    "        extracted = tldextract.extract(url)\n",
+    "        return '{}.{}.{}'.format(extracted.subdomain, extracted.domain, extracted.suffix)\n",
+    "    except Exception as e:\n",
+    "        return 'ERROR'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table style=\"border: 2px solid white;\">\n",
+       "<tr>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Client</h3>\n",
+       "<ul>\n",
+       "  <li><b>Scheduler: </b>tcp://127.0.0.1:59049\n",
+       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
+       "</ul>\n",
+       "</td>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Cluster</h3>\n",
+       "<ul>\n",
+       "  <li><b>Workers: </b>4</li>\n",
+       "  <li><b>Cores: </b>8</li>\n",
+       "  <li><b>Memory: </b>17.12 GB</li>\n",
+       "</ul>\n",
+       "</td>\n",
+       "</tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "<Client: scheduler='tcp://127.0.0.1:59049' processes=4 cores=8>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# setting up the Dask.distributed client\n",
+    "client=Client()\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2. Loading the required dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data directory where the data set is stored\n",
+    "DATA_DIR = 'D:\\Outreachy\\Datasets_Compressed\\Datasets'\n",
+    "\n",
+    "dataset = os.path.join(DATA_DIR, '10percent/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['location', 'script_url', 'operation', 'symbol', 'value'], dtype='object')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#loading the dataset and creating ther dask dataframe using dd.read_parquet \n",
+    "df = dd.read_parquet(dataset, engine='pyarrow', columns = ['location', 'script_url', 'operation', 'symbol', 'value'], index = False)\n",
+    "\n",
+    "#checking what columns the dataset has\n",
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "30.625854378"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# memory usage statistics\n",
+    "memory_usage = df.memory_usage(deep=True).compute()\n",
+    "memory_usage.sum() / 1e9"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3. Previewing the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>script_url</th>\n",
+       "      <th>operation</th>\n",
+       "      <th>symbol</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>https://vk.com/widget_comments.php?app=2297596...</td>\n",
+       "      <td>https://vk.com/js/api/xdm.js?1449919642</td>\n",
+       "      <td>get</td>\n",
+       "      <td>window.name</td>\n",
+       "      <td>fXDcab74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>https://vk.com/widget_comments.php?app=2297596...</td>\n",
+       "      <td>https://vk.com/js/api/xdm.js?1449919642</td>\n",
+       "      <td>get</td>\n",
+       "      <td>window.name</td>\n",
+       "      <td>fXDcab74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>https://vk.com/widget_comments.php?app=2297596...</td>\n",
+       "      <td>https://vk.com/js/al/aes_light.js?592436914</td>\n",
+       "      <td>get</td>\n",
+       "      <td>window.navigator.userAgent</td>\n",
+       "      <td>Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>https://pos.baidu.com/s?hei=70&amp;wid=670&amp;di=u313...</td>\n",
+       "      <td>https://cpro.baidustatic.com/cpro/ui/noexpire/...</td>\n",
+       "      <td>get</td>\n",
+       "      <td>window.navigator.userAgent</td>\n",
+       "      <td>Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>http://serienjunkies.org/smilf/smilf-season-1-...</td>\n",
+       "      <td>https://apis.google.com/js/plusone.js?_=151338...</td>\n",
+       "      <td>get</td>\n",
+       "      <td>window.document.cookie</td>\n",
+       "      <td>_ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            location  \\\n",
+       "0  https://vk.com/widget_comments.php?app=2297596...   \n",
+       "1  https://vk.com/widget_comments.php?app=2297596...   \n",
+       "2  https://vk.com/widget_comments.php?app=2297596...   \n",
+       "3  https://pos.baidu.com/s?hei=70&wid=670&di=u313...   \n",
+       "4  http://serienjunkies.org/smilf/smilf-season-1-...   \n",
+       "\n",
+       "                                          script_url operation  \\\n",
+       "0            https://vk.com/js/api/xdm.js?1449919642       get   \n",
+       "1            https://vk.com/js/api/xdm.js?1449919642       get   \n",
+       "2        https://vk.com/js/al/aes_light.js?592436914       get   \n",
+       "3  https://cpro.baidustatic.com/cpro/ui/noexpire/...       get   \n",
+       "4  https://apis.google.com/js/plusone.js?_=151338...       get   \n",
+       "\n",
+       "                       symbol  \\\n",
+       "0                 window.name   \n",
+       "1                 window.name   \n",
+       "2  window.navigator.userAgent   \n",
+       "3  window.navigator.userAgent   \n",
+       "4      window.document.cookie   \n",
+       "\n",
+       "                                               value  \n",
+       "0                                           fXDcab74  \n",
+       "1                                           fXDcab74  \n",
+       "2  Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...  \n",
+       "3  Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...  \n",
+       "4  _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# First few rows (head)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>script_url</th>\n",
+       "      <th>operation</th>\n",
+       "      <th>symbol</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>npartitions=299</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>object</td>\n",
+       "      <td>object</td>\n",
+       "      <td>int8</td>\n",
+       "      <td>int16</td>\n",
+       "      <td>object</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>\n",
+       "<div>Dask Name: read-parquet, 299 tasks</div>"
+      ],
+      "text/plain": [
+       "Dask DataFrame Structure:\n",
+       "                location script_url operation symbol   value\n",
+       "npartitions=299                                             \n",
+       "                  object     object      int8  int16  object\n",
+       "                     ...        ...       ...    ...     ...\n",
+       "...                  ...        ...       ...    ...     ...\n",
+       "                     ...        ...       ...    ...     ...\n",
+       "                     ...        ...       ...    ...     ...\n",
+       "Dask Name: read-parquet, 299 tasks"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# dataframe structure\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total no. unique websites in this dataset: 205949\n",
+      "Total no. unique script_urls in this dataset: 166862\n"
+     ]
+    }
+   ],
+   "source": [
+    "# dataframe website urls / location\n",
+    "total_location_unique_count = df.location.nunique().compute()\n",
+    "total_script_url_unique_count = df.script_url.nunique().compute()\n",
+    "\n",
+    "# printing no. of entries\n",
+    "print('Total no. unique websites in this dataset:', total_location_unique_count)\n",
+    "print('Total no. unique script_urls in this dataset:', total_script_url_unique_count)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Language preferences\n",
+    "The `NavigatorLanguage.languages` read-only property returns an array of DOMStrings representing the user's preferred languages. The language is described using BCP 47 language tags. In the returned array they are ordered by preference with the most preferred language first.\n",
+    "\n",
+    "The value of `navigator.language` is the first element of the returned array.\n",
+    "\n",
+    "When its value changes, as the user's preferred languages are changed a languagechange event is fired on the Window object.\n",
+    "- Lets see which websites/scripts are checking for user's language preference (window.navigator.language)\n",
+    "- Checking the `value` column can tell us what language the script has captured from its user.\n",
+    "- To see whether the `navigator.language` calls are `get` or not. We can verify the the call type by checking the `operation` column."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Anaconda3\\envs\\overscripted\\lib\\site-packages\\pandas\\core\\ops.py:1649: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
+      "  result = method(y)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[########################################] | 100% Completed | 26.8s\r"
+     ]
+    }
+   ],
+   "source": [
+    "language_pref_df = df[df.symbol == 'window.navigator.language']\n",
+    "language_pref_df = language_pref_df[['location', 'script_url', 'operation', 'value']].drop_duplicates().persist()\n",
+    "progress(language_pref_df, notebook=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>script_url</th>\n",
+       "      <th>operation</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>https://www.canada.ca/en/services.html</td>\n",
+       "      <td>https://www.google-analytics.com/analytics.js</td>\n",
+       "      <td>get</td>\n",
+       "      <td>en-US</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>253</th>\n",
+       "      <td>https://maniform.world.tmall.com/category-1282...</td>\n",
+       "      <td>https://g.alicdn.com/secdev/sufei_data/3.2.2/i...</td>\n",
+       "      <td>get</td>\n",
+       "      <td>en-US</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>484</th>\n",
+       "      <td>https://www.coches.net/fiat/segunda-mano/</td>\n",
+       "      <td>https://jssdk.pulse.schibsted.com/autoTrackerC...</td>\n",
+       "      <td>get</td>\n",
+       "      <td>en-US</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>507</th>\n",
+       "      <td>https://www.coches.net/fiat/segunda-mano/</td>\n",
+       "      <td>https://www.coches.net/ztkieflaaxcvaiwh121837.js</td>\n",
+       "      <td>get</td>\n",
+       "      <td>en-US</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>573</th>\n",
+       "      <td>https://www.coches.net/fiat/segunda-mano/</td>\n",
+       "      <td>https://script.hotjar.com/modules-526d80f8c014...</td>\n",
+       "      <td>get</td>\n",
+       "      <td>en-US</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              location  \\\n",
+       "46              https://www.canada.ca/en/services.html   \n",
+       "253  https://maniform.world.tmall.com/category-1282...   \n",
+       "484          https://www.coches.net/fiat/segunda-mano/   \n",
+       "507          https://www.coches.net/fiat/segunda-mano/   \n",
+       "573          https://www.coches.net/fiat/segunda-mano/   \n",
+       "\n",
+       "                                            script_url operation  value  \n",
+       "46       https://www.google-analytics.com/analytics.js       get  en-US  \n",
+       "253  https://g.alicdn.com/secdev/sufei_data/3.2.2/i...       get  en-US  \n",
+       "484  https://jssdk.pulse.schibsted.com/autoTrackerC...       get  en-US  \n",
+       "507   https://www.coches.net/ztkieflaaxcvaiwh121837.js       get  en-US  \n",
+       "573  https://script.hotjar.com/modules-526d80f8c014...       get  en-US  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# computing the language_pref_df dataframe\n",
+    "language_pref_df = language_pref_df.compute()\n",
+    "\n",
+    "# previewing the head of language_pref_df dataframe\n",
+    "language_pref_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Here we can see that the value returned is `en-US` which means the user has the language preference for english (US).\n",
+    "- Here the values are being retrieved by `get` method as shown in the `operation` column. \n",
+    "\n",
+    "*Note: The `HTTP GET` method requests a representation of the specified resource. Requests using `GET` should only retrieve data.*\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total no. of websites checking for users browser language preference: 46482\n",
+      "Total % of websites checking for users browser language preference: 22.57 % \n",
+      "\n",
+      "Total no. of scripts making the call for users browser language preference: 7700\n",
+      "Total % of scripts making the call for users browser language preference: 4.61 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "#len(language_pref_df.value.str.contains('en-US'))\n",
+    "#en_US_df = language_pref_df[language_pref_df.value == 'en-US']\n",
+    "#en_US_df.compute()\n",
+    "\n",
+    "# total websites checking for language preference and country code\n",
+    "lang_pref_location_count = language_pref_df.location.nunique()\n",
+    "print('Total no. of websites checking for users browser language preference:', lang_pref_location_count)\n",
+    "print('Total % of websites checking for users browser language preference:', round(lang_pref_location_count*100/total_location_unique_count, 2),'% \\n')\n",
+    "\n",
+    "# total scripts checking for language preference and country code\n",
+    "lang_pref_script_url_count = language_pref_df.script_url.nunique()\n",
+    "print('Total no. of scripts making the call for users browser language preference:', lang_pref_script_url_count)\n",
+    "print('Total % of scripts making the call for users browser language preference:', round(lang_pref_script_url_count*100/total_script_url_unique_count, 2),'%')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- we can see that `46482` or `22.57`% websites are checking for language preferences of the user\n",
+    "- we can see that `7700` or `4.61`% scripts are making the calls to check for language preferences of the user"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Unique script domains:\n",
+    "\n",
+    "- Lets now check which domains are actually executing the scripts which look for user's browser language and country code. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "https://www.google-analytics.com/analytics.js                            17246\n",
+       "http://www.google-analytics.com/analytics.js                              5661\n",
+       "https://d31qbv1cthcecs.cloudfront.net/atrk.js                             2288\n",
+       "http://www.google-analytics.com/ga.js                                     2155\n",
+       "https://ssl.google-analytics.com/ga.js                                    1909\n",
+       "https://mc.yandex.ru/metrika/watch.js                                     1906\n",
+       "https://bat.bing.com/bat.js                                               1892\n",
+       "https://securepubads.g.doubleclick.net/gpt/pubads_impl_170.js             1881\n",
+       "https://script.hotjar.com/modules-526d80f8c01454f84b75838f21c8706e.js     1467\n",
+       "http://mc.yandex.ru/metrika/watch.js                                      1298\n",
+       "Name: script_url, dtype: int64"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "language_pref_df.script_url.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- We can see a lot of cdn script providers executing these scripts on various websites. \n",
+    "- We can now extract all the domains along with their subdomains having parameter strings removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "www.google-analytics.com          25310\n",
+       "hm.baidu.com                       4712\n",
+       "mc.yandex.ru                       3306\n",
+       "bat.bing.com                       2554\n",
+       "d31qbv1cthcecs.cloudfront.net      2301\n",
+       "securepubads.g.doubleclick.net     2112\n",
+       "ssl.google-analytics.com           2034\n",
+       "pagead2.googlesyndication.com      1573\n",
+       "script.hotjar.com                  1467\n",
+       "stats.g.doubleclick.net             902\n",
+       "Name: scripts_domain, dtype: int64"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Applying extract_domain() to script_urls and saving the domains in a new column inside language_pref_df\n",
+    "language_pref_df['scripts_domain'] = language_pref_df.script_url.apply(extract_domain)\n",
+    "language_pref_df.scripts_domain.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2271"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculating no. of unique domains\n",
+    "language_pref_unique_domains = language_pref_df.scripts_domain.nunique()\n",
+    "language_pref_unique_domains"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Geolocation capturing\n",
+    "The Geolocation interface represents an object able to programmatically obtain the position of the device. It gives Web content access to the location of the device. The API itself is agnostic of the underlying location information sources. Common sources of location information include Global Positioning System (GPS) and location inferred from network signals such as IP address, RFID, WiFi and Bluetooth MAC addresses, and GSM/CDMA cell IDs, as well as user input. No guarantee is given that the API returns the device's actual location.\n",
+    "\n",
+    "The API is designed to enable both \"one-shot\" position requests and repeated position updates, as well as the ability to explicitly query the cached positions. This allows a Web site or app to offer customized results based on the user's location.\n",
+    "\n",
+    "An object with this interface is obtained using the `navigator.geolocation` property implemented by the Navigator object.\n",
+    "\n",
+    "\n",
+    "- Lets see which websites/scripts are checking for user's geolocation (window.navigator.geolocation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[########################################] | 100% Completed | 23.8s\r"
+     ]
+    }
+   ],
+   "source": [
+    "geolocation_df = df[df.symbol == 'window.navigator.geolocation']\n",
+    "geolocation_df = geolocation_df[['location', 'script_url','value']].drop_duplicates().persist()\n",
+    "progress(geolocation_df, notebook=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>script_url</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4332</th>\n",
+       "      <td>https://www.citilink.ru/catalog/for_gamers/igr...</td>\n",
+       "      <td>https://static.citilink.ru/build/js/commons.bu...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5338</th>\n",
+       "      <td>https://www.bunnings.com.au/our-range/brands/a...</td>\n",
+       "      <td>https://www.bunnings.com.au/assets/styleguide/...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6160</th>\n",
+       "      <td>http://www.elcorteingles.es/moda/accesorios/es...</td>\n",
+       "      <td>http://www.elcorteingles.es/akam/10/240f2be0</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6277</th>\n",
+       "      <td>http://www.elcorteingles.es/moda/accesorios/es...</td>\n",
+       "      <td>http://analytics-static.ugc.bazaarvoice.com/pr...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12036</th>\n",
+       "      <td>https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...</td>\n",
+       "      <td>https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                location  \\\n",
+       "4332   https://www.citilink.ru/catalog/for_gamers/igr...   \n",
+       "5338   https://www.bunnings.com.au/our-range/brands/a...   \n",
+       "6160   http://www.elcorteingles.es/moda/accesorios/es...   \n",
+       "6277   http://www.elcorteingles.es/moda/accesorios/es...   \n",
+       "12036  https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...   \n",
+       "\n",
+       "                                              script_url value  \n",
+       "4332   https://static.citilink.ru/build/js/commons.bu...    {}  \n",
+       "5338   https://www.bunnings.com.au/assets/styleguide/...    {}  \n",
+       "6160        http://www.elcorteingles.es/akam/10/240f2be0    {}  \n",
+       "6277   http://analytics-static.ugc.bazaarvoice.com/pr...    {}  \n",
+       "12036  https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...    {}  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# computing the geolocation_df dataframe\n",
+    "geolocation_df = geolocation_df.compute()\n",
+    "\n",
+    "# previewing the head of geolocation_df dataframe\n",
+    "geolocation_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- We can see here that `value` fields are rather empty implying that the crawler wasn't able to detect the calls which were being executed.\n",
+    "- But whats strange here is that at `index`: `36401` the crawler pickup a call where a `getCurrentPosition()` Function was executed. It's shown below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "location      http://www.cracked.com/article_25101_7-stories...\n",
+       "script_url    http://www.cracked.com/article_25101_7-stories...\n",
+       "value                         {\"getCurrentPosition\":\"FUNCTION\"}\n",
+       "Name: 36401, dtype: object"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# this index location has the required call fuction in the value field but not others\n",
+    "geolocation_df.loc[36401]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- We can clearly see here that the compared to most of the value fields where the braces are empty this one here `{\"getCurrentPosition\":\"FUNCTION\"}` is not and contains one fuction which was executed. \n",
+    "- More information: [https://www.w3.org/TR/geolocation-API/#geolocation_interface](https://www.w3.org/TR/geolocation-API/#geolocation_interface)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Lets calculate the total no. of unique websites & scripts checking for user's location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total no. of websites checking for users location: 2216\n",
+      "Total % of websites checking for users location: 1.08 % \n",
+      "\n",
+      "Total no. of scripts making the call to check for users location: 1359\n",
+      "Total % of scripts making the call to check for users location: 0.81 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "# total websites using geolocation api\n",
+    "geolocation_location_count = geolocation_df.location.nunique()\n",
+    "print('Total no. of websites checking for users location:', geolocation_location_count)\n",
+    "print('Total % of websites checking for users location:', round(geolocation_location_count*100/total_location_unique_count, 2),'% \\n')\n",
+    "\n",
+    "# total scripts using geolocation api\n",
+    "geolocation_script_url_count = geolocation_df.script_url.nunique()\n",
+    "print('Total no. of scripts making the call to check for users location:', geolocation_script_url_count)\n",
+    "print('Total % of scripts making the call to check for users location:', round(geolocation_script_url_count*100/total_script_url_unique_count, 2),'%')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- We can see that `2216` or `1.08`% websites are checking for user location.\n",
+    "- we can see that `1359` or `0.81`% scripts are making the calls to check for users current location"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Unique script domains:\n",
+    "\n",
+    "- Lets now check which domains are actually executing the scripts which look for user's geolocation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "https://analytics-static.ugc.bazaarvoice.com/prod/static/3/bv-analytics.js                           123\n",
+       "http://analytics-static.ugc.bazaarvoice.com/prod/static/3/bv-analytics.js                            123\n",
+       "http://www.castorama.fr/store/js/main.js?v=16122017                                                   51\n",
+       "https://static.citilink.ru/build/js/commons.bundle.js?1513352112                                      33\n",
+       "https://www.bunnings.com.au/assets/styleguide/v1.0.884/bunnings-main-site/assets/js/siteJs.min.js     30\n",
+       "https://px.wayfair.com/px/client/main.min.js                                                          29\n",
+       "https://cdn4.forter.com/script.js?sn=d379f257f86d                                                     29\n",
+       "https://cdn.perfdrive.com/aperture/aperture.js                                                        21\n",
+       "http://stat.sputnik.ru/cnt.js                                                                         20\n",
+       "http://js.3conline.com/min2/temp/v2/plugin-locate,plugin-locate_auto.js                               19\n",
+       "Name: script_url, dtype: int64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "geolocation_df.script_url.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- We can see a lot of cdn script providers executing these scripts on various websites. \n",
+    "- We can now extract all the domains along with their subdomains having parameter strings removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "analytics-static.ugc.bazaarvoice.com    246\n",
+       "www.coupang.com                         169\n",
+       "live.sekindo.com                        164\n",
+       "www.elcorteingles.es                    112\n",
+       "d26opx5dl8t69i.cloudfront.net            86\n",
+       "www.castorama.fr                         59\n",
+       "www.adidas.ca                            51\n",
+       "www.qvc.com                              51\n",
+       "pixel.yabidos.com                        49\n",
+       "www.johnlewis.com                        39\n",
+       "Name: scripts_domain, dtype: int64"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Applying extract_domain() to script_urls and saving the domains in a new column inside geolocation_df\n",
+    "geolocation_df['scripts_domain'] = geolocation_df.script_url.apply(extract_domain)\n",
+    "geolocation_df.scripts_domain.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "400"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Calculating no. of unique domains\n",
+    "geolocation_unique_domains = geolocation_df.scripts_domain.nunique()\n",
+    "geolocation_unique_domains"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference:\n",
+    "\n",
+    "### Overall:\n",
+    "+ Out of the total of __205949__ websites/locations in this dataset __46482__ (`22.57`%) websites were found to be checking for preferred language of the user, usually the language of the browser UI, and their subsequent location/scripts can be found in the `language_pref_df` dataframe.\n",
+    "\n",
+    "+ Out of the total of __205949__ websites/locations in this dataset __2216__ (`1.08`%) websites were found to be checking for user's location using the geolocation api, and their subsequent location/scripts can be found in the `geolocation_df` dataframe.\n",
+    "\n",
+    "### Scripts:\n",
+    "\n",
+    "- Out of the total of __166862__ scripts in this dataset __7700__ (`4.61`%) scripts were found to be making the calls to check for preferred language of the user, usually the language of the browser UI, and their subsequent scripts can be found in the `language_pref_df.script_url` dataframe.\n",
+    "\n",
+    "- Out of the total of __166862__ scripts in this dataset __1359__ (`0.81`%) scripts were found to be making the calls to check for user's location using the geolocation api, and their subsequent scripts can be found in the `geolocation_df.script_url` dataframe.\n",
+    "\n",
+    "### Domains:\n",
+    "\n",
+    "- For _geolocation tracking_: scripts were being executed from `400` unique domains (`geolocation_unique_domains`).\n",
+    "- For _language & country code tracking_: scripts were being executed from `2271` unique domains (`language_pref_unique_domains`).\n",
+    "\n",
+    "\n",
+    "Running it on the full dataset can yield an `higher accuracy`.\n",
+    "  \n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	location	script_url	operation	symbol	value
0	https://vk.com/widget_comments.php?app=2297596...	https://vk.com/js/api/xdm.js?1449919642	get	window.name	fXDcab74
1	https://vk.com/widget_comments.php?app=2297596...	https://vk.com/js/api/xdm.js?1449919642	get	window.name	fXDcab74
2	https://vk.com/widget_comments.php?app=2297596...	https://vk.com/js/al/aes_light.js?592436914	get	window.navigator.userAgent	Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
3	https://pos.baidu.com/s?hei=70&wid=670&di=u313...	https://cpro.baidustatic.com/cpro/ui/noexpire/...	get	window.navigator.userAgent	Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
4	http://serienjunkies.org/smilf/smilf-season-1-...	https://apis.google.com/js/plusone.js?_=151338...	get	window.document.cookie	_ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...
	location	script_url	operation	symbol	value
npartitions=299
	object	object	int8	int16	object
	...	...	...	...	...
...	...	...	...	...	...
	...	...	...	...	...
	...	...	...	...	...
	location	script_url	operation	value
46	https://www.canada.ca/en/services.html	https://www.google-analytics.com/analytics.js	get	en-US
253	https://maniform.world.tmall.com/category-1282...	https://g.alicdn.com/secdev/sufei_data/3.2.2/i...	get	en-US
484	https://www.coches.net/fiat/segunda-mano/	https://jssdk.pulse.schibsted.com/autoTrackerC...	get	en-US
507	https://www.coches.net/fiat/segunda-mano/	https://www.coches.net/ztkieflaaxcvaiwh121837.js	get	en-US
573	https://www.coches.net/fiat/segunda-mano/	https://script.hotjar.com/modules-526d80f8c014...	get	en-US
	location	script_url	value
4332	https://www.citilink.ru/catalog/for_gamers/igr...	https://static.citilink.ru/build/js/commons.bu...	{}
5338	https://www.bunnings.com.au/our-range/brands/a...	https://www.bunnings.com.au/assets/styleguide/...	{}
6160	http://www.elcorteingles.es/moda/accesorios/es...	http://www.elcorteingles.es/akam/10/240f2be0	{}
6277	http://www.elcorteingles.es/moda/accesorios/es...	http://analytics-static.ugc.bazaarvoice.com/pr...	{}
12036	https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...	https://gzhls.at/b/wm/C/hdaustria_wonder_set5_...	{}