Merge pull request #42 from datasciencebr/dataset-interface

Define a dataset interface
okfn-brasil · May 4, 2017 · 2d6a5ce · 2d6a5ce
2 parents 303ce7d + fb91f0e
commit 2d6a5ce
Show file tree

Hide file tree

Showing 20 changed files with 368 additions and 218 deletions.
diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py
@@ -2,10 +2,18 @@
 
 import numpy as np
 import pandas as pd
-from serenata_toolbox.ceap_dataset import CEAPDataset
+from serenata_toolbox.chamber_of_deputies.chamber_of_deputies_dataset import ChamberOfDeputiesDataset
 from serenata_toolbox.datasets import fetch
 
 
+COLUMNS = {
+    'category': 'subquota_description',
+    'net_value': 'total_net_value',
+    'recipient_id': 'cnpj_cpf',
+    'recipient': 'supplier',
+}
+
+
 class Adapter:
     COMPANIES_DATASET = '2016-09-03-companies.xz'
 
@@ -15,39 +23,62 @@ def __init__(self, path):
     @property
     def dataset(self):
         self.update_datasets()
-        reimbursements = self.get_reimbursements()
+        self.get_reimbursements()
         companies = self.get_companies()
-        return pd.merge(reimbursements, companies,
-                        how='left',
-                        left_on='cnpj_cpf',
-                        right_on='cnpj')
+        self._dataset = self._dataset.merge(companies,
+                                            how='left',
+                                            left_on='cnpj_cpf',
+                                            right_on='cnpj')
+        self.prepare_dataset()
+        return self._dataset
+
+    def prepare_dataset(self):
+        self.rename_columns()
+        self.rename_categories()
+
+    def rename_columns(self):
+        columns = {v: k for k, v in COLUMNS.items()}
+        self._dataset.rename(columns=columns, inplace=True)
+
+    def rename_categories(self):
+        # There's no documented type for `3`, thus we assume it's an input error
+        self._dataset['document_type'].replace({3: None}, inplace=True)
+        self._dataset['document_type'] = self._dataset['document_type'].astype(
+            'category')
+        types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad']
+        self._dataset['document_type'].cat.rename_categories(
+            types, inplace=True)
+        # Classifiers expect a more broad category name for meals
+        self._dataset['category'] = self._dataset['category'].replace(
+            {'Congressperson meal': 'Meal'})
+        self._dataset['is_party_expense'] = \
+            self._dataset['congressperson_id'].isnull()
 
     def update_datasets(self):
         os.makedirs(self.path, exist_ok=True)
-        ceap = CEAPDataset(self.path)
-        ceap.fetch()
-        ceap.convert_to_csv()
-        ceap.translate()
-        ceap.clean()
+        chamber_of_deputies = ChamberOfDeputiesDataset(self.path)
+        chamber_of_deputies.fetch()
+        chamber_of_deputies.convert_to_csv()
+        chamber_of_deputies.translate()
+        chamber_of_deputies.clean()
         fetch(self.COMPANIES_DATASET, self.path)
 
     def get_reimbursements(self):
-        dataset = \
-            pd.read_csv(os.path.join(self.path, 'reimbursements.xz'),
-                        dtype={'applicant_id': np.str,
-                               'cnpj_cpf': np.str,
-                               'congressperson_id': np.str,
-                               'subquota_number': np.str},
-                        low_memory=False)
-        dataset['issue_date'] = pd.to_datetime(dataset['issue_date'],
-                                               errors='coerce')
-        return dataset
+        path = os.path.join(self.path, 'reimbursements.xz')
+        self._dataset = pd.read_csv(path,
+                                    dtype={'applicant_id': np.str,
+                                           'cnpj_cpf': np.str,
+                                           'congressperson_id': np.str,
+                                           'subquota_number': np.str},
+                                    low_memory=False)
+        self._dataset['issue_date'] = pd.to_datetime(
+            self._dataset['issue_date'], errors='coerce')
+        return self._dataset
 
     def get_companies(self):
-        dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
-                              dtype={'cnpj': np.str},
-                              low_memory=False)
+        path = os.path.join(self.path, self.COMPANIES_DATASET)
+        dataset = pd.read_csv(path, dtype={'cnpj': np.str}, low_memory=False)
         dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')
-        dataset['situation_date'] = pd.to_datetime(dataset['situation_date'],
-                                                   errors='coerce')
+        dataset['situation_date'] = pd.to_datetime(
+            dataset['situation_date'], errors='coerce')
         return dataset
diff --git a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py
@@ -5,6 +5,17 @@
 
 
 class ElectionExpensesClassifier(TransformerMixin):
+    """
+    Election Expenses classifier.
+
+    Check a `legal_entity` field for the presency of the political candidacy
+    category in the Brazilian Federal Revenue.
+
+    Dataset
+    -------
+    legal_entity : string column
+        Brazilian Federal Revenue category of companies, preceded by its code.
+    """
 
     def fit(self, X):
         return self

diff --git a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py
@@ -5,6 +5,24 @@
 
 
 class IrregularCompaniesClassifier(TransformerMixin):
+    """
+    Irregular Companies classifier.
+
+    Check for the official state of the company in the
+    Brazilian Federal Revenue and reports for rows with companies unauthorized
+    to sell products or services.
+
+    Dataset
+    -------
+    issue_date : datetime column
+        Date when the expense was made.
+
+    situation : string column
+        Situation of the company according to the Brazilian Federal Revenue.
+
+    situation_date : datetime column
+        Date when the situation was last updated.
+    """
 
     def fit(self, X):
         return self

diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py
@@ -7,14 +7,31 @@
 
 
 class MealPriceOutlierClassifier(TransformerMixin):
+    """
+    Meal Price Outlier classifier.
 
+    Dataset
+    -------
+    applicant_id : string column
+        A personal identifier code for every person making expenses.
+
+    category : category column
+        Category of the expense. The model will be applied just in rows where
+        the value is equal to "Meal".
+
+    net_value : float column
+        The value of the expense.
+
+    recipient_id : string column
+        A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID).
+    """
 
     HOTEL_REGEX = r'hote(?:(?:ls?)|is)'
     CLUSTER_KEYS = ['mean', 'std']
 
     def fit(self, X):
         _X = X[self.__applicable_rows(X)]
-        companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \
+        companies = _X.groupby('recipient_id').apply(self.__company_stats) \
             .reset_index()
         companies = companies[self.__applicable_company_rows(companies)]
 
@@ -34,17 +51,17 @@ def transform(self, X=None):
     def predict(self, X):
         _X = X.copy()
         companies = _X[self.__applicable_rows(_X)] \
-            .groupby('cnpj_cpf').apply(self.__company_stats) \
+            .groupby('recipient_id').apply(self.__company_stats) \
             .reset_index()
         companies['cluster'] = \
             self.cluster_model.predict(companies[self.CLUSTER_KEYS])
         companies = pd.merge(companies,
                              self.clusters[['cluster', 'threshold']],
                              how='left')
-        _X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left')
+        _X = pd.merge(_X, companies[['recipient_id', 'threshold']], how='left')
         known_companies = companies[self.__applicable_company_rows(companies)]
         known_thresholds = known_companies \
-            .groupby('cnpj_cpf') \
+            .groupby('recipient_id') \
             .apply(lambda x: x['mean'] + 3 * x['std']) \
             .reset_index() \
             .rename(columns={0: 'cnpj_threshold'})
@@ -55,21 +72,21 @@ def predict(self, X):
         _X['y'] = 1
         is_outlier = self.__applicable_rows(_X) & \
             _X['threshold'].notnull() & \
-            (_X['total_net_value'] > _X['threshold'])
+            (_X['net_value'] > _X['threshold'])
         _X.loc[is_outlier, 'y'] = -1
         return _X['y']
 
     def __applicable_rows(self, X):
-        return (X['subquota_description'] == 'Congressperson meal') & \
-            (X['cnpj_cpf'].str.len() == 14) & \
-            (~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))
+        return (X['category'] == 'Meal') & \
+            (X['recipient_id'].str.len() == 14) & \
+            (~X['recipient'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))
 
     def __applicable_company_rows(self, companies):
         return (companies['congresspeople'] > 3) & (companies['records'] > 20)
 
     def __company_stats(self, X):
-        stats = {'mean': np.mean(X['total_net_value']),
-                 'std': np.std(X['total_net_value']),
+        stats = {'mean': np.mean(X['net_value']),
+                 'std': np.std(X['net_value']),
                  'congresspeople': len(np.unique(X['applicant_id'])),
                  'records': len(X)}
         return pd.Series(stats)

diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py
@@ -6,6 +6,18 @@
 
 
 class MonthlySubquotaLimitClassifier(TransformerMixin):
+    """
+    Monthly Subquota Limit classifier.
+
+    Dataset
+    -------
+    issue_date : datetime column
+        Date when the expense was made.
+
+    net_value : float column
+        The value of the expense.
+    """
+
     KEYS = ['applicant_id', 'month', 'year']
 
     def fit(self, X):
@@ -62,7 +74,7 @@ def predict_proba(self, X=None):
 
 
     def __create_columns(self):
-        self._X['net_value_int'] = (self._X['total_net_value'] * 100).apply(int)
+        self._X['net_value_int'] = (self._X['net_value'] * 100).apply(int)
 
         self._X['coerced_issue_date'] = \
             pd.to_datetime(self._X['issue_date'], errors='coerce')

diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py
@@ -8,6 +8,31 @@
 
 
 class TraveledSpeedsClassifier(TransformerMixin):
+    """
+    Traveled Speeds classifier.
+
+    Dataset
+    -------
+    applicant_id : category column
+        A personal identifier code for every person making expenses.
+
+    category : category column
+        Category of the expense. The model will be applied just in rows where
+        the value is equal to "Meal".
+
+    is_party_expense : bool column
+        If the row corresponds to a party expense or not. The model will be
+        applied just in rows where the value is equal to `False`.
+
+    issue_date : datetime column
+        Date when the expense was made.
+
+    latitude : float column
+        Latitude of the place where the expense was made.
+
+    longitude : float column
+        Longitude of the place where the expense was made.
+    """
 
     AGG_KEYS = ['applicant_id', 'issue_date']
 
@@ -61,10 +86,11 @@ def __classify_dataset(self, X):
         return X
 
     def __applicable_rows(self, X):
-        return (X['subquota_description'] == 'Congressperson meal') & \
+        return (X['category'] == 'Meal') & \
             (-73.992222 < X['longitude']) & (X['longitude'] < -34.7916667) & \
             (-33.742222 < X['latitude']) & (X['latitude'] < 5.2722222) & \
-            X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1)
+            ~X['is_party_expense'] & \
+            X[['latitude', 'longitude']].notnull().all(axis=1)
 
     def __calculate_sum_distances(self, X):
         coordinate_list = X[['latitude', 'longitude']].values

diff --git a/rosie/chamber_of_deputies/settings.py b/rosie/chamber_of_deputies/settings.py
@@ -16,5 +16,3 @@
 }
 
 UNIQUE_IDS = ['applicant_id', 'year', 'document_id']
-
-VALUE = 'total_net_value'
diff --git a/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv
@@ -1,4 +1,4 @@
-cnpj_cpf,situation,situation_date,issue_date
+recipient_id,situation,situation_date,issue_date
 02989654001197,ABERTA,2013-01-03,2013-01-30
 02989654001197,BAIXADA,2013-01-03,2013-01-30
 02989654001197,NULA,2013-01-03,2013-01-30
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,5 +16,3 @@
		}

		UNIQUE_IDS = ['applicant_id', 'year', 'document_id']

		VALUE = 'total_net_value'