Skip to content

Commit

Permalink
Merge pull request #42 from datasciencebr/dataset-interface
Browse files Browse the repository at this point in the history
Define a dataset interface
  • Loading branch information
cuducos authored May 4, 2017
2 parents 303ce7d + fb91f0e commit 2d6a5ce
Show file tree
Hide file tree
Showing 20 changed files with 368 additions and 218 deletions.
83 changes: 57 additions & 26 deletions rosie/chamber_of_deputies/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

import numpy as np
import pandas as pd
from serenata_toolbox.ceap_dataset import CEAPDataset
from serenata_toolbox.chamber_of_deputies.chamber_of_deputies_dataset import ChamberOfDeputiesDataset
from serenata_toolbox.datasets import fetch


COLUMNS = {
'category': 'subquota_description',
'net_value': 'total_net_value',
'recipient_id': 'cnpj_cpf',
'recipient': 'supplier',
}


class Adapter:
COMPANIES_DATASET = '2016-09-03-companies.xz'

Expand All @@ -15,39 +23,62 @@ def __init__(self, path):
@property
def dataset(self):
self.update_datasets()
reimbursements = self.get_reimbursements()
self.get_reimbursements()
companies = self.get_companies()
return pd.merge(reimbursements, companies,
how='left',
left_on='cnpj_cpf',
right_on='cnpj')
self._dataset = self._dataset.merge(companies,
how='left',
left_on='cnpj_cpf',
right_on='cnpj')
self.prepare_dataset()
return self._dataset

def prepare_dataset(self):
self.rename_columns()
self.rename_categories()

def rename_columns(self):
columns = {v: k for k, v in COLUMNS.items()}
self._dataset.rename(columns=columns, inplace=True)

def rename_categories(self):
# There's no documented type for `3`, thus we assume it's an input error
self._dataset['document_type'].replace({3: None}, inplace=True)
self._dataset['document_type'] = self._dataset['document_type'].astype(
'category')
types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad']
self._dataset['document_type'].cat.rename_categories(
types, inplace=True)
# Classifiers expect a more broad category name for meals
self._dataset['category'] = self._dataset['category'].replace(
{'Congressperson meal': 'Meal'})
self._dataset['is_party_expense'] = \
self._dataset['congressperson_id'].isnull()

def update_datasets(self):
os.makedirs(self.path, exist_ok=True)
ceap = CEAPDataset(self.path)
ceap.fetch()
ceap.convert_to_csv()
ceap.translate()
ceap.clean()
chamber_of_deputies = ChamberOfDeputiesDataset(self.path)
chamber_of_deputies.fetch()
chamber_of_deputies.convert_to_csv()
chamber_of_deputies.translate()
chamber_of_deputies.clean()
fetch(self.COMPANIES_DATASET, self.path)

def get_reimbursements(self):
dataset = \
pd.read_csv(os.path.join(self.path, 'reimbursements.xz'),
dtype={'applicant_id': np.str,
'cnpj_cpf': np.str,
'congressperson_id': np.str,
'subquota_number': np.str},
low_memory=False)
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'],
errors='coerce')
return dataset
path = os.path.join(self.path, 'reimbursements.xz')
self._dataset = pd.read_csv(path,
dtype={'applicant_id': np.str,
'cnpj_cpf': np.str,
'congressperson_id': np.str,
'subquota_number': np.str},
low_memory=False)
self._dataset['issue_date'] = pd.to_datetime(
self._dataset['issue_date'], errors='coerce')
return self._dataset

def get_companies(self):
dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
dtype={'cnpj': np.str},
low_memory=False)
path = os.path.join(self.path, self.COMPANIES_DATASET)
dataset = pd.read_csv(path, dtype={'cnpj': np.str}, low_memory=False)
dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')
dataset['situation_date'] = pd.to_datetime(dataset['situation_date'],
errors='coerce')
dataset['situation_date'] = pd.to_datetime(
dataset['situation_date'], errors='coerce')
return dataset
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@


class ElectionExpensesClassifier(TransformerMixin):
"""
Election Expenses classifier.
Check a `legal_entity` field for the presency of the political candidacy
category in the Brazilian Federal Revenue.
Dataset
-------
legal_entity : string column
Brazilian Federal Revenue category of companies, preceded by its code.
"""

def fit(self, X):
return self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@


class IrregularCompaniesClassifier(TransformerMixin):
"""
Irregular Companies classifier.
Check for the official state of the company in the
Brazilian Federal Revenue and reports for rows with companies unauthorized
to sell products or services.
Dataset
-------
issue_date : datetime column
Date when the expense was made.
situation : string column
Situation of the company according to the Brazilian Federal Revenue.
situation_date : datetime column
Date when the situation was last updated.
"""

def fit(self, X):
return self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,31 @@


class MealPriceOutlierClassifier(TransformerMixin):
"""
Meal Price Outlier classifier.
Dataset
-------
applicant_id : string column
A personal identifier code for every person making expenses.
category : category column
Category of the expense. The model will be applied just in rows where
the value is equal to "Meal".
net_value : float column
The value of the expense.
recipient_id : string column
A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID).
"""

HOTEL_REGEX = r'hote(?:(?:ls?)|is)'
CLUSTER_KEYS = ['mean', 'std']

def fit(self, X):
_X = X[self.__applicable_rows(X)]
companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \
companies = _X.groupby('recipient_id').apply(self.__company_stats) \
.reset_index()
companies = companies[self.__applicable_company_rows(companies)]

Expand All @@ -34,17 +51,17 @@ def transform(self, X=None):
def predict(self, X):
_X = X.copy()
companies = _X[self.__applicable_rows(_X)] \
.groupby('cnpj_cpf').apply(self.__company_stats) \
.groupby('recipient_id').apply(self.__company_stats) \
.reset_index()
companies['cluster'] = \
self.cluster_model.predict(companies[self.CLUSTER_KEYS])
companies = pd.merge(companies,
self.clusters[['cluster', 'threshold']],
how='left')
_X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left')
_X = pd.merge(_X, companies[['recipient_id', 'threshold']], how='left')
known_companies = companies[self.__applicable_company_rows(companies)]
known_thresholds = known_companies \
.groupby('cnpj_cpf') \
.groupby('recipient_id') \
.apply(lambda x: x['mean'] + 3 * x['std']) \
.reset_index() \
.rename(columns={0: 'cnpj_threshold'})
Expand All @@ -55,21 +72,21 @@ def predict(self, X):
_X['y'] = 1
is_outlier = self.__applicable_rows(_X) & \
_X['threshold'].notnull() & \
(_X['total_net_value'] > _X['threshold'])
(_X['net_value'] > _X['threshold'])
_X.loc[is_outlier, 'y'] = -1
return _X['y']

def __applicable_rows(self, X):
return (X['subquota_description'] == 'Congressperson meal') & \
(X['cnpj_cpf'].str.len() == 14) & \
(~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))
return (X['category'] == 'Meal') & \
(X['recipient_id'].str.len() == 14) & \
(~X['recipient'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))

def __applicable_company_rows(self, companies):
return (companies['congresspeople'] > 3) & (companies['records'] > 20)

def __company_stats(self, X):
stats = {'mean': np.mean(X['total_net_value']),
'std': np.std(X['total_net_value']),
stats = {'mean': np.mean(X['net_value']),
'std': np.std(X['net_value']),
'congresspeople': len(np.unique(X['applicant_id'])),
'records': len(X)}
return pd.Series(stats)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@


class MonthlySubquotaLimitClassifier(TransformerMixin):
"""
Monthly Subquota Limit classifier.
Dataset
-------
issue_date : datetime column
Date when the expense was made.
net_value : float column
The value of the expense.
"""

KEYS = ['applicant_id', 'month', 'year']

def fit(self, X):
Expand Down Expand Up @@ -62,7 +74,7 @@ def predict_proba(self, X=None):


def __create_columns(self):
self._X['net_value_int'] = (self._X['total_net_value'] * 100).apply(int)
self._X['net_value_int'] = (self._X['net_value'] * 100).apply(int)

self._X['coerced_issue_date'] = \
pd.to_datetime(self._X['issue_date'], errors='coerce')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,31 @@


class TraveledSpeedsClassifier(TransformerMixin):
"""
Traveled Speeds classifier.
Dataset
-------
applicant_id : category column
A personal identifier code for every person making expenses.
category : category column
Category of the expense. The model will be applied just in rows where
the value is equal to "Meal".
is_party_expense : bool column
If the row corresponds to a party expense or not. The model will be
applied just in rows where the value is equal to `False`.
issue_date : datetime column
Date when the expense was made.
latitude : float column
Latitude of the place where the expense was made.
longitude : float column
Longitude of the place where the expense was made.
"""

AGG_KEYS = ['applicant_id', 'issue_date']

Expand Down Expand Up @@ -61,10 +86,11 @@ def __classify_dataset(self, X):
return X

def __applicable_rows(self, X):
return (X['subquota_description'] == 'Congressperson meal') & \
return (X['category'] == 'Meal') & \
(-73.992222 < X['longitude']) & (X['longitude'] < -34.7916667) & \
(-33.742222 < X['latitude']) & (X['latitude'] < 5.2722222) & \
X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1)
~X['is_party_expense'] & \
X[['latitude', 'longitude']].notnull().all(axis=1)

def __calculate_sum_distances(self, X):
coordinate_list = X[['latitude', 'longitude']].values
Expand Down
2 changes: 0 additions & 2 deletions rosie/chamber_of_deputies/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,3 @@
}

UNIQUE_IDS = ['applicant_id', 'year', 'document_id']

VALUE = 'total_net_value'
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cnpj_cpf,situation,situation_date,issue_date
recipient_id,situation,situation_date,issue_date
02989654001197,ABERTA,2013-01-03,2013-01-30
02989654001197,BAIXADA,2013-01-03,2013-01-30
02989654001197,NULA,2013-01-03,2013-01-30
Expand Down
Loading

0 comments on commit 2d6a5ce

Please sign in to comment.