diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 4a57282c2..8c7452c4b 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -2,10 +2,18 @@ import numpy as np import pandas as pd -from serenata_toolbox.ceap_dataset import CEAPDataset +from serenata_toolbox.chamber_of_deputies.chamber_of_deputies_dataset import ChamberOfDeputiesDataset from serenata_toolbox.datasets import fetch +COLUMNS = { + 'category': 'subquota_description', + 'net_value': 'total_net_value', + 'recipient_id': 'cnpj_cpf', + 'recipient': 'supplier', +} + + class Adapter: COMPANIES_DATASET = '2016-09-03-companies.xz' @@ -15,39 +23,62 @@ def __init__(self, path): @property def dataset(self): self.update_datasets() - reimbursements = self.get_reimbursements() + self.get_reimbursements() companies = self.get_companies() - return pd.merge(reimbursements, companies, - how='left', - left_on='cnpj_cpf', - right_on='cnpj') + self._dataset = self._dataset.merge(companies, + how='left', + left_on='cnpj_cpf', + right_on='cnpj') + self.prepare_dataset() + return self._dataset + + def prepare_dataset(self): + self.rename_columns() + self.rename_categories() + + def rename_columns(self): + columns = {v: k for k, v in COLUMNS.items()} + self._dataset.rename(columns=columns, inplace=True) + + def rename_categories(self): + # There's no documented type for `3`, thus we assume it's an input error + self._dataset['document_type'].replace({3: None}, inplace=True) + self._dataset['document_type'] = self._dataset['document_type'].astype( + 'category') + types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] + self._dataset['document_type'].cat.rename_categories( + types, inplace=True) + # Classifiers expect a more broad category name for meals + self._dataset['category'] = self._dataset['category'].replace( + {'Congressperson meal': 'Meal'}) + self._dataset['is_party_expense'] = \ + self._dataset['congressperson_id'].isnull() def update_datasets(self): os.makedirs(self.path, exist_ok=True) - ceap = CEAPDataset(self.path) - ceap.fetch() - ceap.convert_to_csv() - ceap.translate() - ceap.clean() + chamber_of_deputies = ChamberOfDeputiesDataset(self.path) + chamber_of_deputies.fetch() + chamber_of_deputies.convert_to_csv() + chamber_of_deputies.translate() + chamber_of_deputies.clean() fetch(self.COMPANIES_DATASET, self.path) def get_reimbursements(self): - dataset = \ - pd.read_csv(os.path.join(self.path, 'reimbursements.xz'), - dtype={'applicant_id': np.str, - 'cnpj_cpf': np.str, - 'congressperson_id': np.str, - 'subquota_number': np.str}, - low_memory=False) - dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], - errors='coerce') - return dataset + path = os.path.join(self.path, 'reimbursements.xz') + self._dataset = pd.read_csv(path, + dtype={'applicant_id': np.str, + 'cnpj_cpf': np.str, + 'congressperson_id': np.str, + 'subquota_number': np.str}, + low_memory=False) + self._dataset['issue_date'] = pd.to_datetime( + self._dataset['issue_date'], errors='coerce') + return self._dataset def get_companies(self): - dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET), - dtype={'cnpj': np.str}, - low_memory=False) + path = os.path.join(self.path, self.COMPANIES_DATASET) + dataset = pd.read_csv(path, dtype={'cnpj': np.str}, low_memory=False) dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '') - dataset['situation_date'] = pd.to_datetime(dataset['situation_date'], - errors='coerce') + dataset['situation_date'] = pd.to_datetime( + dataset['situation_date'], errors='coerce') return dataset diff --git a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py index b625078e7..91a876b05 100644 --- a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py @@ -5,6 +5,17 @@ class ElectionExpensesClassifier(TransformerMixin): + """ + Election Expenses classifier. + + Check a `legal_entity` field for the presency of the political candidacy + category in the Brazilian Federal Revenue. + + Dataset + ------- + legal_entity : string column + Brazilian Federal Revenue category of companies, preceded by its code. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py index 92a117b44..b2574ffc5 100644 --- a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py @@ -5,6 +5,24 @@ class IrregularCompaniesClassifier(TransformerMixin): + """ + Irregular Companies classifier. + + Check for the official state of the company in the + Brazilian Federal Revenue and reports for rows with companies unauthorized + to sell products or services. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + situation : string column + Situation of the company according to the Brazilian Federal Revenue. + + situation_date : datetime column + Date when the situation was last updated. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 861cfd2f0..862027176 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -7,14 +7,31 @@ class MealPriceOutlierClassifier(TransformerMixin): + """ + Meal Price Outlier classifier. + Dataset + ------- + applicant_id : string column + A personal identifier code for every person making expenses. + + category : category column + Category of the expense. The model will be applied just in rows where + the value is equal to "Meal". + + net_value : float column + The value of the expense. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ HOTEL_REGEX = r'hote(?:(?:ls?)|is)' CLUSTER_KEYS = ['mean', 'std'] def fit(self, X): _X = X[self.__applicable_rows(X)] - companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \ + companies = _X.groupby('recipient_id').apply(self.__company_stats) \ .reset_index() companies = companies[self.__applicable_company_rows(companies)] @@ -34,17 +51,17 @@ def transform(self, X=None): def predict(self, X): _X = X.copy() companies = _X[self.__applicable_rows(_X)] \ - .groupby('cnpj_cpf').apply(self.__company_stats) \ + .groupby('recipient_id').apply(self.__company_stats) \ .reset_index() companies['cluster'] = \ self.cluster_model.predict(companies[self.CLUSTER_KEYS]) companies = pd.merge(companies, self.clusters[['cluster', 'threshold']], how='left') - _X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left') + _X = pd.merge(_X, companies[['recipient_id', 'threshold']], how='left') known_companies = companies[self.__applicable_company_rows(companies)] known_thresholds = known_companies \ - .groupby('cnpj_cpf') \ + .groupby('recipient_id') \ .apply(lambda x: x['mean'] + 3 * x['std']) \ .reset_index() \ .rename(columns={0: 'cnpj_threshold'}) @@ -55,21 +72,21 @@ def predict(self, X): _X['y'] = 1 is_outlier = self.__applicable_rows(_X) & \ _X['threshold'].notnull() & \ - (_X['total_net_value'] > _X['threshold']) + (_X['net_value'] > _X['threshold']) _X.loc[is_outlier, 'y'] = -1 return _X['y'] def __applicable_rows(self, X): - return (X['subquota_description'] == 'Congressperson meal') & \ - (X['cnpj_cpf'].str.len() == 14) & \ - (~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) + return (X['category'] == 'Meal') & \ + (X['recipient_id'].str.len() == 14) & \ + (~X['recipient'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) def __applicable_company_rows(self, companies): return (companies['congresspeople'] > 3) & (companies['records'] > 20) def __company_stats(self, X): - stats = {'mean': np.mean(X['total_net_value']), - 'std': np.std(X['total_net_value']), + stats = {'mean': np.mean(X['net_value']), + 'std': np.std(X['net_value']), 'congresspeople': len(np.unique(X['applicant_id'])), 'records': len(X)} return pd.Series(stats) diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py index 8da52d10c..93b991d5d 100644 --- a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py @@ -6,6 +6,18 @@ class MonthlySubquotaLimitClassifier(TransformerMixin): + """ + Monthly Subquota Limit classifier. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + net_value : float column + The value of the expense. + """ + KEYS = ['applicant_id', 'month', 'year'] def fit(self, X): @@ -62,7 +74,7 @@ def predict_proba(self, X=None): def __create_columns(self): - self._X['net_value_int'] = (self._X['total_net_value'] * 100).apply(int) + self._X['net_value_int'] = (self._X['net_value'] * 100).apply(int) self._X['coerced_issue_date'] = \ pd.to_datetime(self._X['issue_date'], errors='coerce') diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index 9b222db47..2b54c324b 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -8,6 +8,31 @@ class TraveledSpeedsClassifier(TransformerMixin): + """ + Traveled Speeds classifier. + + Dataset + ------- + applicant_id : category column + A personal identifier code for every person making expenses. + + category : category column + Category of the expense. The model will be applied just in rows where + the value is equal to "Meal". + + is_party_expense : bool column + If the row corresponds to a party expense or not. The model will be + applied just in rows where the value is equal to `False`. + + issue_date : datetime column + Date when the expense was made. + + latitude : float column + Latitude of the place where the expense was made. + + longitude : float column + Longitude of the place where the expense was made. + """ AGG_KEYS = ['applicant_id', 'issue_date'] @@ -61,10 +86,11 @@ def __classify_dataset(self, X): return X def __applicable_rows(self, X): - return (X['subquota_description'] == 'Congressperson meal') & \ + return (X['category'] == 'Meal') & \ (-73.992222 < X['longitude']) & (X['longitude'] < -34.7916667) & \ (-33.742222 < X['latitude']) & (X['latitude'] < 5.2722222) & \ - X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1) + ~X['is_party_expense'] & \ + X[['latitude', 'longitude']].notnull().all(axis=1) def __calculate_sum_distances(self, X): coordinate_list = X[['latitude', 'longitude']].values diff --git a/rosie/chamber_of_deputies/settings.py b/rosie/chamber_of_deputies/settings.py index 8d62be196..22769288f 100644 --- a/rosie/chamber_of_deputies/settings.py +++ b/rosie/chamber_of_deputies/settings.py @@ -16,5 +16,3 @@ } UNIQUE_IDS = ['applicant_id', 'year', 'document_id'] - -VALUE = 'total_net_value' diff --git a/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv index 49f29ea03..e814a2cdf 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv @@ -1,4 +1,4 @@ -cnpj_cpf,situation,situation_date,issue_date +recipient_id,situation,situation_date,issue_date 02989654001197,ABERTA,2013-01-03,2013-01-30 02989654001197,BAIXADA,2013-01-03,2013-01-30 02989654001197,NULA,2013-01-03,2013-01-30 diff --git a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv index 75b77c525..b9c5f3b0f 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv @@ -1,82 +1,82 @@ -applicant_id,subquota_description,cnpj_cpf,supplier,total_net_value -999,Congressperson meal,47667414122,João da Silva,9999999 -999,Congressperson meal,24624607000100,Hotel X,30 -999,Congressperson meal,24624607000101,Y Hotels,30 -999,Congressperson meal,24624607000102,X hotéis,30 -999,Congressperson meal,24624607000103,Hotel X,9999999 -999,Congressperson meal,24624607000104,Y Hotels,9999999 -999,Congressperson meal,24624607000105,X hotéis,9999999 -111,Congressperson meal,08378940000120,A Restaurant,3 -111,Congressperson meal,08378940000120,A Restaurant,30 -222,Congressperson meal,08378940000120,A Restaurant,40 -333,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,60 -444,Congressperson meal,08378940000120,A Restaurant,70 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -111,Congressperson meal,08378940000120,A Restaurant,900 -111,Congressperson meal,08378940000120,A Restaurant,80 -222,Congressperson meal,08378940000120,A Restaurant,70 -333,Congressperson meal,08378940000120,A Restaurant,60 -444,Congressperson meal,08378940000120,A Restaurant,70 -444,Congressperson meal,08378940000120,A Restaurant,80 -444,Congressperson meal,08378940000120,A Restaurant,90 -111,Congressperson meal,67661714000111,B Restaurant,130 -222,Congressperson meal,67661714000111,B Restaurant,140 -333,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,160 -444,Congressperson meal,67661714000111,B Restaurant,170 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -111,Congressperson meal,81387409000104,C Restaurant,330 -222,Congressperson meal,81387409000104,C Restaurant,340 -333,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,360 -444,Congressperson meal,81387409000104,C Restaurant,370 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,22472225000183,GOL,400 +applicant_id,category,recipient_id,recipient,net_value +999,Meal,47667414122,João da Silva,9999999 +999,Meal,24624607000100,Hotel X,30 +999,Meal,24624607000101,Y Hotels,30 +999,Meal,24624607000102,X hotéis,30 +999,Meal,24624607000103,Hotel X,9999999 +999,Meal,24624607000104,Y Hotels,9999999 +999,Meal,24624607000105,X hotéis,9999999 +111,Meal,08378940000120,A Restaurant,3 +111,Meal,08378940000120,A Restaurant,30 +222,Meal,08378940000120,A Restaurant,40 +333,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,60 +444,Meal,08378940000120,A Restaurant,70 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +111,Meal,08378940000120,A Restaurant,900 +111,Meal,08378940000120,A Restaurant,80 +222,Meal,08378940000120,A Restaurant,70 +333,Meal,08378940000120,A Restaurant,60 +444,Meal,08378940000120,A Restaurant,70 +444,Meal,08378940000120,A Restaurant,80 +444,Meal,08378940000120,A Restaurant,90 +111,Meal,67661714000111,B Restaurant,130 +222,Meal,67661714000111,B Restaurant,140 +333,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,160 +444,Meal,67661714000111,B Restaurant,170 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +111,Meal,81387409000104,C Restaurant,330 +222,Meal,81387409000104,C Restaurant,340 +333,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,360 +444,Meal,81387409000104,C Restaurant,370 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,22472225000183,GOL,400 444,Flight ticket issue,22472225000183,GOL,9999999 444,Flight ticket issue,22472225000183,GOL,5 diff --git a/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv index fd6ae2c02..edd4e550c 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv @@ -1,4 +1,4 @@ -applicant_id,subquota_number,issue_date,year,month,total_net_value +applicant_id,subquota_number,issue_date,year,month,net_value 1,120,2015-03-01,2015,3,10500 1,120,2015-03-01,2015,3,401 2,120,2015-04-01,2015,4,10500 diff --git a/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz b/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz index 06bf968fa..86bf1067b 100644 Binary files a/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz and b/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz differ diff --git a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv index 08ad5f06e..2668c9351 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv @@ -1,30 +1,30 @@ -applicant_id,congressperson_id,issue_date,subquota_description,cnpj_cpf,latitude,longitude -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365 -999,,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770, -999,999,2016-01-02,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,14047033000100,-10.6519807,-68.4995996 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Congressperson meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Congressperson meal,08378940000120,6.9753770,-33.8248977 +applicant_id,is_party_expense,issue_date,category,recipient_id,latitude,longitude +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365 +999,true,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770, +999,false,2016-01-02,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,14047033000100,-10.6519807,-68.4995996 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,6.9753770,-33.8248977 diff --git a/rosie/chamber_of_deputies/tests/test_adapter.py b/rosie/chamber_of_deputies/tests/test_adapter.py new file mode 100644 index 000000000..c738e9efd --- /dev/null +++ b/rosie/chamber_of_deputies/tests/test_adapter.py @@ -0,0 +1,57 @@ +import shutil +import os +from tempfile import mkdtemp +from unittest import TestCase +from unittest.mock import patch +from shutil import copy2 + +import pandas as pd + +from rosie.chamber_of_deputies.adapter import Adapter as subject_class +from rosie.chamber_of_deputies.adapter import COLUMNS as ADAPTER_COLUMNS + + +class TestAdapter(TestCase): + + def setUp(self): + self.temp_path = mkdtemp() + self.fixtures_path = os.path.join('rosie', 'chamber_of_deputies', 'tests', 'fixtures') + copies = ( + ('companies.xz', subject_class.COMPANIES_DATASET), + ('reimbursements.xz', 'reimbursements.xz') + ) + for source, target in copies: + copy2(os.path.join(self.fixtures_path, source), os.path.join(self.temp_path, target)) + self.subject = subject_class(self.temp_path) + + def tearDown(self): + shutil.rmtree(self.temp_path) + + @patch('rosie.chamber_of_deputies.adapter.ChamberOfDeputiesDataset') + @patch('rosie.chamber_of_deputies.adapter.fetch') + def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, fetch, chamber_of_deputies): + self.assertEqual(6, len(self.subject.dataset)) + self.assertEqual(1, self.subject.dataset['legal_entity'].isnull().sum()) + + @patch('rosie.chamber_of_deputies.adapter.ChamberOfDeputiesDataset') + @patch('rosie.chamber_of_deputies.adapter.fetch') + def test_prepare_dataset(self, fetch, chamber_of_deputies): + """ + * Rename columns. + * Make `document_type` a category column. + * Rename values for `category`. + * Create `is_party_expense` column. + """ + dataset = self.subject.dataset + self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns))) + document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] + self.assertEqual(document_types, + dataset['document_type'].cat.categories.tolist()) + fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz')) + meal_rows = fixture \ + .query('subquota_description == "Congressperson meal"').index + self.assertEqual(['Meal'], + dataset.loc[meal_rows, 'category'].unique().tolist()) + party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index + self.assertEqual([True], + dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist()) diff --git a/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py b/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py index 69fd23eb4..3bab53a5e 100644 --- a/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py +++ b/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py @@ -17,10 +17,6 @@ class TestChamberOfDeputies(TestCase): def setUp(self): row = pd.Series({'applicant_id': 444, 'document_id': 999, - 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 178, 'year': 2016}) self.dataset = pd.DataFrame().append(row, ignore_index=True) self.temp_dir = mkdtemp() diff --git a/rosie/chamber_of_deputies/tests/test_dataset.py b/rosie/chamber_of_deputies/tests/test_dataset.py deleted file mode 100644 index e5c81ee92..000000000 --- a/rosie/chamber_of_deputies/tests/test_dataset.py +++ /dev/null @@ -1,31 +0,0 @@ -import shutil -import os -from tempfile import mkdtemp -from unittest import TestCase -from unittest.mock import patch -from shutil import copy2 - -from rosie.chamber_of_deputies.adapter import Adapter - - -class TestDataset(TestCase): - - def setUp(self): - self.temp_path = mkdtemp() - fixtures = os.path.join('rosie', 'chamber_of_deputies', 'tests', 'fixtures') - copies = ( - ('companies.xz', Adapter.COMPANIES_DATASET), - ('reimbursements.xz', 'reimbursements.xz') - ) - for source, target in copies: - copy2(os.path.join(fixtures, source), os.path.join(self.temp_path, target)) - self.subject = Adapter(self.temp_path) - - def tearDown(self): - shutil.rmtree(self.temp_path) - - @patch('rosie.chamber_of_deputies.adapter.CEAPDataset') - @patch('rosie.chamber_of_deputies.adapter.fetch') - def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, fetch, ceap): - self.assertEqual(5, len(self.subject.dataset)) - self.assertEqual(1, self.subject.dataset['legal_entity'].isnull().sum()) diff --git a/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py index 91769667a..ce8db8b7e 100644 --- a/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py @@ -12,7 +12,7 @@ class TestMealPriceOutlierClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = MealPriceOutlierClassifier() self.subject.fit(self.dataset) @@ -24,40 +24,40 @@ def test_predict_returns_a_prediction_for_each_observation(self, kmeans_mock): def test_predict_outlier_for_common_cnpjs_when_value_is_greater_than_mean_plus_3_stds(self): row = pd.Series({'applicant_id': 444, - 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 178}) + 'category': 'Meal', + 'recipient_id': '67661714000111', + 'recipient': 'B Restaurant', + 'net_value': 178}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(-1, prediction[0]) def test_predict_inlier_for_common_cnpjs_when_value_is_less_than_mean_plus_3_stds(self): row = pd.Series({'applicant_id': 444, - 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 177}) + 'category': 'Meal', + 'recipient_id': '67661714000111', + 'recipient': 'B Restaurant', + 'net_value': 177}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(1, prediction[0]) def test_predict_outlier_for_non_common_cnpjs_when_value_is_greater_than_mean_plus_4_stds(self): row = pd.Series({'applicant_id': 444, - 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '22412242000125', - 'supplier': 'D Restaurant', - 'total_net_value': 178}) + 'category': 'Meal', + 'recipient_id': '22412242000125', + 'recipient': 'D Restaurant', + 'net_value': 178}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(-1, prediction[0]) def test_predict_inlier_for_non_common_cnpjs_when_value_is_less_than_mean_plus_4_stds(self): row = pd.Series({'applicant_id': 444, - 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '22412242000125', - 'supplier': 'D Restaurant', - 'total_net_value': 177}) + 'category': 'Meal', + 'recipient_id': '22412242000125', + 'recipient': 'D Restaurant', + 'net_value': 177}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(1, prediction[0]) diff --git a/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py b/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py index 3a25bd9bc..d0fab5c33 100644 --- a/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py @@ -12,7 +12,7 @@ class TestTraveledSpeedsClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = TraveledSpeedsClassifier() self.subject.fit(self.dataset) diff --git a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py index fa2b20b0e..df2a5fb49 100644 --- a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py +++ b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py @@ -6,6 +6,20 @@ class InvalidCnpjCpfClassifier(TransformerMixin): + """ + Invalid CNPJ/CPF classifier. + + Validate a `recipient_id` field by calculating its expected check digit + and verifying the authenticity of the provided ones. + + Dataset + ------- + document_type : category column + Validate rows with values 'bill_of_sale' or 'simple_receipt'. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ def fit(self, X): return self @@ -17,4 +31,5 @@ def predict(self, X): return np.r_[X.apply(self.__is_invalid, axis=1)] def __is_invalid(self, row): - return (row['document_type'] in [0, 1]) & (not cpfcnpj.validate(str(row['cnpj_cpf']))) + return (row['document_type'] in ['bill_of_sale', 'simple_receipt']) \ + & (not cpfcnpj.validate(str(row['recipient_id']))) diff --git a/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv b/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv index 1fca62d2a..2b74497cc 100644 --- a/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv +++ b/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv @@ -1,10 +1,10 @@ -cnpj_cpf,document_type -22472225000183,0 -22472225000180,0 -,0 -,2 -22472225000183,2 -22472225000180,2 -57725723501,0 -11111111111,0 -22472225000180,3 +recipient_id,document_type +22472225000183,bill_of_sale +22472225000180,bill_of_sale +,bill_of_sale +,expense_made_abroad +22472225000183,expense_made_abroad +22472225000180,expense_made_abroad +57725723501,bill_of_sale +11111111111,bill_of_sale +22472225000180, diff --git a/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py b/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py index 614e2abc0..ed8eef160 100644 --- a/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py +++ b/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py @@ -10,7 +10,7 @@ class TestInvalidCnpjCpfClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = InvalidCnpjCpfClassifier() def test_is_valid_cnpj(self):