From 3afbeca09ce6f272380f2c47b36815c8449aab48 Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Sun, 30 Apr 2017 17:50:57 +0200 Subject: [PATCH 01/11] Rename dataset columns intented to be run by Rosie Following a newly defined interface, Rosie's classifiers expect any dataset following some definitions. Must have `net_value`, `recipient_id` and `recipient` columns. --- rosie/chamber_of_deputies/adapter.py | 20 +++++++++++--- .../meal_price_outlier_classifier.py | 18 ++++++------- .../monthly_subquota_limit_classifier.py | 2 +- rosie/chamber_of_deputies/settings.py | 2 -- .../irregular_companies_classifier.csv | 2 +- .../meal_price_outlier_classifier.csv | 2 +- .../monthly_subquota_limit_classifier.csv | 2 +- .../fixtures/traveled_speeds_classifier.csv | 2 +- .../tests/test_chamber_of_deputies.py | 6 ++--- .../test_meal_price_outlier_classifier.py | 26 +++++++++---------- .../tests/test_traveled_speeds_classifier.py | 2 +- .../invalid_cnpj_cpf_classifier.py | 2 +- .../fixtures/invalid_cnpj_cpf_classifier.csv | 2 +- .../tests/test_invalid_cnpj_cpf_classifier.py | 2 +- 14 files changed, 50 insertions(+), 40 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 4a57282c2..f37b07f82 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -6,6 +6,12 @@ from serenata_toolbox.datasets import fetch +COLUMNS = { + 'net_value': 'total_net_value', + 'recipient_id': 'cnpj_cpf', + 'recipient': 'supplier', +} + class Adapter: COMPANIES_DATASET = '2016-09-03-companies.xz' @@ -17,10 +23,16 @@ def dataset(self): self.update_datasets() reimbursements = self.get_reimbursements() companies = self.get_companies() - return pd.merge(reimbursements, companies, - how='left', - left_on='cnpj_cpf', - right_on='cnpj') + self._dataset = pd.merge(reimbursements, companies, + how='left', + left_on='cnpj_cpf', + right_on='cnpj') + self.prepare_dataset() + return self._dataset + + def prepare_dataset(self): + columns = {v: k for k, v in COLUMNS.items()} + self._dataset.rename(columns=columns, inplace=True) def update_datasets(self): os.makedirs(self.path, exist_ok=True) diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 861cfd2f0..6d6aa2845 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -14,7 +14,7 @@ class MealPriceOutlierClassifier(TransformerMixin): def fit(self, X): _X = X[self.__applicable_rows(X)] - companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \ + companies = _X.groupby('recipient_id').apply(self.__company_stats) \ .reset_index() companies = companies[self.__applicable_company_rows(companies)] @@ -34,17 +34,17 @@ def transform(self, X=None): def predict(self, X): _X = X.copy() companies = _X[self.__applicable_rows(_X)] \ - .groupby('cnpj_cpf').apply(self.__company_stats) \ + .groupby('recipient_id').apply(self.__company_stats) \ .reset_index() companies['cluster'] = \ self.cluster_model.predict(companies[self.CLUSTER_KEYS]) companies = pd.merge(companies, self.clusters[['cluster', 'threshold']], how='left') - _X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left') + _X = pd.merge(_X, companies[['recipient_id', 'threshold']], how='left') known_companies = companies[self.__applicable_company_rows(companies)] known_thresholds = known_companies \ - .groupby('cnpj_cpf') \ + .groupby('recipient_id') \ .apply(lambda x: x['mean'] + 3 * x['std']) \ .reset_index() \ .rename(columns={0: 'cnpj_threshold'}) @@ -55,21 +55,21 @@ def predict(self, X): _X['y'] = 1 is_outlier = self.__applicable_rows(_X) & \ _X['threshold'].notnull() & \ - (_X['total_net_value'] > _X['threshold']) + (_X['net_value'] > _X['threshold']) _X.loc[is_outlier, 'y'] = -1 return _X['y'] def __applicable_rows(self, X): return (X['subquota_description'] == 'Congressperson meal') & \ - (X['cnpj_cpf'].str.len() == 14) & \ - (~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) + (X['recipient_id'].str.len() == 14) & \ + (~X['recipient'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) def __applicable_company_rows(self, companies): return (companies['congresspeople'] > 3) & (companies['records'] > 20) def __company_stats(self, X): - stats = {'mean': np.mean(X['total_net_value']), - 'std': np.std(X['total_net_value']), + stats = {'mean': np.mean(X['net_value']), + 'std': np.std(X['net_value']), 'congresspeople': len(np.unique(X['applicant_id'])), 'records': len(X)} return pd.Series(stats) diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py index 8da52d10c..1d3d4b9c2 100644 --- a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py @@ -62,7 +62,7 @@ def predict_proba(self, X=None): def __create_columns(self): - self._X['net_value_int'] = (self._X['total_net_value'] * 100).apply(int) + self._X['net_value_int'] = (self._X['net_value'] * 100).apply(int) self._X['coerced_issue_date'] = \ pd.to_datetime(self._X['issue_date'], errors='coerce') diff --git a/rosie/chamber_of_deputies/settings.py b/rosie/chamber_of_deputies/settings.py index 8d62be196..22769288f 100644 --- a/rosie/chamber_of_deputies/settings.py +++ b/rosie/chamber_of_deputies/settings.py @@ -16,5 +16,3 @@ } UNIQUE_IDS = ['applicant_id', 'year', 'document_id'] - -VALUE = 'total_net_value' diff --git a/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv index 49f29ea03..e814a2cdf 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/irregular_companies_classifier.csv @@ -1,4 +1,4 @@ -cnpj_cpf,situation,situation_date,issue_date +recipient_id,situation,situation_date,issue_date 02989654001197,ABERTA,2013-01-03,2013-01-30 02989654001197,BAIXADA,2013-01-03,2013-01-30 02989654001197,NULA,2013-01-03,2013-01-30 diff --git a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv index 75b77c525..d9e89d379 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv @@ -1,4 +1,4 @@ -applicant_id,subquota_description,cnpj_cpf,supplier,total_net_value +applicant_id,subquota_description,recipient_id,recipient,net_value 999,Congressperson meal,47667414122,João da Silva,9999999 999,Congressperson meal,24624607000100,Hotel X,30 999,Congressperson meal,24624607000101,Y Hotels,30 diff --git a/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv index fd6ae2c02..edd4e550c 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/monthly_subquota_limit_classifier.csv @@ -1,4 +1,4 @@ -applicant_id,subquota_number,issue_date,year,month,total_net_value +applicant_id,subquota_number,issue_date,year,month,net_value 1,120,2015-03-01,2015,3,10500 1,120,2015-03-01,2015,3,401 2,120,2015-04-01,2015,4,10500 diff --git a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv index 08ad5f06e..8ed0bf4a0 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv @@ -1,4 +1,4 @@ -applicant_id,congressperson_id,issue_date,subquota_description,cnpj_cpf,latitude,longitude +applicant_id,congressperson_id,issue_date,subquota_description,recipient_id,latitude,longitude 999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977 diff --git a/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py b/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py index 69fd23eb4..6212fdf69 100644 --- a/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py +++ b/rosie/chamber_of_deputies/tests/test_chamber_of_deputies.py @@ -18,9 +18,9 @@ def setUp(self): row = pd.Series({'applicant_id': 444, 'document_id': 999, 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 178, + 'recipient_id': '67661714000111', + 'recipient': 'B Restaurant', + 'net_value': 178, 'year': 2016}) self.dataset = pd.DataFrame().append(row, ignore_index=True) self.temp_dir = mkdtemp() diff --git a/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py index 91769667a..fd19bc749 100644 --- a/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/tests/test_meal_price_outlier_classifier.py @@ -12,7 +12,7 @@ class TestMealPriceOutlierClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = MealPriceOutlierClassifier() self.subject.fit(self.dataset) @@ -25,9 +25,9 @@ def test_predict_returns_a_prediction_for_each_observation(self, kmeans_mock): def test_predict_outlier_for_common_cnpjs_when_value_is_greater_than_mean_plus_3_stds(self): row = pd.Series({'applicant_id': 444, 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 178}) + 'recipient_id': '67661714000111', + 'recipient': 'B Restaurant', + 'net_value': 178}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(-1, prediction[0]) @@ -35,9 +35,9 @@ def test_predict_outlier_for_common_cnpjs_when_value_is_greater_than_mean_plus_3 def test_predict_inlier_for_common_cnpjs_when_value_is_less_than_mean_plus_3_stds(self): row = pd.Series({'applicant_id': 444, 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '67661714000111', - 'supplier': 'B Restaurant', - 'total_net_value': 177}) + 'recipient_id': '67661714000111', + 'recipient': 'B Restaurant', + 'net_value': 177}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(1, prediction[0]) @@ -45,9 +45,9 @@ def test_predict_inlier_for_common_cnpjs_when_value_is_less_than_mean_plus_3_std def test_predict_outlier_for_non_common_cnpjs_when_value_is_greater_than_mean_plus_4_stds(self): row = pd.Series({'applicant_id': 444, 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '22412242000125', - 'supplier': 'D Restaurant', - 'total_net_value': 178}) + 'recipient_id': '22412242000125', + 'recipient': 'D Restaurant', + 'net_value': 178}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(-1, prediction[0]) @@ -55,9 +55,9 @@ def test_predict_outlier_for_non_common_cnpjs_when_value_is_greater_than_mean_pl def test_predict_inlier_for_non_common_cnpjs_when_value_is_less_than_mean_plus_4_stds(self): row = pd.Series({'applicant_id': 444, 'subquota_description': 'Congressperson meal', - 'cnpj_cpf': '22412242000125', - 'supplier': 'D Restaurant', - 'total_net_value': 177}) + 'recipient_id': '22412242000125', + 'recipient': 'D Restaurant', + 'net_value': 177}) X = pd.DataFrame().append(row, ignore_index=True) prediction = self.subject.predict(X) self.assertEqual(1, prediction[0]) diff --git a/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py b/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py index 3a25bd9bc..d0fab5c33 100644 --- a/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/tests/test_traveled_speeds_classifier.py @@ -12,7 +12,7 @@ class TestTraveledSpeedsClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = TraveledSpeedsClassifier() self.subject.fit(self.dataset) diff --git a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py index fa2b20b0e..4fa92a935 100644 --- a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py +++ b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py @@ -17,4 +17,4 @@ def predict(self, X): return np.r_[X.apply(self.__is_invalid, axis=1)] def __is_invalid(self, row): - return (row['document_type'] in [0, 1]) & (not cpfcnpj.validate(str(row['cnpj_cpf']))) + return (row['document_type'] in [0, 1]) & (not cpfcnpj.validate(str(row['recipient_id']))) diff --git a/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv b/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv index 1fca62d2a..0ffccbc8f 100644 --- a/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv +++ b/rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv @@ -1,4 +1,4 @@ -cnpj_cpf,document_type +recipient_id,document_type 22472225000183,0 22472225000180,0 ,0 diff --git a/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py b/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py index 614e2abc0..ed8eef160 100644 --- a/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py +++ b/rosie/core/tests/test_invalid_cnpj_cpf_classifier.py @@ -10,7 +10,7 @@ class TestInvalidCnpjCpfClassifier(TestCase): def setUp(self): self.dataset = pd.read_csv('rosie/core/tests/fixtures/invalid_cnpj_cpf_classifier.csv', - dtype={'cnpj_cpf': np.str}) + dtype={'recipient_id': np.str}) self.subject = InvalidCnpjCpfClassifier() def test_is_valid_cnpj(self): From 8403b3fe04b1dc1006cb5ffccd86d7ab46db31ab Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Sun, 30 Apr 2017 18:45:16 +0200 Subject: [PATCH 02/11] Reduce memory usage by reusing the same memory space for operations About the same time to run, but using 1GB less of memory. **Before** In [2]: %time data = Adapter('/Users/irio/Code/serenata/research/data').dataset CPU times: user 37.4 s, sys: 8.73 s, total: 46.2 s Wall time: 46.9 s In [5]: %memit -r 5 data = Adapter('/Users/irio/Code/serenata/research/data').dataset peak memory: 10626.52 MiB, increment: 6682.10 MiB **After** %time data = Adapter('/Users/irio/Code/serenata/research/data').dataset CPU times: user 39 s, sys: 10.8 s, total: 49.9 s Wall time: 51.6 s %memit -r 5 data = Adapter('/Users/irio/Code/serenata/research/data').dataset peak memory: 11398.68 MiB, increment: 7787.70 MiB --- rosie/chamber_of_deputies/adapter.py | 39 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index f37b07f82..76819b754 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -21,12 +21,12 @@ def __init__(self, path): @property def dataset(self): self.update_datasets() - reimbursements = self.get_reimbursements() + self.get_reimbursements() companies = self.get_companies() - self._dataset = pd.merge(reimbursements, companies, - how='left', - left_on='cnpj_cpf', - right_on='cnpj') + self._dataset = self._dataset.merge(companies, + how='left', + left_on='cnpj_cpf', + right_on='cnpj') self.prepare_dataset() return self._dataset @@ -44,22 +44,21 @@ def update_datasets(self): fetch(self.COMPANIES_DATASET, self.path) def get_reimbursements(self): - dataset = \ - pd.read_csv(os.path.join(self.path, 'reimbursements.xz'), - dtype={'applicant_id': np.str, - 'cnpj_cpf': np.str, - 'congressperson_id': np.str, - 'subquota_number': np.str}, - low_memory=False) - dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], - errors='coerce') - return dataset + path = os.path.join(self.path, 'reimbursements.xz') + self._dataset = pd.read_csv(path, + dtype={'applicant_id': np.str, + 'cnpj_cpf': np.str, + 'congressperson_id': np.str, + 'subquota_number': np.str}, + low_memory=False) + self._dataset['issue_date'] = pd.to_datetime( + self._dataset['issue_date'], errors='coerce') + return self._dataset def get_companies(self): - dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET), - dtype={'cnpj': np.str}, - low_memory=False) + path = os.path.join(self.path, self.COMPANIES_DATASET) + dataset = pd.read_csv(path, dtype={'cnpj': np.str}, low_memory=False) dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '') - dataset['situation_date'] = pd.to_datetime(dataset['situation_date'], - errors='coerce') + dataset['situation_date'] = pd.to_datetime( + dataset['situation_date'], errors='coerce') return dataset From 90f58f337aa33fe368dcdc1f656b31f41be37d50 Mon Sep 17 00:00:00 2001 From: Ana Schwendler Date: Sun, 30 Apr 2017 20:04:22 +0200 Subject: [PATCH 03/11] Defining an interface for invalid cnpj/cpf --- rosie/chamber_of_deputies/adapter.py | 4 ++++ .../tests/fixtures/reimbursements.xz | Bin 884 -> 888 bytes .../invalid_cnpj_cpf_classifier.py | 2 +- .../fixtures/invalid_cnpj_cpf_classifier.csv | 18 +++++++++--------- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 76819b754..6c4755395 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -33,6 +33,10 @@ def dataset(self): def prepare_dataset(self): columns = {v: k for k, v in COLUMNS.items()} self._dataset.rename(columns=columns, inplace=True) + self._dataset['document_type'].replace({3: None}, inplace=True) + self._dataset['document_type'] = self._dataset['document_type'].astype('category') + types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] + self._dataset['document_type'].cat.rename_categories(types, inplace=True) def update_datasets(self): os.makedirs(self.path, exist_ok=True) diff --git a/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz b/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz index 06bf968fa470701a604158eb9d2dc49543ccafc6..ae2b55f93c48939a854add433e3102b99f95b5ae 100644 GIT binary patch delta 355 zcmV-p0i6Ey2KWY$8UrT-II$fG0)Nho=B8o=_ag{aF@LTOwqTdMsy-wv?z37URVRu> zGeGxAo#WubowLIZhz5v-AWcZb@RX+YVlLYltozRBZ)Ocla4u+X~t?~{1Mp?_Fx;!vWZ z8w<3|b=x((!xy%KbykeOe6u(0OJ7Xo92W4qe%+OpPAKC^H$I8eeu`KGqQ+{Z7aI{R z8BTN25)V>F`ZZ-Nc5Al0_y@7^S9fru{7kHagV*#)ckIw}9`h-386@=#!7aLS78)hj z^XWVa?*DfsnA$S3m(@ukLR6B#O1QIAHZkTcC~Xk&Y8_+H=NVtETZrvgzU99!p2Frj z!4p#`AcQ3Nc|Nb4jMu+Puss+OD8oFaFu_qkB!FaQ0g zv1D6d0bQeL!$5jH>18&8QS444nZ*{v(w`GDH$i2-Ah=yHYQ*3Ch4*%3Fu4KcffoX} zAXdc>Xfd_(UvyjHcC3I7-%uE`08%CVPIhKL`6ON7bg%?2*VLJf x9J?qceT}sRG%bPv000 Date: Mon, 1 May 2017 12:10:10 +0200 Subject: [PATCH 04/11] Rename test file to reflect class under test --- .../tests/{test_dataset.py => test_adapter.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename rosie/chamber_of_deputies/tests/{test_dataset.py => test_adapter.py} (80%) diff --git a/rosie/chamber_of_deputies/tests/test_dataset.py b/rosie/chamber_of_deputies/tests/test_adapter.py similarity index 80% rename from rosie/chamber_of_deputies/tests/test_dataset.py rename to rosie/chamber_of_deputies/tests/test_adapter.py index e5c81ee92..c36d4b5ae 100644 --- a/rosie/chamber_of_deputies/tests/test_dataset.py +++ b/rosie/chamber_of_deputies/tests/test_adapter.py @@ -5,21 +5,21 @@ from unittest.mock import patch from shutil import copy2 -from rosie.chamber_of_deputies.adapter import Adapter +from rosie.chamber_of_deputies.adapter import Adapter as subject_class -class TestDataset(TestCase): +class TestAdapter(TestCase): def setUp(self): self.temp_path = mkdtemp() fixtures = os.path.join('rosie', 'chamber_of_deputies', 'tests', 'fixtures') copies = ( - ('companies.xz', Adapter.COMPANIES_DATASET), + ('companies.xz', subject_class.COMPANIES_DATASET), ('reimbursements.xz', 'reimbursements.xz') ) for source, target in copies: copy2(os.path.join(fixtures, source), os.path.join(self.temp_path, target)) - self.subject = Adapter(self.temp_path) + self.subject = subject_class(self.temp_path) def tearDown(self): shutil.rmtree(self.temp_path) From 81ae4ab9b8a27d223e3f0b1a8bd525b688306c9d Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Mon, 1 May 2017 12:47:11 +0200 Subject: [PATCH 05/11] Rename category and values for document_type in chamber_of_deputies --- rosie/chamber_of_deputies/adapter.py | 21 +++++++++++---- .../chamber_of_deputies/tests/test_adapter.py | 26 +++++++++++++++++-- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 6c4755395..d471c0d80 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -7,11 +7,12 @@ COLUMNS = { - 'net_value': 'total_net_value', - 'recipient_id': 'cnpj_cpf', - 'recipient': 'supplier', + 'net_value': 'total_net_value', + 'recipient_id': 'cnpj_cpf', + 'recipient': 'supplier', } + class Adapter: COMPANIES_DATASET = '2016-09-03-companies.xz' @@ -31,12 +32,22 @@ def dataset(self): return self._dataset def prepare_dataset(self): + self.rename_columns() + self.rename_categories() + + def rename_columns(self): columns = {v: k for k, v in COLUMNS.items()} self._dataset.rename(columns=columns, inplace=True) + + def rename_categories(self): self._dataset['document_type'].replace({3: None}, inplace=True) - self._dataset['document_type'] = self._dataset['document_type'].astype('category') + self._dataset['document_type'] = self._dataset['document_type'].astype( + 'category') types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] - self._dataset['document_type'].cat.rename_categories(types, inplace=True) + self._dataset['document_type'].cat.rename_categories( + types, inplace=True) + meal_rows = self._dataset['subquota_description'] == 'Congressperson meal' + self._dataset.loc[meal_rows, 'subquota_description'] = 'Meal' def update_datasets(self): os.makedirs(self.path, exist_ok=True) diff --git a/rosie/chamber_of_deputies/tests/test_adapter.py b/rosie/chamber_of_deputies/tests/test_adapter.py index c36d4b5ae..9398526ae 100644 --- a/rosie/chamber_of_deputies/tests/test_adapter.py +++ b/rosie/chamber_of_deputies/tests/test_adapter.py @@ -5,20 +5,23 @@ from unittest.mock import patch from shutil import copy2 +import pandas as pd + from rosie.chamber_of_deputies.adapter import Adapter as subject_class +from rosie.chamber_of_deputies.adapter import COLUMNS as ADAPTER_COLUMNS class TestAdapter(TestCase): def setUp(self): self.temp_path = mkdtemp() - fixtures = os.path.join('rosie', 'chamber_of_deputies', 'tests', 'fixtures') + self.fixtures_path = os.path.join('rosie', 'chamber_of_deputies', 'tests', 'fixtures') copies = ( ('companies.xz', subject_class.COMPANIES_DATASET), ('reimbursements.xz', 'reimbursements.xz') ) for source, target in copies: - copy2(os.path.join(fixtures, source), os.path.join(self.temp_path, target)) + copy2(os.path.join(self.fixtures_path, source), os.path.join(self.temp_path, target)) self.subject = subject_class(self.temp_path) def tearDown(self): @@ -29,3 +32,22 @@ def tearDown(self): def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, fetch, ceap): self.assertEqual(5, len(self.subject.dataset)) self.assertEqual(1, self.subject.dataset['legal_entity'].isnull().sum()) + + @patch('rosie.chamber_of_deputies.adapter.CEAPDataset') + @patch('rosie.chamber_of_deputies.adapter.fetch') + def test_prepare_dataset(self, fetch, ceap): + """ + * Rename columns. + * Make `document_type` a category column. + * Rename values for subquota_description. + """ + dataset = self.subject.dataset + self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns))) + document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] + self.assertEqual(document_types, + dataset['document_type'].cat.categories.tolist()) + fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz')) + meal_rows = fixture \ + .query('subquota_description == "Congressperson meal"')['subquota_description'].index + self.assertEqual(['Meal'], + dataset.loc[meal_rows, 'subquota_description'].unique().tolist()) From 0199ea6a73b1204acea3343d8a94b7c36385d727 Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Mon, 1 May 2017 18:21:01 +0200 Subject: [PATCH 06/11] Rename subquota_description column to category --- rosie/chamber_of_deputies/adapter.py | 5 +- .../meal_price_outlier_classifier.py | 2 +- .../classifiers/traveled_speeds_classifier.py | 2 +- .../meal_price_outlier_classifier.csv | 160 +++++++++--------- .../tests/fixtures/reimbursements.xz | Bin 888 -> 888 bytes .../fixtures/traveled_speeds_classifier.csv | 58 +++---- .../chamber_of_deputies/tests/test_adapter.py | 8 +- .../tests/test_chamber_of_deputies.py | 4 - .../test_meal_price_outlier_classifier.py | 8 +- 9 files changed, 122 insertions(+), 125 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index d471c0d80..280ec56e0 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -7,6 +7,7 @@ COLUMNS = { + 'category': 'subquota_description', 'net_value': 'total_net_value', 'recipient_id': 'cnpj_cpf', 'recipient': 'supplier', @@ -46,8 +47,8 @@ def rename_categories(self): types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] self._dataset['document_type'].cat.rename_categories( types, inplace=True) - meal_rows = self._dataset['subquota_description'] == 'Congressperson meal' - self._dataset.loc[meal_rows, 'subquota_description'] = 'Meal' + self._dataset['category'] = self._dataset['category'].replace( + {'Congressperson meal': 'Meal'}) def update_datasets(self): os.makedirs(self.path, exist_ok=True) diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 6d6aa2845..508053977 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -60,7 +60,7 @@ def predict(self, X): return _X['y'] def __applicable_rows(self, X): - return (X['subquota_description'] == 'Congressperson meal') & \ + return (X['category'] == 'Meal') & \ (X['recipient_id'].str.len() == 14) & \ (~X['recipient'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index 9b222db47..e8f9d0b61 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -61,7 +61,7 @@ def __classify_dataset(self, X): return X def __applicable_rows(self, X): - return (X['subquota_description'] == 'Congressperson meal') & \ + return (X['category'] == 'Meal') & \ (-73.992222 < X['longitude']) & (X['longitude'] < -34.7916667) & \ (-33.742222 < X['latitude']) & (X['latitude'] < 5.2722222) & \ X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1) diff --git a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv index d9e89d379..b9c5f3b0f 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/meal_price_outlier_classifier.csv @@ -1,82 +1,82 @@ -applicant_id,subquota_description,recipient_id,recipient,net_value -999,Congressperson meal,47667414122,João da Silva,9999999 -999,Congressperson meal,24624607000100,Hotel X,30 -999,Congressperson meal,24624607000101,Y Hotels,30 -999,Congressperson meal,24624607000102,X hotéis,30 -999,Congressperson meal,24624607000103,Hotel X,9999999 -999,Congressperson meal,24624607000104,Y Hotels,9999999 -999,Congressperson meal,24624607000105,X hotéis,9999999 -111,Congressperson meal,08378940000120,A Restaurant,3 -111,Congressperson meal,08378940000120,A Restaurant,30 -222,Congressperson meal,08378940000120,A Restaurant,40 -333,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,60 -444,Congressperson meal,08378940000120,A Restaurant,70 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -444,Congressperson meal,08378940000120,A Restaurant,50 -111,Congressperson meal,08378940000120,A Restaurant,900 -111,Congressperson meal,08378940000120,A Restaurant,80 -222,Congressperson meal,08378940000120,A Restaurant,70 -333,Congressperson meal,08378940000120,A Restaurant,60 -444,Congressperson meal,08378940000120,A Restaurant,70 -444,Congressperson meal,08378940000120,A Restaurant,80 -444,Congressperson meal,08378940000120,A Restaurant,90 -111,Congressperson meal,67661714000111,B Restaurant,130 -222,Congressperson meal,67661714000111,B Restaurant,140 -333,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,160 -444,Congressperson meal,67661714000111,B Restaurant,170 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -444,Congressperson meal,67661714000111,B Restaurant,150 -111,Congressperson meal,81387409000104,C Restaurant,330 -222,Congressperson meal,81387409000104,C Restaurant,340 -333,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,360 -444,Congressperson meal,81387409000104,C Restaurant,370 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,81387409000104,C Restaurant,350 -444,Congressperson meal,22472225000183,GOL,400 +applicant_id,category,recipient_id,recipient,net_value +999,Meal,47667414122,João da Silva,9999999 +999,Meal,24624607000100,Hotel X,30 +999,Meal,24624607000101,Y Hotels,30 +999,Meal,24624607000102,X hotéis,30 +999,Meal,24624607000103,Hotel X,9999999 +999,Meal,24624607000104,Y Hotels,9999999 +999,Meal,24624607000105,X hotéis,9999999 +111,Meal,08378940000120,A Restaurant,3 +111,Meal,08378940000120,A Restaurant,30 +222,Meal,08378940000120,A Restaurant,40 +333,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,60 +444,Meal,08378940000120,A Restaurant,70 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +444,Meal,08378940000120,A Restaurant,50 +111,Meal,08378940000120,A Restaurant,900 +111,Meal,08378940000120,A Restaurant,80 +222,Meal,08378940000120,A Restaurant,70 +333,Meal,08378940000120,A Restaurant,60 +444,Meal,08378940000120,A Restaurant,70 +444,Meal,08378940000120,A Restaurant,80 +444,Meal,08378940000120,A Restaurant,90 +111,Meal,67661714000111,B Restaurant,130 +222,Meal,67661714000111,B Restaurant,140 +333,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,160 +444,Meal,67661714000111,B Restaurant,170 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +444,Meal,67661714000111,B Restaurant,150 +111,Meal,81387409000104,C Restaurant,330 +222,Meal,81387409000104,C Restaurant,340 +333,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,360 +444,Meal,81387409000104,C Restaurant,370 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,81387409000104,C Restaurant,350 +444,Meal,22472225000183,GOL,400 444,Flight ticket issue,22472225000183,GOL,9999999 444,Flight ticket issue,22472225000183,GOL,5 diff --git a/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz b/rosie/chamber_of_deputies/tests/fixtures/reimbursements.xz index ae2b55f93c48939a854add433e3102b99f95b5ae..86bf1067be0ac9d8c7e2ca9c8fbbd95063361cf1 100644 GIT binary patch delta 503 zcmV=`61zI!q~GS? z5~X>^gYlW(*c9%(D9b9L@{@TJgAa!gL}vUtHpbW4yW*TCp7Jhrg?aKj+cFh*6X~;g z&8rx}x+g*8zWL)U8=rMCbv^_z+q;yZO~%L$)p#CL?c{Yx<@RnlP)olcg)_tNU{29PH%p$bcwzwEagAc__i_f-(a zPhG9Xz5B@SWyY|Bb6Bof?9StkCPy34goJ*4hOAFBlE>B0Hf8gziF!KCVaQeHD=|fx zRTs?ry<4d=w%@oV*2dn9Q-An=aCkh`gH;lmOso9}RLRnD97-iA)KU{C=+%d6{ZBDa z_yUDG)8kuBxEo9~ed-$tIy_#ApLtHzq;?c3IfF9)&;FMlWqU$;4ZV{)oG#vWfGZG& z-rRG-x{(n%&$gVe2XI%?(wbv#W-HnQg20%T2b0e3k8db|8Fa+d$7qasT!6sO8XG(Z zT|VLC+L5%Sl;b18xbEEp5QC^bQLtiL{-`!RN?|PkJJyUr7>L@J;7GCMDe>0O$?EB6 t@$W)-tEP`qPD}rlAQ?&1ix#gWsx!v zH2EzTr4}z)X+cs$LDWX-bm7wh=c}9?z2;ngn6xE83%P2v_%RGeGxAo#Wubow zLIZhz5v-AWcZb@RX+YVlLVudR>%Ph3f=m^{HL%dS`tOr?#GzPh;!vWZ8w<3|b=x(( z!xy%KbykeOe6u(0OJ7Xo92W4qe%+OpPAKC^H$I8eeu`KGqQ+{Z7aI{R8BTN25)V>F z`ZZ-Nc5Al0_y@7^S9fru{7kHagV*#)ckIw}9`h-386@=#!7aLS7HAqJ*YoK-3hw`R zC79YWvX|9KB0`eDO1QIAHZkTcC~Xk&Y8_+H=NVtETZrvgzU99!p2Frj!4p#`Ac Date: Mon, 1 May 2017 18:21:34 +0200 Subject: [PATCH 07/11] Create `is_party_expense` column in chamber_of_deputies --- rosie/chamber_of_deputies/adapter.py | 2 ++ rosie/chamber_of_deputies/tests/test_adapter.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 280ec56e0..0a1d9b0b8 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -49,6 +49,8 @@ def rename_categories(self): types, inplace=True) self._dataset['category'] = self._dataset['category'].replace( {'Congressperson meal': 'Meal'}) + self._dataset['is_party_expense'] = \ + self._dataset['congressperson_id'].isnull() def update_datasets(self): os.makedirs(self.path, exist_ok=True) diff --git a/rosie/chamber_of_deputies/tests/test_adapter.py b/rosie/chamber_of_deputies/tests/test_adapter.py index b9f72f6d8..d5d3d123b 100644 --- a/rosie/chamber_of_deputies/tests/test_adapter.py +++ b/rosie/chamber_of_deputies/tests/test_adapter.py @@ -40,6 +40,7 @@ def test_prepare_dataset(self, fetch, ceap): * Rename columns. * Make `document_type` a category column. * Rename values for `category`. + * Create `is_party_expense` column. """ dataset = self.subject.dataset self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns))) @@ -51,3 +52,6 @@ def test_prepare_dataset(self, fetch, ceap): .query('subquota_description == "Congressperson meal"').index self.assertEqual(['Meal'], dataset.loc[meal_rows, 'category'].unique().tolist()) + party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index + self.assertEqual([True], + dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist()) From 600171a318b245be087d6efa33ca093ff32dc7cf Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Mon, 1 May 2017 19:28:45 +0200 Subject: [PATCH 08/11] Check for a is_party_expense when executing the Traveled Speeds classifier --- .../classifiers/traveled_speeds_classifier.py | 3 +- .../fixtures/traveled_speeds_classifier.csv | 60 +++++++++---------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index e8f9d0b61..3d3da945f 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -64,7 +64,8 @@ def __applicable_rows(self, X): return (X['category'] == 'Meal') & \ (-73.992222 < X['longitude']) & (X['longitude'] < -34.7916667) & \ (-33.742222 < X['latitude']) & (X['latitude'] < 5.2722222) & \ - X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1) + ~X['is_party_expense'] & \ + X[['latitude', 'longitude']].notnull().all(axis=1) def __calculate_sum_distances(self, X): coordinate_list = X[['latitude', 'longitude']].values diff --git a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv index 721e964d2..2668c9351 100644 --- a/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv +++ b/rosie/chamber_of_deputies/tests/fixtures/traveled_speeds_classifier.csv @@ -1,30 +1,30 @@ -applicant_id,congressperson_id,issue_date,category,recipient_id,latitude,longitude -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365 -999,,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,-9.9753770, -999,999,2016-01-02,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,14047033000100,-10.6519807,-68.4995996 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 -999,999,2016-01-01,Meal,08378940000120,6.9753770,-33.8248977 +applicant_id,is_party_expense,issue_date,category,recipient_id,latitude,longitude +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365 +999,true,2016-01-01,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,-9.9753770, +999,false,2016-01-02,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-03,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-04,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,14047033000100,-10.6519807,-68.4995996 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-05,Meal,08378940000120,-9.9753770,-67.8248977 +999,false,2016-01-01,Meal,08378940000120,6.9753770,-33.8248977 From 368c0d839cb9631fbe87424ecf01a33a56851fc4 Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Mon, 1 May 2017 19:29:48 +0200 Subject: [PATCH 09/11] Document existing classifiers --- .../election_expenses_classifier.py | 11 ++++++++ .../irregular_companies_classifier.py | 18 +++++++++++++ .../meal_price_outlier_classifier.py | 13 ++++++++++ .../monthly_subquota_limit_classifier.py | 12 +++++++++ .../classifiers/traveled_speeds_classifier.py | 25 +++++++++++++++++++ .../invalid_cnpj_cpf_classifier.py | 17 ++++++++++++- 6 files changed, 95 insertions(+), 1 deletion(-) diff --git a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py index b625078e7..91a876b05 100644 --- a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py @@ -5,6 +5,17 @@ class ElectionExpensesClassifier(TransformerMixin): + """ + Election Expenses classifier. + + Check a `legal_entity` field for the presency of the political candidacy + category in the Brazilian Federal Revenue. + + Dataset + ------- + legal_entity : string column + Brazilian Federal Revenue category of companies, preceded by its code. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py index 92a117b44..b2574ffc5 100644 --- a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py @@ -5,6 +5,24 @@ class IrregularCompaniesClassifier(TransformerMixin): + """ + Irregular Companies classifier. + + Check for the official state of the company in the + Brazilian Federal Revenue and reports for rows with companies unauthorized + to sell products or services. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + situation : string column + Situation of the company according to the Brazilian Federal Revenue. + + situation_date : datetime column + Date when the situation was last updated. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 508053977..f76b31822 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -7,7 +7,20 @@ class MealPriceOutlierClassifier(TransformerMixin): + """ + Meal Price Outlier classifier. + Dataset + ------- + applicant_id : string column + A personal identifier code for every person making expenses. + + net_value : float column + The value of the expense. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ HOTEL_REGEX = r'hote(?:(?:ls?)|is)' CLUSTER_KEYS = ['mean', 'std'] diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py index 1d3d4b9c2..93b991d5d 100644 --- a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py @@ -6,6 +6,18 @@ class MonthlySubquotaLimitClassifier(TransformerMixin): + """ + Monthly Subquota Limit classifier. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + net_value : float column + The value of the expense. + """ + KEYS = ['applicant_id', 'month', 'year'] def fit(self, X): diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index 3d3da945f..2b54c324b 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -8,6 +8,31 @@ class TraveledSpeedsClassifier(TransformerMixin): + """ + Traveled Speeds classifier. + + Dataset + ------- + applicant_id : category column + A personal identifier code for every person making expenses. + + category : category column + Category of the expense. The model will be applied just in rows where + the value is equal to "Meal". + + is_party_expense : bool column + If the row corresponds to a party expense or not. The model will be + applied just in rows where the value is equal to `False`. + + issue_date : datetime column + Date when the expense was made. + + latitude : float column + Latitude of the place where the expense was made. + + longitude : float column + Longitude of the place where the expense was made. + """ AGG_KEYS = ['applicant_id', 'issue_date'] diff --git a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py index 62a026953..df2a5fb49 100644 --- a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py +++ b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py @@ -6,6 +6,20 @@ class InvalidCnpjCpfClassifier(TransformerMixin): + """ + Invalid CNPJ/CPF classifier. + + Validate a `recipient_id` field by calculating its expected check digit + and verifying the authenticity of the provided ones. + + Dataset + ------- + document_type : category column + Validate rows with values 'bill_of_sale' or 'simple_receipt'. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ def fit(self, X): return self @@ -17,4 +31,5 @@ def predict(self, X): return np.r_[X.apply(self.__is_invalid, axis=1)] def __is_invalid(self, row): - return (row['document_type'] in ['bill_of_sale', 'simple_receipt']) & (not cpfcnpj.validate(str(row['recipient_id']))) + return (row['document_type'] in ['bill_of_sale', 'simple_receipt']) \ + & (not cpfcnpj.validate(str(row['recipient_id']))) From 2558920120e5b604de1a7809d2d677e6c4351eb0 Mon Sep 17 00:00:00 2001 From: Ana Schwendler Date: Thu, 4 May 2017 11:09:25 +0200 Subject: [PATCH 10/11] Document cleansing operations happening in adapter --- rosie/chamber_of_deputies/adapter.py | 2 ++ .../classifiers/meal_price_outlier_classifier.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index 0a1d9b0b8..bfa1bf04c 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -41,12 +41,14 @@ def rename_columns(self): self._dataset.rename(columns=columns, inplace=True) def rename_categories(self): + # There's no documented type for `3`, thus we assume it's an input error self._dataset['document_type'].replace({3: None}, inplace=True) self._dataset['document_type'] = self._dataset['document_type'].astype( 'category') types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] self._dataset['document_type'].cat.rename_categories( types, inplace=True) + # Classifiers expect a more broad category name for meals self._dataset['category'] = self._dataset['category'].replace( {'Congressperson meal': 'Meal'}) self._dataset['is_party_expense'] = \ diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index f76b31822..862027176 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -15,6 +15,10 @@ class MealPriceOutlierClassifier(TransformerMixin): applicant_id : string column A personal identifier code for every person making expenses. + category : category column + Category of the expense. The model will be applied just in rows where + the value is equal to "Meal". + net_value : float column The value of the expense. From fb91f0ea3dbebd179aa39a88826aefb6e4e8a6ec Mon Sep 17 00:00:00 2001 From: Ana Schwendler Date: Thu, 4 May 2017 12:11:32 +0200 Subject: [PATCH 11/11] Prepare rosie for the new `serenata-toolbox` structure --- rosie/chamber_of_deputies/adapter.py | 12 ++++++------ rosie/chamber_of_deputies/tests/test_adapter.py | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/adapter.py index bfa1bf04c..8c7452c4b 100644 --- a/rosie/chamber_of_deputies/adapter.py +++ b/rosie/chamber_of_deputies/adapter.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from serenata_toolbox.ceap_dataset import CEAPDataset +from serenata_toolbox.chamber_of_deputies.chamber_of_deputies_dataset import ChamberOfDeputiesDataset from serenata_toolbox.datasets import fetch @@ -56,11 +56,11 @@ def rename_categories(self): def update_datasets(self): os.makedirs(self.path, exist_ok=True) - ceap = CEAPDataset(self.path) - ceap.fetch() - ceap.convert_to_csv() - ceap.translate() - ceap.clean() + chamber_of_deputies = ChamberOfDeputiesDataset(self.path) + chamber_of_deputies.fetch() + chamber_of_deputies.convert_to_csv() + chamber_of_deputies.translate() + chamber_of_deputies.clean() fetch(self.COMPANIES_DATASET, self.path) def get_reimbursements(self): diff --git a/rosie/chamber_of_deputies/tests/test_adapter.py b/rosie/chamber_of_deputies/tests/test_adapter.py index d5d3d123b..c738e9efd 100644 --- a/rosie/chamber_of_deputies/tests/test_adapter.py +++ b/rosie/chamber_of_deputies/tests/test_adapter.py @@ -27,15 +27,15 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.temp_path) - @patch('rosie.chamber_of_deputies.adapter.CEAPDataset') + @patch('rosie.chamber_of_deputies.adapter.ChamberOfDeputiesDataset') @patch('rosie.chamber_of_deputies.adapter.fetch') - def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, fetch, ceap): + def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, fetch, chamber_of_deputies): self.assertEqual(6, len(self.subject.dataset)) self.assertEqual(1, self.subject.dataset['legal_entity'].isnull().sum()) - @patch('rosie.chamber_of_deputies.adapter.CEAPDataset') + @patch('rosie.chamber_of_deputies.adapter.ChamberOfDeputiesDataset') @patch('rosie.chamber_of_deputies.adapter.fetch') - def test_prepare_dataset(self, fetch, ceap): + def test_prepare_dataset(self, fetch, chamber_of_deputies): """ * Rename columns. * Make `document_type` a category column.