From 368c0d839cb9631fbe87424ecf01a33a56851fc4 Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Mon, 1 May 2017 19:29:48 +0200 Subject: [PATCH] Document existing classifiers --- .../election_expenses_classifier.py | 11 ++++++++ .../irregular_companies_classifier.py | 18 +++++++++++++ .../meal_price_outlier_classifier.py | 13 ++++++++++ .../monthly_subquota_limit_classifier.py | 12 +++++++++ .../classifiers/traveled_speeds_classifier.py | 25 +++++++++++++++++++ .../invalid_cnpj_cpf_classifier.py | 17 ++++++++++++- 6 files changed, 95 insertions(+), 1 deletion(-) diff --git a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py index b625078e7..91a876b05 100644 --- a/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/election_expenses_classifier.py @@ -5,6 +5,17 @@ class ElectionExpensesClassifier(TransformerMixin): + """ + Election Expenses classifier. + + Check a `legal_entity` field for the presency of the political candidacy + category in the Brazilian Federal Revenue. + + Dataset + ------- + legal_entity : string column + Brazilian Federal Revenue category of companies, preceded by its code. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py index 92a117b44..b2574ffc5 100644 --- a/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/irregular_companies_classifier.py @@ -5,6 +5,24 @@ class IrregularCompaniesClassifier(TransformerMixin): + """ + Irregular Companies classifier. + + Check for the official state of the company in the + Brazilian Federal Revenue and reports for rows with companies unauthorized + to sell products or services. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + situation : string column + Situation of the company according to the Brazilian Federal Revenue. + + situation_date : datetime column + Date when the situation was last updated. + """ def fit(self, X): return self diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 508053977..f76b31822 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -7,7 +7,20 @@ class MealPriceOutlierClassifier(TransformerMixin): + """ + Meal Price Outlier classifier. + Dataset + ------- + applicant_id : string column + A personal identifier code for every person making expenses. + + net_value : float column + The value of the expense. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ HOTEL_REGEX = r'hote(?:(?:ls?)|is)' CLUSTER_KEYS = ['mean', 'std'] diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py index 1d3d4b9c2..93b991d5d 100644 --- a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py @@ -6,6 +6,18 @@ class MonthlySubquotaLimitClassifier(TransformerMixin): + """ + Monthly Subquota Limit classifier. + + Dataset + ------- + issue_date : datetime column + Date when the expense was made. + + net_value : float column + The value of the expense. + """ + KEYS = ['applicant_id', 'month', 'year'] def fit(self, X): diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index 3d3da945f..2b54c324b 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -8,6 +8,31 @@ class TraveledSpeedsClassifier(TransformerMixin): + """ + Traveled Speeds classifier. + + Dataset + ------- + applicant_id : category column + A personal identifier code for every person making expenses. + + category : category column + Category of the expense. The model will be applied just in rows where + the value is equal to "Meal". + + is_party_expense : bool column + If the row corresponds to a party expense or not. The model will be + applied just in rows where the value is equal to `False`. + + issue_date : datetime column + Date when the expense was made. + + latitude : float column + Latitude of the place where the expense was made. + + longitude : float column + Longitude of the place where the expense was made. + """ AGG_KEYS = ['applicant_id', 'issue_date'] diff --git a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py index 62a026953..df2a5fb49 100644 --- a/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py +++ b/rosie/core/classifiers/invalid_cnpj_cpf_classifier.py @@ -6,6 +6,20 @@ class InvalidCnpjCpfClassifier(TransformerMixin): + """ + Invalid CNPJ/CPF classifier. + + Validate a `recipient_id` field by calculating its expected check digit + and verifying the authenticity of the provided ones. + + Dataset + ------- + document_type : category column + Validate rows with values 'bill_of_sale' or 'simple_receipt'. + + recipient_id : string column + A CNPJ (Brazilian company ID) or CPF (Brazilian personal tax ID). + """ def fit(self, X): return self @@ -17,4 +31,5 @@ def predict(self, X): return np.r_[X.apply(self.__is_invalid, axis=1)] def __is_invalid(self, row): - return (row['document_type'] in ['bill_of_sale', 'simple_receipt']) & (not cpfcnpj.validate(str(row['recipient_id']))) + return (row['document_type'] in ['bill_of_sale', 'simple_receipt']) \ + & (not cpfcnpj.validate(str(row['recipient_id'])))