diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index ba862cb17..e7b92b538 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -75,13 +75,17 @@ nav: - Defining an Experiment: experiments/defining.md - Testing Feature Configuration: experiments/feature-testing.md - Running an Experiment: experiments/running.md - - Upgrading an Experiment: experiments/upgrading.md + - Upgrading an Experiment: + to v5: experiments/upgrade-to-v5.md + to v6: experiments/upgrade-to-v6.md + to v7: experiments/upgrade-to-v7.md - Temporal Validation Deep Dive: experiments/temporal-validation.md - Cohort and Label Deep Dive: experiments/cohort-labels.md - Prediction Ranking: experiments/prediction-ranking.md - Feature Generation Recipe Book: experiments/features.md - Experiment Algorithm: experiments/algorithm.md - Experiment Architecture: experiments/architecture.md + - Extending Experiment Features: experiments/extending-features.md - Model selection: dirtyduck/docs/audition.md - Postmodeling: postmodeling/index.md - Model governance: dirtyduck/docs/ml_governance.md diff --git a/docs/sources/experiments/extending-features.md b/docs/sources/experiments/extending-features.md new file mode 100644 index 000000000..a3c523ad8 --- /dev/null +++ b/docs/sources/experiments/extending-features.md @@ -0,0 +1,145 @@ +# Extending Feature Generation + +This document describes how to extend Triage's feature generation capabilities by writing new FeatureBlock classes and incorporating them into Experiments. + +## What is a FeatureBlock? + +A FeatureBlock represents a single feature table in the database and how to generate it. If you're familiar with `collate` parlance, a `SpacetimeAggregation` is similar in scope to a FeatureBlock. A `FeatureBlock` class can be instantiated with whatever arguments it needs,and from there can provide queries to produce its output feature table. Full-size Triage experiments tend to contain multiple feature blocks. These all live in a collection as the `experiment.feature_blocks` property in the Experiment. + +## What existing FeatureBlock classes can I use? + +Class name | Experiment config key | Use +------------ | ------------- | ------------ +triage.component.collate.SpacetimeAggregation | spacetime_aggregation | Temporal aggregations of event-based data + +## Writing a new FeatureBlock class + +The `FeatureBlock` base class defines a set of abstract methods that any child class must implement, as well as a number of initialization arguments that it must take and implement in order to fulfill expectations Triage users have on feature generators. Triage expects these classes to define the queries they need to run, as opposed to generating the tables themselves, so that Triage can implement scaling by parallelization. + +### Abstract methods + +Any method here without parentheses afterwards is expected to be a property. + +Method | Task | Return Type +------------ | ------------- | ------------- +feature_columns | The list of feature columns in the final, postimputation table. Should exclude any index columns (e.g. entity id, date) | list +preinsert_queries | Return all queries that should be run before inserting any data. The creation of your feature table should happen here, and is expected to have `entity_id(integer)` and `as_of_date(timestamp)` columns. | list +insert_queries | Return all inserts to populate this data. Each query in this list should be parallelizable, and should be valid after all `preinsert_queries` are run. | list +postinsert_queries | Return all queries that should be run after inserting all data | list +imputation_queries | Return all queries that should be run to fill in missing data with imputed values. | list + +Any of the query list properties can be empty: for instance, if your implementation doesn't have inserts separate from table creation and is just one big query (e.g. a `CREATE TABLE AS`), you could just define `preinsert_queries` so be that one mega-query and leave the other properties as empty lists. + +### Properties Provided by Base Class + +There are several attributes/properties that can be used within subclass implementations that the base class provides. Triage experiments take care of providing this data during runtime: if you want to instantiate a FeatureBlock object on your own, you'll have to provide them in the constructor. + +Name | Type | Purpose +------------ | ------------- | ------------- +as_of_dates | list | Features are created "as of" specific dates, and expects that each of these dates will be populated with a row for each member of the cohort on that date. +cohort_table | string | The final shape of the feature table should at least include every entity id/date pair in this cohort table. +final_feature_table_name | string | The name of the final table with all features filled in (no missing values). This is provided by the user in feature config, as the key that corresponds to the configuration section that instantiates the feature block +db_engine | sqlalchemy.engine | The engine to use to access the database. Although these instances are mostly returning queries, the engine may be useful for implementing imputation. +features_schema_name | string | The database schema where all feature tables should reside. Defaults to None, which ends up in the public schema. +feature_start_time | string/datetime | A time before which no data should be considered for features. This is generally only applicable if your FeatureBlock is doing temporal aggregations. Defaults to None, which means no data will be excluded. +features_ignore_cohort | bool | If True (the default), features are only computed for members of the cohort. If False, the shape of the final feature table could include more. + + +`FeatureBlock` child classes can, and in almost all cases will, include more configuration at initialization time that are specific to them. They probably also define many more methods to use internally. But as long as they adhere to this interface, they'll work with Triage. + +### Making the new FeatureBlock available to experiments + +Triage Experiments run on serializable configuration, and although it's possible to take fully generated `FeatureBlock` instances and bypass this (e.g. `experiment.feature_blocks = `), it's not recommended. The last step is to pick a config key for use within the `features` key of experiment configs, within `triage.component.architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP` and point it to a function that instantiates a bunch of your objects based on config. + +## Example + +That's a lot of information! Let's see this in action. Let's say that we want to create a very flexible type of feature that simply runs a configured query with a parametrized as-of-date and returns its result as a feature. + +```python +from triage.component.architect.feature_block import FeatureBlock + + +class SimpleQueryFeature(FeatureBlock): + def __init__(self, query, *args, **kwargs): + self.query = query + super().__init__(*args, **kwargs) + + @property + def feature_columns(self): + return ['myfeature'] + + @property + def preinsert_queries(self): + return [f"create table {self.final_feature_table_name}" "(entity_id bigint, as_of_date timestamp, myfeature float)"] + + @property + def insert_queries(self): + if self.features_ignore_cohort: + final_query = self.query + else: + final_query = f""" + select * from (self.query) raw + join {self.cohort_table} using (entity_id, as_of_date) + """ + return [ + final_query.format(as_of_date=date) + for date in self.as_of_dates + ] + + @property + def postinsert_queries(self): + return [f"create index on {self.final_feature_table_name} (entity_id, as_of_date)"] + + @property + def imputation_queries(self): + return [f"update {self.final_feature_table_name} set myfeature = 0.0 where myfeature is null"] +``` + +This class would allow many different uses: basically any query a user can come up with would be a feature. To instantiate this class outside of triage with a simple query, you could: + +```python +feature_block = SimpleQueryFeature( + query="select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'", + as_of_dates=["2016-01-01"], + features_table_name="my_features", + cohort_table="my_cohort_table", + db_engine=triage.create_engine(<..mydbinfo..>) +) + +feature_block.run_preimputation() +feature_block.run_imputation() +``` + +To use it from a Triage experiment, modify `triage.component.architect.feature_block_generators.py` and submit a pull request: + +Before: + +```python +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregations': generate_spacetime_aggregations +} +``` + +After: + +```python +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregations': generate_spacetime_aggregations, + 'simple_query': SimpleQueryFeature, +} +``` + +At this point, you could use it in an experiment configuration by adding a feature table section and specifying the `feature_generator_type` key to be the name you just put in the lookup, `simple_query`. All other keys/values in that config block will be passed to the constructor to your class. Since the class you defined only takes in one extra keyword argument (the query), the only other key you need to specify in config is that query. + +An example: + +```yaml + +features: + my_feature_table: + feature_generator_type: "simple_query" + query: "select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'" + my_other_feature_table: + feature_generator_type: "simple_query" + query: "select entity_id, as_of_date, other_quantity from other_source_table where date < '{as_of_date}'" +``` diff --git a/docs/sources/experiments/feature-testing.md b/docs/sources/experiments/feature-testing.md index 19e70b5a8..3e814d121 100644 --- a/docs/sources/experiments/feature-testing.md +++ b/docs/sources/experiments/feature-testing.md @@ -2,26 +2,27 @@ Developing features for Triage experiments can be a daunting task. There are a lot of things to configure, a small amount of configuration can result in a ton of SQL, and it can take a long time to validate your feature configuration in the context of an Experiment being run on real data. -To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `FeatureGenerator` component. +To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `feature_blocks_from_config` utility. ## Using Triage CLI -![triage featuretest cli help screen](featuretest-cli.png) The command-line interface for testing features takes in two arguments: - - A feature config file. Refer to [example_feature_config.yaml](https://github.com/dssg/triage/blob/master/example/config/feature.yaml). Essentially this is the content of the [example_experiment_config.yaml](https://github.com/dssg/triage/blob/master/example/config/experiment.yaml)'s `feature_aggregations` section. It consists of a YAML list, with one or more feature_aggregation rows present. - - An as-of-date. This should be in the format `2016-01-01`. -Example: `triage experiment featuretest example/config/feature.yaml 2016-01-01` +- An experiment config file. It should have at least a `features` section, and if a `cohort_config` section is present, it will use that to limit the number of feature rows it creates to the cohort at the given date. Other keys can be in there but are ignored. In other lwords, you can use your experiment config file either before or after its fully completed. +- An as-of-date. This should be in the format `2016-01-01`. + +Example: `triage experiment featuretest example/config/experiment.yaml 2016-01-01` All given feature aggregations will be processed for the given date. You will see a bunch of queries pass by in your terminal, populating tables in the `features_test` schema which you can inspect afterwards. ![triage feature test result](featuretest-result.png) ## Using Python Code -If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply your own sqlalchemy database engine to create a 'FeatureGenerator' object, and then call the `create_features_before_imputation` method with your feature config as a list of dictionaries, along with an as-of-date as a string. Make sure your logging level is set to INFO if you want to see all of the queries. +If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply the same arguments plus a few others to the `feature_blocks_from_config` function to create a set of feature blocks, and then call the `run_preimputation` method on each feature block. Make sure your logging level is set to INFO if you want to see all of the queries. + ``` -from triage.component.architect.feature_generators import FeatureGenerator +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.util.db import create_engine import logging import yaml @@ -32,28 +33,37 @@ logging.basicConfig(level=logging.INFO) db_url = 'your db url here' db_engine = create_engine(db_url) -feature_config = [{ - 'prefix': 'aprefix', - 'aggregates': [ - { - 'quantity': 'quantity_one', - 'metrics': ['sum', 'count'], - ], - 'categoricals': [ - { - 'column': 'cat_one', - 'choices': ['good', 'bad'], - 'metrics': ['sum'] - }, - ], - 'groups': ['entity_id', 'zip_code'], - 'intervals': ['all'], - 'knowledge_date_column': 'knowledge_date', - 'from_obj': 'data' -}] - -FeatureGenerator(db_engine, 'features_test').create_features_before_imputation( - feature_aggregation_config=feature_config, - feature_dates=['2016-01-01'] +feature_config = { + 'myfeaturetable': { + 'feature_generator_type': 'spacetime_aggregation', + 'prefix': 'aprefix', + 'aggregates': [ + { + 'quantity': 'quantity_one', + 'metrics': ['sum', 'count'], + } + ], + 'categoricals': [ + { + 'column': 'cat_one', + 'choices': ['good', 'bad'], + 'metrics': ['sum'] + }, + ], + 'groups': ['entity_id', 'zip_code'], + 'intervals': ['all'], + 'knowledge_date_column': 'knowledge_date', + 'from_obj': 'data' + } +} + +feature_blocks = feature_blocks_from_config( + feature_config, + as_of_dates=['2016-01-01'], + cohort_table=None, + db_engine=db_engine, + features_schema_name="features_test", ) +for feature_block in feature_blocks: + feature_block.run_preimputation(verbose=True) ``` diff --git a/docs/sources/experiments/upgrade-to-v7.md b/docs/sources/experiments/upgrade-to-v7.md new file mode 100644 index 000000000..b6cd38176 --- /dev/null +++ b/docs/sources/experiments/upgrade-to-v7.md @@ -0,0 +1,71 @@ +# Upgrading your experiment configuration to v7 + + +This document details the steps needed to update a triage v6 configuration to +v7, mimicking the old behavior. + +Experiment configuration v7 includes only one change from v6: The features are given at a different key. Instead of `feature_aggregations`, to make space for non-collate features to be added in the future, there is now a more generic `features` key. The value of this key is a dictionary, the key of which is the desired output table name for that feature table, and the value of which is the same as the configuration for each feature aggregation from before. There is one change to this. A new key called 'feature_generator_type', to specify which method is being used to generate this feature table. Since non-collate features have not been added yet, there is only one key for this: `spacetime_aggregation`. + +Since the output feature table name is now configurable, there are two things to note: +- Final tables won't necessarily be suffixed with `_aggregation_imputed` as they were before. If you would like to use the old naming system, for instance to avoid having to change postmodeling code that reads features from the database, you can add that suffix to your table name. The example below does set the table name to match what it was before, but there's no reason you have to follow this if you don't want! You can call the table whatever you want. +- The `prefix` key is no longer used to construct the table name. It is still used to prefix column names, if present. If not present, the name of the feature table will be used. + + + +Old: + +``` +feature_aggregations: + - + prefix: 'prefix' + from_obj: 'cool_stuff' + knowledge_date_column: 'open_date' + aggregates_imputation: + all: + type: 'constant' + value: 0 + aggregates: + - + quantity: 'homeless::INT' + metrics: ['count', 'sum'] + intervals: ['1 year', '2 year'] + groups: ['entity_id'] +``` + +New: + +``` +features: + prefix_aggregation_imputed: + feature_generator_type: 'spacetime_aggregation' + prefix: 'prefix' + from_obj: 'cool_stuff' + knowledge_date_column: 'open_date' + aggregates_imputation: + all: + type: 'constant' + value: 0 + aggregates: + - + quantity: 'homeless::INT' + metrics: ['count', 'sum'] + intervals: ['1 year', '2 year'] + groups: ['entity_id'] +``` + +## Upgrading the experiment config version + +At this point, you should be able to bump the top-level experiment config version to v7: + +Old: + +``` +config_version: 'v6' +``` + +New: + +``` +config_version: 'v7' +``` + diff --git a/docs/sources/experiments/upgrading.md b/docs/sources/experiments/upgrading.md deleted file mode 100644 index f08a49ab7..000000000 --- a/docs/sources/experiments/upgrading.md +++ /dev/null @@ -1,5 +0,0 @@ -# Upgrading an Experiment config - -* [v5 → v6](experiments/upgrade-to-v6.md) -* [v3/v4 → v5](experiments/upgrade-to-v5.md) - diff --git a/example/config/experiment.yaml b/example/config/experiment.yaml index def811f5d..aee6634ea 100644 --- a/example/config/experiment.yaml +++ b/example/config/experiment.yaml @@ -5,7 +5,7 @@ # old configuration files are released. Be sure to assign the config version # that matches the triage.experiments.CONFIG_VERSION in the triage release # you are developing against! -config_version: 'v6' +config_version: 'v7' # EXPERIMENT METADATA # model_comment (optional) will end up in the model_comment column of the @@ -75,42 +75,47 @@ label_config: # FEATURE GENERATION -# The aggregate features to generate for each train/test split -# -# Implemented by wrapping collate: https://github.com/dssg/collate -# Most terminology here is taken directly from collate -# -# Each entry describes a collate.SpacetimeAggregation object, and the -# arguments needed to create it. Generally, each of these entries controls -# the features from one source table, though in the case of multiple groups -# may result in multiple output tables -# -# Rules specifying how to handle imputation of null values must be explicitly -# defined in your config file. These can be specified in two places: either -# within each feature or overall for each type of feature (aggregates_imputation, -# categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for -# each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all -# can be specified with `all`. Aggregation function-specific rules will take -# precedence over the `all` rule and feature-specific rules will take -# precedence over the higher-level rules. Several examples are provided below. -# -# Available Imputation Rules: -# * mean: The average value of the feature (for SpacetimeAggregation the -# mean is taken within-date). -# * constant: Fill with a constant value from a required `value` parameter. -# * zero: Fill with zero. -# * null_category: Only available for categorical features. Just flag null -# values with the null category column. -# * binary_mode: Only available for aggregate column types. Takes the modal -# value for a binary feature. -# * error: Raise an exception if any null values are encountered for this -# feature. -feature_aggregations: - - - # prefix given to the resultant tables - prefix: 'prefix' - # from_obj is usually a source table but can be an expression, such as - # a join (ie 'cool_stuff join other_stuff using (stuff_id)') +features: + # Every entry in the features section contains: + # - a key that names the output feature table + # - a value that configures the feature table based on the options + # for one of the available feature generator types + # + my_feature_output_table: # the output from this will go into 'my_feature_output_table' + feature_generator_type: "spacetime_aggregation" + # available types: 'spacetime_aggregation' + # + # SPACETIME_AGGREGATION + # The aggregate features to generate for each train/test split + # + # Implemented by wrapping collate: https://github.com/dssg/collate + # Most terminology here is taken directly from collate + # + # Each entry describes a collate.SpacetimeAggregation object, and the + # arguments needed to create it. Generally, each of these entries controls + # the features from one source table, though in the case of multiple groups + # may result in multiple output tables + # + # Rules specifying how to handle imputation of null values must be explicitly + # defined in your config file. These can be specified in two places: either + # within each feature or overall for each type of feature (aggregates_imputation, + # categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for + # each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all + # can be specified with `all`. Aggregation function-specific rules will take + # precedence over the `all` rule and feature-specific rules will take + # precedence over the higher-level rules. Several examples are provided below. + # + # Available Imputation Rules: + # * mean: The average value of the feature (for SpacetimeAggregation the + # mean is taken within-date). + # * constant: Fill with a constant value from a required `value` parameter. + # * zero: Fill with zero. + # * null_category: Only available for categorical features. Just flag null + # values with the null category column. + # * binary_mode: Only available for aggregate column types. Takes the modal + # value for a binary feature. + # * error: Raise an exception if any null values are encountered for this + # feature. from_obj: 'cool_stuff' # The date column to use for specifying which records to include # in temporal features. It is important that the column used specifies @@ -210,10 +215,10 @@ feature_aggregations: # feature_group_definition allows you to create groups/subset of your features # by different criteria. # for instance, -# - 'tables' allows you to send a list of collate feature tables (collate builds these by appending 'aggregation_imputed' to the prefix) +# - 'tables' allows you to send a list of feature tables # - 'prefix' allows you to specify a list of feature name prefixes feature_group_definition: - tables: ['prefix_aggregation_imputed'] + tables: ['my_feature_output_table'] # strategies for generating combinations of groups # available: all, leave-one-out, leave-one-in, all-combinations diff --git a/example/config/feature.yaml b/example/config/feature.yaml deleted file mode 100644 index 9de66b39d..000000000 --- a/example/config/feature.yaml +++ /dev/null @@ -1,100 +0,0 @@ -### EXAMPLE FEATURE CONFIG -# -### - - - # prefix given to the resultant tables - prefix: 'prefix' - # from_obj is usually a source table but can be an expression, such as - # a join (ie 'cool_stuff join other_stuff using (stuff_id)') - from_obj: 'cool_stuff' - # The date column to use for specifying which records to include - # in temporal features. It is important that the column used specifies - # the date at which the event is known about, which may be different - # from the date the event happened. - knowledge_date_column: 'open_date' - - # top-level imputation rules that will apply to all aggregates functions - # can also specify categoricals_imputation or array_categoricals_imputation - # - # You must specified at least one of the top-level or feature-level imputation - # to cover ever feature being defined. - aggregates_imputation: - # The `all` rule will apply to all aggregation functions, unless over- - # ridden by a more specific one - all: - # every imputation rule must have a `type` parameter, while some - # (like 'constant') have other required parameters (`value` here) - type: 'constant' - value: 0 - # specifying `max` here will take precedence over the `all` rule for - # aggregations using a MAX() function - max: - type: 'mean' - - # aggregates and categoricals define the actual features created. So - # at least one is required - # - # Aggregates of numerical columns. Each quantity is a number of some - # sort, and the list of metrics are applied to each quantity - aggregates: - - - quantity: 'homeless::INT' - # Imputation rules specified at the level of specific features - # will take precedence over the higer-level rules specified - # above. Note that the 'count' and 'sum' metrics will be - # imputed differently here. - imputation: - count: - type: 'mean' - sum: - type: 'constant' - value: 137 - metrics: - - 'count' - - 'sum' - - - # since we're specifying `aggregates_imputation` above, - # a feature-specific imputation rule can be omitted - quantity: 'some_flag' - metrics: - - 'max' - - 'sum' - # Categorical features. The column given can be of any type, but the - # choices must comparable to that type for equality within SQL - # The result will be one feature for each choice/metric combination - categoricals: - - - column: 'color' - # note that we haven't specified a top level `categoricals_imputation` - # set of rules, so we have to include feature-specific imputation - # rules for both of our categoricals here. - imputation: - sum: - type: 'null_category' - max: - type: 'mean' - choices: - - 'red' - - 'blue' - - 'green' - metrics: - - 'sum' - - - column: 'shape' - # as with the top-level imputation rules, `all` can be used - # for the feature-level rules to specify the same type of - # imputation for all aggregation functions - imputation: - all: - type: 'zero' - choice_query: 'select distinct shape from cool_stuff' - metrics: - - 'sum' - # The time intervals over which to aggregate features - intervals: - - '1 year' - - '2 years' - - 'all' - # A list of different columns to separately group by - groups: - - 'entity_id' diff --git a/src/tests/architect_tests/test_feature_block_generators.py b/src/tests/architect_tests/test_feature_block_generators.py new file mode 100644 index 000000000..9078eb3aa --- /dev/null +++ b/src/tests/architect_tests/test_feature_block_generators.py @@ -0,0 +1,238 @@ +from datetime import datetime, date + +from triage.component.architect.feature_block_generators import generate_spacetime_aggregation +import triage.component.collate as collate + +import pytest +from unittest.mock import patch + + +def test_spacetime_generation(db_engine): + aggregation_config = { + "aggregates": [ + { + "quantity": "quantity_one", + "metrics": ["sum", "count"], + "imputation": { + "sum": {"type": "constant", "value": 137}, + "count": {"type": "zero"}, + }, + } + ], + "categoricals_imputation": {"all": {"type": "null_category"}}, + "categoricals": [ + {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + feature_table_name="my_features", + db_engine=db_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + ) + assert isinstance(aggregation, collate.SpacetimeAggregation) + assert aggregation.as_of_dates == ["2017-01-02", "2017-02-02"] + assert aggregation.feature_start_time == "2011-01-01" + assert aggregation.groups == {"entity_id": "entity_id", "zip_code": "zip_code"} + assert aggregation.intervals == {"entity_id": ["all"], "zip_code": ["all"]} + assert str(aggregation.from_obj) == "data" + assert len(aggregation.aggregates) == 2 + for aggregate in aggregation.aggregates: + if isinstance(aggregate, collate.Categorical): + assert aggregate.quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one = 'bad')::INT",), + "cat_one_good": ("(cat_one = 'good')::INT",), + } + assert aggregate.functions == ["sum"] + else: + assert aggregate.quantities == {"quantity_one": ("quantity_one",)} + assert aggregate.functions == ["sum", "count"] + + + +INPUT_DATA = [ + # entity_id, knowledge_date, zip_code, cat_one, quantity_one + (1, date(2014, 1, 1), "60120", "good", 10000), + (1, date(2014, 10, 11), "60120", "good", None), + (3, date(2012, 6, 8), "60653", "bad", 342), + (3, date(2014, 12, 21), "60653", "inbetween", 600), + (4, date(2014, 4, 4), "60653", "bad", 1236), +] + +INPUT_STATES = [ + # entity_id, as_of_date + (1, date(2013, 9, 30)), + (1, date(2014, 9, 30)), + (1, date(2015, 1, 1)), + (3, date(2013, 9, 30)), + (3, date(2014, 9, 30)), + (3, date(2015, 1, 1)), + (4, date(2014, 9, 30)), + (4, date(2015, 1, 1)), +] + +@pytest.fixture(name='test_engine', scope='function') +def fixture_test_engine(db_engine): + """Local extension to the shared db_engine fixture to set up test + database tables. + + """ + db_engine.execute( + """\ + create table data ( + entity_id int, + knowledge_date date, + zip_code text, + cat_one varchar, + quantity_one float + ) + """ + ) + for row in INPUT_DATA: + db_engine.execute("insert into data values (%s, %s, %s, %s, %s)", row) + + db_engine.execute( + """\ + create table states ( + entity_id int, + as_of_date date + ) + """ + ) + for row in INPUT_STATES: + db_engine.execute("insert into states values (%s, %s)", row) + + return db_engine + + +def test_choice_query(test_engine): + aggregation_config = { + "categoricals": [ + { + "column": "cat_one", + "choice_query": "select distinct(cat_one) from data", + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=test_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + feature_table_name="aprefix", + ) + assert aggregation.aggregates[0].quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one = 'bad')::INT",), + "cat_one_good": ("(cat_one = 'good')::INT",), + "cat_one_inbetween": ("(cat_one = 'inbetween')::INT",), + } + +def test_array_categoricals(test_engine): + aggregation_config = { + "array_categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad", "inbetween"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=test_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + feature_table_name="aprefix", + ) + + assert aggregation.aggregates[0].quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one @> array['bad'::varchar])::INT",), + "cat_one_good": ("(cat_one @> array['good'::varchar])::INT",), + "cat_one_inbetween": ("(cat_one @> array['inbetween'::varchar])::INT",), + } + +def xtest_materialize_off(db_engine): + aggregation_config = { + "categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + + with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: + feature_generator = generate_spacetime_aggregation( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=db_engine, + features_schema_name="features", + materialize_subquery_fromobjs=False, + feature_table_name="aprefix", + ) + assert not fromobj_mock.called + + +def xtest_aggregations_materialize_on(db_engine): + aggregation_config = { + "categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + + with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: + feature_generator = generate_spacetime_aggregation( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=db_engine, + features_schema_name="features", + materialize_subquery_fromobjs=True, + feature_table_name="aprefix", + ) + fromobj_mock.assert_called_once_with( + from_obj="data", + knowledge_date_column="knowledge_date", + name="features.aprefix" + ) diff --git a/src/tests/architect_tests/test_feature_blocks.py b/src/tests/architect_tests/test_feature_blocks.py new file mode 100644 index 000000000..e8a0130ee --- /dev/null +++ b/src/tests/architect_tests/test_feature_blocks.py @@ -0,0 +1,150 @@ +from triage.component.architect.feature_block import FeatureBlock +import pytest + + +class FeatureBlockExample(FeatureBlock): + """A sample, functional FeatureBlock class + + Implements very simple versions of all of the abstract methods/properties + that allows testing of the concrete methods in the base class + """ + @property + def final_feature_table_name(self): + return "myfeatures" + + @property + def feature_columns(self): + return set(["feature_one", "feature_two"]) + + @property + def preinsert_queries(self): + return [ + "drop table if exists myfeatures", + "create table myfeatures (entity_id int, as_of_date timestamp, f_one int, f_two int)" + ] + + @property + def insert_queries(self): + return [ + "insert into myfeatures values (1, '2016-01-01', 1, 0)", + "insert into myfeatures values (1, '2016-02-01', 0, 0)", + "insert into myfeatures values (2, '2016-01-01', 0, 1)", + "insert into myfeatures values (2, '2016-02-01', 0, NULL)" + ] + + @property + def postinsert_queries(self): + return [ + "create index on myfeatures (as_of_date)" + ] + + @property + def imputation_queries(self): + return [ + "update myfeatures set f_one = 1 where f_one is null", + "update myfeatures set f_two = 1 where f_two is null", + ] + + +def populate_cohort(db_engine): + db_engine.execute("create table mycohort (entity_id int, as_of_date timestamp)") + db_engine.execute("insert into mycohort values (1, '2016-01-01'), " + "(1, '2016-02-01'), (2, '2016-01-01'), (2, '2016-02-01')") + + +def test_FeatureBlock_generate_preimpute_tasks(db_engine): + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) + block.needs_features = lambda: True + assert block.generate_preimpute_tasks(replace=False) == { + "prepare": block.preinsert_queries, + "inserts": block.insert_queries, + "finalize": block.postinsert_queries + } + block.needs_features = lambda: False + assert block.generate_preimpute_tasks(replace=False) == {} + + assert block.generate_preimpute_tasks(replace=True) == { + "prepare": block.preinsert_queries, + "inserts": block.insert_queries, + "finalize": block.postinsert_queries + } + + +def test_FeatureBlock_generate_impute_tasks(db_engine): + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) + block.needs_features = lambda: True + assert block.generate_impute_tasks(replace=False) == { + "prepare": block.imputation_queries, + "inserts": [], + "finalize": [] + } + block.needs_features = lambda: False + assert block.generate_impute_tasks(replace=False) == {} + + assert block.generate_impute_tasks(replace=True) == { + "prepare": block.imputation_queries, + "inserts": [], + "finalize": [] + } + + +def test_FeatureBlock_log_verbose_task_info(db_engine): + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) + task = block.generate_impute_tasks(replace=True) + # just want to make sure that the logging doesn't error, no assertions + block.log_verbose_task_info(task) + + +def test_FeatureBlock_needs_features(db_engine): + # needs_features should function as following: + # if there are members of the cohort without features, needs_features should return true + # 1. a freshly created table should definitely need features + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) + populate_cohort(db_engine) + assert block.needs_features() + block.run_preimputation() + block.run_imputation() + assert not block.needs_features() + + # 2. a table that already has features, but is merely a subset of the cohort, + # should also need features + db_engine.execute("insert into mycohort values (3, '2016-01-01')") + assert block.needs_features() + + +def test_FeatureBlock_verify_nonulls(db_engine): + # verify_no_nulls should function as following: + # if there are members of the cohort without features, needs_features should return true + # 1. a freshly created table should definitely need features + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) + populate_cohort(db_engine) + block.run_preimputation() + with pytest.raises(ValueError): + block.verify_no_nulls() + block.run_imputation() + block.verify_no_nulls() diff --git a/src/tests/architect_tests/test_feature_dictionary_creator.py b/src/tests/architect_tests/test_feature_dictionary_creator.py deleted file mode 100644 index 5ac63a8eb..000000000 --- a/src/tests/architect_tests/test_feature_dictionary_creator.py +++ /dev/null @@ -1,87 +0,0 @@ -from triage.component.architect.features import FeatureDictionaryCreator -import testing.postgresql -from sqlalchemy import create_engine - - -def test_feature_dictionary_creator(): - with testing.postgresql.Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - engine.execute("create schema features") - engine.execute( - """ - create table features.prefix1_entity_id ( - entity_id int, - as_of_date date, - feature_one float, - feature_two float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_zipcode ( - zipcode text, - as_of_date date, - feature_three float, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_aggregation ( - entity_id int, - as_of_date date, - zipcode text, - feature_one float, - feature_two float, - feature_three float, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_aggregation_imputed ( - entity_id int, - as_of_date date, - zipcode text, - feature_one float, - feature_two float, - feature_three float, - feature_three_imp int, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.random_other_table ( - another_column float - ) - """ - ) - - creator = FeatureDictionaryCreator( - features_schema_name="features", db_engine=engine - ) - feature_dictionary = creator.feature_dictionary( - feature_table_names=[ - "prefix1_entity_id", - "prefix1_zip_code", - "prefix1_aggregation", - "prefix1_aggregation_imputed", - ], - index_column_lookup={ - "prefix1_aggregation_imputed": ["entity_id", "zipcode", "as_of_date"] - }, - ) - assert feature_dictionary == { - "prefix1_aggregation_imputed": [ - "feature_one", - "feature_two", - "feature_three", - "feature_three_imp", - "feature_four", - ] - } diff --git a/src/tests/architect_tests/test_feature_generators.py b/src/tests/architect_tests/test_feature_generators.py deleted file mode 100644 index 338570411..000000000 --- a/src/tests/architect_tests/test_feature_generators.py +++ /dev/null @@ -1,931 +0,0 @@ -import copy -from datetime import date - -import pandas -import pytest -import sqlalchemy -from sqlalchemy import text as t - -from triage.component.architect.feature_generators import FeatureGenerator -from triage.component.collate import Aggregate, Categorical, SpacetimeAggregation - -from unittest.mock import patch - - -INPUT_DATA = [ - # entity_id, knowledge_date, zip_code, cat_one, quantity_one - (1, date(2014, 1, 1), "60120", "good", 10000), - (1, date(2014, 10, 11), "60120", "good", None), - (3, date(2012, 6, 8), "60653", "bad", 342), - (3, date(2014, 12, 21), "60653", "inbetween", 600), - (4, date(2014, 4, 4), "60653", "bad", 1236), -] - -INPUT_STATES = [ - # entity_id, as_of_date - (1, date(2013, 9, 30)), - (1, date(2014, 9, 30)), - (1, date(2015, 1, 1)), - (3, date(2013, 9, 30)), - (3, date(2014, 9, 30)), - (3, date(2015, 1, 1)), - (4, date(2014, 9, 30)), - (4, date(2015, 1, 1)), -] - - -@pytest.fixture(name='test_engine', scope='function') -def fixture_test_engine(db_engine): - """Local extension to the shared db_engine fixture to set up test - database tables. - - """ - db_engine.execute( - """\ - create table data ( - entity_id int, - knowledge_date date, - zip_code text, - cat_one varchar, - quantity_one float - ) - """ - ) - for row in INPUT_DATA: - db_engine.execute("insert into data values (%s, %s, %s, %s, %s)", row) - - db_engine.execute( - """\ - create table states ( - entity_id int, - as_of_date date - ) - """ - ) - for row in INPUT_STATES: - db_engine.execute("insert into states values (%s, %s)", row) - - return db_engine - - -def test_feature_generation(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates": [ - { - "quantity": "quantity_one", - "metrics": ["sum", "count"], - "imputation": { - "sum": {"type": "constant", "value": 137}, - "count": {"type": "zero"}, - }, - } - ], - "categoricals_imputation": {"all": {"type": "null_category"}}, - "categoricals": [ - {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "zip_code": None, - "aprefix_entity_id_all_quantity_one_sum": 137, - "aprefix_entity_id_all_quantity_one_count": 0, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - "aprefix_zip_code_all_quantity_one_sum": 137, - "aprefix_zip_code_all_quantity_one_count": 0, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 0, - "aprefix_zip_code_all_cat_one__NULL_sum": 1, - "aprefix_entity_id_all_quantity_one_imp": 1, - "aprefix_zip_code_all_quantity_one_imp": 1, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "zip_code": "60120", - "aprefix_entity_id_all_quantity_one_sum": 10000, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 10000, - "aprefix_zip_code_all_quantity_one_count": 1, - "aprefix_zip_code_all_cat_one_good_sum": 1, - "aprefix_zip_code_all_cat_one_bad_sum": 0, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_imp": 0, - "aprefix_zip_code_all_quantity_one_imp": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 342, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 342, - "aprefix_zip_code_all_quantity_one_count": 1, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 1, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_imp": 0, - "aprefix_zip_code_all_quantity_one_imp": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 342, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 1578, - "aprefix_zip_code_all_quantity_one_count": 2, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 2, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_imp": 0, - "aprefix_zip_code_all_quantity_one_imp": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 1236, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 1578, - "aprefix_zip_code_all_quantity_one_count": 2, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 2, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_imp": 0, - "aprefix_zip_code_all_quantity_one_imp": 0, - }, - ] - } - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by entity_id, as_of_date".format( - features_schema_name, - output_table, - ), - test_engine, - ).to_dict("records") - - for record, expected_record in zip(records, expected_output[output_table]): - assert record == expected_record - - -def test_index_column_lookup(test_engine): - aggregations = [ - SpacetimeAggregation( - prefix="prefix1", - aggregates=[ - Categorical( - col="cat_one", - function="sum", - choices=["good", "bad", "inbetween"], - impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - SpacetimeAggregation( - prefix="prefix2", - aggregates=[ - Aggregate( - quantity="quantity_one", - function="count", - impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id", "zip_code"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - ] - - features_schema_name = "features" - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ) - lookup = feature_generator.index_column_lookup(aggregations) - assert lookup == { - "prefix1_aggregation_imputed": ["as_of_date", "entity_id"], - "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"], - } - - -def test_feature_generation_feature_start_time(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates_imputation": {"all": {"type": "constant", "value": 7}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["sum"]}], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 10000, - }, - { - "entity_id": 3, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 600, - }, - { - "entity_id": 4, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 1236, - }, - ] - } - - features_schema_name = "features" - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - feature_start_time="2013-01-01", - ).create_all_tables( - feature_dates=["2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, - output_table, - ), - test_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_dynamic_categoricals(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choice_query": "select distinct(cat_one) from data", - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - ] - } - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, output_table - ), - test_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_array_categoricals(db_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "array_categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad", "inbetween"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - ] - } - - input_data = [ - # entity_id, knowledge_date, cat_one, quantity_one - (1, date(2014, 1, 1), ["good", "good"], 10000), - (1, date(2014, 10, 11), ["good"], None), - (3, date(2012, 6, 8), ["bad"], 342), - (3, date(2014, 12, 21), ["inbetween"], 600), - (4, date(2014, 4, 4), ["bad"], 1236), - ] - - db_engine.execute( - """\ - create table data ( - entity_id int, - knowledge_date date, - cat_one varchar[], - quantity_one float - ) - """ - ) - for row in input_data: - db_engine.execute("insert into data values (%s, %s, %s, %s)", row) - - db_engine.execute( - """\ - create table states ( - entity_id int, - as_of_date date - ) - """ - ) - for row in INPUT_STATES: - db_engine.execute("insert into states values (%s, %s)", row) - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=db_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, output_table - ), - db_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_generate_table_tasks(test_engine): - test_engine.execute('create schema features') - aggregations = [ - SpacetimeAggregation( - prefix="prefix1", - aggregates=[ - Categorical( - col="cat_one", - function="sum", - choices=["good", "bad", "inbetween"], - impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - SpacetimeAggregation( - prefix="prefix2", - aggregates=[ - Aggregate( - quantity="quantity_one", - function="count", - impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - ] - features_schema_name = "features" - - table_tasks = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).generate_all_table_tasks(aggregations, task_type="aggregation") - for table_name, task in table_tasks.items(): - assert "DROP TABLE" in task["prepare"][0] - assert "CREATE TABLE" in str(task["prepare"][1]) - assert "CREATE INDEX" in task["finalize"][0] - assert isinstance(task["inserts"], list) - - # build the aggregation tables to check the imputation tasks - FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).process_table_tasks(table_tasks) - - table_tasks = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).generate_all_table_tasks(aggregations, task_type="imputation") - - for table_name, task in table_tasks.items(): - assert "DROP TABLE" in task["prepare"][0] - assert "CREATE TABLE" in str(task["prepare"][1]) - assert "CREATE INDEX" in task["finalize"][0] - assert isinstance(task["inserts"], list) - - -def test_aggregations(test_engine): - aggregate_config = [ - { - "prefix": "prefix1", - "categoricals": [ - { - "column": "cat_one", - "choice_query": "select distinct(cat_one) from data", - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - }, - { - "prefix": "prefix2", - "aggregates_imputation": {"all": {"type": "mean"}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["count"]}], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - }, - ] - features_schema_name = "features" - - aggregations = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).aggregations( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - for aggregation in aggregations: - assert isinstance(aggregation, SpacetimeAggregation) - - -def test_replace(test_engine): - # test the replace=False functionality, wherein we see if the cohort is fully represented - # in the imputed table and reuse the features if so - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates_imputation": {"all": {"type": "mean"}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["sum", "count"]}], - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - features_schema_name = "features" - feature_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - replace=False, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - assert len(feature_tables) == 1 - assert list(feature_tables)[0] == "aprefix_aggregation_imputed" - - # now try and run feature generation with replace=False. We should - # be able to see that the entire cohort is there and reuse the features - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - replace=False, - ) - aggregations = feature_generator.aggregations( - feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - table_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="aggregation", - ) - - assert len(table_tasks["aprefix_entity_id"]) == 0 - assert len(table_tasks["aprefix_aggregation"]) == 0 - - imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="imputation", - ) - - assert len(imp_tasks["aprefix_aggregation_imputed"]) == 0 - - # add a new member of the cohort. now we should need to rebuild everything - test_engine.execute("insert into states values (%s, %s)", 999, "2015-01-01") - table_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="aggregation", - ) - assert len(table_tasks["aprefix_entity_id"]) == 3 - assert len(table_tasks["aprefix_aggregation"]) == 3 - feature_generator.process_table_tasks(table_tasks) - imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="imputation", - ) - - assert len(imp_tasks["aprefix_aggregation_imputed"]) == 3 - -def test_aggregations_materialize_off(test_engine): - aggregate_config = { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - materialize_subquery_fromobjs=False - ) - - with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock: - feature_generator.aggregations([aggregate_config], "2016-01-01", "states") - assert not fromobj_mock.called - - -def test_aggregations_materialize_on(test_engine): - aggregate_config = { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - ) - - with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock: - feature_generator.aggregations([aggregate_config], "2016-01-01", "states") - fromobj_mock.assert_called_once_with( - from_obj="data", - knowledge_date_column="knowledge_date", - name="features.aprefix" - ) - - -def test_transaction_error(test_engine): - """Database connections are cleaned up regardless of in-transaction - query errors. - - """ - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates": [ - { - "quantity": "quantity_one", - "metrics": ["sum"], - "imputation": { - "sum": {"type": "constant", "value": 137}, - "count": {"type": "zero"}, - }, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - ) - - with pytest.raises(sqlalchemy.exc.ProgrammingError): - feature_generator.create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="statez", # WRONG! - ) - - ((query_count,),) = test_engine.execute( - t("""\ - select count(1) from pg_stat_activity - where datname = :datname and - query not ilike '%%pg_stat_activity%%' - """), - datname=test_engine.url.database, - ) - - assert query_count == 0 - - -class TestValidations: - - @pytest.fixture - def base_config(self): - return { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - @pytest.fixture - def feature_generator(self, test_engine): - return FeatureGenerator(test_engine, "features") - - def test_correct_keys(self, base_config, feature_generator): - feature_generator.validate([base_config]) - - with pytest.raises(ValueError): - no_group = copy.deepcopy(base_config) - del no_group["groups"] - feature_generator.validate([no_group]) - - with pytest.raises(ValueError): - no_intervals = copy.deepcopy(base_config) - del no_intervals["intervals"] - feature_generator.validate([no_intervals]) - - with pytest.raises(ValueError): - no_kdate = copy.deepcopy(base_config) - del no_kdate["knowledge_date_column"] - feature_generator.validate([no_kdate]) - - with pytest.raises(ValueError): - no_from_obj = copy.deepcopy(base_config) - del no_from_obj["from_obj"] - feature_generator.validate([no_from_obj]) - - with pytest.raises(ValueError): - no_aggs = copy.deepcopy(base_config) - del no_aggs["categoricals"] - feature_generator.validate([no_aggs]) - - with pytest.raises(ValueError): - no_imps = copy.deepcopy(base_config) - del no_imps["categoricals"][0]["imputation"] - feature_generator.validate([no_imps]) - - def test_bad_from_obj(self, base_config, feature_generator): - bad_from_obj = copy.deepcopy(base_config) - bad_from_obj["from_obj"] = "where thing is other_thing" - with pytest.raises(ValueError): - feature_generator.validate([bad_from_obj]) - - def test_bad_interval(self, base_config, feature_generator): - base_config["intervals"] = ["1y", "1fortnight"] - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_group(self, base_config, feature_generator): - base_config["groups"] = ["zip_code", "otherthing"] - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_choice_query(self, base_config, feature_generator): - del base_config["categoricals"][0]["choices"] - base_config["categoricals"][0][ - "choice_query" - ] = "select distinct cat_two from data" - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_wrong_imp_fcn(self, base_config, feature_generator): - del base_config["categoricals"][0]["imputation"]["all"] - base_config["categoricals"][0]["imputation"]["max"] = { - "type": "null_category" - } - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_imp_rule(self, base_config, feature_generator): - base_config["categoricals"][0]["imputation"]["all"] = { - "type": "bad_rule_doesnt_exist" - } - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_no_imp_rule_type(self, base_config, feature_generator): - base_config["categoricals"][0]["imputation"]["all"] = {"value": "good"} - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_missing_imp_arg(self, base_config, feature_generator): - # constant value imputation requires a 'value' parameter - base_config["categoricals"][0]["imputation"]["all"] = {"type": "constant"} - with pytest.raises(ValueError): - feature_generator.validate([base_config]) diff --git a/src/tests/architect_tests/test_integration.py b/src/tests/architect_tests/test_integration.py index 174d0427f..c43fe2eeb 100644 --- a/src/tests/architect_tests/test_integration.py +++ b/src/tests/architect_tests/test_integration.py @@ -10,11 +10,11 @@ from triage.component.results_schema import Base from triage.component.timechop import Timechop from triage.component.architect.features import ( - FeatureGenerator, - FeatureDictionaryCreator, FeatureGroupCreator, FeatureGroupMixer, + FeatureDictionary, ) +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.label_generators import LabelGenerator from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator from triage.component.architect.planner import Planner @@ -170,14 +170,6 @@ def basic_integration_test( db_engine=db_engine, query=sample_config()["label_config"]["query"] ) - feature_generator = FeatureGenerator( - db_engine=db_engine, features_schema_name="features", replace=True - ) - - feature_dictionary_creator = FeatureDictionaryCreator( - db_engine=db_engine, features_schema_name="features" - ) - feature_group_creator = FeatureGroupCreator(feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) @@ -227,12 +219,10 @@ def basic_integration_test( label_timespans=["6months"], ) - # create feature table tasks - # we would use FeatureGenerator#create_all_tables but want to use - # the tasks dict directly to create a feature dict - aggregations = feature_generator.aggregations( - feature_aggregation_config=[ - { + feature_blocks = feature_blocks_from_config( + { + 'cat': { + "feature_generator_type": "spacetime_aggregation", "prefix": "cat", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", @@ -246,7 +236,8 @@ def basic_integration_test( "intervals": ["1y"], "groups": ["entity_id"], }, - { + 'dog': { + "feature_generator_type": "spacetime_aggregation", "prefix": "dog", "from_obj": "dog_complaints", "knowledge_date_column": "as_of_date", @@ -257,34 +248,23 @@ def basic_integration_test( }, "aggregates": [ {"quantity": "dog_sightings", "metrics": ["count", "avg"]} + ], "intervals": ["1y"], "groups": ["entity_id"], }, - ], - feature_dates=all_as_of_times, - state_table=entity_date_table_generator.entity_date_table_name, - ) - feature_table_agg_tasks = feature_generator.generate_all_table_tasks( - aggregations, task_type="aggregation" - ) - - # create feature aggregation tables - feature_generator.process_table_tasks(feature_table_agg_tasks) - - feature_table_imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, task_type="imputation" + }, + as_of_dates=all_as_of_times, + cohort_table=entity_date_table_generator.entity_date_table_name, + db_engine=db_engine, + features_schema_name='features', ) - # create feature imputation tables - feature_generator.process_table_tasks(feature_table_imp_tasks) + for feature_block in feature_blocks: + feature_block.run_preimputation() + feature_block.run_imputation() - # build feature dictionaries from feature tables and - # subsetting config - master_feature_dict = feature_dictionary_creator.feature_dictionary( - feature_table_names=feature_table_imp_tasks.keys(), - index_column_lookup=feature_generator.index_column_lookup(aggregations), - ) + master_feature_dict = FeatureDictionary(feature_blocks) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict) diff --git a/src/tests/collate_tests/test_collate.py b/src/tests/collate_tests/test_collate.py index a4585f20a..eb69c599b 100755 --- a/src/tests/collate_tests/test_collate.py +++ b/src/tests/collate_tests/test_collate.py @@ -4,7 +4,9 @@ Unit tests for `collate` module. """ -from triage.component.collate import Aggregate, Aggregation, Categorical +import testing.postgresql +import sqlalchemy +from triage.component.collate import Aggregate, Categorical def test_aggregate(): agg = Aggregate("*", "count", {}) @@ -116,50 +118,6 @@ def test_aggregate_format_kwargs(): ) == ["min('2012-01-01' - date)"] -def test_aggregation_table_name_no_schema(): - # no schema - assert ( - Aggregation( - [], from_obj="source", groups=[], state_table="tbl" - ).get_table_name() - == '"source_aggregation"' - ) - assert ( - Aggregation([], from_obj="source", groups=[], state_table="tbl").get_table_name( - imputed=True - ) - == '"source_aggregation_imputed"' - ) - - # prefix - assert ( - Aggregation( - [], from_obj="source", prefix="mysource", groups=[], state_table="tbl" - ).get_table_name() - == '"mysource_aggregation"' - ) - assert ( - Aggregation( - [], from_obj="source", prefix="mysource", groups=[], state_table="tbl" - ).get_table_name(imputed=True) - == '"mysource_aggregation_imputed"' - ) - - # schema - assert ( - Aggregation( - [], from_obj="source", schema="schema", groups=[], state_table="tbl" - ).get_table_name() - == '"schema"."source_aggregation"' - ) - assert ( - Aggregation( - [], from_obj="source", schema="schema", groups=[], state_table="tbl" - ).get_table_name(imputed=True) - == '"schema"."source_aggregation_imputed"' - ) - - def test_distinct(): assert list(map(str, Aggregate("distinct x", "count", {}).get_columns())) == [ "count(distinct x)" diff --git a/src/tests/collate_tests/test_imputation_output.py b/src/tests/collate_tests/test_imputation_output.py index dece1c782..a2449d200 100644 --- a/src/tests/collate_tests/test_imputation_output.py +++ b/src/tests/collate_tests/test_imputation_output.py @@ -120,8 +120,9 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): feat_sql = "\n".join( [", prefix_entity_id_1y_%s_max int" % f for f in feat_list] ) + engine.execute( - """create table prefix_aggregation ( + """create table myfeatures_aggregation ( entity_id int , as_of_date date %s @@ -129,7 +130,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): % feat_sql ) ins_sql = ( - "insert into prefix_aggregation values (%s, %s" + "insert into myfeatures_aggregation values (%s, %s" + (", %s" * len(feat_list)) + ")" ) @@ -157,43 +158,25 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): ] st = SpacetimeAggregation( aggregates=aggs, + db_engine=engine, + features_table_name="myfeatures", from_obj="prefix_events", prefix="prefix", groups=["entity_id"], intervals=["1y"], - dates=["2016-01-01", "2016-02-03", "2016-03-14"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2016-02-03", "2016-03-14"], + cohort_table="states", + entity_column="entity_id", date_column="as_of_date", - input_min_date="2000-01-01", + feature_start_time="2000-01-01", output_date_column="as_of_date", + drop_interim_tables=False, ) - conn = engine.connect() - - trans = conn.begin() - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - res = conn.execute(st.find_nulls()) - null_counts = list(zip(res.keys(), res.fetchone())) - impute_cols = [col for col, val in null_counts if val > 0] - nonimpute_cols = [col for col, val in null_counts if val == 0] - - # sql to drop and create the imputation table - drop_imp = st.get_drop(imputed=True) - create_imp = st.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ) - - # create the imputation table - conn.execute(drop_imp) - conn.execute(create_imp) - - trans.commit() + st.run_imputation() # check the results - df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed", engine) + df = pd.read_sql("SELECT * FROM myfeatures", engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) diff --git a/src/tests/collate_tests/test_integration.py b/src/tests/collate_tests/test_integration.py deleted file mode 100755 index fd089d944..000000000 --- a/src/tests/collate_tests/test_integration.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- -"""Integration tests for `collate` module.""" -import testing.postgresql -from sqlalchemy import create_engine -from sqlalchemy.sql import expression as ex - -from triage.component.collate import Aggregation, Aggregate -from triage.component.collate.spacetime import SpacetimeAggregation - -from . import initialize_db - - -IMPUTE_RULES = { - "coltype": "aggregate", - "count": {"type": "mean"}, - "mode": {"type": "mean"}, -} - -Postgresql = testing.postgresql.PostgresqlFactory( - cache_initialized_db=True, on_initialized=initialize_db.handler -) - - -def teardown_module(): - Postgresql.clear_cache() - - -def test_engine(): - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - ((result,),) = engine.execute("SELECT COUNT(*) FROM food_inspections") - assert result == 966 - - -def test_st_explicit_execute(): - agg = Aggregate({"F": "results='Fail'"}, ["count"], IMPUTE_RULES) - mode = Aggregate("", "mode", IMPUTE_RULES, order="zip") - st = SpacetimeAggregation( - [agg, agg + agg, mode], - from_obj=ex.table("food_inspections"), - groups={"license": ex.column("license_no"), "zip": ex.column("zip")}, - intervals={"license": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column="inspection_date", - prefix="food_inspections", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_st_lazy_execute(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals={"license_no": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column='"inspection_date"', - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_st_execute_broadcast_intervals(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals=["1 year", "2 years", "all"], - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column='"inspection_date"', - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_execute(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = Aggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - state_table="all_licenses", - state_group="license_no", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_execute_schema_output_date_column(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals={"license_no": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states_diff_colname", - state_group="license_no", - schema="agg", - date_column='"inspection_date"', - output_date_column="aggregation_date", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 141127bc0..7080c468d 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -67,19 +67,23 @@ def test_basic_spacetime(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["1y", "2y", "all"], - dates=["2016-01-01", "2015-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + features_schema_name="schema", + features_table_name="myfeaturetable", + cohort_table="states", + entity_column="entity_id", date_column="event_date", output_date_column="as_of_date", + db_engine=engine, + drop_interim_tables=False, ) - - st.execute(engine.connect()) - + engine.execute(st.get_create_schema()) + st.run_preimputation() r = engine.execute( - "select * from events_entity_id order by entity_id, as_of_date" + "select * from schema.myfeaturetable_entity_id order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[0]["entity_id"] == 1 @@ -144,9 +148,10 @@ def test_basic_spacetime(): assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0 assert len(rows) == 7 + st.run_imputation() # check some imputation results r = engine.execute( - "select * from events_aggregation_imputed order by entity_id, as_of_date" + "select * from schema.myfeaturetable order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[6]["entity_id"] == 4 @@ -192,8 +197,26 @@ def test_basic_spacetime(): assert rows[7]["events_entity_id_all_outcome::int_stddev_imp"] == 1 assert len(rows) == 8 - -def test_input_min_date(): + assert st.feature_columns == { + "events_entity_id_1y_outcome::int_sum", + "events_entity_id_1y_outcome::int_avg", + "events_entity_id_1y_outcome::int_stddev", + "events_entity_id_1y_outcome::int_imp", + "events_entity_id_1y_outcome::int_stddev_imp", + "events_entity_id_2y_outcome::int_sum", + "events_entity_id_2y_outcome::int_avg", + "events_entity_id_2y_outcome::int_stddev", + "events_entity_id_2y_outcome::int_imp", + "events_entity_id_2y_outcome::int_stddev_imp", + "events_entity_id_all_outcome::int_sum", + "events_entity_id_all_outcome::int_avg", + "events_entity_id_all_outcome::int_stddev", + "events_entity_id_all_outcome::int_imp", + "events_entity_id_all_outcome::int_stddev_imp", + } + + +def test_feature_start_time(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute("create table events (entity_id int, date date, outcome bool)") @@ -217,18 +240,22 @@ def test_input_min_date(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["all"], - dates=["2016-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01"], + features_table_name="event_features", + cohort_table="states", + entity_column="entity_id", date_column='"date"', - input_min_date="2015-11-10", + feature_start_time="2015-11-10", + db_engine=engine, + drop_interim_tables=False, ) - st.execute(engine.connect()) + st.run_preimputation() - r = engine.execute("select * from events_entity_id order by entity_id") + r = engine.execute("select * from event_features_entity_id order by entity_id") rows = [x for x in r] assert rows[0]["entity_id"] == 1 @@ -245,21 +272,22 @@ def test_input_min_date(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["1y", "all"], - dates=["2016-01-01", "2015-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + features_table_name="event_features", + cohort_table="states", + entity_column="entity_id", date_column='"date"', - input_min_date="2014-11-10", + feature_start_time="2014-11-10", + db_engine=engine ) with pytest.raises(ValueError): st.validate(engine.connect()) - with pytest.raises(ValueError): - st.execute(engine.connect()) -def test_join_with_cohort_table(db_engine): +def test_features_ignore_cohort(db_engine): # if we specify joining with the cohort table # only entity_id/date pairs in the cohort table should show up db_engine.execute("create table events (entity_id int, date date, outcome bool)") @@ -278,7 +306,7 @@ def test_join_with_cohort_table(db_engine): for state in smaller_cohort: db_engine.execute("insert into cohort values (%s, %s)", state) - # create our test aggregation with the important 'join_with_cohort_table' flag + # create our test aggregation with the important 'features_ignore_cohort' flag agg = Aggregate( "outcome::int", ["sum", "avg"], @@ -292,18 +320,22 @@ def test_join_with_cohort_table(db_engine): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["all"], - dates=["2016-01-01", "2015-01-01"], - state_table="cohort", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + features_table_name="event_features", + cohort_table="cohort", + entity_column="entity_id", date_column='"date"', - join_with_cohort_table=True, + features_ignore_cohort=False, + db_engine=db_engine, + drop_interim_tables=False, ) - st.execute(db_engine.connect()) + st.run_preimputation() - r = db_engine.execute("select * from events_entity_id order by entity_id, date") + r = db_engine.execute("select * from event_features_entity_id order by entity_id, date") rows = [x for x in r] # these rows should be similar to the rows in the basic spacetime test, @@ -327,3 +359,71 @@ def test_join_with_cohort_table(db_engine): assert rows[3]["date"] == date(2016, 1, 1) assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5 + + +def test_aggregation_table_name_no_schema(): + # no schema + assert ( + SpacetimeAggregation( + [], from_obj="source", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"source_features_aggregation"' + ) + assert ( + SpacetimeAggregation([], from_obj="source", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[]).get_table_name( + imputed=True + ) + == '"source_features"' + ) + + # prefix + assert ( + SpacetimeAggregation( + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"source_features_aggregation"' + ) + assert ( + SpacetimeAggregation( + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], + ).get_table_name(imputed=True) + == '"source_features"' + ) + + # schema + assert ( + SpacetimeAggregation( + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"schema"."source_features_aggregation"' + ) + assert ( + SpacetimeAggregation( + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], + ).get_table_name(imputed=True) + == '"schema"."source_features"' + ) + + +def test_get_feature_columns(): + with testing.postgresql.Postgresql() as psql: + db_engine = sqlalchemy.create_engine(psql.url()) + n = Aggregate("x", "sum", {}) + d = Aggregate("1", "count", {}) + m = Aggregate("y", "avg", {}) + assert SpacetimeAggregation( + aggregates=[n, d, m], + from_obj="source", + features_schema_name="schema", + features_table_name="source_features", + prefix="prefix", + groups=["entity_id"], + cohort_table="tbl", + db_engine=db_engine, + as_of_dates=[], + ).feature_columns == set([ + "prefix_entity_id_all_x_sum", + "prefix_entity_id_all_1_count", + "prefix_entity_id_all_y_avg" + ]) + diff --git a/src/tests/conftest.py b/src/tests/conftest.py index a0a0f007b..613a5c822 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -130,7 +130,7 @@ def crosstabs_config(): using (model_id, as_of_date)""", "features_query": """ select m.model_id, f1.* - from features.entity_features_aggregation_imputed f1 join + from features.entity_features f1 join models_dates_join_query m using (as_of_date)""", "predictions_query": """ select model_id, diff --git a/src/tests/test_cli.py b/src/tests/test_cli.py index 497059381..fd2453a1f 100644 --- a/src/tests/test_cli.py +++ b/src/tests/test_cli.py @@ -51,7 +51,7 @@ def test_cli_crosstabs(): def test_featuretest(): - with patch('triage.cli.FeatureGenerator', autospec=True) as featuremock: + with patch('triage.cli.feature_blocks_from_config', autospec=True) as featuremock: with patch('triage.cli.EntityDateTableGenerator', autospec=True) as cohortmock: try_command('featuretest', 'example/config/experiment.yaml', '2017-06-06') featuremock.assert_called_once() diff --git a/src/tests/test_experiments.py b/src/tests/test_experiments.py index 879de3d8d..25d0dfc78 100644 --- a/src/tests/test_experiments.py +++ b/src/tests/test_experiments.py @@ -435,8 +435,8 @@ def test_baselines_with_missing_features(experiment_class): } config["feature_group_definition"] = { "tables": [ - "entity_features_aggregation_imputed", - "zip_code_features_aggregation_imputed", + "entity_features", + "zip_code_features", ] } config["feature_group_strategies"] = ["leave-one-in"] diff --git a/src/tests/test_partial_experiments.py b/src/tests/test_partial_experiments.py index 116c021fd..bfce1ff73 100644 --- a/src/tests/test_partial_experiments.py +++ b/src/tests/test_partial_experiments.py @@ -104,7 +104,7 @@ def test_validate_strict(self): class PreimputationFeatures(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "config_version": sample_config()["config_version"], "random_seed": sample_config()["random_seed"], } @@ -120,7 +120,7 @@ def test_run(self): if "_aggregation" in table ] - assert len(generated_tables) == len(sample_config()["feature_aggregations"]) + assert len(generated_tables) == len(sample_config()["features"]) for table in generated_tables: table_should_have_data(table, experiment.db_engine) @@ -137,7 +137,7 @@ def test_validate_strict(self): class PostimputationFeatures(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "cohort_config": sample_config()["cohort_config"], "config_version": sample_config()["config_version"], "random_seed": sample_config()["random_seed"], @@ -146,17 +146,8 @@ class PostimputationFeatures(TestCase): def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() - generated_tables = [ - table - for table in schema_tables( - experiment.features_schema_name, experiment.db_engine - ).keys() - if "_aggregation_imputed" in table - ] - - assert len(generated_tables) == len(sample_config()["feature_aggregations"]) - for table in generated_tables: - table_should_have_data(table, experiment.db_engine) + for feature_table_name in self.config['features'].keys(): + table_should_have_data("{}.{}".format(experiment.features_schema_name, feature_table_name), experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: @@ -171,7 +162,7 @@ def test_validate_strict(self): class Matrices(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "cohort_config": sample_config()["cohort_config"], "label_config": sample_config()["label_config"], "config_version": sample_config()["config_version"], diff --git a/src/tests/test_utils_db.py b/src/tests/test_utils_db.py new file mode 100644 index 000000000..c9976d3d9 --- /dev/null +++ b/src/tests/test_utils_db.py @@ -0,0 +1,23 @@ +from triage.util.db import run_statements +import pytest +import sqlalchemy +from sqlalchemy import text as t + + +def test_run_statements(db_engine): + """Test that database connections are cleaned up regardless of in-transaction + query errors. + """ + with pytest.raises(sqlalchemy.exc.ProgrammingError): + run_statements(['insert into blah'], db_engine) + + ((query_count,),) = db_engine.execute( + t("""\ + select count(1) from pg_stat_activity + where datname = :datname and + query not ilike '%%pg_stat_activity%%' + """), + datname=db_engine.url.database, + ) + + assert query_count == 0 diff --git a/src/tests/utils.py b/src/tests/utils.py index 07372034d..9f27f6f78 100644 --- a/src/tests/utils.py +++ b/src/tests/utils.py @@ -356,9 +356,9 @@ def sample_config(): } } - feature_config = [ - { - "prefix": "entity_features", + feature_config = { + "entity_features": { + "feature_generator_type": "spacetime_aggregation", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", "aggregates_imputation": {"all": {"type": "constant", "value": 0}}, @@ -366,8 +366,8 @@ def sample_config(): "intervals": ["1year"], "groups": ["entity_id"], }, - { - "prefix": "zip_code_features", + "zip_code_features": { + "feature_generator_type": "spacetime_aggregation", "from_obj": "entity_zip_codes join zip_code_events using (zip_code)", "knowledge_date_column": "as_of_date", "aggregates_imputation": {"all": {"type": "constant", "value": 0}}, @@ -375,7 +375,7 @@ def sample_config(): "intervals": ["1year"], "groups": ["entity_id", "zip_code"], }, - ] + } cohort_config = { "query": "select distinct(entity_id) from events where '{as_of_date}'::date >= outcome_date", @@ -403,7 +403,7 @@ def sample_config(): "entity_column_name": "entity_id", "model_comment": "test2-final-final", "model_group_keys": ["label_name", "label_type", "custom_key"], - "feature_aggregations": feature_config, + "features": feature_config, "cohort_config": cohort_config, "temporal_config": temporal_config, "grid_config": grid_config, diff --git a/src/triage/cli.py b/src/triage/cli.py index 800d5c344..070dfb3f2 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -10,7 +10,7 @@ from argcmdr import RootCommand, Command, main, cmdmethod from sqlalchemy.engine.url import URL -from triage.component.architect.feature_generators import FeatureGenerator +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator from triage.component.audition import AuditionRunner from triage.component.results_schema import upgrade_db, stamp_db, REVISION_MAPPING @@ -111,13 +111,13 @@ def configversion(self, args): @Triage.register class FeatureTest(Command): - """Test a feature aggregation by running it for one date""" + """Test features by running them for one date""" def __init__(self, parser): parser.add_argument( - "feature_config_file", + "experiment_config_file", type=argparse.FileType("r"), - help="Feature config YAML file, containing a list of feature_aggregation objects", + help="Experiment config YAML file, containing at least one feature configuration block. Cohort config will be used if present to filter the results. Any other keys will be ignored.", ) parser.add_argument( "as_of_date", @@ -128,8 +128,8 @@ def __init__(self, parser): def __call__(self, args): self.root.setup() # Loading configuration (if exists) db_engine = create_engine(self.root.db_url) - full_config = yaml.load(args.feature_config_file) - feature_config = full_config['feature_aggregations'] + full_config = yaml.load(args.experiment_config_file) + feature_config = full_config['features'] cohort_config = full_config.get('cohort_config', None) if cohort_config: EntityDateTableGenerator( @@ -139,11 +139,18 @@ def __call__(self, args): replace=True ).generate_entity_date_table(as_of_dates=[args.as_of_date]) - FeatureGenerator(db_engine, "features_test").create_features_before_imputation( - feature_aggregation_config=feature_config, - feature_dates=[args.as_of_date], - state_table="features_test.test_cohort" + feature_blocks = feature_blocks_from_config( + feature_config, + as_of_dates=[args.as_of_date], + cohort_table="features_test.test_cohort" if cohort_config else None, + db_engine=db_engine, + features_schema_name="features_test", + materialize_subquery_fromobjs=False, + features_ignore_cohort=bool(cohort_config), ) + for feature_block in feature_blocks: + feature_block.run_preimputation(verbose=True) + logging.info( "Features created for feature_config %s and date %s", feature_config, diff --git a/src/triage/component/architect/README.md b/src/triage/component/architect/README.md index f268b9936..72ef47a40 100644 --- a/src/triage/component/architect/README.md +++ b/src/triage/component/architect/README.md @@ -13,7 +13,7 @@ The Architect addresses these issues with functionality aimed at all tasks betwe ## Components - [LabelGenerator](architect/label_generators.py): Create binary labels suitable for a design matrix by querying a database table containing outcome events. -- [FeatureGenerator](architect/feature_generators.py): Create aggregate features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries. +- [FeatureBlockGenerator](architect/feature_block_generators.py): Create features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries. - [FeatureGroupCreator](architect/feature_group_creator.py), [FeatureGroupMixer](architect/feature_group_mixer.py): Create groupings of features, and mix them using different strategies (like 'leave one out') to test their effectiveness. - [Planner](architect/planner.py), [Builder](architect/builders.py): Build all design matrices needed for an experiment, taking into account different labels, state configurations, and feature groups. diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index c1d75fb12..b5c8d325c 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -451,7 +451,7 @@ def load_features_data( table=entity_date_table_name, ), # collate imputation shouldn't leave any nulls and we double-check - # the imputed table in FeatureGenerator.create_all_tables() but as + # the imputed table in FeatureBlock.run_imputation() but as # a final check, raise a divide by zero error on export if the # database encounters any during the outer join right_column_selections=[', "{0}"'.format(fn) for fn in feature_names], diff --git a/src/triage/component/architect/database_reflection.py b/src/triage/component/architect/database_reflection.py index f56697379..7d993f94d 100644 --- a/src/triage/component/architect/database_reflection.py +++ b/src/triage/component/architect/database_reflection.py @@ -98,6 +98,20 @@ def table_has_column(table_name, column, db_engine): return column in reflected_table(table_name, db_engine).columns +def table_columns(table_name, db_engine): + """Retrieve a list of columns. + + The table is expected to exist. + + Args: + table_name (string) A table name (with schema) + db_engine (sqlalchemy.engine) + + Returns: (list) Every column currently in the table + """ + return reflected_table(table_name, db_engine).columns + + def column_type(table_name, column, db_engine): """Find the database type of the given column in the given table diff --git a/src/triage/component/architect/feature_block.py b/src/triage/component/architect/feature_block.py new file mode 100644 index 000000000..1e2cd0b02 --- /dev/null +++ b/src/triage/component/architect/feature_block.py @@ -0,0 +1,234 @@ +from abc import ABC, abstractmethod + +import logging +import sqlparse +from triage.database_reflection import table_exists, table_columns +from triage.util.db import run_statements + + +class FeatureBlock(ABC): + def __init__( + self, + db_engine, + cohort_table, + as_of_dates, + features_table_name, + features_schema_name=None, + feature_start_time=None, + features_ignore_cohort=False, + ): + self.db_engine = db_engine + self.cohort_table_name = cohort_table + self.as_of_dates = as_of_dates + self.features_table_name_without_schema = features_table_name + self.features_schema_name = features_schema_name + self.feature_start_time = feature_start_time + self.features_ignore_cohort = features_ignore_cohort + + @property + def final_feature_table_name(self): + "The name of the final table with all features filled in (no missing values)" + schema = '"%s".' % self.features_schema_name if self.features_schema_name else "" + name = f'"{self.features_table_name_without_schema}"' + return "%s%s" % (schema, name) + + @property + @abstractmethod + def feature_columns(self): + """ + The list of feature columns in the final, postimputation table + + Should exclude any index columns (e.g. entity id, date) + """ + pass + + @property + @abstractmethod + def preinsert_queries(self): + """ + Return all queries that should be run before inserting any data. + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def insert_queries(self): + """ + Return all inserts to populate this data. Each query in this list should be parallelizable. + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def postinsert_queries(self): + """ + Return all queries that should be run after inserting all data + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def imputation_queries(self): + """ + Return all queries that should be run to fill in missing data with imputed values. + + Returns a list of queries/executable statements + """ + pass + + def preprocess(self): + """Perform any needed preprocessing.""" + pass + + def _cohort_table_sub(self): + """Helper function to ensure we only include state table records + in our set of input dates and after the feature_start_time. + """ + datestr = ", ".join(f"'{dt}'::date" for dt in self.as_of_dates) + mindtstr = ( + f" AND as_of_date >= '{self.feature_start_time}'::date" + if self.feature_start_time is not None + else "" + ) + return f"""( + SELECT * + FROM {self.cohort_table_name} + WHERE as_of_date IN ({datestr}) + {mindtstr})""" + + def verify_no_nulls(self): + """ + Verify that there are no nulls remaining in the imputed table + + Should raise an error if there are any. + """ + + query_template = """ + SELECT {cols} + FROM {state_tbl} t1 + LEFT JOIN {aggs_tbl} t2 USING(entity_id, as_of_date) + """ + cols_sql = ",\n".join( + f"""SUM(CASE WHEN "{column.name}" IS NULL THEN 1 ELSE 0 END) AS "{column.name}" """ + for column in table_columns(self.final_feature_table_name, self.db_engine) + ) + + results = self.db_engine.execute(query_template.format( + cols=cols_sql, + state_tbl=self._cohort_table_sub(), + aggs_tbl=self.final_feature_table_name, + )) + null_counts = results.first().items() + nullcols = [col for (col, val) in null_counts if val > 0] + + if len(nullcols) > 0: + raise ValueError( + "Imputation failed for {} columns. Null values remain in: {}".format( + len(nullcols), nullcols + ) + ) + + def needs_features(self): + imputed_table = self.final_feature_table_name + + if table_exists(imputed_table, self.db_engine): + check_query = ( + f"select 1 from {self.cohort_table_name} " + f"left join {imputed_table} " + "using (entity_id, as_of_date) " + f"where {imputed_table}.entity_id is null limit 1" + ) + if self.db_engine.execute(check_query).scalar(): + logging.warning( + "Imputed feature table %s did not contain rows from the " + "entire cohort, need to rebuild features", imputed_table) + return True + else: + logging.warning("Imputed feature table %s did not exist, " + "need to build features", imputed_table) + return True + logging.warning("Imputed feature table %s looks good, " + "skipping feature building!", imputed_table) + return False + + def generate_preimpute_tasks(self, replace): + if not replace and not self.needs_features(): + return {} + return { + "prepare": self.preinsert_queries, + "inserts": self.insert_queries, + "finalize": self.postinsert_queries + } + + def generate_impute_tasks(self, replace): + if not replace and not self.needs_features(): + return {} + return { + "prepare": self.imputation_queries, + "inserts": [], + "finalize": [] + } + + def process_table_task(self, task, verbose=False): + if verbose: + self.log_verbose_task_info(task) + run_statements(task.get("prepare", []), self.db_engine) + run_statements(task.get("inserts", []), self.db_engine) + run_statements(task.get("finalize", []), self.db_engine) + + def run_preimputation(self, verbose=False): + self.process_table_task(self.generate_preimpute_tasks(replace=True), verbose=verbose) + + def run_imputation(self, verbose=False): + self.process_table_task(self.generate_impute_tasks(replace=True), verbose=verbose) + self.verify_no_nulls() + + def log_verbose_task_info(self, task): + prepares = task.get("prepare", []) + inserts = task.get("inserts", []) + finalize = task.get("finalize", []) + logging.info("------------------") + logging.info( + "%s prepare queries, %s insert queries, %s finalize queries", + len(prepares), + len(inserts), + len(finalize), + ) + logging.info("------------------") + logging.info("") + logging.info("------------------") + logging.info("PREPARATION QUERIES") + logging.info("------------------") + for query_num, query in enumerate(prepares, 1): + logging.info("") + logging.info( + "prepare query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("INSERT QUERIES") + logging.info("------------------") + for query_num, query in enumerate(inserts, 1): + logging.info("") + logging.info( + "insert query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("FINALIZE QUERIES") + logging.info("------------------") + for query_num, query in enumerate(finalize, 1): + logging.info("") + logging.info( + "finalize query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) diff --git a/src/triage/component/architect/feature_block_generators.py b/src/triage/component/architect/feature_block_generators.py new file mode 100644 index 000000000..c117c8ff9 --- /dev/null +++ b/src/triage/component/architect/feature_block_generators.py @@ -0,0 +1,271 @@ +import copy +import logging + +from triage.component.collate import ( + Aggregate, + Categorical, + Compare, + SpacetimeAggregation, +) + + +def generate_spacetime_aggregation( + feature_aggregation_config, + feature_table_name, + as_of_dates, + cohort_table, + db_engine, + features_schema_name, + feature_start_time=None, + materialize_subquery_fromobjs=True, + features_ignore_cohort=False, +): + """Creates collate.SpacetimeAggregations from the given arguments + + Args: + feature_aggregation_config (list) all values, except for feature + date, necessary to instantiate a collate.SpacetimeAggregation + feature_table_name (string) the table in which to put output features + as_of_dates (list) dates to generate features as of + cohort_table (string) schema.table_name for state table with all entity/date pairs + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + materialize_subquery_fromobjs (boolean, optional) Whether or not to inspect from_obj + values and create persistent tables out of ones that look like subqueries, for the + purposes of making runs on many as-of-dates faster + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + + Returns: (list) collate.SpacetimeAggregations + """ + if not cohort_table: + logging.warning("No cohort table passed. Imputation will not be possible.") + features_ignore_cohort = True + + return SpacetimeAggregationGenerator( + db_engine=db_engine, + features_schema_name=features_schema_name, + feature_start_time=feature_start_time, + materialize_subquery_fromobjs=materialize_subquery_fromobjs, + features_ignore_cohort=features_ignore_cohort, + ).aggregation( + feature_aggregation_config, + as_of_dates, + cohort_table, + feature_table_name + ) + + +class SpacetimeAggregationGenerator(object): + def __init__( + self, + db_engine, + features_schema_name, + feature_start_time=None, + materialize_subquery_fromobjs=True, + features_ignore_cohort=False, + ): + """Generates aggregate features using collate + + Args: + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + materialize_subquery_fromobjs (boolean, optional) Whether or not to inspect from_obj + values and create persistent tables out of ones that look like subqueries, for the + purposes of making runs on many as-of-dates faster + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + """ + self.db_engine = db_engine + self.features_schema_name = features_schema_name + self.categorical_cache = {} + self.feature_start_time = feature_start_time + self.materialize_subquery_fromobjs = materialize_subquery_fromobjs + self.features_ignore_cohort = features_ignore_cohort + self.entity_id_column = "entity_id" + self.from_objs = {} + + def _compute_choices(self, choice_query): + if choice_query not in self.categorical_cache: + with self.db_engine.begin() as conn: + self.categorical_cache[choice_query] = [ + row[0] for row in conn.execute(choice_query) + ] + + logging.info( + "Computed list of categoricals: %s for choice query: %s", + self.categorical_cache[choice_query], + choice_query, + ) + + return self.categorical_cache[choice_query] + + def _build_choices(self, categorical): + logging.info( + "Building categorical choices for column %s, metrics %s", + categorical["column"], + categorical["metrics"], + ) + if "choices" in categorical: + logging.info("Found list of configured choices: %s", categorical["choices"]) + return categorical["choices"] + else: + return self._compute_choices(categorical["choice_query"]) + + def _build_categoricals(self, categorical_config, impute_rules): + # TODO: only include null flag where necessary + return [ + Categorical( + col=categorical["column"], + choices=self._build_choices(categorical), + function=categorical["metrics"], + impute_rules=dict( + impute_rules, + coltype="categorical", + **categorical.get("imputation", {}) + ), + include_null=True, + coltype=categorical.get('coltype', None), + ) + for categorical in categorical_config + ] + + def _build_array_categoricals(self, categorical_config, impute_rules): + # TODO: only include null flag where necessary + return [ + Compare( + col=categorical["column"], + op="@>", + choices={ + choice: "array['{}'::varchar]".format(choice) + for choice in self._build_choices(categorical) + }, + function=categorical["metrics"], + impute_rules=dict( + impute_rules, + coltype="array_categorical", + **categorical.get("imputation", {}) + ), + op_in_name=False, + quote_choices=False, + include_null=True, + coltype=categorical.get('coltype', None) + ) + for categorical in categorical_config + ] + + def aggregation(self, aggregation_config, feature_dates, state_table, feature_table_name): + logging.info( + "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", + aggregation_config, + len(feature_dates), + ) + + # read top-level imputation rules from the aggregation config; we'll allow + # these to be overridden by imputation rules at the individual feature + # level as those get parsed as well + agimp = aggregation_config.get("aggregates_imputation", {}) + catimp = aggregation_config.get("categoricals_imputation", {}) + arrcatimp = aggregation_config.get("array_categoricals_imputation", {}) + + aggregates = [ + Aggregate( + aggregate["quantity"], + aggregate["metrics"], + dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})), + coltype=aggregate.get('coltype', None) + ) + for aggregate in aggregation_config.get("aggregates", []) + ] + logging.info("Found %s quantity aggregates", len(aggregates)) + categoricals = self._build_categoricals( + aggregation_config.get("categoricals", []), catimp + ) + logging.info("Found %s categorical aggregates", len(categoricals)) + array_categoricals = self._build_array_categoricals( + aggregation_config.get("array_categoricals", []), arrcatimp + ) + logging.info("Found %s array categorical aggregates", len(array_categoricals)) + return SpacetimeAggregation( + aggregates + categoricals + array_categoricals, + from_obj=aggregation_config["from_obj"], + intervals=aggregation_config["intervals"], + groups=aggregation_config["groups"], + as_of_dates=feature_dates, + cohort_table=state_table, + entity_column=self.entity_id_column, + date_column=aggregation_config["knowledge_date_column"], + output_date_column="as_of_date", + db_engine=self.db_engine, + feature_start_time=self.feature_start_time, + features_schema_name=self.features_schema_name, + features_table_name=feature_table_name, + features_ignore_cohort=self.features_ignore_cohort + ) + + +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregation': generate_spacetime_aggregation +} + + +def feature_blocks_from_config( + config, + as_of_dates, + cohort_table, + db_engine, + features_schema_name, + feature_start_time=None, + features_ignore_cohort=False, + **kwargs +): + """ + Create a list of feature blocks from a block of configuration + Args: + config (dict) feature config, consisting of: + a key corresponding to a known feature generator (in FEATURE_BLOCK_GENERATOR_LOOKUP) + a value corresponding to any config needed for that feature generator + as_of_dates (list) dates to generate features as of + cohort_table (string) schema.table_name for cohort table with all entity/date pairs + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + + Returns: (list) of FeatureBlock objects + """ + feature_blocks = [] + for feature_table_name, feature_block_configuration in config.items(): + config_to_pass = copy.deepcopy(feature_block_configuration) + feature_generator_type = config_to_pass.pop("feature_generator_type") + feature_block_generator = FEATURE_BLOCK_GENERATOR_LOOKUP.get(feature_generator_type, None) + if not feature_block_generator: + raise ValueError(f"feature generator type {feature_generator_type} does not correspond to a recognized" + " feature generator. Recognized feature generator types:" + f"{FEATURE_BLOCK_GENERATOR_LOOKUP.keys()}") + + feature_block = feature_block_generator( + config_to_pass, + feature_table_name=feature_table_name, + as_of_dates=as_of_dates, + cohort_table=cohort_table, + db_engine=db_engine, + features_schema_name=features_schema_name, + feature_start_time=feature_start_time, + features_ignore_cohort=features_ignore_cohort, + **kwargs + ) + feature_blocks.append(feature_block) + return feature_blocks diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py new file mode 100644 index 000000000..ea2eea6a5 --- /dev/null +++ b/src/triage/component/architect/feature_dictionary.py @@ -0,0 +1,34 @@ +from triage.component.architect.utils import remove_schema_from_table_name +from triage.util.structs import FeatureNameList +from collections.abc import Iterable + + +class FeatureDictionary(dict): + """A feature dictionary, consisting of table names as keys and column names as values + + If a list of feature_blocks is passed, will initialize the feature dictionary with their data. + """ + def __init__(self, feature_blocks=None, *args, **kwargs): + super().__init__(*args, **kwargs) + for feature_block in feature_blocks: + cleaned_table = remove_schema_from_table_name( + feature_block.final_feature_table_name + ) + self[cleaned_table] = feature_block.feature_columns + + def __setitem__(self, table, feature_names): + if not isinstance(table, str): + raise TypeError("key of FeatureDictionary objects represents a table " + "name and must be a string") + if not isinstance(feature_names, Iterable): + raise TypeError("value of FeatureDictionary objects represents a list of " + "feature names, and therefore must be iterable") + + for feature_name in feature_names: + if not isinstance(feature_name, str): + raise TypeError("A FeatureNameList represents a list of feature names, and therefore" + f"each item must be a string, not: {feature_name!r}") + if isinstance(feature_names, FeatureNameList): + super().__setitem__(table, feature_names) + else: + super().__setitem__(table, FeatureNameList(feature_names)) diff --git a/src/triage/component/architect/feature_dictionary_creator.py b/src/triage/component/architect/feature_dictionary_creator.py deleted file mode 100644 index 4d0082001..000000000 --- a/src/triage/component/architect/feature_dictionary_creator.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging -from triage.component.architect.utils import str_in_sql -from triage.util.structs import FeatureNameList - - -class FeatureDictionaryCreator(object): - def __init__(self, features_schema_name, db_engine): - self.features_schema_name = features_schema_name - self.db_engine = db_engine - - def _tables_to_include(self, feature_table_names): - return [ - feature_table - for feature_table in feature_table_names - if "aggregation_imputed" in feature_table - ] - - def feature_dictionary(self, feature_table_names, index_column_lookup): - """ Create a dictionary of feature names, where keys are feature tables - and values are lists of feature names. - - :return: feature_dictionary - :rtype: dict - """ - feature_dictionary = {} - - # iterate! store each table name + features names as key-value pair - for feature_table_name in self._tables_to_include(feature_table_names): - feature_names = [ - row[0] - for row in self.db_engine.execute( - self._build_feature_names_query( - feature_table_name, index_column_lookup[feature_table_name] - ) - ) - ] - feature_dictionary[feature_table_name] = FeatureNameList(feature_names) - logging.info("Feature dictionary built: %s", feature_dictionary) - return feature_dictionary - - def _build_feature_names_query(self, table_name, index_columns): - """ For a given feature table, get the names of the feature columns. - - :param table_name: name of the feature table - :type table_name: str - - :return: names of the feature columns in given table - :rtype: list - """ - # format the query that gets column names, - # excluding indices from result - feature_names_query = """ - SELECT column_name - FROM information_schema.columns - WHERE table_name = '{table}' AND - table_schema = '{schema}' AND - column_name NOT IN ({index_columns}) - """.format( - table=table_name, - schema=self.features_schema_name, - index_columns=str_in_sql(index_columns), - ) - logging.info( - "Extracting all possible feature names for table %s with query %s", - table_name, - feature_names_query, - ) - - return feature_names_query diff --git a/src/triage/component/architect/feature_generators.py b/src/triage/component/architect/feature_generators.py deleted file mode 100644 index d7c386d42..000000000 --- a/src/triage/component/architect/feature_generators.py +++ /dev/null @@ -1,715 +0,0 @@ -import logging -from collections import OrderedDict - -import sqlalchemy -import sqlparse - -from triage.util.conf import convert_str_to_relativedelta -from triage.database_reflection import table_exists - -from triage.component.collate import ( - Aggregate, - Categorical, - Compare, - SpacetimeAggregation, - FromObj -) - - -class FeatureGenerator(object): - def __init__( - self, - db_engine, - features_schema_name, - replace=True, - feature_start_time=None, - materialize_subquery_fromobjs=True, - features_ignore_cohort=False, - ): - """Generates aggregate features using collate - - Args: - db_engine (sqlalchemy.db.engine) - features_schema_name (string) Name of schema where feature - tables should be written to - replace (boolean, optional) Whether or not existing features - should be replaced - feature_start_time (string/datetime, optional) point in time before which - should not be included in features - features_ignore_cohort (boolean, optional) Whether or not features should be built - independently of the cohort. Takes longer but means that features can be reused - for different cohorts. - """ - self.db_engine = db_engine - self.features_schema_name = features_schema_name - self.categorical_cache = {} - self.replace = replace - self.feature_start_time = feature_start_time - self.materialize_subquery_fromobjs = materialize_subquery_fromobjs - self.features_ignore_cohort = features_ignore_cohort - self.entity_id_column = "entity_id" - self.from_objs = {} - - def _validate_keys(self, aggregation_config): - for key in [ - "from_obj", - "intervals", - "groups", - "knowledge_date_column", - "prefix", - ]: - if key not in aggregation_config: - raise ValueError( - "{} required as key: aggregation config: {}".format( - key, aggregation_config - ) - ) - - def _validate_aggregates(self, aggregation_config): - if ( - "aggregates" not in aggregation_config - and "categoricals" not in aggregation_config - and "array_categoricals" not in aggregation_config - ): - raise ValueError( - "Need either aggregates, categoricals, or array_categoricals" - + " in {}".format(aggregation_config) - ) - - def _validate_categoricals(self, categoricals): - for categorical in categoricals: - if "choice_query" in categorical: - logging.info("Validating choice query") - - try: - with self.db_engine.begin() as conn: - conn.execute("explain {}".format(categorical["choice_query"])) - except Exception as exc: - raise ValueError( - "choice query does not run. \n" - 'choice query: "{}"\n' - "Full error: {}".format(categorical["choice_query"], exc) - ) - - def _validate_from_obj(self, from_obj): - logging.info("Validating from_obj") - try: - with self.db_engine.begin() as conn: - conn.execute("explain select * from {}".format(from_obj)) - except Exception as exc: - raise ValueError( - "from_obj query does not run. \n" - 'from_obj: "{}"\n' - "Full error: {}".format(from_obj, exc) - ) - - def _validate_time_intervals(self, intervals): - logging.info("Validating time intervals") - for interval in intervals: - if interval != "all": - convert_str_to_relativedelta(interval) - - def _validate_groups(self, groups): - if "entity_id" not in groups: - raise ValueError( - "One of the aggregation groups is required to be entity_id" - ) - - def _validate_imputation_rule(self, aggregate_type, impute_rule): - """Validate the imputation rule for a given aggregation type.""" - # dictionary of imputation type : required parameters - valid_imputations = { - "all": { - "mean": [], - "constant": ["value"], - "zero": [], - "zero_noflag": [], - "error": [], - }, - "aggregates": {"binary_mode": []}, - "categoricals": {"null_category": []}, - } - valid_imputations["array_categoricals"] = valid_imputations["categoricals"] - - # the valid imputation rules for the specific aggregation type being checked - valid_types = dict( - valid_imputations["all"], **valid_imputations[aggregate_type] - ) - - # no imputation rule was specified - if "type" not in impute_rule.keys(): - raise ValueError("Imputation type must be specified") - - # a rule was specified, but not valid for this type of aggregate - if impute_rule["type"] not in valid_types.keys(): - raise ValueError( - "Invalid imputation type %s for %s" - % (impute_rule["type"], aggregate_type) - ) - - # check that all required parameters exist in the keys of the imputation rule - required_params = valid_types[impute_rule["type"]] - for param in required_params: - if param not in impute_rule.keys(): - raise ValueError( - "Missing param %s for %s" % (param, impute_rule["type"]) - ) - - def _validate_imputations(self, aggregation_config): - """Validate the imputation rules in an aggregation config, looping - through all three types of aggregates. Most of the work here is - done by _validate_imputation_rule() to check the requirements of - each imputation rule found - """ - agg_types = ["aggregates", "categoricals", "array_categoricals"] - - for agg_type in agg_types: - # base_imp are the top-level rules, `such as aggregates_imputation` - base_imp = aggregation_config.get(agg_type + "_imputation", {}) - - # loop through the individual aggregates - for agg in aggregation_config.get(agg_type, []): - # combine any aggregate-level imputation rules with top-level ones - imp_dict = dict(base_imp, **agg.get("imputation", {})) - - # imputation rules are metric-specific, so check each metric's rule - for metric in agg["metrics"]: - # metric rules may be defined by the metric name (e.g., 'max') - # or with the 'all' catch-all, with named metrics taking - # precedence. If we fall back to {}, the rule validator will - # error out on no metric found. - impute_rule = imp_dict.get(metric, imp_dict.get("all", {})) - self._validate_imputation_rule(agg_type, impute_rule) - - def _validate_aggregation(self, aggregation_config): - logging.info("Validating aggregation config %s", aggregation_config) - self._validate_keys(aggregation_config) - self._validate_aggregates(aggregation_config) - self._validate_categoricals(aggregation_config.get("categoricals", [])) - self._validate_from_obj(aggregation_config["from_obj"]) - self._validate_time_intervals(aggregation_config["intervals"]) - self._validate_groups(aggregation_config["groups"]) - self._validate_imputations(aggregation_config) - - def validate(self, feature_aggregation_config): - """Validate a feature aggregation config applied to this object - - The validations range from basic type checks, key presence checks, - as well as validating the sql in from objects. - - Args: - feature_aggregation_config (list) all values, except for feature - date, necessary to instantiate a collate.SpacetimeAggregation - - Raises: ValueError if any part of the config is found to be invalid - """ - for aggregation in feature_aggregation_config: - self._validate_aggregation(aggregation) - - def _compute_choices(self, choice_query): - if choice_query not in self.categorical_cache: - with self.db_engine.begin() as conn: - self.categorical_cache[choice_query] = [ - row[0] for row in conn.execute(choice_query) - ] - - logging.info( - "Computed list of categoricals: %s for choice query: %s", - self.categorical_cache[choice_query], - choice_query, - ) - - return self.categorical_cache[choice_query] - - def _build_choices(self, categorical): - logging.info( - "Building categorical choices for column %s, metrics %s", - categorical["column"], - categorical["metrics"], - ) - if "choices" in categorical: - logging.info("Found list of configured choices: %s", categorical["choices"]) - return categorical["choices"] - else: - return self._compute_choices(categorical["choice_query"]) - - def _build_categoricals(self, categorical_config, impute_rules): - # TODO: only include null flag where necessary - return [ - Categorical( - col=categorical["column"], - choices=self._build_choices(categorical), - function=categorical["metrics"], - impute_rules=dict( - impute_rules, - coltype="categorical", - **categorical.get("imputation", {}) - ), - include_null=True, - coltype=categorical.get('coltype', None), - ) - for categorical in categorical_config - ] - - def _build_array_categoricals(self, categorical_config, impute_rules): - # TODO: only include null flag where necessary - return [ - Compare( - col=categorical["column"], - op="@>", - choices={ - choice: "array['{}'::varchar]".format(choice) - for choice in self._build_choices(categorical) - }, - function=categorical["metrics"], - impute_rules=dict( - impute_rules, - coltype="array_categorical", - **categorical.get("imputation", {}) - ), - op_in_name=False, - quote_choices=False, - include_null=True, - coltype=categorical.get('coltype', None) - ) - for categorical in categorical_config - ] - - def _aggregation(self, aggregation_config, feature_dates, state_table): - logging.info( - "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", - aggregation_config, - len(feature_dates), - ) - - # read top-level imputation rules from the aggregation config; we'll allow - # these to be overridden by imputation rules at the individual feature - # level as those get parsed as well - agimp = aggregation_config.get("aggregates_imputation", {}) - catimp = aggregation_config.get("categoricals_imputation", {}) - arrcatimp = aggregation_config.get("array_categoricals_imputation", {}) - - aggregates = [ - Aggregate( - aggregate["quantity"], - aggregate["metrics"], - dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})), - coltype=aggregate.get('coltype', None) - ) - for aggregate in aggregation_config.get("aggregates", []) - ] - logging.info("Found %s quantity aggregates", len(aggregates)) - categoricals = self._build_categoricals( - aggregation_config.get("categoricals", []), catimp - ) - logging.info("Found %s categorical aggregates", len(categoricals)) - array_categoricals = self._build_array_categoricals( - aggregation_config.get("array_categoricals", []), arrcatimp - ) - logging.info("Found %s array categorical aggregates", len(array_categoricals)) - return SpacetimeAggregation( - aggregates + categoricals + array_categoricals, - from_obj=aggregation_config["from_obj"], - intervals=aggregation_config["intervals"], - groups=aggregation_config["groups"], - dates=feature_dates, - state_table=state_table, - state_group=self.entity_id_column, - date_column=aggregation_config["knowledge_date_column"], - output_date_column="as_of_date", - input_min_date=self.feature_start_time, - schema=self.features_schema_name, - prefix=aggregation_config["prefix"], - join_with_cohort_table=not self.features_ignore_cohort - ) - - def aggregations(self, feature_aggregation_config, feature_dates, state_table): - """Creates collate.SpacetimeAggregations from the given arguments - - Args: - feature_aggregation_config (list) all values, except for feature - date, necessary to instantiate a collate.SpacetimeAggregation - feature_dates (list) dates to generate features as of - state_table (string) schema.table_name for state table with all entity/date pairs - - Returns: (list) collate.SpacetimeAggregations - """ - return [ - self.preprocess_aggregation( - self._aggregation(aggregation_config, feature_dates, state_table) - ) - for aggregation_config in feature_aggregation_config - ] - - def preprocess_aggregation(self, aggregation): - create_schema = aggregation.get_create_schema() - - if create_schema is not None: - with self.db_engine.begin() as conn: - conn.execute(create_schema) - - if self.materialize_subquery_fromobjs: - # materialize from obj - from_obj = FromObj( - from_obj=aggregation.from_obj.text, - name=f"{aggregation.schema}.{aggregation.prefix}", - knowledge_date_column=aggregation.date_column - ) - from_obj.maybe_materialize(self.db_engine) - aggregation.from_obj = from_obj.table - return aggregation - - def generate_all_table_tasks(self, aggregations, task_type): - """Generates SQL commands for creating, populating, and indexing - feature group tables - - Args: - aggregations (list) collate.SpacetimeAggregation objects - type (str) either 'aggregation' or 'imputation' - - Returns: (dict) keys are group table names, values are themselves dicts, - each with keys for different stages of table creation (prepare, inserts, finalize) - and with values being lists of SQL commands - """ - - logging.debug("---------------------") - - # pick the method to use for generating tasks depending on whether we're - # building the aggregations or imputations - if task_type == "aggregation": - task_generator = self._generate_agg_table_tasks_for - logging.debug("---------FEATURE GENERATION------------") - elif task_type == "imputation": - task_generator = self._generate_imp_table_tasks_for - logging.debug("---------FEATURE IMPUTATION------------") - else: - raise ValueError("Table task type must be aggregation or imputation") - - logging.debug("---------------------") - - table_tasks = OrderedDict() - for aggregation in aggregations: - table_tasks.update(task_generator(aggregation)) - logging.info("Created %s tables", len(table_tasks.keys())) - return table_tasks - - def create_features_before_imputation( - self, feature_aggregation_config, feature_dates, state_table=None - ): - """Create features before imputation for a set of dates""" - all_tasks = self.generate_all_table_tasks( - self.aggregations( - feature_aggregation_config, feature_dates, state_table=state_table - ), - task_type="aggregation", - ) - logging.info("Generated a total of %s table tasks", len(all_tasks)) - for task_num, task in enumerate(all_tasks.values(), 1): - prepares = task.get("prepare", []) - inserts = task.get("inserts", []) - finalize = task.get("finalize", []) - logging.info("------------------") - logging.info("TASK %s ", task_num) - logging.info( - "%s prepare queries, %s insert queries, %s finalize queries", - len(prepares), - len(inserts), - len(finalize), - ) - logging.info("------------------") - logging.info("") - logging.info("------------------") - logging.info("PREPARATION QUERIES") - logging.info("------------------") - for query_num, query in enumerate(prepares, 1): - logging.info("") - logging.info( - "prepare query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("INSERT QUERIES") - logging.info("------------------") - for query_num, query in enumerate(inserts, 1): - logging.info("") - logging.info( - "insert query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("FINALIZE QUERIES") - logging.info("------------------") - for query_num, query in enumerate(finalize, 1): - logging.info("") - logging.info( - "finalize query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - self.process_table_task(task) - - def create_all_tables(self, feature_aggregation_config, feature_dates, state_table): - """Create all feature tables. - - First builds the aggregation tables, and then performs - imputation on any null values, (requiring a two-step process to - determine which columns contain nulls after the initial - aggregation tables are built). - - Args: - feature_aggregation_config (list) all values, except for - feature date, necessary to instantiate a - `collate.SpacetimeAggregation` - feature_dates (list) dates to generate features as of - state_table (string) schema.table_name for state table with - all entity/date pairs - - Returns: (list) table names - - """ - aggs = self.aggregations(feature_aggregation_config, feature_dates, state_table) - - # first, generate and run table tasks for aggregations - table_tasks_aggregate = self.generate_all_table_tasks( - aggs, task_type="aggregation" - ) - self.process_table_tasks(table_tasks_aggregate) - - # second, perform the imputations (this will query the tables - # constructed above to identify features containing nulls) - table_tasks_impute = self.generate_all_table_tasks(aggs, task_type="imputation") - impute_keys = self.process_table_tasks(table_tasks_impute) - - # double-check that the imputation worked and no nulls remain - # in the data: - nullcols = [] - with self.db_engine.begin() as conn: - for agg in aggs: - results = conn.execute(agg.find_nulls(imputed=True)) - null_counts = results.first().items() - nullcols += [col for (col, val) in null_counts if val > 0] - - if len(nullcols) > 0: - raise ValueError( - "Imputation failed for {} columns. Null values remain in: {}".format( - len(nullcols), nullcols - ) - ) - - return impute_keys - - def process_table_task(self, task): - self.run_commands(task.get("prepare", [])) - self.run_commands(task.get("inserts", [])) - self.run_commands(task.get("finalize", [])) - - def process_table_tasks(self, table_tasks): - for table_name, task in table_tasks.items(): - logging.info("Running feature table queries for %s", table_name) - self.process_table_task(task) - return table_tasks.keys() - - def _explain_selects(self, aggregations): - with self.db_engine.begin() as conn: - for aggregation in aggregations: - for selectlist in aggregation.get_selects().values(): - for select in selectlist: - query = "explain " + str(select) - results = list(conn.execute(query)) - logging.debug(str(select)) - logging.debug(results) - - def _clean_table_name(self, table_name): - # remove the schema and quotes from the name - return table_name.split(".")[1].replace('"', "") - - def _table_exists(self, table_name): - try: - with self.db_engine.begin() as conn: - conn.execute( - "select 1 from {}.{} limit 1".format( - self.features_schema_name, table_name - ) - ).first() - except sqlalchemy.exc.ProgrammingError: - return False - else: - return True - - def run_commands(self, command_list): - with self.db_engine.begin() as conn: - for command in command_list: - logging.debug("Executing feature generation query: %s", command) - conn.execute(command) - - def _aggregation_index_query(self, aggregation, imputed=False): - return "CREATE INDEX ON {} ({}, {})".format( - aggregation.get_table_name(imputed=imputed), - self.entity_id_column, - aggregation.output_date_column, - ) - - def _aggregation_index_columns(self, aggregation): - return sorted( - [group for group in aggregation.groups.keys()] - + [aggregation.output_date_column] - ) - - def index_column_lookup(self, aggregations, imputed=True): - return dict( - ( - self._clean_table_name(aggregation.get_table_name(imputed=imputed)), - self._aggregation_index_columns(aggregation), - ) - for aggregation in aggregations - ) - - def _needs_features(self, aggregation): - imputed_table = self._clean_table_name( - aggregation.get_table_name(imputed=True) - ) - - if self._table_exists(imputed_table): - check_query = ( - f"select 1 from {aggregation.state_table} " - f"left join {self.features_schema_name}.{imputed_table} " - "using (entity_id, as_of_date) " - f"where {self.features_schema_name}.{imputed_table}.entity_id is null limit 1" - ) - if self.db_engine.execute(check_query).scalar(): - logging.warning( - "Imputed feature table %s did not contain rows from the " - "entire cohort, need to rebuild features", imputed_table) - return True - else: - logging.warning("Imputed feature table %s did not exist, " - "need to build features", imputed_table) - return True - logging.warning("Imputed feature table %s looks good, " - "skipping feature building!", imputed_table) - return False - - def _generate_agg_table_tasks_for(self, aggregation): - """Generates SQL commands for preparing, populating, and finalizing - each feature group table in the given aggregation - - Args: - aggregation (collate.SpacetimeAggregation) - - Returns: (dict) of structure { - 'prepare': list of commands to prepare table for population - 'inserts': list of commands to populate table - 'finalize': list of commands to finalize table after population - } - """ - creates = aggregation.get_creates() - drops = aggregation.get_drops() - indexes = aggregation.get_indexes() - inserts = aggregation.get_inserts() - table_tasks = OrderedDict() - for group in aggregation.groups: - group_table = self._clean_table_name( - aggregation.get_table_name(group=group) - ) - if self.replace or self._needs_features(aggregation): - table_tasks[group_table] = { - "prepare": [drops[group], creates[group]], - "inserts": inserts[group], - "finalize": [indexes[group]], - } - logging.info("Created table tasks for %s", group_table) - else: - logging.info("Skipping feature table creation for %s", group_table) - table_tasks[group_table] = {} - logging.info("Created table tasks for aggregation") - if self.replace or self._needs_features(aggregation): - table_tasks[self._clean_table_name(aggregation.get_table_name())] = { - "prepare": [aggregation.get_drop(), aggregation.get_create()], - "inserts": [], - "finalize": [self._aggregation_index_query(aggregation)], - } - else: - table_tasks[self._clean_table_name(aggregation.get_table_name())] = {} - - return table_tasks - - def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True): - """Generate SQL statements for preparing, populating, and - finalizing imputations, for each feature group table in the - given aggregation. - - Requires the existance of the underlying feature and aggregation - tables defined in `_generate_agg_table_tasks_for()`. - - Args: - aggregation (collate.SpacetimeAggregation) - drop_preagg: boolean to specify dropping pre-imputation - tables - - Returns: (dict) of structure { - 'prepare': list of commands to prepare table for population - 'inserts': list of commands to populate table - 'finalize': list of commands to finalize table after population - } - - """ - table_tasks = OrderedDict() - imp_tbl_name = self._clean_table_name(aggregation.get_table_name(imputed=True)) - - if not self.replace and not self._needs_features(aggregation): - logging.warning("Skipping imputation table creation for %s", imp_tbl_name) - table_tasks[imp_tbl_name] = {} - return table_tasks - - if not aggregation.state_table: - logging.warning( - "No state table defined in aggregation, cannot create imputation table for %s", - imp_tbl_name, - ) - table_tasks[imp_tbl_name] = {} - return table_tasks - - if not table_exists(aggregation.state_table, self.db_engine): - logging.warning( - "State table %s does not exist, cannot create imputation table for %s", - aggregation.state_table, - imp_tbl_name, - ) - table_tasks[imp_tbl_name] = {} - return table_tasks - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - with self.db_engine.begin() as conn: - results = conn.execute(aggregation.find_nulls()) - null_counts = results.first().items() - impute_cols = [col for (col, val) in null_counts if val > 0] - nonimpute_cols = [col for (col, val) in null_counts if val == 0] - - # table tasks for imputed aggregation table, most of the work is done here - # by collate's get_impute_create() - table_tasks[imp_tbl_name] = { - "prepare": [ - aggregation.get_drop(imputed=True), - aggregation.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ), - ], - "inserts": [], - "finalize": [self._aggregation_index_query(aggregation, imputed=True)], - } - logging.info("Created table tasks for imputation: %s", imp_tbl_name) - - # do some cleanup: - # drop the group-level and aggregation tables, just leaving the - # imputation table if drop_preagg=True - if drop_preagg: - drops = aggregation.get_drops() - table_tasks[imp_tbl_name]["finalize"] += list(drops.values()) + [ - aggregation.get_drop() - ] - logging.info("Added drop table cleanup tasks: %s", imp_tbl_name) - - return table_tasks diff --git a/src/triage/component/architect/features.py b/src/triage/component/architect/features.py index d297d9b4d..e843b83a7 100644 --- a/src/triage/component/architect/features.py +++ b/src/triage/component/architect/features.py @@ -1,13 +1,10 @@ -from triage.component.architect.feature_generators import FeatureGenerator -from triage.component.architect.feature_dictionary_creator import ( - FeatureDictionaryCreator, -) +from triage.component.architect.feature_dictionary import FeatureDictionary from triage.component.architect.feature_group_creator import FeatureGroupCreator from triage.component.architect.feature_group_mixer import FeatureGroupMixer + __all__ = ( - "FeatureGenerator", - "FeatureDictionaryCreator", + "FeatureDictionary", "FeatureGroupCreator", "FeatureGroupMixer", ) diff --git a/src/triage/component/architect/utils.py b/src/triage/component/architect/utils.py index b13498085..dccabd811 100644 --- a/src/triage/component/architect/utils.py +++ b/src/triage/component/architect/utils.py @@ -38,6 +38,11 @@ def feature_list(feature_dictionary): )) +def remove_schema_from_table_name(table_name): + # remove the schema and quotes from the name + return table_name.split(".")[1].replace('"', "") + + def convert_string_column_to_date(column): return [datetime.datetime.strptime(date, "%Y-%m-%d").date() for date in column] diff --git a/src/triage/component/collate/__init__.py b/src/triage/component/collate/__init__.py index 9bc977b84..33055b035 100644 --- a/src/triage/component/collate/__init__.py +++ b/src/triage/component/collate/__init__.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- -from .collate import available_imputations, Aggregation, Aggregate, Compare, Categorical +from .collate import Aggregate, Compare, Categorical from .from_obj import FromObj -from .spacetime import SpacetimeAggregation +from .spacetime import SpacetimeAggregation, available_imputations __all__ = [ "available_imputations", - "Aggregation", "Aggregate", "FromObj", "Compare", diff --git a/src/triage/component/collate/collate.py b/src/triage/component/collate/collate.py index 06f916c4b..e7d7b632d 100644 --- a/src/triage/component/collate/collate.py +++ b/src/triage/component/collate/collate.py @@ -1,31 +1,10 @@ # -*- coding: utf-8 -*- -import logging from numbers import Number -from itertools import product, chain +from itertools import product import sqlalchemy.sql.expression as ex import re -from descriptors import cachedproperty - -from .sql import make_sql_clause, to_sql_name, CreateTableAs, InsertFromSelect -from .imputations import ( - ImputeMean, - ImputeConstant, - ImputeZero, - ImputeZeroNoFlag, - ImputeNullCategory, - ImputeBinaryMode, - ImputeError, -) - -available_imputations = { - "mean": ImputeMean, - "constant": ImputeConstant, - "zero": ImputeZero, - "zero_noflag": ImputeZeroNoFlag, - "null_category": ImputeNullCategory, - "binary_mode": ImputeBinaryMode, - "error": ImputeError, -} + +from .sql import to_sql_name def make_list(a): @@ -435,412 +414,3 @@ def __init__( op_in_name=op_in_name, **kwargs ) - - -class Aggregation(object): - def __init__( - self, - aggregates, - groups, - from_obj, - state_table, - state_group=None, - prefix=None, - suffix=None, - schema=None, - ): - """ - Args: - aggregates: collection of Aggregate objects. - from_obj: defines the from clause, e.g. the name of the table. can use - groups: a list of expressions to group by in the aggregation or a dictionary - pairs group: expr pairs where group is the alias (used in column names) - state_table: schema.table to query for comprehensive set of state_group entities - regardless of what exists in the from_obj - state_group: the group level found in the state table (e.g., "entity_id") - prefix: prefix for aggregation tables and column names, defaults to from_obj - suffix: suffix for aggregation table, defaults to "aggregation" - schema: schema for aggregation tables - - The from_obj and group expressions are passed directly to the - SQLAlchemy Select object so could be anything supported there. - For details see: - http://docs.sqlalchemy.org/en/latest/core/selectable.html - - Aggregates will have {collate_date} in their quantities substituted with the date - of aggregation. - """ - self.aggregates = aggregates - self.from_obj = make_sql_clause(from_obj, ex.text) - self.groups = ( - groups if isinstance(groups, dict) else {str(g): g for g in groups} - ) - self.state_table = state_table - self.state_group = state_group if state_group else "entity_id" - self.prefix = prefix if prefix else str(from_obj) - self.suffix = suffix if suffix else "aggregation" - self.schema = schema - - @cachedproperty - def colname_aggregate_lookup(self): - """A reverse lookup from column name to the source collate.Aggregate - - Will error if the Aggregation contains duplicate column names - """ - lookup = {} - for group, groupby in self.groups.items(): - for agg in self.aggregates: - for col in agg.get_columns(prefix=self._col_prefix(group)): - if col.name in lookup: - raise ValueError("Duplicate feature column name found: ", col.name) - lookup[col.name] = agg - return lookup - - def _col_prefix(self, group): - """ - Helper for creating a column prefix for the group - group: group clause, for naming columns - Returns: string for a common column prefix for columns in that group - """ - return "{prefix}_{group}_".format(prefix=self.prefix, group=group) - - def _get_aggregates_sql(self, group): - """ - Helper for getting aggregates sql - Args: - group: group clause, for naming columns - Returns: collection of aggregate column SQL strings - """ - return chain(*[a.get_columns(prefix=self._col_prefix(group)) for a in self.aggregates]) - - def get_selects(self): - """ - Constructs select queries for this aggregation - - Returns: a dictionary of group : queries pairs where - group are the same keys as groups - queries is a list of Select queries, one for each date in dates - """ - queries = {} - - for group, groupby in self.groups.items(): - columns = [make_sql_clause(groupby, ex.text)] - columns += self._get_aggregates_sql(group) - - gb_clause = make_sql_clause(groupby, ex.literal_column) - query = ex.select(columns=columns, from_obj=make_sql_clause(self.from_obj, ex.text)).group_by( - gb_clause - ) - - queries[group] = [query] - - return queries - - def get_imputation_rules(self): - """ - Constructs a dictionary to lookup an imputation rule from an associated - column name. - - Returns: a dictionary of column : imputation_rule pairs - """ - imprules = {} - for group, groupby in self.groups.items(): - prefix = "{prefix}_{group}_".format(prefix=self.prefix, group=group) - for a in self.aggregates: - imprules.update(a.column_imputation_lookup(prefix=prefix)) - return imprules - - def get_table_name(self, group=None, imputed=False): - """ - Returns name for table for the given group - """ - if group is None and not imputed: - name = '"%s_%s"' % (self.prefix, self.suffix) - elif group is None and imputed: - name = '"%s_%s_%s"' % (self.prefix, self.suffix, "imputed") - elif imputed: - name = '"%s"' % to_sql_name("%s_%s_%s" % (self.prefix, group, "imputed")) - else: - name = '"%s"' % to_sql_name("%s_%s" % (self.prefix, group)) - schema = '"%s".' % self.schema if self.schema else "" - return "%s%s" % (schema, name) - - def get_creates(self): - """ - Construct create queries for this aggregation - Args: - selects: the dictionary of select queries to use - if None, use self.get_selects() - this allows you to customize select queries before creation - - Returns: - a dictionary of group : create pairs where - group are the same keys as groups - create is a CreateTableAs object - """ - return { - group: CreateTableAs(self.get_table_name(group), next(iter(sels)).limit(0)) - for group, sels in self.get_selects().items() - } - - def get_inserts(self): - """ - Construct insert queries from this aggregation - Args: - selects: the dictionary of select queries to use - if None, use self.get_selects() - this allows you to customize select queries before creation - - Returns: - a dictionary of group : inserts pairs where - group are the same keys as groups - inserts is a list of InsertFromSelect objects - """ - return { - group: [InsertFromSelect(self.get_table_name(group), sel) for sel in sels] - for group, sels in self.get_selects().items() - } - - def get_drops(self): - """ - Generate drop queries for this aggregation - - Returns: a dictionary of group : drop pairs where - group are the same keys as groups - drop is a raw drop table query for the corresponding table - """ - return { - group: "DROP TABLE IF EXISTS %s;" % self.get_table_name(group) - for group in self.groups - } - - def get_indexes(self): - """ - Generate create index queries for this aggregation - - Returns: a dictionary of group : index pairs where - group are the same keys as groups - index is a raw create index query for the corresponding table - """ - return { - group: "CREATE INDEX ON %s (%s);" % (self.get_table_name(group), groupby) - for group, groupby in self.groups.items() - } - - def get_join_table(self): - """ - Generate a query for a join table - """ - return ex.Select( - columns=[make_sql_clause(group, ex.column) for group in self.groups.values()], - from_obj=self.from_obj - ).group_by( - *self.groups.values() - ) - - def get_create(self, join_table=None): - """ - Generate a single aggregation table creation query by joining - together the results of get_creates() - Returns: a CREATE TABLE AS query - """ - if not join_table: - join_table = "(%s) t1" % self.get_join_table() - - query = "SELECT * FROM %s\n" % join_table - for group, groupby in self.groups.items(): - query += "LEFT JOIN %s USING (%s)" % (self.get_table_name(group), groupby) - - return "CREATE TABLE %s AS (%s);" % (self.get_table_name(), query) - - def get_drop(self, imputed=False): - """ - Generate a drop table statement for the aggregation table - Returns: string sql query - """ - return "DROP TABLE IF EXISTS %s" % self.get_table_name(imputed=imputed) - - def get_create_schema(self): - """ - Generate a create schema statement - """ - if self.schema is not None: - return "CREATE SCHEMA IF NOT EXISTS %s" % self.schema - - def find_nulls(self, imputed=False): - """ - Generate query to count number of nulls in each column in the aggregation table - - Returns: a SQL SELECT statement - """ - query_template = """ - SELECT {cols} - FROM {state_tbl} t1 - LEFT JOIN {aggs_tbl} t2 USING({group}) - """ - cols_sql = ",\n".join( - [ - """SUM(CASE WHEN "{col}" IS NULL THEN 1 ELSE 0 END) AS "{col}" """.format( - col=column - ) - for column in self.get_imputation_rules().keys() - ] - ) - - return query_template.format( - cols=cols_sql, - state_tbl=self.state_table, - aggs_tbl=self.get_table_name(imputed=imputed), - group=self.state_group, - ) - - def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): - - imprules = self.get_imputation_rules() - - # check if we're missing any columns relative to the full set and raise an - # exception if we are - missing_cols = set(imprules.keys()) - set(nonimpute_cols + impute_cols) - if len(missing_cols) > 0: - raise ValueError("Missing columns in get_impute_create: %s" % missing_cols) - - # key columns and date column - query = "" - - used_impflags = set() - # pre-sort and iterate through the combined set to ensure column order - for col in sorted(nonimpute_cols + impute_cols): - # just pass through columns that don't require imputation (no nulls found) - if col in nonimpute_cols: - query += '\n,"%s"' % col - - # for columns that do require imputation, include SQL to do the imputation work - # and a flag for whether the value was imputed - if col in impute_cols: - - # we don't want to add redundant imputation flags. for a given source - # column and time interval, all of the functions will have identical - # sets of rows that needed imputation - # to reliably merge these, we lookup the original aggregate that produced - # the function, and see its available functions. we expect exactly one of - # these functions to end the column name and remove it if so - # this is passed to the imputer - if hasattr(self.colname_aggregate_lookup[col], 'functions'): - agg_functions = self.colname_aggregate_lookup[col].functions - used_function = next(funcname for funcname in agg_functions if col.endswith(funcname)) - if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: - impflag_basecol = col - else: - impflag_basecol = col.rstrip('_' + used_function) - else: - logging.warning("Imputation flag merging is not implemented for " - "AggregateExpression objects that don't define an aggregate " - "function (e.g. composites)") - impflag_basecol = col - impute_rule = imprules[col] - - try: - imputer = available_imputations[impute_rule["type"]] - except KeyError as err: - raise ValueError( - "Invalid imputation type %s for column %s" - % (impute_rule.get("type", ""), col) - ) from err - - imputer = imputer(column=col, column_base_for_impflag=impflag_basecol, partitionby=partitionby, **impute_rule) - - query += "\n,%s" % imputer.to_sql() - if not imputer.noflag: - # Add an imputation flag for non-categorical columns (this is handeled - # for categorical columns with a separate NULL category) - # but only add it if another functionally equivalent impflag hasn't already been added - impflag_select, impflag_alias = imputer.imputed_flag_select_and_alias() - if impflag_alias not in used_impflags: - used_impflags.add(impflag_alias) - query += "\n,%s as \"%s\" " % (impflag_select, impflag_alias) - - return query - - def get_impute_create(self, impute_cols, nonimpute_cols): - """ - Generates the CREATE TABLE query for the aggregation table with imputation. - - Args: - impute_cols: a list of column names with null values - nonimpute_cols: a list of column names without null values - - Returns: a CREATE TABLE AS query - """ - - # key columns and date column - query = "SELECT %s" % ", ".join(map(str, self.groups.values())) - - # columns with imputation filling as needed - query += self._get_impute_select(impute_cols, nonimpute_cols) - - # imputation starts from the state table and left joins into the aggregation table - query += "\nFROM %s t1" % self.state_table - query += "\nLEFT JOIN %s t2 USING(%s)" % ( - self.get_table_name(), - self.state_group, - ) - - return "CREATE TABLE %s AS (%s)" % (self.get_table_name(imputed=True), query) - - def execute(self, conn, join_table=None): - """ - Execute all SQL statements to create final aggregation table. - Args: - conn: the SQLAlchemy connection on which to execute - """ - self.validate(conn) - create_schema = self.get_create_schema() - creates = self.get_creates() - drops = self.get_drops() - indexes = self.get_indexes() - inserts = self.get_inserts() - drop = self.get_drop() - create = self.get_create(join_table=join_table) - - trans = conn.begin() - - if create_schema is not None: - conn.execute(create_schema) - - for group in self.groups: - conn.execute(drops[group]) - conn.execute(creates[group]) - for insert in inserts[group]: - conn.execute(insert) - conn.execute(indexes[group]) - - # create the aggregation table - conn.execute(drop) - conn.execute(create) - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - res = conn.execute(self.find_nulls()) - null_counts = list(zip(res.keys(), res.fetchone())) - impute_cols = [col for col, val in null_counts if val > 0] - nonimpute_cols = [col for col, val in null_counts if val == 0] - res.close() - - # sql to drop and create the imputation table - drop_imp = self.get_drop(imputed=True) - create_imp = self.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ) - - # create the imputation table - conn.execute(drop_imp) - conn.execute(create_imp) - - trans.commit() - - def validate(self, conn): - """ - Validate the Aggregation to ensure that it will perform as expected. - This is done against an active SQL connection in order to enable - validation of the SQL itself. - """ diff --git a/src/triage/component/collate/imputations.py b/src/triage/component/collate/imputations.py index 35ac067b6..24e64ee8d 100644 --- a/src/triage/component/collate/imputations.py +++ b/src/triage/component/collate/imputations.py @@ -1,3 +1,6 @@ +IMPUTATION_COLNAME_SUFFIX = "_imp" + + class BaseImputation(object): """Base class for various imputation methods """ @@ -25,17 +28,17 @@ def _base_sql(self): def imputed_flag_select_and_alias(self): if not self.noflag: - template = """CASE WHEN "{col}" IS NULL THEN 1::SMALLINT ELSE 0::SMALLINT END""" - alias_template = "{base_for_impflag}_imp" + template = """CASE WHEN "{col}" IS NULL THEN 1::SMALLINT ELSE 0::SMALLINT END""" + alias_template = "{base_for_impflag}{suffix}" if self.column_base_for_impflag: return ( template.format(col=self.column), - alias_template.format(base_for_impflag=self.column_base_for_impflag) + alias_template.format(base_for_impflag=self.column_base_for_impflag, suffix=IMPUTATION_COLNAME_SUFFIX) ) else: return ( template.format(col=self.column), - alias_template.format(base_for_impflag=self.column) + alias_template.format(base_for_impflag=self.column, suffix=IMPUTATION_COLNAME_SUFFIX) ) else: diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index 38b87bd6d..c6d76e9d6 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -1,91 +1,357 @@ # -*- coding: utf-8 -*- from itertools import chain import sqlalchemy.sql.expression as ex -from descriptors import cachedproperty - -from .sql import make_sql_clause -from .collate import Aggregation +import logging +from .sql import make_sql_clause, to_sql_name, CreateTableAs, InsertFromSelect +from triage.component.architect.utils import remove_schema_from_table_name +from triage.database_reflection import table_exists +from triage.component.architect.feature_block import FeatureBlock +from .from_obj import FromObj +from descriptors import cachedproperty -class SpacetimeAggregation(Aggregation): +from .imputations import ( + ImputeMean, + ImputeConstant, + ImputeZero, + ImputeZeroNoFlag, + ImputeNullCategory, + ImputeBinaryMode, + ImputeError, + IMPUTATION_COLNAME_SUFFIX +) + +available_imputations = { + "mean": ImputeMean, + "constant": ImputeConstant, + "zero": ImputeZero, + "zero_noflag": ImputeZeroNoFlag, + "null_category": ImputeNullCategory, + "binary_mode": ImputeBinaryMode, + "error": ImputeError, +} + +AGGFUNCS_NEED_MULTIPLE_VALUES = set(['stddev', 'stddev_samp', 'variance', 'var_samp']) + + +class SpacetimeAggregation(FeatureBlock): def __init__( self, aggregates, groups, - intervals, from_obj, - dates, - state_table, - state_group=None, + intervals=None, + entity_column=None, prefix=None, suffix=None, - schema=None, date_column=None, output_date_column=None, - input_min_date=None, - join_with_cohort_table=False, + drop_interim_tables=True, + *args, + **kwargs ): """ Args: + aggregates: collection of Aggregate objects. + from_obj: defines the from clause, e.g. the name of the table. can use + groups: a list of expressions to group by in the aggregation or a dictionary + pairs group: expr pairs where group is the alias (used in column names) + entity_column: the group level found in the state table (e.g., "entity_id") + prefix: prefix for aggregation tables and column names, defaults to from_obj + suffix: suffix for aggregation table, defaults to "aggregation" intervals: the intervals to aggregate over. either a list of datetime intervals, e.g. ["1 month", "1 year"], or a dictionary of group : intervals pairs where group is a group in groups and intervals is a collection of datetime intervals, e.g. {"address_id": ["1 month", "1 year]} - dates: list of PostgreSQL date strings, - e.g. ["2012-01-01", "2013-01-01"] - state_table: schema.table to query for valid state_group/date combinations - state_group: the group level found in the state table (e.g., "entity_id") + entity_column: the group level found in the cohort table (e.g., "entity_id") date_column: name of date column in from_obj, defaults to "date" output_date_column: name of date column in aggregated output, defaults to "date" - input_min_date: minimum date for which rows shall be included, defaults - to no absolute time restrictions on the minimum date of included rows - - For all other arguments see collate.Aggregation - """ - Aggregation.__init__( - self, - aggregates=aggregates, - from_obj=from_obj, - groups=groups, - state_table=state_table, - state_group=state_group, - prefix=prefix, - suffix=suffix, - schema=schema, + """ + super().__init__(*args, **kwargs) + self.groups = ( + groups if isinstance(groups, dict) else {str(g): g for g in groups} ) - if isinstance(intervals, dict): self.intervals = intervals - else: + elif intervals: self.intervals = {g: intervals for g in self.groups} - self.dates = dates + else: + self.intervals = {g: ["all"] for g in self.groups} + self.date_column = date_column if date_column else "date" self.output_date_column = output_date_column if output_date_column else "date" - self.input_min_date = input_min_date - self.join_with_cohort_table = join_with_cohort_table - - def _state_table_sub(self): - """Helper function to ensure we only include state table records - in our set of input dates and after the input_min_date. - """ - datestr = ", ".join(["'%s'::date" % dt for dt in self.dates]) - mindtstr = ( - " AND %s >= '%s'::date" % (self.output_date_column, self.input_min_date) - if self.input_min_date is not None - else "" + self.aggregates = aggregates + self.from_obj = make_sql_clause(from_obj, ex.text) + self.entity_column = entity_column if entity_column else "entity_id" + self.prefix = prefix if prefix else self.features_table_name_without_schema + self.drop_interim_tables = drop_interim_tables + + def get_table_name(self, group=None, imputed=False): + """ + Returns name for table for the given group + """ + if imputed: + return self.final_feature_table_name + prefix = self.features_table_name_without_schema + if group is None: + name = '"%s_%s"' % (prefix, "aggregation") + else: + name = '"%s"' % to_sql_name("%s_%s" % (prefix, group)) + schema = '"%s".' % to_sql_name(self.features_schema_name) if self.features_schema_name else "" + return "%s%s" % (schema, name) + + def get_drops(self): + """ + Generate drop queries for this aggregation + + Returns: a dictionary of group : drop pairs where + group are the same keys as groups + drop is a raw drop table query for the corresponding table + """ + return [ + "DROP TABLE IF EXISTS %s;" % self.get_table_name(group) + for group in self.groups + ] + + def get_drop(self, imputed=False): + """ + Generate a drop table statement for the aggregation table + Returns: string sql query + """ + return "DROP TABLE IF EXISTS %s" % self.get_table_name(imputed=imputed) + + def get_create_schema(self): + """ + Generate a create schema statement + """ + if self.features_schema_name is not None: + return "CREATE SCHEMA IF NOT EXISTS %s" % self.features_schema_name + + def imputed_flag_column_names(self): + # format the query that gets column names, + # excluding indices from result + feature_names_query = """ + SELECT column_name + FROM information_schema.columns + WHERE table_name = '{table}' AND + table_schema = '{schema}' AND + column_name like '%%{suffix}' + """.format( + table=remove_schema_from_table_name(self.get_table_name(imputed=True)), + schema=self.features_schema_name or 'public', + suffix=IMPUTATION_COLNAME_SUFFIX ) - return """( - SELECT * - FROM {st} - WHERE {datecol} IN ({datestr}) - {mindtstr})""".format( - st=self.state_table, - datecol=self.output_date_column, - datestr=datestr, - mindtstr=mindtstr, + feature_names = [ + row[0] + for row in self.db_engine.execute(feature_names_query) + ] + return feature_names + + def _basecol_of_impflag(self, impflag_col): + # we don't want to add redundant imputation flags. for a given source + # column and time interval, all of the functions will have identical + # sets of rows that needed imputation + # to reliably merge these, we lookup the original aggregate that produced + # the function, and see its available functions. we expect exactly one of + # these functions to end the column name and remove it if so + if hasattr(self.colname_aggregate_lookup[impflag_col], 'functions'): + agg_functions = self.colname_aggregate_lookup[impflag_col].functions + used_function = next(funcname for funcname in agg_functions if impflag_col.endswith(funcname)) + if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: + return impflag_col + else: + return impflag_col.rstrip('_' + used_function) + else: + logging.warning("Imputation flag merging is not implemented for " + "AggregateExpression objects that don't define an aggregate " + "function (e.g. composites)") + return impflag_col + + def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): + + imprules = self.get_imputation_rules() + used_impflags = set() + + # check if we're missing any columns relative to the full set and raise an + # exception if we are + missing_cols = set(imprules.keys()) - set(nonimpute_cols + impute_cols) + if len(missing_cols) > 0: + raise ValueError("Missing columns in get_impute_create: %s" % missing_cols) + + # key columns and date column + query = "" + + # pre-sort and iterate through the combined set to ensure column order + for col in sorted(nonimpute_cols + impute_cols): + # just pass through columns that don't require imputation (no nulls found) + if col in nonimpute_cols: + query += '\n,"%s"' % col + + # for columns that do require imputation, include SQL to do the imputation work + # and a flag for whether the value was imputed + if col in impute_cols: + impute_rule = imprules[col] + + try: + imputer = available_imputations[impute_rule["type"]] + except KeyError as err: + raise ValueError( + "Invalid imputation type %s for column %s" + % (impute_rule.get("type", ""), col) + ) from err + + impflag_basecol = self._basecol_of_impflag(col) + imputer = imputer(column=col, column_base_for_impflag=impflag_basecol, partitionby=partitionby, **impute_rule) + + query += "\n,%s" % imputer.to_sql() + if not imputer.noflag: + # Add an imputation flag for non-categorical columns (this is handeled + # for categorical columns with a separate NULL category) + # but only add it if another functionally equivalent impflag hasn't already been added + impflag_select, impflag_alias = imputer.imputed_flag_select_and_alias() + if impflag_alias not in used_impflags: + used_impflags.add(impflag_alias) + query += "\n,%s as \"%s\" " % (impflag_select, impflag_alias) + + return query + + def get_index(self, imputed=False): + return "CREATE INDEX ON {} ({})".format( + self.get_table_name(imputed=imputed), + self.entity_column, ) + def get_creates(self): + return { + group: CreateTableAs(self.get_table_name(group), next(iter(sels)).limit(0)) + for group, sels in self.get_selects().items() + } + + # implement the FeatureBlock interface + @property + def feature_columns(self): + """ + The list of feature columns in the final, postimputation table + + Should exclude any index columns (e.g. entity id, date) + """ + # start with all columns defined in the feature block. + # this is important as we don't want to return columns in the final feature table that + # aren't defined in the feature block (e.g. from an earlier run with more features); + # this will exclude impflag columns as they are decided after initial features are written + feature_columns = self.feature_columns_sans_impflags + impflag_columns = set() + + # our list of imputation flag columns comes from the database, + # but it may contain columns from prior runs that we didn't specify + imputation_flag_feature_cols = self.imputed_flag_column_names() + for feature_column in feature_columns: + impflag_name = self._basecol_of_impflag(feature_column) + IMPUTATION_COLNAME_SUFFIX + if impflag_name in imputation_flag_feature_cols: + impflag_columns.add(impflag_name) + return feature_columns | impflag_columns + + @property + def preinsert_queries(self): + """ + Return all queries that should be run before inserting any data. + + Consists of all queries to drop tables from previous runs, as well as all creates + needed for this run. + + Returns a list of queries/executable statements + """ + preinserts = [self.get_drop()] + self.get_drops() + list(self.get_creates().values()) + create_schema = self.get_create_schema() + if create_schema: + preinserts.insert(0, create_schema) + return preinserts + + @property + def insert_queries(self): + """ + Return all inserts to populate this data. Each query in this list should be parallelizable. + + Returns a list of queries/executable statements + """ + return [ + InsertFromSelect(self.get_table_name(group), sel) + for group, sels in self.get_selects().items() + for sel in sels + ] + + @property + def postinsert_queries(self): + """ + Return all queries that should be run after inserting all data + + Consists of indexing queries for each group table as well as a + query to create the aggregation table that encompasses all groups. + + Returns a list of queries/executable statements + """ + postinserts = [ + "CREATE INDEX ON %s (%s);" % (self.get_table_name(group), groupby) + for group, groupby in self.groups.items() + ] + [self.get_create(), self.get_index()] + if self.drop_interim_tables: + postinserts += self.get_drops() + return postinserts + + @property + def imputation_queries(self): + """ + Return all queries that should be run to fill in missing data with imputed values. + + Returns a list of queries/executable statements + """ + if not self.cohort_table_name: + logging.warning( + "No cohort table defined in feature_block, cannot create imputation table for %s", + self.final_feature_table_name + ) + return [] + + if not table_exists(self.cohort_table_name, self.db_engine): + logging.warning( + "Cohort table %s does not exist, cannot create imputation table for %s", + self.cohort_table_name, + self.final_feature_table_name + ) + return [] + + with self.db_engine.begin() as conn: + results = conn.execute(self.find_nulls()) + null_counts = results.first().items() + impute_cols = [col for (col, val) in null_counts if val > 0] + nonimpute_cols = [col for (col, val) in null_counts if val == 0] + imp_queries = [ + self.get_drop(imputed=True), # clear out old imputed data + self._get_impute_create(impute_cols, nonimpute_cols), # create the imputed table + self.get_index(imputed=True), # index the imputed table + ] + if self.drop_interim_tables: + imp_queries.append(self.get_drop(imputed=False)) # drop the old aggregation table + return imp_queries + + def preprocess(self): + create_schema = self.get_create_schema() + + if create_schema is not None: + with self.db_engine.begin() as conn: + conn.execute(create_schema) + + if self.materialize_subquery_fromobjs: + # materialize from obj + from_obj = FromObj( + from_obj=self.from_obj.text, + name=f"{self.features_schema_name}.{self.prefix}", + knowledge_date_column=self.date_column + ) + from_obj.maybe_materialize(self.db_engine) + self.from_obj = from_obj.table + @cachedproperty def colname_aggregate_lookup(self): """A reverse lookup from column name to the source collate.Aggregate @@ -96,9 +362,8 @@ def colname_aggregate_lookup(self): for group, groupby in self.groups.items(): intervals = self.intervals[group] for interval in intervals: - date = self.dates[0] for agg in self.aggregates: - for col in self._cols_for_aggregate(agg, group, interval, date): + for col in self._cols_for_aggregate(agg, group, interval, None): if col.name in lookup: raise ValueError("Duplicate feature column name found: ", col.name) lookup[col.name] = agg @@ -154,6 +419,19 @@ def _get_aggregates_sql(self, interval, date, group): ] ) + def index_query(self, imputed=False): + return "CREATE INDEX ON {} ({}, {})".format( + self.get_table_name(imputed=imputed), + self.entity_column, + self.output_date_column, + ) + + def index_columns(self): + return sorted( + [group for group in self.groups.keys()] + + [self.output_date_column] + ) + def get_selects(self): """ Constructs select queries for this aggregation @@ -167,7 +445,7 @@ def get_selects(self): for group, groupby in self.groups.items(): intervals = self.intervals[group] queries[group] = [] - for date in self.dates: + for date in self.as_of_dates: columns = [ make_sql_clause(groupby, ex.text), ex.literal_column("'%s'::date" % date).label( @@ -181,10 +459,10 @@ def get_selects(self): ) gb_clause = make_sql_clause(groupby, ex.literal_column) - if self.join_with_cohort_table: + if not self.features_ignore_cohort: from_obj = ex.text( f"(select from_obj.* from (" - f"(select * from {self.from_obj}) from_obj join {self.state_table} cohort on ( " + f"(select * from {self.from_obj}) from_obj join {self.cohort_table_name} cohort on ( " "cohort.entity_id = from_obj.entity_id and " f"cohort.{self.output_date_column} = '{date}'::date)" ")) cohorted_from_obj") @@ -240,9 +518,9 @@ def where(self, date, intervals): w += "AND {date_column} >= {min_date}".format( date_column=self.date_column, min_date=min_date ) - if self.input_min_date is not None: + if self.feature_start_time is not None: w += "AND {date_column} >= '{bot}'::date".format( - date_column=self.date_column, bot=self.input_min_date + date_column=self.date_column, bot=self.feature_start_time ) return ex.text(w) @@ -269,7 +547,7 @@ def get_join_table(self): intervals = list(set(chain(*self.intervals.values()))) queries = [] - for date in self.dates: + for date in self.as_of_dates: columns = groups + [ ex.literal_column("'%s'::date" % date).label(self.output_date_column) ] @@ -304,9 +582,9 @@ def validate(self, conn): SpacetimeAggregations ensure that no intervals extend beyond the absolute minimum time. """ - if self.input_min_date is not None: + if self.feature_start_time is not None: all_intervals = set(*self.intervals.values()) - for date in self.dates: + for date in self.as_of_dates: for interval in all_intervals: if interval == "all": continue @@ -314,23 +592,23 @@ def validate(self, conn): # it this way allows for nicer error messages. r = conn.execute( "select ('%s'::date - '%s'::interval) < '%s'::date" - % (date, interval, self.input_min_date) + % (date, interval, self.feature_start_time) ) if r.fetchone()[0]: raise ValueError( - "date '%s' - '%s' is before input_min_date ('%s')" - % (date, interval, self.input_min_date) + "date '%s' - '%s' is before feature_start_time ('%s')" + % (date, interval, self.feature_start_time) ) r.close() - for date in self.dates: + for date in self.as_of_dates: r = conn.execute( "select count(*) from %s where %s = '%s'::date" - % (self.state_table, self.output_date_column, date) + % (self.cohort_table_name, self.output_date_column, date) ) if r.fetchone()[0] == 0: raise ValueError( "date '%s' is not present in states table ('%s')" - % (date, self.state_table) + % (date, self.cohort_table_name) ) r.close() @@ -356,13 +634,13 @@ def find_nulls(self, imputed=False): return query_template.format( cols=cols_sql, - state_tbl=self._state_table_sub(), + state_tbl=self._cohort_table_sub(), aggs_tbl=self.get_table_name(imputed=imputed), - group=self.state_group, + group=self.entity_column, date_col=self.output_date_column, ) - def get_impute_create(self, impute_cols, nonimpute_cols): + def _get_impute_create(self, impute_cols, nonimpute_cols): """ Generates the CREATE TABLE query for the aggregation table with imputation. @@ -385,11 +663,22 @@ def get_impute_create(self, impute_cols, nonimpute_cols): ) # imputation starts from the state table and left joins into the aggregation table - query += "\nFROM %s t1" % self._state_table_sub() + query += "\nFROM %s t1" % self._cohort_table_sub() query += "\nLEFT JOIN %s t2 USING(%s, %s)" % ( self.get_table_name(), - self.state_group, + self.entity_column, self.output_date_column, ) return "CREATE TABLE %s AS (%s)" % (self.get_table_name(imputed=True), query) + + @property + def feature_columns_sans_impflags(self): + columns = chain.from_iterable( + chain.from_iterable( + self._get_aggregates_sql(interval, "2016-01-01", group) + for interval in self.intervals[group] + ) + for (group, groupby) in self.groups.items() + ) + return {label.name for label in columns} diff --git a/src/triage/database_reflection.py b/src/triage/database_reflection.py index 406209bbb..4b316c013 100644 --- a/src/triage/database_reflection.py +++ b/src/triage/database_reflection.py @@ -10,7 +10,7 @@ def split_table(table_name): Returns: (tuple) of schema and table name """ - table_parts = table_name.split(".") + table_parts = table_name.replace('"', '').split(".") if len(table_parts) == 2: return tuple(table_parts) elif len(table_parts) == 1: @@ -131,6 +131,20 @@ def column_type(table_name, column, db_engine): return type(reflected_table(table_name, db_engine).columns[column].type) +def table_columns(table_name, db_engine): + """Retrieve a list of columns. + + The table is expected to exist. + + Args: + table_name (string) A table name (with schema) + db_engine (sqlalchemy.engine) + + Returns: (list) Every column currently in the table + """ + return reflected_table(table_name, db_engine).columns + + def schema_tables(schema_name, db_engine): meta = MetaData(schema=schema_name, bind=db_engine) meta.reflect() diff --git a/src/triage/experiments/__init__.py b/src/triage/experiments/__init__.py index 95d3081de..b83f9827f 100644 --- a/src/triage/experiments/__init__.py +++ b/src/triage/experiments/__init__.py @@ -1,5 +1,5 @@ # Avoid circular import (required by base) -CONFIG_VERSION = "v6" # noqa: E402 +CONFIG_VERSION = "v7" # noqa: E402 from .base import ExperimentBase from .multicore import MultiCoreExperiment diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 97abceca5..2191c97e8 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -16,11 +16,12 @@ ) from triage.component.architect.features import ( - FeatureGenerator, - FeatureDictionaryCreator, FeatureGroupCreator, FeatureGroupMixer, + FeatureDictionary, ) + +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.planner import Planner from triage.component.architect.builders import MatrixBuilder from triage.component.architect.entity_date_table_generators import ( @@ -64,7 +65,7 @@ from triage.database_reflection import table_has_data from triage.util.conf import dt_from_str -from triage.util.db import get_for_update +from triage.util.db import get_for_update, run_statements from triage.util.introspection import bind_kwargs, classpath @@ -224,18 +225,24 @@ def initialize_components(self): "you will not be able to make matrices." ) - self.feature_dictionary_creator = FeatureDictionaryCreator( - features_schema_name=self.features_schema_name, db_engine=self.db_engine - ) + if "features" in self.config: + logging.info("Creating feature blocks from config") + self.feature_blocks = feature_blocks_from_config( + config=self.config["features"], + as_of_dates=self.all_as_of_times, + cohort_table=self.cohort_table_name, + features_schema_name=self.features_schema_name, + db_engine=self.db_engine, + feature_start_time=self.config["temporal_config"]["feature_start_time"], + features_ignore_cohort=self.features_ignore_cohort, + materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, + ) + else: + logging.warning("No feature config is available") + self.feature_blocks = [] - self.feature_generator = FeatureGenerator( - features_schema_name=self.features_schema_name, - replace=self.replace, - db_engine=self.db_engine, - feature_start_time=split_config["feature_start_time"], - materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, - features_ignore_cohort=self.features_ignore_cohort - ) + with self.get_for_update() as experiment: + experiment.feature_blocks = len(self.feature_blocks) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) @@ -277,7 +284,7 @@ def initialize_components(self): replace=self.replace, as_of_times=self.all_as_of_times ) - + self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, @@ -418,63 +425,6 @@ def all_as_of_times(self): experiment.as_of_times = len(distinct_as_of_times) return distinct_as_of_times - @cachedproperty - def collate_aggregations(self): - """Collation of ``Aggregation`` objects used by this experiment. - - Returns: (list) of ``collate.Aggregation`` objects - - """ - logging.info("Creating collate aggregations") - if "feature_aggregations" not in self.config: - logging.warning("No feature_aggregation config is available") - return [] - aggregations = self.feature_generator.aggregations( - feature_aggregation_config=self.config["feature_aggregations"], - feature_dates=self.all_as_of_times, - state_table=self.cohort_table_name, - ) - with self.get_for_update() as experiment: - experiment.feature_blocks = len(aggregations) - return aggregations - - - @cachedproperty - def feature_aggregation_table_tasks(self): - """All feature table query tasks specified by this - ``Experiment``. - - Returns: (dict) keys are group table names, values are - themselves dicts, each with keys for different stages of - table creation (prepare, inserts, finalize) and with values - being lists of SQL commands - - """ - logging.info( - "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) - ) - return self.feature_generator.generate_all_table_tasks( - self.collate_aggregations, task_type="aggregation" - ) - - @cachedproperty - def feature_imputation_table_tasks(self): - """All feature imputation query tasks specified by this - ``Experiment``. - - Returns: (dict) keys are group table names, values are - themselves dicts, each with keys for different stages of - table creation (prepare, inserts, finalize) and with values - being lists of SQL commands - - """ - logging.info( - "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) - ) - return self.feature_generator.generate_all_table_tasks( - self.collate_aggregations, task_type="imputation" - ) - @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features @@ -484,12 +434,7 @@ def master_feature_dictionary(self): values being lists of feature names """ - result = self.feature_dictionary_creator.feature_dictionary( - feature_table_names=self.feature_imputation_table_tasks.keys(), - index_column_lookup=self.feature_generator.index_column_lookup( - self.collate_aggregations - ), - ) + result = FeatureDictionary(feature_blocks=self.feature_blocks) logging.info("Computed master feature dictionary: %s", result) with self.get_for_update() as experiment: experiment.total_features = sum(1 for _feature in itertools.chain.from_iterable(result.values())) @@ -605,7 +550,7 @@ def process_train_test_batches(self, train_test_batches): pass @abstractmethod - def process_query_tasks(self, query_tasks): + def process_inserts(self, inserts): pass @abstractmethod @@ -614,19 +559,25 @@ def process_matrix_build_tasks(self, matrix_build_tasks): @experiment_entrypoint def generate_preimputation_features(self): - self.process_query_tasks(self.feature_aggregation_table_tasks) - logging.info( - "Finished running preimputation feature queries. The final results are in tables: %s", - ",".join(agg.get_table_name() for agg in self.collate_aggregations), - ) + for feature_block in self.feature_blocks: + tasks = feature_block.generate_preimpute_tasks(self.replace) + run_statements(tasks.get("prepare", []), self.db_engine) + self.process_inserts(tasks.get("inserts", [])) + run_statements(tasks.get("finalize", []), self.db_engine) + logging.info("Finished running preimputation feature queries.") @experiment_entrypoint def impute_missing_features(self): - self.process_query_tasks(self.feature_imputation_table_tasks) + for feature_block in self.feature_blocks: + tasks = feature_block.generate_impute_tasks(self.replace) + run_statements(tasks.get("prepare", []), self.db_engine) + self.process_inserts(tasks.get("inserts", [])) + run_statements(tasks.get("finalize", []), self.db_engine) + logging.info( "Finished running postimputation feature queries. The final results are in tables: %s", ",".join( - agg.get_table_name(imputed=True) for agg in self.collate_aggregations + block.final_feature_table_name for block in self.feature_blocks ), ) diff --git a/src/triage/experiments/multicore.py b/src/triage/experiments/multicore.py index 0371aa326..1f65f44cb 100644 --- a/src/triage/experiments/multicore.py +++ b/src/triage/experiments/multicore.py @@ -4,6 +4,8 @@ from pebble import ProcessPool from multiprocessing.reduction import ForkingPickler +from triage.util.db import run_statements + from triage.component.catwalk.utils import Batch from triage.experiments import ExperimentBase @@ -72,21 +74,16 @@ def process_train_test_batches(self, batches): for serial_task in batch.tasks: self.model_train_tester.process_task(**serial_task) - def process_query_tasks(self, query_tasks): + def process_inserts(self, inserts): logging.info("Processing query tasks with %s processes", self.n_db_processes) - for table_name, tasks in query_tasks.items(): - logging.info("Processing features for %s", table_name) - self.feature_generator.run_commands(tasks.get("prepare", [])) - partial_insert = partial( - insert_into_table, feature_generator=self.feature_generator - ) + partial_insert = partial( + insert_into_table, db_engine=self.db_engine + ) - insert_batches = [ - list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) - ] - parallelize(partial_insert, insert_batches, n_processes=self.n_db_processes) - self.feature_generator.run_commands(tasks.get("finalize", [])) - logging.info("%s completed", table_name) + insert_batches = [ + list(task_batch) for task_batch in Batch(inserts, 25) + ] + parallelize(partial_insert, insert_batches, n_processes=self.n_db_processes) def process_matrix_build_tasks(self, matrix_build_tasks): partial_build_matrix = partial( @@ -116,10 +113,10 @@ def process_subset_tasks(self, subset_tasks): ) -def insert_into_table(insert_statements, feature_generator): +def insert_into_table(insert_statements, db_engine): try: logging.info("Beginning insert batch") - feature_generator.run_commands(insert_statements) + run_statements(insert_statements, db_engine) return True except Exception: logging.error("Child error: %s", traceback.format_exc()) diff --git a/src/triage/experiments/rq.py b/src/triage/experiments/rq.py index 8f4e54776..a3db17e42 100644 --- a/src/triage/experiments/rq.py +++ b/src/triage/experiments/rq.py @@ -1,6 +1,7 @@ import logging import time from triage.component.catwalk.utils import Batch +from triage.util.db import run_statements from triage.experiments import ExperimentBase try: @@ -74,51 +75,22 @@ def wait_for(self, jobs): logging.info("Sleeping for %s seconds", self.sleep_time) time.sleep(self.sleep_time) - def process_query_tasks(self, query_tasks): - """Run queries by table - - Will run preparation (e.g. create table) and finalize (e.g. create index) tasks - in the main process, - but delegate inserts to rq Jobs in batches of 25 - - Args: query_tasks (dict) - keys should be table names and values should be dicts. - Each inner dict should have up to three keys, each with a list of queries: - 'prepare' (setting up the table), - 'inserts' (insert commands to populate the table), - 'finalize' (finishing table setup after all inserts have run) - - Example: { - 'table_one': { - 'prepare': ['create table table_one (col1 varchar)'], - 'inserts': [ - 'insert into table_one values (\'a\')', - 'insert into table_one values (\'b'\')' - ] - 'finalize': ['create index on table_one (col1)'] - } - } - """ - for table_name, tasks in query_tasks.items(): - logging.info("Processing features for %s", table_name) - self.feature_generator.run_commands(tasks.get("prepare", [])) - - insert_batches = [ - list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) - ] - jobs = [ - self.queue.enqueue( - self.feature_generator.run_commands, - insert_batch, - job_timeout=DEFAULT_TIMEOUT, - result_ttl=DEFAULT_TIMEOUT, - ttl=DEFAULT_TIMEOUT, - ) - for insert_batch in insert_batches - ] - self.wait_for(jobs) - - self.feature_generator.run_commands(tasks.get("finalize", [])) - logging.info("%s completed", table_name) + def process_inserts(self, inserts): + insert_batches = [ + list(task_batch) for task_batch in Batch(inserts, 25) + ] + jobs = [ + self.queue.enqueue( + run_statements, + insert_batch, + self.db_engine, + job_timeout=DEFAULT_TIMEOUT, + result_ttl=DEFAULT_TIMEOUT, + ttl=DEFAULT_TIMEOUT, + ) + for insert_batch in insert_batches + ] + self.wait_for(jobs) def process_matrix_build_tasks(self, matrix_build_tasks): """Run matrix build tasks using RQ diff --git a/src/triage/experiments/singlethreaded.py b/src/triage/experiments/singlethreaded.py index 0223c05bd..94c39fd8a 100644 --- a/src/triage/experiments/singlethreaded.py +++ b/src/triage/experiments/singlethreaded.py @@ -1,9 +1,10 @@ from triage.experiments import ExperimentBase +from triage.util.db import run_statements class SingleThreadedExperiment(ExperimentBase): - def process_query_tasks(self, query_tasks): - self.feature_generator.process_table_tasks(query_tasks) + def process_inserts(self, inserts): + run_statements(inserts, self.db_engine) def process_matrix_build_tasks(self, matrix_build_tasks): self.matrix_builder.build_all_matrices(matrix_build_tasks) diff --git a/src/triage/experiments/validate.py b/src/triage/experiments/validate.py index a9932dcc7..26b0d7986 100644 --- a/src/triage/experiments/validate.py +++ b/src/triage/experiments/validate.py @@ -127,7 +127,7 @@ def dt_from_str(dt_str): ) -class FeatureAggregationsValidator(Validator): +class SpacetimeAggregationValidator(Validator): def _validate_keys(self, aggregation_config): for key in [ "from_obj", @@ -140,7 +140,7 @@ def _validate_keys(self, aggregation_config): raise ValueError( dedent( """ - Section: feature_aggregations - + Section: features->spacetime_aggregations - '{} required as key: aggregation config: {}""".format( key, aggregation_config ) @@ -156,7 +156,7 @@ def _validate_aggregates(self, aggregation_config): raise ValueError( dedent( """ - Section: feature_aggregations - + Section: features->spacetime_aggregations - Need either aggregates, categoricals, or array_categoricals in {}""".format( aggregation_config @@ -516,7 +516,7 @@ def _validate_prefixes(self, prefix_list): ) ) - def _run(self, feature_group_definition, feature_aggregation_config): + def _run(self, feature_group_definition, feature_table_names): if not isinstance(feature_group_definition, dict): raise ValueError( dedent( @@ -554,30 +554,10 @@ def _run(self, feature_group_definition, feature_aggregation_config): ) if "prefix" in feature_group_definition: - available_prefixes = { - aggregation["prefix"] for aggregation in feature_aggregation_config - } - bad_prefixes = set(feature_group_definition["prefix"]) - available_prefixes - if bad_prefixes: - raise ValueError( - dedent( - """ - Section: feature_group_definition - - The following given feature group prefixes: '{}' - are invalid. Available prefixes from this experiment's feature - aggregations are: '{}' - """.format( - bad_prefixes, available_prefixes - ) - ) - ) self._validate_prefixes(feature_group_definition["prefix"]) if "tables" in feature_group_definition: - available_tables = { - aggregation["prefix"] + "_aggregation_imputed" - for aggregation in feature_aggregation_config - } + available_tables = feature_table_names bad_tables = set(feature_group_definition["tables"]) - available_tables if bad_tables: raise ValueError( @@ -585,8 +565,8 @@ def _run(self, feature_group_definition, feature_aggregation_config): """ Section: feature_group_definition - The following given feature group tables: '{}' - are invalid. Available tables from this experiment's feature - aggregations are: '{}' + are invalid. Available tables from this experiment's features + are: '{}' """.format( bad_tables, available_tables ) @@ -844,13 +824,35 @@ def _run(self, scoring_config): ) +class FeatureValidator(Validator): + def _run(self, feature_config): + feature_lookup = architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP + given_keys = set(feature_block['feature_generator_type'] for feature_block in feature_config.values()) + bad_keys = given_keys - feature_lookup.keys() + if bad_keys: + raise ValueError( + dedent( + f"""\ + Section: features - + The following given feature types '{bad_keys}' are unavailable. + Available metrics are: '{feature_lookup.keys()}' + """ + ) + ) + if 'spacetime_aggregations' in feature_config: + SpacetimeAggregationValidator( + self.db_engine, + strict=self.strict + ).run(feature_config['spacetime_aggregations']) + + class ExperimentValidator(Validator): def run(self, experiment_config): TemporalValidator(strict=self.strict).run( experiment_config.get("temporal_config", {}) ) - FeatureAggregationsValidator(self.db_engine, strict=self.strict).run( - experiment_config.get("feature_aggregations", {}) + FeatureValidator(self.db_engine, strict=self.strict).run( + experiment_config.get("features", {}) ) LabelConfigValidator(self.db_engine, strict=self.strict).run( experiment_config.get("label_config", None) @@ -860,7 +862,7 @@ def run(self, experiment_config): ) FeatureGroupDefinitionValidator(strict=self.strict).run( experiment_config.get("feature_group_definition", {}), - experiment_config.get("feature_aggregations", {}), + set(experiment_config.get("features", {}).keys()) ) FeatureGroupStrategyValidator(strict=self.strict).run( experiment_config.get("feature_group_strategies", []) diff --git a/src/triage/util/db.py b/src/triage/util/db.py index 585622868..288a68060 100644 --- a/src/triage/util/db.py +++ b/src/triage/util/db.py @@ -38,6 +38,12 @@ def __reconstruct__(cls, url, creator, kwargs): create_engine = SerializableDbEngine +def run_statements(statement_list, db_engine): + with db_engine.begin() as conn: + for statement in statement_list: + conn.execute(statement) + + @contextmanager def scoped_session(db_engine): """Provide a transactional scope around a series of operations."""