diff --git a/core/src/main/java/feast/core/model/Entity.java b/core/src/main/java/feast/core/model/Entity.java index 574abb0c4e..fd26f95afa 100644 --- a/core/src/main/java/feast/core/model/Entity.java +++ b/core/src/main/java/feast/core/model/Entity.java @@ -23,6 +23,9 @@ import lombok.Getter; import lombok.Setter; +/** + * Feast entity object. Contains name, type as well as domain metadata about the entity. + */ @Getter @Setter @javax.persistence.Entity diff --git a/core/src/main/java/feast/core/model/Feature.java b/core/src/main/java/feast/core/model/Feature.java index d21411fbf7..51b74e45af 100644 --- a/core/src/main/java/feast/core/model/Feature.java +++ b/core/src/main/java/feast/core/model/Feature.java @@ -25,6 +25,9 @@ import lombok.Getter; import lombok.Setter; +/** + * Feature belonging to a featureset. Contains name, type as well as domain metadata about the feature. + */ @Getter @Setter @Entity diff --git a/core/src/main/java/feast/core/service/StatsService.java b/core/src/main/java/feast/core/service/StatsService.java index 614fa22a33..933ce3e6a6 100644 --- a/core/src/main/java/feast/core/service/StatsService.java +++ b/core/src/main/java/feast/core/service/StatsService.java @@ -37,7 +37,7 @@ import feast.core.model.FieldId; import feast.storage.api.statistics.FeatureSetStatistics; import feast.storage.api.statistics.StatisticsRetriever; -import feast.storage.connectors.bigquery.stats.BigQueryStatisticsRetriever; +import feast.storage.connectors.bigquery.statistics.BigQueryStatisticsRetriever; import java.io.IOException; import java.time.Instant; import java.util.*; @@ -49,6 +49,7 @@ import org.tensorflow.metadata.v0.*; import org.tensorflow.metadata.v0.FeatureNameStatistics.Builder; +/** Facilitates the retrieval of feature set statistics from historical stores. */ @Slf4j @Service public class StatsService { @@ -67,6 +68,26 @@ public StatsService( this.featureStatisticsRepository = featureStatisticsRepository; } + /** + * Get {@link DatasetFeatureStatistics} for the requested feature set in the provided datasets or + * date range for the store provided. The {@link DatasetFeatureStatistics} will contain a list of + * {@link FeatureNameStatistics} for each feature requested. Results retrieved will be cached + * indefinitely. To force Feast to recompute the statistics, set forceRefresh to true. + * + *

Only one of datasetIds or startDate/endDate should be provided. If both are provided, the + * former will be used over the latter. + * + *

If multiple datasetIds or if the date ranges over a few days, statistics will be retrieved + * for each single unit (dataset id or day) and results aggregated across that set. As a result of + * this, in such a scenario, statistics that cannot be aggregated will be dropped. This includes + * all histograms and quantiles, unique values, and top value counts. + * + * @param request {@link GetFeatureStatisticsRequest} containing feature set name, subset of + * features, dataset ids or date range, and store to retrieve the data from. + * @return {@link GetFeatureStatisticsResponse} containing {@link DatasetFeatureStatistics} with + * the feature statistics requested. + * @throws IOException + */ @Transactional public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsRequest request) throws IOException { @@ -86,7 +107,11 @@ public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsReq while (timestamp < request.getEndDate().getSeconds()) { List featureNameStatistics = getFeatureNameStatisticsByDate( - statisticsRetriever, featureSetSpec, features, timestamp); + statisticsRetriever, + featureSetSpec, + features, + timestamp, + request.getForceRefresh()); featureNameStatisticsList.add(featureNameStatistics); timestamp += 86400; // advance by a day } @@ -95,16 +120,23 @@ public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsReq for (String datasetId : request.getDatasetIdsList()) { List featureNameStatistics = getFeatureNameStatisticsByDataset( - statisticsRetriever, featureSetSpec, features, datasetId); + statisticsRetriever, + featureSetSpec, + features, + datasetId, + request.getForceRefresh()); featureNameStatisticsList.add(featureNameStatistics); } } List featureNameStatistics = mergeStatistics(featureNameStatisticsList); + long totalCount = getTotalCount(featureNameStatistics.get(0)); return GetFeatureStatisticsResponse.newBuilder() .setDatasetFeatureStatisticsList( DatasetFeatureStatisticsList.newBuilder() .addDatasets( - DatasetFeatureStatistics.newBuilder().addAllFeatures(featureNameStatistics))) + DatasetFeatureStatistics.newBuilder() + .setNumExamples(totalCount) + .addAllFeatures(featureNameStatistics))) .build(); } @@ -112,7 +144,8 @@ private List getFeatureNameStatisticsByDataset( StatisticsRetriever statisticsRetriever, FeatureSetSpec featureSetSpec, List features, - String datasetId) + String datasetId, + boolean forceRefresh) throws IOException { List featureNameStatistics = new ArrayList<>(); List featuresMissingStats = new ArrayList<>(); @@ -124,9 +157,12 @@ private List getFeatureNameStatisticsByDataset( featureSetSpec.getName(), featureSetSpec.getVersion(), featureName)); - Optional cachedFeatureStatistics = - featureStatisticsRepository.findFeatureStatisticsByFeatureAndDatasetId( - feature, datasetId); + Optional cachedFeatureStatistics = Optional.empty(); + if (!forceRefresh) { + cachedFeatureStatistics = + featureStatisticsRepository.findFeatureStatisticsByFeatureAndDatasetId( + feature, datasetId); + } if (cachedFeatureStatistics.isPresent()) { featureNameStatistics.add(cachedFeatureStatistics.get().toProto()); } else { @@ -154,7 +190,8 @@ private List getFeatureNameStatisticsByDate( StatisticsRetriever statisticsRetriever, FeatureSetSpec featureSetSpec, List features, - long timestamp) + long timestamp, + boolean forceRefresh) throws IOException { Date date = Date.from(Instant.ofEpochSecond(timestamp)); List featureNameStatistics = new ArrayList<>(); @@ -167,8 +204,11 @@ private List getFeatureNameStatisticsByDate( featureSetSpec.getName(), featureSetSpec.getVersion(), featureName)); - Optional cachedFeatureStatistics = - featureStatisticsRepository.findFeatureStatisticsByFeatureAndDate(feature, date); + Optional cachedFeatureStatistics = Optional.empty(); + if (!forceRefresh) { + cachedFeatureStatistics = + featureStatisticsRepository.findFeatureStatisticsByFeatureAndDate(feature, date); + } if (cachedFeatureStatistics.isPresent()) { featureNameStatistics.add(cachedFeatureStatistics.get().toProto()); } else { @@ -433,4 +473,26 @@ private FeatureNameStatistics mergeByteStatistics( return mergedFeatureNameStatistics.setBytesStats(mergedBytesStatistics).build(); } + + private long getTotalCount(FeatureNameStatistics featureNameStatistics) { + CommonStatistics commonStats; + switch (featureNameStatistics.getType()) { + case STRUCT: + commonStats = featureNameStatistics.getStructStats().getCommonStats(); + break; + case STRING: + commonStats = featureNameStatistics.getStringStats().getCommonStats(); + break; + case BYTES: + commonStats = featureNameStatistics.getBytesStats().getCommonStats(); + break; + case FLOAT: + case INT: + commonStats = featureNameStatistics.getNumStats().getCommonStats(); + break; + default: + throw new RuntimeException("Unable to extract dataset size; Invalid type provided"); + } + return commonStats.getNumNonMissing() + commonStats.getNumMissing(); + } } diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java similarity index 86% rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java index 85d0f2dc13..b63d829ce8 100644 --- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java +++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; -import static feast.storage.connectors.bigquery.stats.StatsUtil.toFeatureNameStatistics; +import static feast.storage.connectors.bigquery.statistics.StatsUtil.toFeatureNameStatistics; import com.google.auto.value.AutoValue; import com.google.cloud.bigquery.BigQuery; @@ -99,30 +99,24 @@ private FeatureSetStatistics getFeatureSetStatistics( featureSetSpec = featureSetSpecBuilder.build(); try { + // Generate SQL for and retrieve non-histogram statistics String getFeatureSetStatsQuery = StatsQueryTemplater.createGetFeatureSetStatsQuery( featureSetStatisticsQueryInfo, projectId(), datasetId()); - String getFeatureSetHistQuery = - StatsQueryTemplater.createGetFeatureSetHistQuery( - featureSetStatisticsQueryInfo, projectId(), datasetId()); QueryJobConfiguration queryJobConfiguration = QueryJobConfiguration.newBuilder(getFeatureSetStatsQuery).build(); TableResult basicStats = bigquery().query(queryJobConfiguration); + + // Generate SQL for and retrieve histogram statistics + String getFeatureSetHistQuery = + StatsQueryTemplater.createGetFeatureSetHistQuery( + featureSetStatisticsQueryInfo, projectId(), datasetId()); queryJobConfiguration = QueryJobConfiguration.newBuilder(getFeatureSetHistQuery).build(); TableResult hist = bigquery().query(queryJobConfiguration); - Map basicStatsValues = - Streams.stream(basicStats.getValues()) - .collect( - Collectors.toMap( - fieldValueList -> fieldValueList.get(0).getStringValue(), - fieldValueList -> fieldValueList)); - Map histValues = - Streams.stream(hist.getValues()) - .collect( - Collectors.toMap( - fieldValueList -> fieldValueList.get(0).getStringValue(), - fieldValueList -> fieldValueList)); + // Convert to map of feature_name:row containing the statistics + Map basicStatsValues = getTableResultByFeatureName(basicStats); + Map histValues = getTableResultByFeatureName(hist); int totalCountIndex = basicStats.getSchema().getFields().getIndex("total_count"); FeatureSetStatistics.Builder featureSetStatisticsBuilder = @@ -130,6 +124,7 @@ private FeatureSetStatistics getFeatureSetStatistics( .setNumExamples( basicStatsValues.get(features.get(0)).get(totalCountIndex).getLongValue()); + // Convert BQ rows to FeatureNameStatistics for (FeatureSpec featureSpec : featureSetSpec.getFeaturesList()) { FeatureNameStatistics featureNameStatistics = toFeatureNameStatistics( @@ -149,4 +144,12 @@ private FeatureSetStatistics getFeatureSetStatistics( e); } } + + private Map getTableResultByFeatureName(TableResult basicStats) { + return Streams.stream(basicStats.getValues()) + .collect( + Collectors.toMap( + fieldValueList -> fieldValueList.get(0).getStringValue(), + fieldValueList -> fieldValueList)); + } } diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java similarity index 94% rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java index 5809db7455..43bf18f0f9 100644 --- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java +++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; import com.google.protobuf.Timestamp; import feast.core.FeatureSetProto.FeatureSpec; @@ -25,6 +25,10 @@ import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; +/** + * Value class for Feature Sets containing information necessary to template stats-retrieving + * queries. + */ public class FeatureSetStatisticsQueryInfo { private final String project; private final String name; diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java similarity index 92% rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java index 73bbdfbac2..209ea8f41d 100644 --- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java +++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java @@ -14,11 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; import feast.core.FeatureSetProto.FeatureSpec; import feast.types.ValueProto.ValueType.Enum; +/** + * Value class for Features containing information necessary to template stats-retrieving queries. + */ public class FeatureStatisticsQueryInfo { private final String name; private final String type; diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java similarity index 98% rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java index 1552a1f4f5..8103b2cb48 100644 --- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java +++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; import com.mitchellbosecke.pebble.PebbleEngine; import com.mitchellbosecke.pebble.template.PebbleTemplate; diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java similarity index 90% rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java index 40dca0d7df..0747e44ad8 100644 --- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java +++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; import com.google.cloud.bigquery.FieldList; import com.google.cloud.bigquery.FieldValue; @@ -54,6 +54,22 @@ public class StatsUtil { TFDV_TYPE_MAP.put(Enum.DOUBLE_LIST, Type.STRUCT); } + /** + * Convert BQ-retrieved statistics to the corresponding TFDV {@link FeatureNameStatistics} + * specific to the feature type. + * + * @param featureSpec {@link FeatureSpec} of the feature + * @param basicStatsSchema BigQuery {@link Schema} of the retrieved statistics row for the + * non-histogram statistics. Used to retrieve the column names corresponding to each value in + * the row. + * @param basicStatsValues BigQuery {@link FieldValueList} containing a single row of + * non-histogram statistics retrieved from BigQuery + * @param histSchema BigQuery {@link Schema} of the retrieved statistics row for the histogram + * statistics. Used to retrieve the column names corresponding to each value in the row. + * @param histValues BigQuery {@link FieldValueList} containing a single row of histogram + * statistics retrieved from BigQuery + * @return {@link FeatureNameStatistics} + */ public static FeatureNameStatistics toFeatureNameStatistics( FeatureSpec featureSpec, Schema basicStatsSchema, diff --git a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java similarity index 99% rename from storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java rename to storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java index 28134251cb..ff3ef368e2 100644 --- a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java +++ b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; import com.google.cloud.bigquery.BigQueryOptions; import com.google.protobuf.InvalidProtocolBufferException; diff --git a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java similarity index 98% rename from storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java rename to storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java index 1ab1778b3f..96f9bce5cf 100644 --- a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java +++ b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package feast.storage.connectors.bigquery.stats; +package feast.storage.connectors.bigquery.statistics; -import static feast.storage.connectors.bigquery.stats.StatsUtil.toFeatureNameStatistics; +import static feast.storage.connectors.bigquery.statistics.StatsUtil.toFeatureNameStatistics; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.MatcherAssert.assertThat;