diff --git a/core/src/main/java/feast/core/model/Entity.java b/core/src/main/java/feast/core/model/Entity.java
index 574abb0c4e..fd26f95afa 100644
--- a/core/src/main/java/feast/core/model/Entity.java
+++ b/core/src/main/java/feast/core/model/Entity.java
@@ -23,6 +23,9 @@
import lombok.Getter;
import lombok.Setter;
+/**
+ * Feast entity object. Contains name, type as well as domain metadata about the entity.
+ */
@Getter
@Setter
@javax.persistence.Entity
diff --git a/core/src/main/java/feast/core/model/Feature.java b/core/src/main/java/feast/core/model/Feature.java
index d21411fbf7..51b74e45af 100644
--- a/core/src/main/java/feast/core/model/Feature.java
+++ b/core/src/main/java/feast/core/model/Feature.java
@@ -25,6 +25,9 @@
import lombok.Getter;
import lombok.Setter;
+/**
+ * Feature belonging to a featureset. Contains name, type as well as domain metadata about the feature.
+ */
@Getter
@Setter
@Entity
diff --git a/core/src/main/java/feast/core/service/StatsService.java b/core/src/main/java/feast/core/service/StatsService.java
index 614fa22a33..933ce3e6a6 100644
--- a/core/src/main/java/feast/core/service/StatsService.java
+++ b/core/src/main/java/feast/core/service/StatsService.java
@@ -37,7 +37,7 @@
import feast.core.model.FieldId;
import feast.storage.api.statistics.FeatureSetStatistics;
import feast.storage.api.statistics.StatisticsRetriever;
-import feast.storage.connectors.bigquery.stats.BigQueryStatisticsRetriever;
+import feast.storage.connectors.bigquery.statistics.BigQueryStatisticsRetriever;
import java.io.IOException;
import java.time.Instant;
import java.util.*;
@@ -49,6 +49,7 @@
import org.tensorflow.metadata.v0.*;
import org.tensorflow.metadata.v0.FeatureNameStatistics.Builder;
+/** Facilitates the retrieval of feature set statistics from historical stores. */
@Slf4j
@Service
public class StatsService {
@@ -67,6 +68,26 @@ public StatsService(
this.featureStatisticsRepository = featureStatisticsRepository;
}
+ /**
+ * Get {@link DatasetFeatureStatistics} for the requested feature set in the provided datasets or
+ * date range for the store provided. The {@link DatasetFeatureStatistics} will contain a list of
+ * {@link FeatureNameStatistics} for each feature requested. Results retrieved will be cached
+ * indefinitely. To force Feast to recompute the statistics, set forceRefresh to true.
+ *
+ *
Only one of datasetIds or startDate/endDate should be provided. If both are provided, the
+ * former will be used over the latter.
+ *
+ *
If multiple datasetIds or if the date ranges over a few days, statistics will be retrieved
+ * for each single unit (dataset id or day) and results aggregated across that set. As a result of
+ * this, in such a scenario, statistics that cannot be aggregated will be dropped. This includes
+ * all histograms and quantiles, unique values, and top value counts.
+ *
+ * @param request {@link GetFeatureStatisticsRequest} containing feature set name, subset of
+ * features, dataset ids or date range, and store to retrieve the data from.
+ * @return {@link GetFeatureStatisticsResponse} containing {@link DatasetFeatureStatistics} with
+ * the feature statistics requested.
+ * @throws IOException
+ */
@Transactional
public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsRequest request)
throws IOException {
@@ -86,7 +107,11 @@ public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsReq
while (timestamp < request.getEndDate().getSeconds()) {
List featureNameStatistics =
getFeatureNameStatisticsByDate(
- statisticsRetriever, featureSetSpec, features, timestamp);
+ statisticsRetriever,
+ featureSetSpec,
+ features,
+ timestamp,
+ request.getForceRefresh());
featureNameStatisticsList.add(featureNameStatistics);
timestamp += 86400; // advance by a day
}
@@ -95,16 +120,23 @@ public GetFeatureStatisticsResponse getFeatureStatistics(GetFeatureStatisticsReq
for (String datasetId : request.getDatasetIdsList()) {
List featureNameStatistics =
getFeatureNameStatisticsByDataset(
- statisticsRetriever, featureSetSpec, features, datasetId);
+ statisticsRetriever,
+ featureSetSpec,
+ features,
+ datasetId,
+ request.getForceRefresh());
featureNameStatisticsList.add(featureNameStatistics);
}
}
List featureNameStatistics = mergeStatistics(featureNameStatisticsList);
+ long totalCount = getTotalCount(featureNameStatistics.get(0));
return GetFeatureStatisticsResponse.newBuilder()
.setDatasetFeatureStatisticsList(
DatasetFeatureStatisticsList.newBuilder()
.addDatasets(
- DatasetFeatureStatistics.newBuilder().addAllFeatures(featureNameStatistics)))
+ DatasetFeatureStatistics.newBuilder()
+ .setNumExamples(totalCount)
+ .addAllFeatures(featureNameStatistics)))
.build();
}
@@ -112,7 +144,8 @@ private List getFeatureNameStatisticsByDataset(
StatisticsRetriever statisticsRetriever,
FeatureSetSpec featureSetSpec,
List features,
- String datasetId)
+ String datasetId,
+ boolean forceRefresh)
throws IOException {
List featureNameStatistics = new ArrayList<>();
List featuresMissingStats = new ArrayList<>();
@@ -124,9 +157,12 @@ private List getFeatureNameStatisticsByDataset(
featureSetSpec.getName(),
featureSetSpec.getVersion(),
featureName));
- Optional cachedFeatureStatistics =
- featureStatisticsRepository.findFeatureStatisticsByFeatureAndDatasetId(
- feature, datasetId);
+ Optional cachedFeatureStatistics = Optional.empty();
+ if (!forceRefresh) {
+ cachedFeatureStatistics =
+ featureStatisticsRepository.findFeatureStatisticsByFeatureAndDatasetId(
+ feature, datasetId);
+ }
if (cachedFeatureStatistics.isPresent()) {
featureNameStatistics.add(cachedFeatureStatistics.get().toProto());
} else {
@@ -154,7 +190,8 @@ private List getFeatureNameStatisticsByDate(
StatisticsRetriever statisticsRetriever,
FeatureSetSpec featureSetSpec,
List features,
- long timestamp)
+ long timestamp,
+ boolean forceRefresh)
throws IOException {
Date date = Date.from(Instant.ofEpochSecond(timestamp));
List featureNameStatistics = new ArrayList<>();
@@ -167,8 +204,11 @@ private List getFeatureNameStatisticsByDate(
featureSetSpec.getName(),
featureSetSpec.getVersion(),
featureName));
- Optional cachedFeatureStatistics =
- featureStatisticsRepository.findFeatureStatisticsByFeatureAndDate(feature, date);
+ Optional cachedFeatureStatistics = Optional.empty();
+ if (!forceRefresh) {
+ cachedFeatureStatistics =
+ featureStatisticsRepository.findFeatureStatisticsByFeatureAndDate(feature, date);
+ }
if (cachedFeatureStatistics.isPresent()) {
featureNameStatistics.add(cachedFeatureStatistics.get().toProto());
} else {
@@ -433,4 +473,26 @@ private FeatureNameStatistics mergeByteStatistics(
return mergedFeatureNameStatistics.setBytesStats(mergedBytesStatistics).build();
}
+
+ private long getTotalCount(FeatureNameStatistics featureNameStatistics) {
+ CommonStatistics commonStats;
+ switch (featureNameStatistics.getType()) {
+ case STRUCT:
+ commonStats = featureNameStatistics.getStructStats().getCommonStats();
+ break;
+ case STRING:
+ commonStats = featureNameStatistics.getStringStats().getCommonStats();
+ break;
+ case BYTES:
+ commonStats = featureNameStatistics.getBytesStats().getCommonStats();
+ break;
+ case FLOAT:
+ case INT:
+ commonStats = featureNameStatistics.getNumStats().getCommonStats();
+ break;
+ default:
+ throw new RuntimeException("Unable to extract dataset size; Invalid type provided");
+ }
+ return commonStats.getNumNonMissing() + commonStats.getNumMissing();
+ }
}
diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java
similarity index 86%
rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java
rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java
index 85d0f2dc13..b63d829ce8 100644
--- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetriever.java
+++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetriever.java
@@ -14,9 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
-import static feast.storage.connectors.bigquery.stats.StatsUtil.toFeatureNameStatistics;
+import static feast.storage.connectors.bigquery.statistics.StatsUtil.toFeatureNameStatistics;
import com.google.auto.value.AutoValue;
import com.google.cloud.bigquery.BigQuery;
@@ -99,30 +99,24 @@ private FeatureSetStatistics getFeatureSetStatistics(
featureSetSpec = featureSetSpecBuilder.build();
try {
+ // Generate SQL for and retrieve non-histogram statistics
String getFeatureSetStatsQuery =
StatsQueryTemplater.createGetFeatureSetStatsQuery(
featureSetStatisticsQueryInfo, projectId(), datasetId());
- String getFeatureSetHistQuery =
- StatsQueryTemplater.createGetFeatureSetHistQuery(
- featureSetStatisticsQueryInfo, projectId(), datasetId());
QueryJobConfiguration queryJobConfiguration =
QueryJobConfiguration.newBuilder(getFeatureSetStatsQuery).build();
TableResult basicStats = bigquery().query(queryJobConfiguration);
+
+ // Generate SQL for and retrieve histogram statistics
+ String getFeatureSetHistQuery =
+ StatsQueryTemplater.createGetFeatureSetHistQuery(
+ featureSetStatisticsQueryInfo, projectId(), datasetId());
queryJobConfiguration = QueryJobConfiguration.newBuilder(getFeatureSetHistQuery).build();
TableResult hist = bigquery().query(queryJobConfiguration);
- Map basicStatsValues =
- Streams.stream(basicStats.getValues())
- .collect(
- Collectors.toMap(
- fieldValueList -> fieldValueList.get(0).getStringValue(),
- fieldValueList -> fieldValueList));
- Map histValues =
- Streams.stream(hist.getValues())
- .collect(
- Collectors.toMap(
- fieldValueList -> fieldValueList.get(0).getStringValue(),
- fieldValueList -> fieldValueList));
+ // Convert to map of feature_name:row containing the statistics
+ Map basicStatsValues = getTableResultByFeatureName(basicStats);
+ Map histValues = getTableResultByFeatureName(hist);
int totalCountIndex = basicStats.getSchema().getFields().getIndex("total_count");
FeatureSetStatistics.Builder featureSetStatisticsBuilder =
@@ -130,6 +124,7 @@ private FeatureSetStatistics getFeatureSetStatistics(
.setNumExamples(
basicStatsValues.get(features.get(0)).get(totalCountIndex).getLongValue());
+ // Convert BQ rows to FeatureNameStatistics
for (FeatureSpec featureSpec : featureSetSpec.getFeaturesList()) {
FeatureNameStatistics featureNameStatistics =
toFeatureNameStatistics(
@@ -149,4 +144,12 @@ private FeatureSetStatistics getFeatureSetStatistics(
e);
}
}
+
+ private Map getTableResultByFeatureName(TableResult basicStats) {
+ return Streams.stream(basicStats.getValues())
+ .collect(
+ Collectors.toMap(
+ fieldValueList -> fieldValueList.get(0).getStringValue(),
+ fieldValueList -> fieldValueList));
+ }
}
diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java
similarity index 94%
rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java
rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java
index 5809db7455..43bf18f0f9 100644
--- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureSetStatisticsQueryInfo.java
+++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureSetStatisticsQueryInfo.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
import com.google.protobuf.Timestamp;
import feast.core.FeatureSetProto.FeatureSpec;
@@ -25,6 +25,10 @@
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
+/**
+ * Value class for Feature Sets containing information necessary to template stats-retrieving
+ * queries.
+ */
public class FeatureSetStatisticsQueryInfo {
private final String project;
private final String name;
diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java
similarity index 92%
rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java
rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java
index 73bbdfbac2..209ea8f41d 100644
--- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/FeatureStatisticsQueryInfo.java
+++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/FeatureStatisticsQueryInfo.java
@@ -14,11 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
import feast.core.FeatureSetProto.FeatureSpec;
import feast.types.ValueProto.ValueType.Enum;
+/**
+ * Value class for Features containing information necessary to template stats-retrieving queries.
+ */
public class FeatureStatisticsQueryInfo {
private final String name;
private final String type;
diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java
similarity index 98%
rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java
rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java
index 1552a1f4f5..8103b2cb48 100644
--- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsQueryTemplater.java
+++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsQueryTemplater.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
import com.mitchellbosecke.pebble.PebbleEngine;
import com.mitchellbosecke.pebble.template.PebbleTemplate;
diff --git a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java
similarity index 90%
rename from storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java
rename to storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java
index 40dca0d7df..0747e44ad8 100644
--- a/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/stats/StatsUtil.java
+++ b/storage/connectors/bigquery/src/main/java/feast/storage/connectors/bigquery/statistics/StatsUtil.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.FieldValue;
@@ -54,6 +54,22 @@ public class StatsUtil {
TFDV_TYPE_MAP.put(Enum.DOUBLE_LIST, Type.STRUCT);
}
+ /**
+ * Convert BQ-retrieved statistics to the corresponding TFDV {@link FeatureNameStatistics}
+ * specific to the feature type.
+ *
+ * @param featureSpec {@link FeatureSpec} of the feature
+ * @param basicStatsSchema BigQuery {@link Schema} of the retrieved statistics row for the
+ * non-histogram statistics. Used to retrieve the column names corresponding to each value in
+ * the row.
+ * @param basicStatsValues BigQuery {@link FieldValueList} containing a single row of
+ * non-histogram statistics retrieved from BigQuery
+ * @param histSchema BigQuery {@link Schema} of the retrieved statistics row for the histogram
+ * statistics. Used to retrieve the column names corresponding to each value in the row.
+ * @param histValues BigQuery {@link FieldValueList} containing a single row of histogram
+ * statistics retrieved from BigQuery
+ * @return {@link FeatureNameStatistics}
+ */
public static FeatureNameStatistics toFeatureNameStatistics(
FeatureSpec featureSpec,
Schema basicStatsSchema,
diff --git a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java
similarity index 99%
rename from storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java
rename to storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java
index 28134251cb..ff3ef368e2 100644
--- a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/BigQueryStatisticsRetrieverTest.java
+++ b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/BigQueryStatisticsRetrieverTest.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.protobuf.InvalidProtocolBufferException;
diff --git a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java
similarity index 98%
rename from storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java
rename to storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java
index 1ab1778b3f..96f9bce5cf 100644
--- a/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/stats/StatsUtilTest.java
+++ b/storage/connectors/bigquery/src/test/java/feast/storage/connectors/bigquery/statistics/StatsUtilTest.java
@@ -14,9 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package feast.storage.connectors.bigquery.stats;
+package feast.storage.connectors.bigquery.statistics;
-import static feast.storage.connectors.bigquery.stats.StatsUtil.toFeatureNameStatistics;
+import static feast.storage.connectors.bigquery.statistics.StatsUtil.toFeatureNameStatistics;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;