-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* New recipe for 1.2.0 * Added makefile, sample data * Added link to new recipe * Type-o fixed
- Loading branch information
Showing
8 changed files
with
1,398 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
include ../Makefile | ||
|
||
infra: | ||
@docker-compose up | ||
|
||
batch: | ||
@docker exec -it pinot-controller cp /data/* /opt/pinot/data | ||
@docker exec -it pinot-controller \ | ||
./bin/pinot-admin.sh LaunchDataIngestionJob -jobSpecFile /config/jobspec.yaml | ||
|
||
recipe: infra check_pinot logger batch | ||
|
||
validate: | ||
|
||
clean: | ||
@docker-compose down |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Working with nested JSON documents | ||
|
||
> In this recipe we'll learn how to work with JSON documents using regex, isJson and array functions added in Apache Pinot 1.2.0. | ||
<table> | ||
<tr> | ||
<td>Pinot Version</td> | ||
<td>1.2.0</td> | ||
</tr> | ||
<tr> | ||
<td>Schema</td> | ||
<td><a href="config/schema.json">config/schema.json</a></td> | ||
</tr> | ||
<tr> | ||
<td>Table Config</td> | ||
<td><a href="config/table.json">config/table.json</a></td> | ||
</tr> | ||
<tr> | ||
<td>Ingestion Job</td> | ||
<td><a href="config/job-spec.yml">config/jobspec.yml</a></td> | ||
</tr> | ||
</table> | ||
|
||
*** | ||
|
||
Clone this repository and navigate to this recipe: | ||
|
||
```bash | ||
git clone git@github.com:startreedata/pinot-recipes.git | ||
cd pinot-recipes/recipes/json-regex | ||
``` | ||
|
||
Spin up a Pinot cluster using Docker Compose: | ||
|
||
```bash | ||
docker-compose up | ||
``` | ||
|
||
Add some data: | ||
|
||
```bash | ||
cp /data/* /opt/pinot/data | ||
/opt/pinot/bin/pinot-admin.sh LaunchDataIngestionJob -jobSpecFile /config/jobspec.yaml | ||
``` | ||
|
||
Query Pinot: | ||
|
||
```sql | ||
SELECT * FROM github_events WHERE JSON_MATCH(actor_json, 'REGEXP_LIKE("$.login", ''maria(.)*'')') | ||
``` | ||
|
||
```sql | ||
SELECT * FROM github_events WHERE JSON_MATCH(actor_json, '"$.id" > ''35560568''') | ||
``` | ||
|
||
```sql | ||
SELECT payload_commits, isJson(payload_commits) from github_events limit 10 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# | ||
# Copyright 2021 StarTree Inc. All rights reserved. Confidential and Proprietary Information of StarTree Inc. | ||
# | ||
|
||
# executionFrameworkSpec: Defines ingestion jobs to be running. | ||
executionFrameworkSpec: | ||
|
||
# name: execution framework name | ||
name: 'standalone' | ||
|
||
# segmentGenerationJobRunnerClassName: class name implements org.apache.pinot.spi.batch.ingestion.runner.SegmentGenerationJobRunner interface. | ||
segmentGenerationJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentGenerationJobRunner' | ||
|
||
# segmentTarPushJobRunnerClassName: class name implements org.apache.pinot.spi.batch.ingestion.runner.SegmentTarPushJobRunner interface. | ||
segmentTarPushJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentTarPushJobRunner' | ||
|
||
# jobType: Pinot ingestion job type. | ||
# Supported job types are: | ||
# 'SegmentCreation' | ||
# 'SegmentTarPush' | ||
# 'SegmentUriPush' | ||
# 'SegmentCreationAndTarPush' | ||
# 'SegmentCreationAndUriPush' | ||
jobType: SegmentCreationAndTarPush | ||
|
||
# inputDirURI: Root directory of input data, expected to have scheme configured in PinotFS. | ||
inputDirURI: '/opt/pinot/data' | ||
|
||
# includeFileNamePattern: include file name pattern, supported glob pattern. | ||
# Sample usage: | ||
# 'glob:*.avro' will include all avro files just under the inputDirURI, not sub directories; | ||
# 'glob:**/*.avro' will include all the avro files under inputDirURI recursively. | ||
includeFileNamePattern: 'glob:**/*.json' | ||
|
||
# excludeFileNamePattern: exclude file name pattern, supported glob pattern. | ||
# Sample usage: | ||
# 'glob:*.avro' will exclude all avro files just under the inputDirURI, not sub directories; | ||
# 'glob:**/*.avro' will exclude all the avro files under inputDirURI recursively. | ||
# _excludeFileNamePattern: '' | ||
|
||
# outputDirURI: Root directory of output segments, expected to have scheme configured in PinotFS. | ||
outputDirURI: '/opt/pinot/data/segments' | ||
|
||
# overwriteOutput: Overwrite output segments if existed. | ||
overwriteOutput: true | ||
|
||
# pinotFSSpecs: defines all related Pinot file systems. | ||
pinotFSSpecs: | ||
|
||
- # scheme: used to identify a PinotFS. | ||
# E.g. local, hdfs, dbfs, etc | ||
scheme: file | ||
|
||
# className: Class name used to create the PinotFS instance. | ||
# E.g. | ||
# org.apache.pinot.spi.filesystem.LocalPinotFS is used for local filesystem | ||
# org.apache.pinot.plugin.filesystem.AzurePinotFS is used for Azure Data Lake | ||
# org.apache.pinot.plugin.filesystem.HadoopPinotFS is used for HDFS | ||
className: org.apache.pinot.spi.filesystem.LocalPinotFS | ||
|
||
# recordReaderSpec: defines all record reader | ||
recordReaderSpec: | ||
|
||
# dataFormat: Record data format, e.g. 'avro', 'parquet', 'orc', 'csv', 'json', 'thrift' etc. | ||
dataFormat: 'json' | ||
|
||
# className: Corresponding RecordReader class name. | ||
# E.g. | ||
# org.apache.pinot.plugin.inputformat.avro.AvroRecordReader | ||
# org.apache.pinot.plugin.inputformat.csv.CSVRecordReader | ||
# org.apache.pinot.plugin.inputformat.parquet.ParquetRecordReader | ||
# org.apache.pinot.plugin.inputformat.json.JSONRecordReader | ||
# org.apache.pinot.plugin.inputformat.orc.ORCRecordReader | ||
# org.apache.pinot.plugin.inputformat.thrift.ThriftRecordReader | ||
className: 'org.apache.pinot.plugin.inputformat.json.JSONRecordReader' | ||
|
||
# tableSpec: defines table name and where to fetch corresponding table config and table schema. | ||
tableSpec: | ||
|
||
# tableName: Table name | ||
tableName: 'github_events' | ||
|
||
# pinotClusterSpecs: defines the Pinot Cluster Access Point. | ||
pinotClusterSpecs: | ||
- # controllerURI: used to fetch table/schema information and data push. | ||
# E.g. http://localhost:9000 | ||
controllerURI: 'http://localhost:9000' | ||
|
||
# pushJobSpec: defines segment push job related configuration. | ||
pushJobSpec: | ||
|
||
# pushAttempts: number of attempts for push job, default is 1, which means no retry. | ||
pushAttempts: 2 | ||
pushParallelism: 2 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
{ | ||
"schemaName": "github_events", | ||
"dimensionFieldSpecs": [ | ||
{ | ||
"name": "id", | ||
"dataType": "STRING" | ||
}, | ||
{ | ||
"name": "type", | ||
"dataType": "STRING" | ||
}, | ||
{ | ||
"name": "repo_name", | ||
"dataType": "STRING" | ||
}, | ||
{ | ||
"name": "repo_name_fst", | ||
"dataType": "STRING" | ||
}, | ||
{ | ||
"name": "actor_json", | ||
"dataType": "JSON" | ||
}, | ||
{ | ||
"name": "payload_commits", | ||
"dataType": "STRING", | ||
"maxLength": 2147483647 | ||
}, | ||
{ | ||
"name": "payload_pull_request", | ||
"dataType": "STRING", | ||
"maxLength": 2147483647 | ||
}, | ||
{ | ||
"name": "commit_author_names", | ||
"dataType": "STRING", | ||
"singleValueField": false | ||
}, | ||
{ | ||
"name": "label_ids", | ||
"dataType": "LONG", | ||
"singleValueField": false | ||
} | ||
], | ||
"dateTimeFieldSpecs": [ | ||
{ | ||
"name": "created_at", | ||
"dataType": "STRING", | ||
"format": "1:SECONDS:SIMPLE_DATE_FORMAT:yyyy-MM-dd'T'HH:mm:ss'Z'", | ||
"granularity": "1:HOURS" | ||
} | ||
] | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
{ | ||
"tableName": "github_events", | ||
"tableType": "OFFLINE", | ||
"segmentsConfig": { | ||
"retentionTimeUnit": "DAYS", | ||
"retentionTimeValue": "1", | ||
"segmentPushType": "APPEND", | ||
"segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy", | ||
"schemaName": "github_events", | ||
"replication": "1" | ||
}, | ||
"tenants": { | ||
}, | ||
"ingestionConfig": { | ||
"transformConfigs": [ | ||
{ | ||
"columnName": "repo_name", | ||
"transformFunction": "jsonPathString(repo, '$.name', 'null')" | ||
}, | ||
{ | ||
"columnName": "repo_name_fst", | ||
"transformFunction": "jsonPathString(repo, '$.name', 'null')" | ||
}, | ||
{ | ||
"columnName": "actor_json", | ||
"transformFunction": "jsonFormat(actor)" | ||
}, | ||
{ | ||
"columnName": "payload_commits", | ||
"transformFunction": "jsonPathString(payload, '$.commits', 'null')" | ||
}, | ||
{ | ||
"columnName": "payload_pull_request", | ||
"transformFunction": "jsonPathString(payload, '$.pull_request', 'null')" | ||
}, | ||
{ | ||
"columnName": "commit_author_names", | ||
"transformFunction": "jsonPathArrayDefaultEmpty(payload, '$.commits[*].author.name')" | ||
}, | ||
{ | ||
"columnName": "label_ids", | ||
"transformFunction": "jsonPathArrayDefaultEmpty(payload, '$.pull_request.labels[*].id')" | ||
} | ||
] | ||
}, | ||
"fieldConfigList": [ | ||
{ | ||
"name": "payload_commits", | ||
"encodingType": "RAW", | ||
"indexType": "TEXT" | ||
}, | ||
{ | ||
"name": "payload_pull_request", | ||
"encodingType": "RAW", | ||
"indexType": "TEXT" | ||
}, | ||
{ | ||
"name": "repo_name_fst", | ||
"encodingType": "DICTIONARY", | ||
"indexType": "FST" | ||
} | ||
], | ||
"tableIndexConfig": { | ||
"loadMode": "MMAP", | ||
"sortedColumn": [ | ||
"created_at" | ||
], | ||
"invertedIndexColumns": [ | ||
"id", | ||
"type", | ||
"repo_name", | ||
"commit_author_names", | ||
"label_ids" | ||
], | ||
"bloomFilterColumns": [ | ||
"id", | ||
"commit_author_names", | ||
"label_ids" | ||
], | ||
"jsonIndexColumns": [ | ||
"actor_json", | ||
"payload_commits" | ||
], | ||
"noDictionaryColumns": [ | ||
"payload_commits", | ||
"payload_pull_request" | ||
] | ||
}, | ||
"metadata": { | ||
"customConfigs": { | ||
} | ||
} | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.