diff --git a/.github/get-sonatype-credentials.sh b/.github/get-sonatype-credentials.sh new file mode 100755 index 00000000000..f1660f509d5 --- /dev/null +++ b/.github/get-sonatype-credentials.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Script to retrieve Sonatype credentials from AWS Secrets Manager + +SONATYPE_USERNAME=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-username --query SecretString --output text) +SONATYPE_PASSWORD=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-password --query SecretString --output text) +echo "::add-mask::$SONATYPE_USERNAME" +echo "::add-mask::$SONATYPE_PASSWORD" +echo "SONATYPE_USERNAME=$SONATYPE_USERNAME" >> $GITHUB_ENV +echo "SONATYPE_PASSWORD=$SONATYPE_PASSWORD" >> $GITHUB_ENV \ No newline at end of file diff --git a/.github/workflows/maven-publish.yml b/.github/workflows/maven-publish.yml index 6ea76fd783a..019ff7384eb 100644 --- a/.github/workflows/maven-publish.yml +++ b/.github/workflows/maven-publish.yml @@ -8,6 +8,9 @@ on: - 1.* - 2.* +env: + SNAPSHOT_REPO_URL: https://aws.oss.sonatype.org/content/repositories/snapshots/ + jobs: build-and-publish-snapshots: strategy: @@ -30,33 +33,11 @@ jobs: role-to-assume: ${{ secrets.PUBLISH_SNAPSHOTS_ROLE }} aws-region: us-east-1 - # Create the initial direct-query directory structure - - name: Create direct-query directory structure in repository + - name: get credentials run: | # Get credentials for publishing - export SONATYPE_USERNAME=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-username --query SecretString --output text) - export SONATYPE_PASSWORD=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-password --query SecretString --output text) - echo "::add-mask::$SONATYPE_USERNAME" - echo "::add-mask::$SONATYPE_PASSWORD" - - # Create a placeholder file - TEMP_DIR=$(mktemp -d) - echo "Directory placeholder - $(date)" > "${TEMP_DIR}/.placeholder" - - # Upload the placeholder file to create the directory structure - echo "Creating initial directory structure..." - curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" \ - --upload-file "${TEMP_DIR}/.placeholder" \ - "https://aws.oss.sonatype.org/content/repositories/snapshots/org/opensearch/direct-query/.placeholder" - - # Clean up - rm -rf "${TEMP_DIR}" - echo "Directory structure created" + .github/get-sonatype-credentials.sh - name: publish snapshots to maven run: | - export SONATYPE_USERNAME=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-username --query SecretString --output text) - export SONATYPE_PASSWORD=$(aws secretsmanager get-secret-value --secret-id maven-snapshots-password --query SecretString --output text) - echo "::add-mask::$SONATYPE_USERNAME" - echo "::add-mask::$SONATYPE_PASSWORD" - ./gradlew publishPluginZipPublicationToSnapshotsRepository + ./gradlew publishPluginZipPublicationToSnapshotsRepository \ No newline at end of file diff --git a/.github/workflows/publish-snapshots-and-grammar-files.yml b/.github/workflows/publish-snapshots-and-grammar-files.yml new file mode 100644 index 00000000000..7417c5a39ba --- /dev/null +++ b/.github/workflows/publish-snapshots-and-grammar-files.yml @@ -0,0 +1,354 @@ +name: Publish snapshots and grammar files to maven + +on: + workflow_dispatch: + push: + branches: + - main + - 1.* + - 2.* + +env: + SNAPSHOT_REPO_URL: https://aws.oss.sonatype.org/content/repositories/snapshots/ + +jobs: + publish-grammar-files: + strategy: + fail-fast: false + if: github.repository == 'opensearch-project/sql' + runs-on: ubuntu-latest + + permissions: + id-token: write + contents: write + + env: + TARGET_REPO_PATH: org/opensearch/language-grammar + + steps: + - uses: actions/setup-java@v3 + with: + distribution: temurin # Temurin is a distribution of adoptium + java-version: 21 + - uses: actions/checkout@v3 + - uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: ${{ secrets.PUBLISH_SNAPSHOTS_ROLE }} + aws-region: us-east-1 + + # Get Maven credentials + - name: Setup publishing credentials + id: creds + run: | + .github/get-sonatype-credentials.sh + + # Extract version information directly from build.gradle + - name: Set version + id: set_version + run: | + VERSION=$(grep "version = " ./language-grammar/build.gradle | cut -d "'" -f 2) + echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT + echo "Using version: ${VERSION}" + + # Capture commit ID + - name: Set commit ID + id: set_commit + run: | + COMMIT_ID=$(git log -1 --format='%H') + echo "commit_id=${COMMIT_ID}" >> $GITHUB_OUTPUT + echo "Using commit ID: ${COMMIT_ID}" + + # Create ZIP of grammar files + - name: Package grammar files + run: | + # Create directory for the zip content + mkdir -p grammar_files + + # Copy all .g4 files to the directory + find ./language-grammar/src/main/antlr4 -name "*.g4" -type f -exec cp {} grammar_files/ \; + + # List the files that will be included in the zip + echo "Files to be included in the zip:" + ls -la grammar_files/ + + # Create zip file + cd grammar_files + zip -r ../grammar.zip ./* + cd .. + + # Check the zip file + ls -la grammar.zip + + - name: Prepare for Maven publishing + run: | + # Define constants + ARTIFACT_ID="language-grammar" + GROUP_ID="org.opensearch" + VERSION="${{ steps.set_version.outputs.VERSION }}" + + # Create directory structure for Maven + MAVEN_LOCAL_PATH="${HOME}/.m2/repository/${GROUP_ID//.//}/${ARTIFACT_ID}/${VERSION}" + mkdir -p "${MAVEN_LOCAL_PATH}" + + # Copy the zip file to Maven directory with proper naming + MAVEN_ZIP_NAME="${ARTIFACT_ID}-${VERSION}.zip" + cp grammar.zip "${MAVEN_LOCAL_PATH}/${MAVEN_ZIP_NAME}" + + # Generate POM file + cat > "${MAVEN_LOCAL_PATH}/${ARTIFACT_ID}-${VERSION}.pom" << EOF + + + 4.0.0 + ${GROUP_ID} + ${ARTIFACT_ID} + ${VERSION} + zip + OpenSearch Language Grammar Files + + EOF + + echo "Grammar files prepared for Maven publishing as version ${VERSION}" + + # Generate checksums for the Maven artifacts + - name: Generate checksums + run: | + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.pom" -type f`; do sha512sum "$i" | awk '{print $1}' >> "$i.sha512"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.zip" -type f`; do sha512sum "$i" | awk '{print $1}' >> "$i.sha512"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.pom" -type f`; do sha256sum "$i" | awk '{print $1}' >> "$i.sha256"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.zip" -type f`; do sha256sum "$i" | awk '{print $1}' >> "$i.sha256"; done + + # Checkout build libraries for publishing scripts + - uses: actions/checkout@v4 + with: + repository: 'opensearch-project/opensearch-build-libraries' + path: 'build' + + # Publish to Maven + - name: Publish to Maven + run: | + # Copy local Maven repo to build directory + cd build/resources/publish/ + cp -a $HOME/.m2/repository/* ./ + + # Run the publish script + ./publish-snapshot.sh ./ + + # Update metadata with commit ID + - name: Add commit ID to metadata + run: | + COMMIT_ID="${{ steps.set_commit.outputs.commit_id }}" + ARTIFACT_ID="language-grammar" + VERSION="${{ steps.set_version.outputs.VERSION }}" + + TEMP_DIR=$(mktemp -d) + METADATA_FILE="${TEMP_DIR}/maven-metadata.xml" + + # Download existing metadata + META_URL="${SNAPSHOT_REPO_URL}org/opensearch/${ARTIFACT_ID}/${VERSION}/maven-metadata.xml" + curl -s -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" -o "${METADATA_FILE}" "${META_URL}" + + if [ -s "${METADATA_FILE}" ]; then + cp "${METADATA_FILE}" "${METADATA_FILE}.bak" + + # Add commit ID to metadata + awk -v commit="${COMMIT_ID}" ' + // { + print $0 + print " " commit "" + next + } + {print} + ' "${METADATA_FILE}.bak" > "${METADATA_FILE}" + + # Upload modified metadata + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "${METADATA_FILE}" "${META_URL}" + + # Update checksums + cd "${TEMP_DIR}" + sha256sum "maven-metadata.xml" | awk '{print $1}' > "maven-metadata.xml.sha256" + sha512sum "maven-metadata.xml" | awk '{print $1}' > "maven-metadata.xml.sha512" + + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "maven-metadata.xml.sha256" "${META_URL}.sha256" + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "maven-metadata.xml.sha512" "${META_URL}.sha512" + + echo "Version metadata updated with commit ID" + else + echo "Failed to download metadata, skipping commit ID addition" + fi + + rm -rf "${TEMP_DIR}" + + publish-async-query-core: + strategy: + fail-fast: false + if: github.repository == 'opensearch-project/sql' + runs-on: ubuntu-latest + + permissions: + id-token: write + contents: write + + steps: + - uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 21 + - uses: actions/checkout@v3 + - uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: ${{ secrets.PUBLISH_SNAPSHOTS_ROLE }} + aws-region: us-east-1 + + # Get and mask credentials once in a dedicated step + - name: Setup publishing credentials + id: creds + run: | + .github/get-sonatype-credentials.sh + + # Capture the commit ID for metadata purposes + - name: Set commit ID + id: set_commit + run: | + COMMIT_ID=$(git log -1 --format='%H') + echo "commit_id=${COMMIT_ID}" >> $GITHUB_OUTPUT + echo "Using commit ID: ${COMMIT_ID}" + + # Replace the current "Extract version from build.gradle" step with this: + - name: Extract version from build.gradle + id: extract_version + run: | + # Extract the version directly from the build.gradle file + VERSION=$(grep -m 1 "archiveVersion.set" ./async-query-core/build.gradle | sed -n "s/.*archiveVersion.set('\([^']*\)').*/\1/p") + + # Add -SNAPSHOT suffix for snapshot repository if not already present + if [[ ! $VERSION == *-SNAPSHOT ]]; then + VERSION="${VERSION}-SNAPSHOT" + fi + + echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT + echo "Version: ${VERSION}" + + - name: Build and publish shadow JAR + run: | + # Build the shadow JAR + ./gradlew :async-query-core:shadowJar + + # Define constants + ARTIFACT_ID="async-query-core" + GROUP_PATH="org/opensearch" + VERSION="${{ steps.extract_version.outputs.VERSION }}" + + # Find the generated shadow JAR + SHADOW_JAR=$(find ./async-query-core/build/libs/ -name "*-all.jar" | head -n 1) + + if [ -z "$SHADOW_JAR" ]; then + echo "Error: Shadow JAR not found!" + exit 1 + fi + + # Create directory structure in local Maven repository + MAVEN_LOCAL_PATH="${HOME}/.m2/repository/${GROUP_PATH}/${ARTIFACT_ID}/${VERSION}" + mkdir -p "${MAVEN_LOCAL_PATH}" + + # Copy the shadow JAR to the local Maven repository with proper naming + MAVEN_JAR_NAME="${ARTIFACT_ID}-${VERSION}.jar" + cp "${SHADOW_JAR}" "${MAVEN_LOCAL_PATH}/${MAVEN_JAR_NAME}" + + # Generate a POM file + cat > "${MAVEN_LOCAL_PATH}/${ARTIFACT_ID}-${VERSION}.pom" << EOF + + + 4.0.0 + org.opensearch + ${ARTIFACT_ID} + ${VERSION} + + EOF + + echo "Shadow JAR and POM published to local Maven repository for version ${VERSION}" + + # Checkout opensearch-build-libraries repository for publishing scripts + - uses: actions/checkout@v4 + with: + repository: 'opensearch-project/opensearch-build-libraries' + path: 'build' + + - name: Generate SHA checksums for JAR and POM files + run: | + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.pom" -type f`; do sha512sum "$i" | awk '{print $1}' >> "$i.sha512"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.jar" -type f`; do sha512sum "$i" | awk '{print $1}' >> "$i.sha512"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.pom" -type f`; do sha256sum "$i" | awk '{print $1}' >> "$i.sha256"; done + for i in `find ${HOME}/.m2/repository/org/opensearch/ -name "*.jar" -type f`; do sha256sum "$i" | awk '{print $1}' >> "$i.sha256"; done + + - name: Install XML tools + run: sudo apt-get update && sudo apt-get install -y xmlstarlet + + - name: Publish snapshots to maven + run: | + # Publish snapshots to maven + cd build/resources/publish/ + cp -a $HOME/.m2/repository/* ./ + ./publish-snapshot.sh ./ + + - name: Update version metadata with commit ID + run: | + COMMIT_ID="${{ steps.set_commit.outputs.commit_id }}" + ARTIFACT_ID="async-query-core" + VERSION="${{ steps.extract_version.outputs.VERSION }}" + + # Add commit ID to version-specific metadata file + echo "Processing commit ID for version: ${VERSION}" + + TEMP_DIR=$(mktemp -d) + METADATA_FILE="${TEMP_DIR}/maven-metadata.xml" + + # Download metadata from repository + META_URL="${SNAPSHOT_REPO_URL}org/opensearch/${ARTIFACT_ID}/${VERSION}/maven-metadata.xml" + echo "Downloading metadata from ${META_URL}" + + # Try to download the metadata file + curl -s -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" -o "${METADATA_FILE}" "${META_URL}" + + # If successful, modify and upload back + if [ -s "${METADATA_FILE}" ]; then + echo "Modifying metadata for ${VERSION}" + cp "${METADATA_FILE}" "${METADATA_FILE}.bak" + + # Apply same awk command from working example + awk -v commit="${COMMIT_ID}" ' + // { + print $0 + print " " commit "" + next + } + {print} + ' "${METADATA_FILE}.bak" > "${METADATA_FILE}" + + # Upload modified file back + echo "Uploading modified metadata to ${META_URL}" + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "${METADATA_FILE}" "${META_URL}" + + # Update the SHA checksums + cd "${TEMP_DIR}" + sha256sum "maven-metadata.xml" | awk '{print $1}' > "maven-metadata.xml.sha256" + sha512sum "maven-metadata.xml" | awk '{print $1}' > "maven-metadata.xml.sha512" + + # Upload the checksums + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "maven-metadata.xml.sha256" "${META_URL}.sha256" + curl -X PUT -u "${SONATYPE_USERNAME}:${SONATYPE_PASSWORD}" --upload-file "maven-metadata.xml.sha512" "${META_URL}.sha512" + cd - + + echo "Updated metadata and checksums for ${VERSION}" + else + echo "Failed to download metadata for ${VERSION} or file is empty" + exit 1 + fi + + # Clean up + rm -rf "${TEMP_DIR}" + + echo "Version metadata updated with commit ID" \ No newline at end of file diff --git a/language-grammar/build.gradle b/language-grammar/build.gradle new file mode 100644 index 00000000000..177cc069011 --- /dev/null +++ b/language-grammar/build.gradle @@ -0,0 +1,13 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +plugins { + id 'java' + id 'java-library' +} + +group = 'org.opensearch' +version = '0.1.0-SNAPSHOT' +description = 'OpenSearch Language Grammar Files' \ No newline at end of file diff --git a/language-grammar/src/main/antlr4/FlintSparkSqlExtensions.g4 b/language-grammar/src/main/antlr4/FlintSparkSqlExtensions.g4 new file mode 100644 index 00000000000..46e814e9f56 --- /dev/null +++ b/language-grammar/src/main/antlr4/FlintSparkSqlExtensions.g4 @@ -0,0 +1,208 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +grammar FlintSparkSqlExtensions; + +import SparkSqlBase; + + +// Flint SQL Syntax Extension + +singleStatement + : statement SEMICOLON* EOF + ; + +statement + : skippingIndexStatement + | coveringIndexStatement + | materializedViewStatement + | indexManagementStatement + | indexJobManagementStatement + ; + +skippingIndexStatement + : createSkippingIndexStatement + | refreshSkippingIndexStatement + | describeSkippingIndexStatement + | alterSkippingIndexStatement + | dropSkippingIndexStatement + | vacuumSkippingIndexStatement + | analyzeSkippingIndexStatement + ; + +createSkippingIndexStatement + : CREATE SKIPPING INDEX (IF NOT EXISTS)? + ON tableName + LEFT_PAREN indexColTypeList RIGHT_PAREN + whereClause? + (WITH LEFT_PAREN propertyList RIGHT_PAREN)? + ; + +refreshSkippingIndexStatement + : REFRESH SKIPPING INDEX ON tableName + ; + +describeSkippingIndexStatement + : (DESC | DESCRIBE) SKIPPING INDEX ON tableName + ; + +alterSkippingIndexStatement + : ALTER SKIPPING INDEX + ON tableName + WITH LEFT_PAREN propertyList RIGHT_PAREN + ; + +dropSkippingIndexStatement + : DROP SKIPPING INDEX ON tableName + ; + +vacuumSkippingIndexStatement + : VACUUM SKIPPING INDEX ON tableName + ; + +coveringIndexStatement + : createCoveringIndexStatement + | refreshCoveringIndexStatement + | showCoveringIndexStatement + | describeCoveringIndexStatement + | alterCoveringIndexStatement + | dropCoveringIndexStatement + | vacuumCoveringIndexStatement + ; + +createCoveringIndexStatement + : CREATE INDEX (IF NOT EXISTS)? indexName + ON tableName + LEFT_PAREN indexColumns=multipartIdentifierPropertyList RIGHT_PAREN + whereClause? + (WITH LEFT_PAREN propertyList RIGHT_PAREN)? + ; + +refreshCoveringIndexStatement + : REFRESH INDEX indexName ON tableName + ; + +showCoveringIndexStatement + : SHOW (INDEX | INDEXES) ON tableName + ; + +describeCoveringIndexStatement + : (DESC | DESCRIBE) INDEX indexName ON tableName + ; + +alterCoveringIndexStatement + : ALTER INDEX indexName + ON tableName + WITH LEFT_PAREN propertyList RIGHT_PAREN + ; + +dropCoveringIndexStatement + : DROP INDEX indexName ON tableName + ; + +vacuumCoveringIndexStatement + : VACUUM INDEX indexName ON tableName + ; + +analyzeSkippingIndexStatement + : ANALYZE SKIPPING INDEX ON tableName + ; + +materializedViewStatement + : createMaterializedViewStatement + | refreshMaterializedViewStatement + | showMaterializedViewStatement + | describeMaterializedViewStatement + | alterMaterializedViewStatement + | dropMaterializedViewStatement + | vacuumMaterializedViewStatement + ; + +createMaterializedViewStatement + : CREATE MATERIALIZED VIEW (IF NOT EXISTS)? mvName=multipartIdentifier + AS query=materializedViewQuery + (WITH LEFT_PAREN propertyList RIGHT_PAREN)? + ; + +refreshMaterializedViewStatement + : REFRESH MATERIALIZED VIEW mvName=multipartIdentifier + ; + +showMaterializedViewStatement + : SHOW MATERIALIZED (VIEW | VIEWS) IN catalogDb=multipartIdentifier + ; + +describeMaterializedViewStatement + : (DESC | DESCRIBE) MATERIALIZED VIEW mvName=multipartIdentifier + ; + +alterMaterializedViewStatement + : ALTER MATERIALIZED VIEW mvName=multipartIdentifier + WITH LEFT_PAREN propertyList RIGHT_PAREN + ; + +dropMaterializedViewStatement + : DROP MATERIALIZED VIEW mvName=multipartIdentifier + ; + +vacuumMaterializedViewStatement + : VACUUM MATERIALIZED VIEW mvName=multipartIdentifier + ; + +indexManagementStatement + : showFlintIndexStatement + ; + +showFlintIndexStatement + : SHOW FLINT (INDEX | INDEXES) + IN catalogDb=multipartIdentifier #showFlintIndex + | SHOW FLINT (INDEX | INDEXES) EXTENDED + IN catalogDb=multipartIdentifier #showFlintIndexExtended + ; + +indexJobManagementStatement + : recoverIndexJobStatement + ; + +recoverIndexJobStatement + : RECOVER INDEX JOB identifier + ; + +/* + * Match all remaining tokens in non-greedy way + * so WITH clause won't be captured by this rule. + */ +materializedViewQuery + : .+? + ; + +whereClause + : WHERE filterCondition + ; + +filterCondition + : .+? + ; + +indexColTypeList + : indexColType (COMMA indexColType)* + ; + +indexColType + : multipartIdentifier skipType=(PARTITION | VALUE_SET | MIN_MAX | BLOOM_FILTER) + (LEFT_PAREN skipParams RIGHT_PAREN)? + ; + +skipParams + : propertyValue (COMMA propertyValue)* + ; + +indexName + : identifier + ; + +tableName + : multipartIdentifier + ; diff --git a/language-grammar/src/main/antlr4/OpenSearchLegacySqlLexer.g4 b/language-grammar/src/main/antlr4/OpenSearchLegacySqlLexer.g4 new file mode 100644 index 00000000000..eb075d12ba5 --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchLegacySqlLexer.g4 @@ -0,0 +1,352 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* +MySQL (Positive Technologies) grammar +The MIT License (MIT). +Copyright (c) 2015-2017, Ivan Kochurkin (kvanttt@gmail.com), Positive Technologies. +Copyright (c) 2017, Ivan Khudyashev (IHudyashov@ptsecurity.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +lexer grammar OpenSearchLegacySqlLexer; + +channels { SQLCOMMENT, ERRORCHANNEL } + + +// SKIP + +SPACE: [ \t\r\n]+ -> channel(HIDDEN); +SPEC_SQL_COMMENT: '/*!' .+? '*/' -> channel(SQLCOMMENT); +COMMENT_INPUT: '/*' .*? '*/' -> channel(HIDDEN); +LINE_COMMENT: ( + ('-- ' | '#') ~[\r\n]* ('\r'? '\n' | EOF) + | '--' ('\r'? '\n' | EOF) + ) -> channel(HIDDEN); + + +// Keywords +// Common Keywords + +ALL: 'ALL'; +AND: 'AND'; +AS: 'AS'; +ASC: 'ASC'; +BETWEEN: 'BETWEEN'; +BY: 'BY'; +CASE: 'CASE'; +CAST: 'CAST'; +CROSS: 'CROSS'; +DATETIME: 'DATETIME'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DISTINCT: 'DISTINCT'; +DOUBLE: 'DOUBLE'; +ELSE: 'ELSE'; +EXISTS: 'EXISTS'; +FALSE: 'FALSE'; +FLOAT: 'FLOAT'; +FROM: 'FROM'; +GROUP: 'GROUP'; +HAVING: 'HAVING'; +IN: 'IN'; +INNER: 'INNER'; +INT: 'INT'; +IS: 'IS'; +JOIN: 'JOIN'; +LEFT: 'LEFT'; +LIKE: 'LIKE'; +LIMIT: 'LIMIT'; +LONG: 'LONG'; +MATCH: 'MATCH'; +NATURAL: 'NATURAL'; +NOT: 'NOT'; +NULL_LITERAL: 'NULL'; +ON: 'ON'; +OR: 'OR'; +ORDER: 'ORDER'; +OUTER: 'OUTER'; +REGEXP: 'REGEXP'; +RIGHT: 'RIGHT'; +SELECT: 'SELECT'; +SHOW: 'SHOW'; +STRING: 'STRING'; +THEN: 'THEN'; +TRUE: 'TRUE'; +UNION: 'UNION'; +USING: 'USING'; +WHEN: 'WHEN'; +WHERE: 'WHERE'; + + +// OD SQL special keyword +MISSING: 'MISSING'; +EXCEPT: 'MINUS'; + + +// Group function Keywords + +AVG: 'AVG'; +COUNT: 'COUNT'; +MAX: 'MAX'; +MIN: 'MIN'; +SUM: 'SUM'; + + +// Common function Keywords + +SUBSTRING: 'SUBSTRING'; +TRIM: 'TRIM'; +YEAR: 'YEAR'; +STRCMP: 'STRCMP'; + + +// Keywords, but can be ID +// Common Keywords, but can be ID + +END: 'END'; +FULL: 'FULL'; +OFFSET: 'OFFSET'; + + +// PRIVILEGES + +TABLES: 'TABLES'; + + +// Common function names + +ABS: 'ABS'; +ACOS: 'ACOS'; +ADD: 'ADD'; +ASCII: 'ASCII'; +ASIN: 'ASIN'; +ATAN: 'ATAN'; +ATAN2: 'ATAN2'; +CBRT: 'CBRT'; +CEIL: 'CEIL'; +CONCAT: 'CONCAT'; +CONCAT_WS: 'CONCAT_WS'; +COS: 'COS'; +COSH: 'COSH'; +COT: 'COT'; +CURDATE: 'CURDATE'; +DATE: 'DATE'; +DATE_FORMAT: 'DATE_FORMAT'; +DAYOFMONTH: 'DAYOFMONTH'; +DEGREES: 'DEGREES'; +E: 'E'; +EXP: 'EXP'; +EXPM1: 'EXPM1'; +FLOOR: 'FLOOR'; +IF: 'IF'; +IFNULL: 'IFNULL'; +ISNULL: 'ISNULL'; +LENGTH: 'LENGTH'; +LN: 'LN'; +LOCATE: 'LOCATE'; +LOG: 'LOG'; +LOG10: 'LOG10'; +LOG2: 'LOG2'; +LOWER: 'LOWER'; +LTRIM: 'LTRIM'; +MAKETIME: 'MAKETIME'; +MODULUS: 'MODULUS'; +MONTH: 'MONTH'; +MONTHNAME: 'MONTHNAME'; +MULTIPLY: 'MULTIPLY'; +NOW: 'NOW'; +PI: 'PI'; +POW: 'POW'; +POWER: 'POWER'; +RADIANS: 'RADIANS'; +RAND: 'RAND'; +REPLACE: 'REPLACE'; +RINT: 'RINT'; +ROUND: 'ROUND'; +RTRIM: 'RTRIM'; +SIGN: 'SIGN'; +SIGNUM: 'SIGNUM'; +SIN: 'SIN'; +SINH: 'SINH'; +SQRT: 'SQRT'; +SUBTRACT: 'SUBTRACT'; +TAN: 'TAN'; +TIMESTAMP: 'TIMESTAMP'; +UPPER: 'UPPER'; + +D: 'D'; +T: 'T'; +TS: 'TS'; +LEFT_BRACE: '{'; +RIGHT_BRACE: '}'; + + +// OD SQL special functions +DATE_HISTOGRAM: 'DATE_HISTOGRAM'; +DAY_OF_MONTH: 'DAY_OF_MONTH'; +DAY_OF_YEAR: 'DAY_OF_YEAR'; +DAY_OF_WEEK: 'DAY_OF_WEEK'; +EXCLUDE: 'EXCLUDE'; +EXTENDED_STATS: 'EXTENDED_STATS'; +FIELD: 'FIELD'; +FILTER: 'FILTER'; +GEO_BOUNDING_BOX: 'GEO_BOUNDING_BOX'; +GEO_CELL: 'GEO_CELL'; +GEO_DISTANCE: 'GEO_DISTANCE'; +GEO_DISTANCE_RANGE: 'GEO_DISTANCE_RANGE'; +GEO_INTERSECTS: 'GEO_INTERSECTS'; +GEO_POLYGON: 'GEO_POLYGON'; +HISTOGRAM: 'HISTOGRAM'; +HOUR_OF_DAY: 'HOUR_OF_DAY'; +INCLUDE: 'INCLUDE'; +IN_TERMS: 'IN_TERMS'; +MATCHPHRASE: 'MATCHPHRASE'; +MATCH_PHRASE: 'MATCH_PHRASE'; +MATCHQUERY: 'MATCHQUERY'; +MATCH_QUERY: 'MATCH_QUERY'; +MINUTE_OF_DAY: 'MINUTE_OF_DAY'; +MINUTE_OF_HOUR: 'MINUTE_OF_HOUR'; +MONTH_OF_YEAR: 'MONTH_OF_YEAR'; +MULTIMATCH: 'MULTIMATCH'; +MULTI_MATCH: 'MULTI_MATCH'; +NESTED: 'NESTED'; +PERCENTILES: 'PERCENTILES'; +REGEXP_QUERY: 'REGEXP_QUERY'; +REVERSE_NESTED: 'REVERSE_NESTED'; +QUERY: 'QUERY'; +RANGE: 'RANGE'; +SCORE: 'SCORE'; +SECOND_OF_MINUTE: 'SECOND_OF_MINUTE'; +STATS: 'STATS'; +TERM: 'TERM'; +TERMS: 'TERMS'; +TOPHITS: 'TOPHITS'; +WEEK_OF_YEAR: 'WEEK_OF_YEAR'; +WILDCARDQUERY: 'WILDCARDQUERY'; +WILDCARD_QUERY: 'WILDCARD_QUERY'; + + +// Operators + +// Operators. Arithmetics + +STAR: '*'; +DIVIDE: '/'; +MODULE: '%'; +PLUS: '+'; +MINUS: '-'; +DIV: 'DIV'; +MOD: 'MOD'; + + +// Operators. Comparation + +EQUAL_SYMBOL: '='; +GREATER_SYMBOL: '>'; +LESS_SYMBOL: '<'; +EXCLAMATION_SYMBOL: '!'; + + +// Operators. Bit + +BIT_NOT_OP: '~'; +BIT_OR_OP: '|'; +BIT_AND_OP: '&'; +BIT_XOR_OP: '^'; + + +// Constructors symbols + +DOT: '.'; +LR_BRACKET: '('; +RR_BRACKET: ')'; +COMMA: ','; +SEMI: ';'; +AT_SIGN: '@'; +ZERO_DECIMAL: '0'; +ONE_DECIMAL: '1'; +TWO_DECIMAL: '2'; +SINGLE_QUOTE_SYMB: '\''; +DOUBLE_QUOTE_SYMB: '"'; +REVERSE_QUOTE_SYMB: '`'; +COLON_SYMB: ':'; + + +// Literal Primitives + +START_NATIONAL_STRING_LITERAL: 'N' SQUOTA_STRING; +STRING_LITERAL: SQUOTA_STRING; +DECIMAL_LITERAL: DEC_DIGIT+; +HEXADECIMAL_LITERAL: 'X' '\'' (HEX_DIGIT HEX_DIGIT)+ '\'' + | '0X' HEX_DIGIT+; + +REAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+ + | DEC_DIGIT+ '.' EXPONENT_NUM_PART + | (DEC_DIGIT+)? '.' (DEC_DIGIT+ EXPONENT_NUM_PART) + | DEC_DIGIT+ EXPONENT_NUM_PART; +NULL_SPEC_LITERAL: '\\' 'N'; +BIT_STRING: BIT_STRING_L; + + + +// Hack for dotID +// Prevent recognize string: .123somelatin AS ((.123), FLOAT_LITERAL), ((somelatin), ID) +// it must recoginze: .123somelatin AS ((.), DOT), (123somelatin, ID) + +DOT_ID: '.' ID_LITERAL; + + + +// Identifiers + +ID: ID_LITERAL; +// DOUBLE_QUOTE_ID: '"' ~'"'+ '"'; +REVERSE_QUOTE_ID: '`' ~'`'+ '`'; +DOUBLE_QUOTE_ID: DQUOTA_STRING; +BACKTICK_QUOTE_ID: BQUOTA_STRING; +STRING_USER_NAME: ( + SQUOTA_STRING | DQUOTA_STRING + | BQUOTA_STRING | ID_LITERAL + ) '@' + ( + SQUOTA_STRING | DQUOTA_STRING + | BQUOTA_STRING | ID_LITERAL + ); + + +// Fragments for Literal primitives + +fragment EXPONENT_NUM_PART: 'E' [-+]? DEC_DIGIT+; +fragment ID_LITERAL: [A-Z_$0-9@]*?[A-Z_$]+?[A-Z_$\-0-9]*; +fragment DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"'; +fragment SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\''; +fragment BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`'; +fragment HEX_DIGIT: [0-9A-F]; +fragment DEC_DIGIT: [0-9]; +fragment BIT_STRING_L: 'B' '\'' [01]+ '\''; + + + +// Last tokens must generate Errors + +ERROR_RECOGNITION: . -> channel(ERRORCHANNEL); diff --git a/language-grammar/src/main/antlr4/OpenSearchLegacySqlParser.g4 b/language-grammar/src/main/antlr4/OpenSearchLegacySqlParser.g4 new file mode 100644 index 00000000000..217aa5eaa39 --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchLegacySqlParser.g4 @@ -0,0 +1,561 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* +MySQL (Positive Technologies) grammar +The MIT License (MIT). +Copyright (c) 2015-2017, Ivan Kochurkin (kvanttt@gmail.com), Positive Technologies. +Copyright (c) 2017, Ivan Khudyashev (IHudyashov@ptsecurity.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +parser grammar OpenSearchLegacySqlParser; + + +options { tokenVocab = OpenSearchLegacySqlLexer; } +// Top Level Description + +// Root rule +root + : sqlStatement? SEMI? EOF + ; + +// Only SELECT, DELETE, SHOW and DSCRIBE are supported for now +sqlStatement + : dmlStatement + | administrationStatement + | utilityStatement + ; + +dmlStatement + : selectStatement + ; + + // Data Manipulation Language + + // Primary DML Statements +selectStatement + : querySpecification # simpleSelect + | queryExpression # parenthesisSelect + | querySpecification unionStatement+ orderByClause? limitClause? # unionSelect + | querySpecification minusStatement+ orderByClause? limitClause? # minusSelect + ; + + +// Detailed DML Statements + +orderByClause + : ORDER BY orderByExpression (',' orderByExpression)* + ; + +orderByExpression + : expression order = (ASC | DESC)? + ; + +tableSources + : tableSource (',' tableSource)* + ; + +tableSource + : tableSourceItem joinPart* # tableSourceBase + | '(' tableSourceItem joinPart* ')' # tableSourceNested + ; + +tableSourceItem + : tableName (AS? alias = uid)? # atomTableItem + | (selectStatement | '(' parenthesisSubquery = selectStatement ')') AS? alias = uid # subqueryTableItem + | '(' tableSources ')' # tableSourcesItem + ; + +joinPart + : (INNER | CROSS)? JOIN tableSourceItem (ON expression | USING '(' uidList ')')? # innerJoin + | (LEFT | RIGHT) OUTER? JOIN tableSourceItem (ON expression | USING '(' uidList ')')? # outerJoin + | NATURAL ((LEFT | RIGHT) OUTER?)? JOIN tableSourceItem # naturalJoin + ; + +// Select Statement's Details +queryExpression + : '(' querySpecification ')' + | '(' queryExpression ')' + ; + +querySpecification + : SELECT selectSpec* selectElements fromClause orderByClause? limitClause? + ; + +unionStatement + : UNION unionType = (ALL | DISTINCT)? (querySpecification | queryExpression) + ; + +minusStatement + : EXCEPT (querySpecification | queryExpression) + ; + +selectSpec + : (ALL | DISTINCT) + ; + +selectElements + : (star = '*' | selectElement) (',' selectElement)* + ; + +selectElement + : fullId '.' '*' # selectStarElement + | fullColumnName (AS? uid)? # selectColumnElement + | functionCall (AS? uid)? # selectFunctionElement + | expression (AS? uid)? # selectExpressionElement + | NESTED '(' fullId DOT STAR ')' # selectNestedStarElement + ; + +fromClause + : FROM tableSources (WHERE whereExpr = expression)? (GROUP BY groupByItem (',' groupByItem)*)? (HAVING havingExpr = expression)? + ; + +groupByItem + : expression order = (ASC | DESC)? + ; + +limitClause + : LIMIT ((offset = limitClauseAtom ',')? limit = limitClauseAtom | limit = limitClauseAtom OFFSET offset = limitClauseAtom) + ; + +limitClauseAtom + : decimalLiteral + ; + +// SHOW/DESCIRBE statements +administrationStatement + : showStatement + ; + +showStatement + : SHOW showSchemaEntity (schemaFormat = (FROM | IN) uid)? showFilter? + ; + +utilityStatement + : simpleDescribeStatement + ; + +simpleDescribeStatement + : command = DESCRIBE tableName (column = uid | pattern = STRING_LITERAL)? + ; + +showFilter + : LIKE STRING_LITERAL + | WHERE expression + ; + +showSchemaEntity + : FULL? TABLES + ; + +// Common Clauses + +// DB Objects +fullId + : uid (DOT_ID | '.' uid)? + ; + +tableName + : fullId # simpleTableName + | uid STAR # tableNamePattern + | uid DIVIDE uid # tableAndTypeName + ; + +fullColumnName + : uid dottedId* + ; + +uid + : simpleId + | REVERSE_QUOTE_ID + ; + +simpleId + : ID + | DOT_ID // note: the current scope by adding DOT_ID to simpleId is large, move DOT_ID upwards tablename if needed + | DOUBLE_QUOTE_ID + | BACKTICK_QUOTE_ID + | keywordsCanBeId + | functionNameBase + ; + +dottedId + : DOT_ID + | '.' uid + ; + +// Literals +decimalLiteral + : DECIMAL_LITERAL + | ZERO_DECIMAL + | ONE_DECIMAL + | TWO_DECIMAL + ; + +stringLiteral + : (STRING_LITERAL | START_NATIONAL_STRING_LITERAL) STRING_LITERAL+ + | (STRING_LITERAL | START_NATIONAL_STRING_LITERAL) + ; + +booleanLiteral + : TRUE + | FALSE + ; + +nullNotnull + : NOT? (NULL_LITERAL | NULL_SPEC_LITERAL) + ; + +constant + : stringLiteral + | decimalLiteral + | '-' decimalLiteral + | booleanLiteral + | REAL_LITERAL + | BIT_STRING + | NOT? nullLiteral = (NULL_LITERAL | NULL_SPEC_LITERAL) + | LEFT_BRACE dateType = (D | T | TS | DATE | TIME | TIMESTAMP) stringLiteral RIGHT_BRACE + ; + +// Common Lists +uidList + : uid (',' uid)* + ; + +expressions + : expression (',' expression)* + ; + +aggregateFunction + : functionAsAggregatorFunction # functionAsAggregatorFunctionCall + | aggregateWindowedFunction # aggregateWindowedFunctionCall + ; + +scalarFunction + : scalarFunctionName '(' nestedFunctionArgs+ ')' # nestedFunctionCall + | scalarFunctionName '(' functionArgs? ')' # scalarFunctionCall + ; + +functionCall + : aggregateFunction # aggregateFunctionCall + | scalarFunctionName '(' aggregateWindowedFunction ')' # aggregationAsArgFunctionCall + | scalarFunction # scalarFunctionsCall + | specificFunction # specificFunctionCall + | fullId '(' functionArgs? ')' # udfFunctionCall + ; + +specificFunction + : CAST '(' expression AS convertedDataType ')' # dataTypeFunctionCall + | CASE expression caseFuncAlternative+ (ELSE elseArg = functionArg)? END # caseFunctionCall + | CASE caseFuncAlternative+ (ELSE elseArg = functionArg)? END # caseFunctionCall + ; + +caseFuncAlternative + : WHEN condition = functionArg THEN consequent = functionArg + ; + +convertedDataType + : typeName = DATETIME + | typeName = INT + | typeName = DOUBLE + | typeName = LONG + | typeName = FLOAT + | typeName = STRING + ; + +aggregateWindowedFunction + : (AVG | MAX | MIN | SUM) '(' aggregator = (ALL | DISTINCT)? functionArg ')' + | COUNT '(' (starArg = '*' | aggregator = ALL? functionArg) ')' + | COUNT '(' aggregator = DISTINCT functionArgs ')' + ; + +functionAsAggregatorFunction + : (AVG | MAX | MIN | SUM) '(' aggregator = (ALL | DISTINCT)? functionCall ')' + | COUNT '(' aggregator = (ALL | DISTINCT)? functionCall ')' + ; + +scalarFunctionName + : functionNameBase + ; +/* +Separated aggregate to function-aggregator and nonfunction-aggregator aggregations. + +Current related rules: aggregateWindowedFunction, functionAsAggregatorFunction, aggregateFunction, functionCall +Original rules: functionCall (as is shown in below changes), no aggregateWindowFunction, no functionAsAggregatorFunction, + no aggregateFunction + +==== + +Separated function argument rule to nonFunctionCall and functionCall +functions with functionCall arguments are taken as nested functions + +Current related rules: functionArgs, functionArg, nestedFunctionArgs +Original rules: +functionArgs + : (constant | fullColumnName | functionCall | expression) + ( + ',' + (constant | fullColumnName | functionCall | expression) + )* + ; + +functionArg + : constant | fullColumnName | functionCall | expression + ; + +==== + +Accordingly functionCall rule is changed by separating scalar functions +to nested functions and non-nested functons. +Current related rules: functionCall, scalarFunction +Original rule: +functionCall + : specificFunction #specificFunctionCall + | aggregateWindowedFunction #aggregateFunctionCall + | scalarFunctionName '(' functionArgs? ')' #scalarFunctionCall + | fullId '(' functionArgs? ')' #udfFunctionCall + ; +*/ + + +functionArgs + : (constant | fullColumnName | expression) (',' (constant | fullColumnName | expression))* + ; + +functionArg + : constant + | fullColumnName + | expression + ; + +nestedFunctionArgs + : functionCall (',' functionArgs)? + ; + +// Expressions, predicates + +// Simplified approach for expression +expression + : notOperator = (NOT | '!') expression # notExpression + | expression logicalOperator expression # logicalExpression + | predicate IS NOT? testValue = (TRUE | FALSE | MISSING) # isExpression + | predicate # predicateExpression + ; + +predicate + : predicate NOT? IN '(' (selectStatement | expressions) ')' # inPredicate + | predicate IS nullNotnull # isNullPredicate + | left = predicate comparisonOperator right = predicate # binaryComparisonPredicate + | predicate NOT? BETWEEN predicate AND predicate # betweenPredicate + | predicate NOT? LIKE predicate # likePredicate + | predicate NOT? regex = REGEXP predicate # regexpPredicate + | expressionAtom # expressionAtomPredicate + ; + +// Add in ASTVisitor nullNotnull in constant +expressionAtom + : constant # constantExpressionAtom + | fullColumnName # fullColumnNameExpressionAtom + | functionCall # functionCallExpressionAtom + | unaryOperator expressionAtom # unaryExpressionAtom + | '(' expression (',' expression)* ')' # nestedExpressionAtom + | EXISTS '(' selectStatement ')' # existsExpessionAtom + | '(' selectStatement ')' # subqueryExpessionAtom + | left = expressionAtom bitOperator right = expressionAtom # bitExpressionAtom + | left = expressionAtom mathOperator right = expressionAtom # mathExpressionAtom + ; + +unaryOperator + : '!' + | '~' + | '+' + | '-' + | NOT + ; + +comparisonOperator + : '=' + | '>' + | '<' + | '<' '=' + | '>' '=' + | '<' '>' + | '!' '=' + ; + +logicalOperator + : AND + | '&' '&' + | OR + | '|' '|' + ; + +bitOperator + : '<' '<' + | '>' '>' + | '&' + | '^' + | '|' + ; + +mathOperator + : '*' + | '/' + | '%' + | DIV + | MOD + | '+' + | '-' + ; + +// Simple id sets +// (that keyword, which can be id) +keywordsCanBeId + : FULL + | FIELD + | D + | T + | TS // OD SQL and ODBC special + | COUNT + | MIN + | MAX + | AVG + | SUM + ; + +functionNameBase + : openSearchFunctionNameBase + | ABS + | ACOS + | ADD + | ASCII + | ASIN + | ATAN + | ATAN2 + | CBRT + | CEIL + | CONCAT + | CONCAT_WS + | COS + | COSH + | COT + | CURDATE + | DATE + | DATE_FORMAT + | DAYOFMONTH + | DEGREES + | E + | EXP + | EXPM1 + | FLOOR + | IF + | IFNULL + | ISNULL + | LEFT + | LENGTH + | LN + | LOCATE + | LOG + | LOG10 + | LOG2 + | LOWER + | LTRIM + | MAKETIME + | MODULUS + | MONTH + | MONTHNAME + | MULTIPLY + | NOW + | PI + | POW + | POWER + | RADIANS + | RAND + | REPLACE + | RIGHT + | RINT + | ROUND + | RTRIM + | SIGN + | SIGNUM + | SIN + | SINH + | SQRT + | SUBSTRING + | SUBTRACT + | TAN + | TIMESTAMP + | TRIM + | UPPER + | YEAR + | ADDDATE + | ADDTIME + | GREATEST + | LEAST + | STRCMP + ; + +openSearchFunctionNameBase + : DATE_HISTOGRAM + | DAY_OF_MONTH + | DAY_OF_YEAR + | DAY_OF_WEEK + | EXCLUDE + | EXTENDED_STATS + | FILTER + | GEO_BOUNDING_BOX + | GEO_CELL + | GEO_DISTANCE + | GEO_DISTANCE_RANGE + | GEO_INTERSECTS + | GEO_POLYGON + | INCLUDE + | IN_TERMS + | HISTOGRAM + | HOUR_OF_DAY + | MATCHPHRASE + | MATCH_PHRASE + | MATCHQUERY + | MATCH_QUERY + | MINUTE_OF_DAY + | MINUTE_OF_HOUR + | MISSING + | MONTH_OF_YEAR + | MULTIMATCH + | MULTI_MATCH + | NESTED + | PERCENTILES + | QUERY + | RANGE + | REGEXP_QUERY + | REVERSE_NESTED + | SCORE + | SECOND_OF_MINUTE + | STATS + | TERM + | TERMS + | TOPHITS + | WEEK_OF_YEAR + | WILDCARDQUERY + | WILDCARD_QUERY + ; diff --git a/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 b/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 new file mode 100644 index 00000000000..b7dc4b7286d --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -0,0 +1,516 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + + +lexer grammar OpenSearchPPLLexer; + +channels { WHITESPACE, ERRORCHANNEL } +options { caseInsensitive = true; } + +// COMMAND KEYWORDS +SEARCH: 'SEARCH'; +DESCRIBE: 'DESCRIBE'; +SHOW: 'SHOW'; +FROM: 'FROM'; +WHERE: 'WHERE'; +FIELDS: 'FIELDS'; +RENAME: 'RENAME'; +STATS: 'STATS'; +EVENTSTATS: 'EVENTSTATS'; +DEDUP: 'DEDUP'; +SORT: 'SORT'; +EVAL: 'EVAL'; +HEAD: 'HEAD'; +TOP_APPROX: 'TOP_APPROX'; +TOP: 'TOP'; +RARE_APPROX: 'RARE_APPROX'; +RARE: 'RARE'; +PARSE: 'PARSE'; +METHOD: 'METHOD'; +REGEX: 'REGEX'; +PUNCT: 'PUNCT'; +GROK: 'GROK'; +PATTERN: 'PATTERN'; +PATTERNS: 'PATTERNS'; +NEW_FIELD: 'NEW_FIELD'; +KMEANS: 'KMEANS'; +AD: 'AD'; +ML: 'ML'; +FILLNULL: 'FILLNULL'; +EXPAND: 'EXPAND'; +FLATTEN: 'FLATTEN'; +TRENDLINE: 'TRENDLINE'; +APPENDCOL: 'APPENDCOL'; + +//Native JOIN KEYWORDS +JOIN: 'JOIN'; +ON: 'ON'; +INNER: 'INNER'; +OUTER: 'OUTER'; +FULL: 'FULL'; +SEMI: 'SEMI'; +ANTI: 'ANTI'; +CROSS: 'CROSS'; +LEFT_HINT: 'HINT.LEFT'; +RIGHT_HINT: 'HINT.RIGHT'; + +//CORRELATION KEYWORDS +CORRELATE: 'CORRELATE'; +SELF: 'SELF'; +EXACT: 'EXACT'; +APPROXIMATE: 'APPROXIMATE'; +SCOPE: 'SCOPE'; +MAPPING: 'MAPPING'; + +//EXPLAIN KEYWORDS +EXPLAIN: 'EXPLAIN'; +FORMATTED: 'FORMATTED'; +COST: 'COST'; +CODEGEN: 'CODEGEN'; +EXTENDED: 'EXTENDED'; +SIMPLE: 'SIMPLE'; + +// COMMAND ASSIST KEYWORDS +AS: 'AS'; +BY: 'BY'; +SOURCE: 'SOURCE'; +INDEX: 'INDEX'; +D: 'D'; +DESC: 'DESC'; +DATASOURCES: 'DATASOURCES'; +USING: 'USING'; +WITH: 'WITH'; + +// SORT FIELD KEYWORDS +// TODO #963: Implement 'num', 'str', and 'ip' sort syntax +AUTO: 'AUTO'; +STR: 'STR'; +IP: 'IP'; +NUM: 'NUM'; + +// FIELDSUMMARY keywords +FIELDSUMMARY: 'FIELDSUMMARY'; +INCLUDEFIELDS: 'INCLUDEFIELDS'; +NULLS: 'NULLS'; + +//TRENDLINE KEYWORDS +SMA: 'SMA'; +WMA: 'WMA'; + +// APPENDCOL options +OVERRIDE: 'OVERRIDE'; + +// ARGUMENT KEYWORDS +KEEPEMPTY: 'KEEPEMPTY'; +CONSECUTIVE: 'CONSECUTIVE'; +DEDUP_SPLITVALUES: 'DEDUP_SPLITVALUES'; +PARTITIONS: 'PARTITIONS'; +ALLNUM: 'ALLNUM'; +DELIM: 'DELIM'; +CENTROIDS: 'CENTROIDS'; +ITERATIONS: 'ITERATIONS'; +DISTANCE_TYPE: 'DISTANCE_TYPE'; +NUMBER_OF_TREES: 'NUMBER_OF_TREES'; +SHINGLE_SIZE: 'SHINGLE_SIZE'; +SAMPLE_SIZE: 'SAMPLE_SIZE'; +OUTPUT_AFTER: 'OUTPUT_AFTER'; +TIME_DECAY: 'TIME_DECAY'; +ANOMALY_RATE: 'ANOMALY_RATE'; +CATEGORY_FIELD: 'CATEGORY_FIELD'; +TIME_FIELD: 'TIME_FIELD'; +TIME_ZONE: 'TIME_ZONE'; +TRAINING_DATA_SIZE: 'TRAINING_DATA_SIZE'; +ANOMALY_SCORE_THRESHOLD: 'ANOMALY_SCORE_THRESHOLD'; +APPEND: 'APPEND'; + +// COMPARISON FUNCTION KEYWORDS +CASE: 'CASE'; +ELSE: 'ELSE'; +IN: 'IN'; +EXISTS: 'EXISTS'; + +// LOGICAL KEYWORDS +NOT: 'NOT'; +OR: 'OR'; +AND: 'AND'; +XOR: 'XOR'; +TRUE: 'TRUE'; +FALSE: 'FALSE'; +REGEXP: 'REGEXP'; + +// DATETIME, INTERVAL AND UNIT KEYWORDS +CONVERT_TZ: 'CONVERT_TZ'; +DATETIME: 'DATETIME'; +DAY: 'DAY'; +DAY_HOUR: 'DAY_HOUR'; +DAY_MICROSECOND: 'DAY_MICROSECOND'; +DAY_MINUTE: 'DAY_MINUTE'; +DAY_OF_YEAR: 'DAY_OF_YEAR'; +DAY_SECOND: 'DAY_SECOND'; +HOUR: 'HOUR'; +HOUR_MICROSECOND: 'HOUR_MICROSECOND'; +HOUR_MINUTE: 'HOUR_MINUTE'; +HOUR_OF_DAY: 'HOUR_OF_DAY'; +HOUR_SECOND: 'HOUR_SECOND'; +INTERVAL: 'INTERVAL'; +MICROSECOND: 'MICROSECOND'; +MILLISECOND: 'MILLISECOND'; +MINUTE: 'MINUTE'; +MINUTE_MICROSECOND: 'MINUTE_MICROSECOND'; +MINUTE_OF_DAY: 'MINUTE_OF_DAY'; +MINUTE_OF_HOUR: 'MINUTE_OF_HOUR'; +MINUTE_SECOND: 'MINUTE_SECOND'; +MONTH: 'MONTH'; +MONTH_OF_YEAR: 'MONTH_OF_YEAR'; +QUARTER: 'QUARTER'; +SECOND: 'SECOND'; +SECOND_MICROSECOND: 'SECOND_MICROSECOND'; +SECOND_OF_MINUTE: 'SECOND_OF_MINUTE'; +WEEK: 'WEEK'; +WEEK_OF_YEAR: 'WEEK_OF_YEAR'; +YEAR: 'YEAR'; +YEAR_MONTH: 'YEAR_MONTH'; + +// DATASET TYPES +DATAMODEL: 'DATAMODEL'; +LOOKUP: 'LOOKUP'; +SAVEDSEARCH: 'SAVEDSEARCH'; + +// CONVERTED DATA TYPES +INT: 'INT'; +INTEGER: 'INTEGER'; +DOUBLE: 'DOUBLE'; +LONG: 'LONG'; +FLOAT: 'FLOAT'; +STRING: 'STRING'; +BOOLEAN: 'BOOLEAN'; + +// SPECIAL CHARACTERS AND OPERATORS +PIPE: '|'; +COMMA: ','; +DOT: '.'; +EQUAL: '='; +GREATER: '>'; +LESS: '<'; +NOT_GREATER: '<' '='; +NOT_LESS: '>' '='; +NOT_EQUAL: '!' '='; +PLUS: '+'; +MINUS: '-'; +STAR: '*'; +DIVIDE: '/'; +MODULE: '%'; +EXCLAMATION_SYMBOL: '!'; +COLON: ':'; +LT_PRTHS: '('; +RT_PRTHS: ')'; +LT_SQR_PRTHS: '['; +RT_SQR_PRTHS: ']'; +SINGLE_QUOTE: '\''; +DOUBLE_QUOTE: '"'; +BACKTICK: '`'; +ARROW: '->'; + +// Operators. Bit + +BIT_NOT_OP: '~'; +BIT_AND_OP: '&'; +BIT_XOR_OP: '^'; + +// AGGREGATIONS +AVG: 'AVG'; +COUNT: 'COUNT'; +DISTINCT_COUNT: 'DISTINCT_COUNT'; +DISTINCT_COUNT_APPROX: 'DISTINCT_COUNT_APPROX'; +ESTDC: 'ESTDC'; +ESTDC_ERROR: 'ESTDC_ERROR'; +MAX: 'MAX'; +MEAN: 'MEAN'; +MEDIAN: 'MEDIAN'; +MIN: 'MIN'; +MODE: 'MODE'; +RANGE: 'RANGE'; +STDEV: 'STDEV'; +STDEVP: 'STDEVP'; +SUM: 'SUM'; +SUMSQ: 'SUMSQ'; +VAR_SAMP: 'VAR_SAMP'; +VAR_POP: 'VAR_POP'; +STDDEV_SAMP: 'STDDEV_SAMP'; +STDDEV_POP: 'STDDEV_POP'; +PERCENTILE: 'PERCENTILE'; +PERCENTILE_APPROX: 'PERCENTILE_APPROX'; +TAKE: 'TAKE'; +FIRST: 'FIRST'; +LAST: 'LAST'; +LIST: 'LIST'; +VALUES: 'VALUES'; +PER_DAY: 'PER_DAY'; +PER_HOUR: 'PER_HOUR'; +PER_MINUTE: 'PER_MINUTE'; +PER_SECOND: 'PER_SECOND'; +RATE: 'RATE'; +SPARKLINE: 'SPARKLINE'; +C: 'C'; +DC: 'DC'; + +// BASIC FUNCTIONS +ABS: 'ABS'; +CBRT: 'CBRT'; +CEIL: 'CEIL'; +CEILING: 'CEILING'; +CONV: 'CONV'; +CRC32: 'CRC32'; +E: 'E'; +EXP: 'EXP'; +FLOOR: 'FLOOR'; +LN: 'LN'; +LOG: 'LOG'; +LOG10: 'LOG10'; +LOG2: 'LOG2'; +MOD: 'MOD'; +PI: 'PI'; +POSITION: 'POSITION'; +POW: 'POW'; +POWER: 'POWER'; +RAND: 'RAND'; +ROUND: 'ROUND'; +SIGN: 'SIGN'; +SIGNUM: 'SIGNUM'; +SQRT: 'SQRT'; +TRUNCATE: 'TRUNCATE'; + +// TRIGONOMETRIC FUNCTIONS +ACOS: 'ACOS'; +ASIN: 'ASIN'; +ATAN: 'ATAN'; +ATAN2: 'ATAN2'; +COS: 'COS'; +COT: 'COT'; +DEGREES: 'DEGREES'; +RADIANS: 'RADIANS'; +SIN: 'SIN'; +TAN: 'TAN'; + +// CRYPTOGRAPHIC FUNCTIONS +MD5: 'MD5'; +SHA1: 'SHA1'; +SHA2: 'SHA2'; + +// DATE AND TIME FUNCTIONS +ADDDATE: 'ADDDATE'; +ADDTIME: 'ADDTIME'; +CURDATE: 'CURDATE'; +CURRENT_DATE: 'CURRENT_DATE'; +CURRENT_TIME: 'CURRENT_TIME'; +CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; +CURRENT_TIMEZONE: 'CURRENT_TIMEZONE'; +CURTIME: 'CURTIME'; +DATE: 'DATE'; +DATEDIFF: 'DATEDIFF'; +DATE_ADD: 'DATE_ADD'; +DATE_FORMAT: 'DATE_FORMAT'; +DATE_SUB: 'DATE_SUB'; +DAYNAME: 'DAYNAME'; +DAYOFMONTH: 'DAYOFMONTH'; +DAYOFWEEK: 'DAYOFWEEK'; +DAYOFYEAR: 'DAYOFYEAR'; +DAY_OF_MONTH: 'DAY_OF_MONTH'; +DAY_OF_WEEK: 'DAY_OF_WEEK'; +DURATION: 'DURATION'; +EXTRACT: 'EXTRACT'; +FROM_DAYS: 'FROM_DAYS'; +FROM_UNIXTIME: 'FROM_UNIXTIME'; +GET_FORMAT: 'GET_FORMAT'; +LAST_DAY: 'LAST_DAY'; +LOCALTIME: 'LOCALTIME'; +LOCALTIMESTAMP: 'LOCALTIMESTAMP'; +MAKEDATE: 'MAKEDATE'; +MAKE_DATE: 'MAKE_DATE'; +MAKETIME: 'MAKETIME'; +MONTHNAME: 'MONTHNAME'; +NOW: 'NOW'; +PERIOD_ADD: 'PERIOD_ADD'; +PERIOD_DIFF: 'PERIOD_DIFF'; +SEC_TO_TIME: 'SEC_TO_TIME'; +STR_TO_DATE: 'STR_TO_DATE'; +SUBDATE: 'SUBDATE'; +SUBTIME: 'SUBTIME'; +SYSDATE: 'SYSDATE'; +TIME: 'TIME'; +TIMEDIFF: 'TIMEDIFF'; +TIMESTAMP: 'TIMESTAMP'; +TIMESTAMPADD: 'TIMESTAMPADD'; +TIMESTAMPDIFF: 'TIMESTAMPDIFF'; +TIME_FORMAT: 'TIME_FORMAT'; +TIME_TO_SEC: 'TIME_TO_SEC'; +TO_DAYS: 'TO_DAYS'; +TO_SECONDS: 'TO_SECONDS'; +UNIX_TIMESTAMP: 'UNIX_TIMESTAMP'; +UTC_DATE: 'UTC_DATE'; +UTC_TIME: 'UTC_TIME'; +UTC_TIMESTAMP: 'UTC_TIMESTAMP'; +WEEKDAY: 'WEEKDAY'; +YEARWEEK: 'YEARWEEK'; + +// RELATIVE TIME FUNCTIONS +RELATIVE_TIMESTAMP: 'RELATIVE_TIMESTAMP'; +EARLIEST: 'EARLIEST'; +LATEST: 'LATEST'; + +// TEXT FUNCTIONS +SUBSTR: 'SUBSTR'; +SUBSTRING: 'SUBSTRING'; +LTRIM: 'LTRIM'; +RTRIM: 'RTRIM'; +TRIM: 'TRIM'; +TO: 'TO'; +LOWER: 'LOWER'; +UPPER: 'UPPER'; +CONCAT: 'CONCAT'; +CONCAT_WS: 'CONCAT_WS'; +LENGTH: 'LENGTH'; +STRCMP: 'STRCMP'; +RIGHT: 'RIGHT'; +LEFT: 'LEFT'; +ASCII: 'ASCII'; +LOCATE: 'LOCATE'; +REPLACE: 'REPLACE'; +REVERSE: 'REVERSE'; +CAST: 'CAST'; + +// JSON TEXT FUNCTIONS +JSON: 'JSON'; +JSON_OBJECT: 'JSON_OBJECT'; +JSON_ARRAY: 'JSON_ARRAY'; +JSON_ARRAY_LENGTH: 'JSON_ARRAY_LENGTH'; +TO_JSON_STRING: 'TO_JSON_STRING'; +JSON_EXTRACT: 'JSON_EXTRACT'; +JSON_DELETE : 'JSON_DELETE'; +JSON_KEYS: 'JSON_KEYS'; +JSON_VALID: 'JSON_VALID'; +JSON_APPEND: 'JSON_APPEND'; +JSON_EXTEND : 'JSON_EXTEND'; +JSON_SET: 'JSON_SET'; +//JSON_ARRAY_ALL_MATCH: 'JSON_ARRAY_ALL_MATCH'; +//JSON_ARRAY_ANY_MATCH: 'JSON_ARRAY_ANY_MATCH'; +//JSON_ARRAY_FILTER: 'JSON_ARRAY_FILTER'; +//JSON_ARRAY_MAP: 'JSON_ARRAY_MAP'; +//JSON_ARRAY_REDUCE: 'JSON_ARRAY_REDUCE'; + +// COLLECTION FUNCTIONS +ARRAY: 'ARRAY'; +ARRAY_LENGTH: 'ARRAY_LENGTH'; + +// LAMBDA FUNCTIONS +//EXISTS: 'EXISTS'; +FORALL: 'FORALL'; +FILTER: 'FILTER'; +TRANSFORM: 'TRANSFORM'; +REDUCE: 'REDUCE'; + +// BOOL FUNCTIONS +LIKE: 'LIKE'; +ISNULL: 'ISNULL'; +ISNOTNULL: 'ISNOTNULL'; +BETWEEN: 'BETWEEN'; +CIDRMATCH: 'CIDRMATCH'; +ISPRESENT: 'ISPRESENT'; +ISEMPTY: 'ISEMPTY'; +ISBLANK: 'ISBLANK'; + +// FLOWCONTROL FUNCTIONS +IFNULL: 'IFNULL'; +NULLIF: 'NULLIF'; +IF: 'IF'; +TYPEOF: 'TYPEOF'; + +//OTHER CONDITIONAL EXPRESSIONS +COALESCE: 'COALESCE'; + +//GEOLOCATION FUNCTIONS +GEOIP: 'GEOIP'; + +//GEOLOCATION PROPERTIES +COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE'; +COUNTRY_NAME: 'COUNTRY_NAME'; +CONTINENT_NAME: 'CONTINENT_NAME'; +REGION_ISO_CODE: 'REGION_ISO_CODE'; +REGION_NAME: 'REGION_NAME'; +CITY_NAME: 'CITY_NAME'; +LOCATION: 'LOCATION'; + +// RELEVANCE FUNCTIONS AND PARAMETERS +MATCH: 'MATCH'; +MATCH_PHRASE: 'MATCH_PHRASE'; +MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; +MATCH_BOOL_PREFIX: 'MATCH_BOOL_PREFIX'; +SIMPLE_QUERY_STRING: 'SIMPLE_QUERY_STRING'; +MULTI_MATCH: 'MULTI_MATCH'; +QUERY_STRING: 'QUERY_STRING'; + +ALLOW_LEADING_WILDCARD: 'ALLOW_LEADING_WILDCARD'; +ANALYZE_WILDCARD: 'ANALYZE_WILDCARD'; +ANALYZER: 'ANALYZER'; +AUTO_GENERATE_SYNONYMS_PHRASE_QUERY:'AUTO_GENERATE_SYNONYMS_PHRASE_QUERY'; +BOOST: 'BOOST'; +CUTOFF_FREQUENCY: 'CUTOFF_FREQUENCY'; +DEFAULT_FIELD: 'DEFAULT_FIELD'; +DEFAULT_OPERATOR: 'DEFAULT_OPERATOR'; +ENABLE_POSITION_INCREMENTS: 'ENABLE_POSITION_INCREMENTS'; +ESCAPE: 'ESCAPE'; +FLAGS: 'FLAGS'; +FUZZY_MAX_EXPANSIONS: 'FUZZY_MAX_EXPANSIONS'; +FUZZY_PREFIX_LENGTH: 'FUZZY_PREFIX_LENGTH'; +FUZZY_TRANSPOSITIONS: 'FUZZY_TRANSPOSITIONS'; +FUZZY_REWRITE: 'FUZZY_REWRITE'; +FUZZINESS: 'FUZZINESS'; +LENIENT: 'LENIENT'; +LOW_FREQ_OPERATOR: 'LOW_FREQ_OPERATOR'; +MAX_DETERMINIZED_STATES: 'MAX_DETERMINIZED_STATES'; +MAX_EXPANSIONS: 'MAX_EXPANSIONS'; +MINIMUM_SHOULD_MATCH: 'MINIMUM_SHOULD_MATCH'; +OPERATOR: 'OPERATOR'; +PHRASE_SLOP: 'PHRASE_SLOP'; +PREFIX_LENGTH: 'PREFIX_LENGTH'; +QUOTE_ANALYZER: 'QUOTE_ANALYZER'; +QUOTE_FIELD_SUFFIX: 'QUOTE_FIELD_SUFFIX'; +REWRITE: 'REWRITE'; +SLOP: 'SLOP'; +TIE_BREAKER: 'TIE_BREAKER'; +TYPE: 'TYPE'; +ZERO_TERMS_QUERY: 'ZERO_TERMS_QUERY'; + +// SPAN KEYWORDS +SPAN: 'SPAN'; +MS: 'MS'; +S: 'S'; +M: 'M'; +H: 'H'; +W: 'W'; +Q: 'Q'; +Y: 'Y'; + + +// LITERALS AND VALUES +//STRING_LITERAL: DQUOTA_STRING | SQUOTA_STRING | BQUOTA_STRING; +ID: ID_LITERAL; +CLUSTER: CLUSTER_PREFIX_LITERAL; +INTEGER_LITERAL: DEC_DIGIT+; +DECIMAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+; + +fragment DATE_SUFFIX: ([\-.][*0-9]+)+; +fragment ID_LITERAL: [@*A-Z]+?[*A-Z_\-0-9]*; +fragment CLUSTER_PREFIX_LITERAL: [*A-Z]+?[*A-Z_\-0-9]* COLON; +ID_DATE_SUFFIX: CLUSTER_PREFIX_LITERAL? ID_LITERAL DATE_SUFFIX; +DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"'; +SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\''; +BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`'; +fragment DEC_DIGIT: [0-9]; + +LINE_COMMENT: '//' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN); +BLOCK_COMMENT: '/*' .*? '*/' -> channel(HIDDEN); + +ERROR_RECOGNITION: . -> channel(ERRORCHANNEL); diff --git a/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 b/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 new file mode 100644 index 00000000000..cae57b53181 --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchPPLParser.g4 @@ -0,0 +1,1208 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +parser grammar OpenSearchPPLParser; + + +options { tokenVocab = OpenSearchPPLLexer; } +root + : pplStatement? EOF + ; + +// statement +pplStatement + : dmlStatement + ; + +dmlStatement + : (explainCommand PIPE)? queryStatement + ; + +queryStatement + : pplCommands (PIPE commands)* + ; + +subSearch + : searchCommand (PIPE commands)* + ; + +// commands +pplCommands + : searchCommand + | describeCommand + ; + +commands + : whereCommand + | correlateCommand + | joinCommand + | fieldsCommand + | statsCommand + | dedupCommand + | sortCommand + | headCommand + | topCommand + | rareCommand + | evalCommand + | grokCommand + | parseCommand + | patternsCommand + | lookupCommand + | renameCommand + | fillnullCommand + | fieldsummaryCommand + | flattenCommand + | expandCommand + | trendlineCommand + | appendcolCommand + ; + +commandName + : SEARCH + | DESCRIBE + | SHOW + | AD + | ML + | KMEANS + | WHERE + | CORRELATE + | JOIN + | FIELDS + | STATS + | EVENTSTATS + | DEDUP + | EXPLAIN + | SORT + | HEAD + | TOP + | TOP_APPROX + | RARE + | RARE_APPROX + | EVAL + | GROK + | PARSE + | PATTERNS + | LOOKUP + | RENAME + | EXPAND + | FILLNULL + | FIELDSUMMARY + | FLATTEN + | TRENDLINE + | APPENDCOL + ; + +searchCommand + : (SEARCH)? fromClause # searchFrom + | (SEARCH)? fromClause logicalExpression # searchFromFilter + | (SEARCH)? logicalExpression fromClause # searchFilterFrom + ; + +fieldsummaryCommand + : FIELDSUMMARY (fieldsummaryParameter)* + ; + +fieldsummaryParameter + : INCLUDEFIELDS EQUAL fieldList # fieldsummaryIncludeFields + | NULLS EQUAL booleanLiteral # fieldsummaryNulls + ; + +describeCommand + : DESCRIBE tableSourceClause + ; + +explainCommand + : EXPLAIN explainMode + ; + +explainMode + : FORMATTED + | COST + | CODEGEN + | EXTENDED + | SIMPLE + ; + +showDataSourcesCommand + : SHOW DATASOURCES + ; + +whereCommand + : WHERE logicalExpression + ; + +correlateCommand + : CORRELATE correlationType FIELDS LT_PRTHS fieldList RT_PRTHS (scopeClause)? mappingList + ; + +correlationType + : SELF + | EXACT + | APPROXIMATE + ; + +scopeClause + : SCOPE LT_PRTHS fieldExpression COMMA value = literalValue (unit = timespanUnit)? RT_PRTHS + ; + +mappingList + : MAPPING LT_PRTHS ( mappingClause (COMMA mappingClause)* ) RT_PRTHS + ; + +mappingClause + : left = qualifiedName comparisonOperator right = qualifiedName # mappingCompareExpr + ; + +fieldsCommand + : FIELDS (PLUS | MINUS)? fieldList + ; + +renameCommand + : RENAME renameClasue (COMMA renameClasue)* + ; + +statsCommand + : (STATS | EVENTSTATS) (PARTITIONS EQUAL partitions = integerLiteral)? (ALLNUM EQUAL allnum = booleanLiteral)? (DELIM EQUAL delim = stringLiteral)? statsAggTerm (COMMA statsAggTerm)* (statsByClause)? (DEDUP_SPLITVALUES EQUAL dedupsplit = booleanLiteral)? + ; + +dedupCommand + : DEDUP (number = integerLiteral)? fieldList (KEEPEMPTY EQUAL keepempty = booleanLiteral)? (CONSECUTIVE EQUAL consecutive = booleanLiteral)? + ; + +sortCommand + : SORT sortbyClause + ; + +evalCommand + : EVAL evalClause (COMMA evalClause)* + ; + +headCommand + : HEAD (number = integerLiteral)? (FROM from = integerLiteral)? + ; + +topCommand + : (TOP | TOP_APPROX) (number = integerLiteral)? fieldList (byClause)? + ; + +rareCommand + : (RARE | RARE_APPROX) (number = integerLiteral)? fieldList (byClause)? + ; + +grokCommand + : GROK (source_field = expression) (pattern = stringLiteral) + ; + +parseCommand + : PARSE (source_field = expression) (pattern = stringLiteral) + ; + +patternsCommand + : PATTERNS (patternsParameter)* (source_field = expression) + ; + +patternsParameter + : (NEW_FIELD EQUAL new_field = stringLiteral) + | (PATTERN EQUAL pattern = stringLiteral) + ; + +patternsMethod + : PUNCT + | REGEX + ; + +// lookup +lookupCommand + : LOOKUP tableSource lookupMappingList ((APPEND | REPLACE) outputCandidateList)? + ; + +lookupMappingList + : lookupPair (COMMA lookupPair)* + ; + +outputCandidateList + : lookupPair (COMMA lookupPair)* + ; + + // The lookup pair will generate a K-V pair. + // The format is Key -> Alias(outputFieldName, inputField), Value -> outputField. For example: + // 1. When lookupPair is "name AS cName", the key will be Alias(cName, Field(name)), the value will be Field(cName) + // 2. When lookupPair is "dept", the key is Alias(dept, Field(dept)), value is Field(dept) +lookupPair + : inputField = fieldExpression (AS outputField = fieldExpression)? + ; + +fillnullCommand + : FILLNULL (fillNullWithTheSameValue + | fillNullWithFieldVariousValues) + ; + +fillNullWithTheSameValue + : WITH nullReplacement = valueExpression IN nullableFieldList = fieldList + ; + +fillNullWithFieldVariousValues + : USING nullableReplacementExpression (COMMA nullableReplacementExpression)* + ; + +nullableReplacementExpression + : nullableField = fieldExpression EQUAL nullableReplacement = valueExpression + ; + +expandCommand + : EXPAND fieldExpression (AS alias = qualifiedName)? + ; + +flattenCommand + : FLATTEN fieldExpression (AS alias = identifierSeq)? + ; + +trendlineCommand + : TRENDLINE (SORT sortField)? trendlineClause (trendlineClause)* + ; + +trendlineClause + : trendlineType LT_PRTHS numberOfDataPoints = INTEGER_LITERAL COMMA field = fieldExpression RT_PRTHS (AS alias = qualifiedName)? + ; + +trendlineType + : SMA + | WMA + ; + +appendcolCommand + : APPENDCOL (OVERRIDE EQUAL override = booleanLiteral)? LT_SQR_PRTHS commands (PIPE commands)* RT_SQR_PRTHS + ; + +kmeansCommand + : KMEANS (kmeansParameter)* + ; + +kmeansParameter + : (CENTROIDS EQUAL centroids = integerLiteral) + | (ITERATIONS EQUAL iterations = integerLiteral) + | (DISTANCE_TYPE EQUAL distance_type = stringLiteral) + ; + +adCommand + : AD (adParameter)* + ; + +adParameter + : (NUMBER_OF_TREES EQUAL number_of_trees = integerLiteral) + | (SHINGLE_SIZE EQUAL shingle_size = integerLiteral) + | (SAMPLE_SIZE EQUAL sample_size = integerLiteral) + | (OUTPUT_AFTER EQUAL output_after = integerLiteral) + | (TIME_DECAY EQUAL time_decay = decimalLiteral) + | (ANOMALY_RATE EQUAL anomaly_rate = decimalLiteral) + | (CATEGORY_FIELD EQUAL category_field = stringLiteral) + | (TIME_FIELD EQUAL time_field = stringLiteral) + | (DATE_FORMAT EQUAL date_format = stringLiteral) + | (TIME_ZONE EQUAL time_zone = stringLiteral) + | (TRAINING_DATA_SIZE EQUAL training_data_size = integerLiteral) + | (ANOMALY_SCORE_THRESHOLD EQUAL anomaly_score_threshold = decimalLiteral) + ; + +mlCommand + : ML (mlArg)* + ; + +mlArg + : (argName = ident EQUAL argValue = literalValue) + ; + +// clauses +fromClause + : SOURCE EQUAL tableOrSubqueryClause + | INDEX EQUAL tableOrSubqueryClause + ; + +tableOrSubqueryClause + : LT_SQR_PRTHS subSearch RT_SQR_PRTHS (AS alias = qualifiedName)? + | tableSourceClause + ; + +// One tableSourceClause will generate one Relation node with/without one alias +// even if the relation contains more than one table sources. +// These table sources in one relation will be readed one by one in OpenSearch. +// But it may have different behaivours in different execution backends. +// For example, a Spark UnresovledRelation node only accepts one data source. +tableSourceClause + : tableSource (COMMA tableSource)* (AS alias = qualifiedName)? + ; + +// join +joinCommand + : (joinType) JOIN sideAlias joinHintList? joinCriteria? right = tableOrSubqueryClause + ; + +joinType + : INNER? + | CROSS + | LEFT OUTER? + | RIGHT OUTER? + | FULL OUTER? + | LEFT? SEMI + | LEFT? ANTI + ; + +sideAlias + : (LEFT EQUAL leftAlias = qualifiedName)? COMMA? (RIGHT EQUAL rightAlias = qualifiedName)? + ; + +joinCriteria + : ON logicalExpression + ; + +joinHintList + : hintPair (COMMA? hintPair)* + ; + +hintPair + : leftHintKey = LEFT_HINT DOT ID EQUAL leftHintValue = ident #leftHint + | rightHintKey = RIGHT_HINT DOT ID EQUAL rightHintValue = ident #rightHint + ; + +renameClasue + : orignalField = wcFieldExpression AS renamedField = wcFieldExpression + ; + +byClause + : BY fieldList + ; + +statsByClause + : BY fieldList + | BY bySpanClause + | BY bySpanClause COMMA fieldList + ; + +bySpanClause + : spanClause (AS alias = qualifiedName)? + ; + +spanClause + : SPAN LT_PRTHS fieldExpression COMMA value = literalValue (unit = timespanUnit)? RT_PRTHS + ; + +sortbyClause + : sortField (COMMA sortField)* + ; + +evalClause + : fieldExpression EQUAL expression + | geoipCommand + ; + +geoipCommand + : fieldExpression EQUAL GEOIP LT_PRTHS ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS + ; + +// aggregation terms +statsAggTerm + : statsFunction (AS alias = wcFieldExpression)? + ; + +// aggregation functions +statsFunction + : statsFunctionName LT_PRTHS valueExpression RT_PRTHS # statsFunctionCall + | COUNT LT_PRTHS RT_PRTHS # countAllFunctionCall + | (DISTINCT_COUNT | DC | DISTINCT_COUNT_APPROX) LT_PRTHS valueExpression RT_PRTHS # distinctCountFunctionCall + | percentileFunctionName = (PERCENTILE | PERCENTILE_APPROX) LT_PRTHS valueExpression COMMA percent = integerLiteral RT_PRTHS # percentileFunctionCall + ; + +statsFunctionName + : AVG + | COUNT + | SUM + | MIN + | MAX + | STDDEV_SAMP + | STDDEV_POP + ; + +// expressions +expression + : logicalExpression + | valueExpression + ; + +logicalExpression + : NOT logicalExpression # logicalNot + | LT_PRTHS logicalExpression RT_PRTHS # parentheticLogicalExpr + | comparisonExpression # comparsion + | left = logicalExpression (AND)? right = logicalExpression # logicalAnd + | left = logicalExpression OR right = logicalExpression # logicalOr + | left = logicalExpression XOR right = logicalExpression # logicalXor + | booleanExpression # booleanExpr + ; + +comparisonExpression + : left = valueExpression comparisonOperator right = valueExpression # compareExpr + | valueExpression NOT? IN valueList # inExpr + | expr1 = functionArg NOT? BETWEEN expr2 = functionArg AND expr3 = functionArg # between + ; + +valueExpressionList + : valueExpression + | LT_PRTHS valueExpression (COMMA valueExpression)* RT_PRTHS + ; + +valueExpression + : left = valueExpression binaryOperator = (STAR | DIVIDE | MODULE) right = valueExpression # binaryArithmetic + | left = valueExpression binaryOperator = (PLUS | MINUS) right = valueExpression # binaryArithmetic + | primaryExpression # valueExpressionDefault + | positionFunction # positionFunctionCall + | caseFunction # caseExpr + | timestampFunction # timestampFunctionCall + | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr + | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr + | ident ARROW expression # lambda + | LT_PRTHS ident (COMMA ident)+ RT_PRTHS ARROW expression # lambda + ; + +primaryExpression + : evalFunctionCall + | fieldExpression + | literalValue + | dataTypeFunctionCall + ; + +positionFunction + : positionFunctionName LT_PRTHS functionArg IN functionArg RT_PRTHS + ; + +booleanExpression + : booleanFunctionCall # booleanFunctionCallExpr + | valueExpressionList NOT? IN LT_SQR_PRTHS subSearch RT_SQR_PRTHS # inSubqueryExpr + | EXISTS LT_SQR_PRTHS subSearch RT_SQR_PRTHS # existsSubqueryExpr + | cidrMatchFunctionCall # cidrFunctionCallExpr + ; + + caseFunction + : CASE LT_PRTHS logicalExpression COMMA valueExpression (COMMA logicalExpression COMMA valueExpression)* (ELSE valueExpression)? RT_PRTHS + ; + +relevanceExpression + : singleFieldRelevanceFunction + | multiFieldRelevanceFunction + ; + +// Field is a single column +singleFieldRelevanceFunction + : singleFieldRelevanceFunctionName LT_PRTHS field = relevanceField COMMA query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS + ; + +// Field is a list of columns +multiFieldRelevanceFunction + : multiFieldRelevanceFunctionName LT_PRTHS LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA query = relevanceQuery (COMMA relevanceArg)* RT_PRTHS + ; + +// tables +tableSource + : tableQualifiedName + | ID_DATE_SUFFIX + ; + +tableFunction + : qualifiedName LT_PRTHS functionArgs RT_PRTHS + ; + +// fields +fieldList + : fieldExpression (COMMA fieldExpression)* + ; + +wcFieldList + : wcFieldExpression (COMMA wcFieldExpression)* + ; + +sortField + : (PLUS | MINUS)? sortFieldExpression + ; + +sortFieldExpression + : fieldExpression + + // TODO #963: Implement 'num', 'str', and 'ip' sort syntax + | AUTO LT_PRTHS fieldExpression RT_PRTHS + | STR LT_PRTHS fieldExpression RT_PRTHS + | IP LT_PRTHS fieldExpression RT_PRTHS + | NUM LT_PRTHS fieldExpression RT_PRTHS + ; + +fieldExpression + : qualifiedName + ; + +wcFieldExpression + : wcQualifiedName + ; + +// functions +evalFunctionCall + : evalFunctionName LT_PRTHS functionArgs RT_PRTHS + ; + +// cast function +dataTypeFunctionCall + : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS + ; + +// boolean functions +booleanFunctionCall + : conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS + ; + +cidrMatchFunctionCall + : CIDRMATCH LT_PRTHS ipAddress = functionArg COMMA cidrBlock = functionArg RT_PRTHS + ; + +convertedDataType + : typeName = DATE + | typeName = TIME + | typeName = TIMESTAMP + | typeName = INT + | typeName = INTEGER + | typeName = DOUBLE + | typeName = LONG + | typeName = FLOAT + | typeName = STRING + | typeName = BOOLEAN + ; + +evalFunctionName + : mathematicalFunctionName + | dateTimeFunctionName + | textFunctionName + | conditionFunctionBase + | systemFunctionName + | positionFunctionName + | coalesceFunctionName + | cryptographicFunctionName + | jsonFunctionName + | collectionFunctionName + | lambdaFunctionName + ; + +functionArgs + : (functionArg (COMMA functionArg)*)? + ; + +functionArg + : (ident EQUAL)? valueExpression + ; + +relevanceArg + : relevanceArgName EQUAL relevanceArgValue + ; + +relevanceArgName + : ALLOW_LEADING_WILDCARD + | ANALYZER + | ANALYZE_WILDCARD + | AUTO_GENERATE_SYNONYMS_PHRASE_QUERY + | BOOST + | CUTOFF_FREQUENCY + | DEFAULT_FIELD + | DEFAULT_OPERATOR + | ENABLE_POSITION_INCREMENTS + | ESCAPE + | FIELDS + | FLAGS + | FUZZINESS + | FUZZY_MAX_EXPANSIONS + | FUZZY_PREFIX_LENGTH + | FUZZY_REWRITE + | FUZZY_TRANSPOSITIONS + | LENIENT + | LOW_FREQ_OPERATOR + | MAX_DETERMINIZED_STATES + | MAX_EXPANSIONS + | MINIMUM_SHOULD_MATCH + | OPERATOR + | PHRASE_SLOP + | PREFIX_LENGTH + | QUOTE_ANALYZER + | QUOTE_FIELD_SUFFIX + | REWRITE + | SLOP + | TIE_BREAKER + | TIME_ZONE + | TYPE + | ZERO_TERMS_QUERY + ; + +relevanceFieldAndWeight + : field = relevanceField + | field = relevanceField weight = relevanceFieldWeight + | field = relevanceField BIT_XOR_OP weight = relevanceFieldWeight + ; + +relevanceFieldWeight + : integerLiteral + | decimalLiteral + ; + +relevanceField + : qualifiedName + | stringLiteral + ; + +relevanceQuery + : relevanceArgValue + ; + +relevanceArgValue + : qualifiedName + | literalValue + ; + +mathematicalFunctionName + : ABS + | CBRT + | CEIL + | CEILING + | CONV + | CRC32 + | E + | EXP + | FLOOR + | LN + | LOG + | LOG10 + | LOG2 + | MOD + | PI + | POW + | POWER + | RAND + | ROUND + | SIGN + | SIGNUM + | SQRT + | TRUNCATE + | trigonometricFunctionName + ; + +trigonometricFunctionName + : ACOS + | ASIN + | ATAN + | ATAN2 + | COS + | COT + | DEGREES + | RADIANS + | SIN + | TAN + ; + +cryptographicFunctionName + : MD5 + | SHA1 + | SHA2 + ; + +dateTimeFunctionName + : ADDDATE + | ADDTIME + | CONVERT_TZ + | CURDATE + | CURRENT_DATE + | CURRENT_TIME + | CURRENT_TIMESTAMP + | CURRENT_TIMEZONE + | CURTIME + | DATE + | DATEDIFF + | DATETIME + | DATE_ADD + | DATE_FORMAT + | DATE_SUB + | DAY + | DAYNAME + | DAYOFMONTH + | DAYOFWEEK + | DAYOFYEAR + | DAY_OF_MONTH + | DAY_OF_WEEK + | DAY_OF_YEAR + | FROM_DAYS + | FROM_UNIXTIME + | HOUR + | HOUR_OF_DAY + | LAST_DAY + | LOCALTIME + | LOCALTIMESTAMP + | MAKEDATE + | MAKE_DATE + | MAKETIME + | MICROSECOND + | MINUTE + | MINUTE_OF_DAY + | MINUTE_OF_HOUR + | MONTH + | MONTHNAME + | MONTH_OF_YEAR + | NOW + | PERIOD_ADD + | PERIOD_DIFF + | QUARTER + | SECOND + | SECOND_OF_MINUTE + | SEC_TO_TIME + | STR_TO_DATE + | SUBDATE + | SUBTIME + | SYSDATE + | TIME + | TIMEDIFF + | TIMESTAMP + | TIME_FORMAT + | TIME_TO_SEC + | TO_DAYS + | TO_SECONDS + | UNIX_TIMESTAMP + | UTC_DATE + | UTC_TIME + | UTC_TIMESTAMP + | WEEK + | WEEKDAY + | WEEK_OF_YEAR + | YEAR + | YEARWEEK + | relativeTimeFunctionName + ; + +relativeTimeFunctionName + : RELATIVE_TIMESTAMP + | EARLIEST + | LATEST + ; + +getFormatFunction + : GET_FORMAT LT_PRTHS getFormatType COMMA functionArg RT_PRTHS + ; + +getFormatType + : DATE + | DATETIME + | TIME + | TIMESTAMP + ; + +extractFunction + : EXTRACT LT_PRTHS datetimePart FROM functionArg RT_PRTHS + ; + +simpleDateTimePart + : MICROSECOND + | SECOND + | MINUTE + | HOUR + | DAY + | WEEK + | MONTH + | QUARTER + | YEAR + ; + +complexDateTimePart + : SECOND_MICROSECOND + | MINUTE_MICROSECOND + | MINUTE_SECOND + | HOUR_MICROSECOND + | HOUR_SECOND + | HOUR_MINUTE + | DAY_MICROSECOND + | DAY_SECOND + | DAY_MINUTE + | DAY_HOUR + | YEAR_MONTH + ; + +datetimePart + : simpleDateTimePart + | complexDateTimePart + ; + +timestampFunction + : timestampFunctionName LT_PRTHS simpleDateTimePart COMMA firstArg = functionArg COMMA secondArg = functionArg RT_PRTHS + ; + +timestampFunctionName + : TIMESTAMPADD + | TIMESTAMPDIFF + ; + +// condition function return boolean value +conditionFunctionBase + : LIKE + | IF + | ISNULL + | ISNOTNULL + | IFNULL + | NULLIF + | ISPRESENT + | JSON_VALID + | EARLIEST + | LATEST + | ISEMPTY + | ISBLANK + ; + +systemFunctionName + : TYPEOF + ; + +textFunctionName + : SUBSTR + | SUBSTRING + | TRIM + | LTRIM + | RTRIM + | LOWER + | UPPER + | CONCAT + | CONCAT_WS + | LENGTH + | STRCMP + | RIGHT + | LEFT + | ASCII + | LOCATE + | REPLACE + | REVERSE + | ISEMPTY + | ISBLANK + ; + +jsonFunctionName + : JSON + | JSON_OBJECT + | JSON_ARRAY + | JSON_ARRAY_LENGTH + | TO_JSON_STRING + | JSON_EXTRACT + | JSON_DELETE + | JSON_APPEND + | JSON_KEYS + | JSON_VALID + | JSON_EXTEND + | JSON_SET +// | JSON_ARRAY_ALL_MATCH +// | JSON_ARRAY_ANY_MATCH +// | JSON_ARRAY_FILTER +// | JSON_ARRAY_MAP +// | JSON_ARRAY_REDUCE + ; + +collectionFunctionName + : ARRAY + | ARRAY_LENGTH + ; + +lambdaFunctionName + : FORALL + | EXISTS + | FILTER + | TRANSFORM + | REDUCE + ; + +positionFunctionName + : POSITION + ; + +coalesceFunctionName + : COALESCE + ; + +geoIpPropertyList + : geoIpProperty (COMMA geoIpProperty)* + ; + +geoIpProperty + : COUNTRY_ISO_CODE + | COUNTRY_NAME + | CONTINENT_NAME + | REGION_ISO_CODE + | REGION_NAME + | CITY_NAME + | TIME_ZONE + | LOCATION + ; + +// operators + comparisonOperator + : EQUAL + | NOT_EQUAL + | LESS + | NOT_LESS + | GREATER + | NOT_GREATER + | REGEXP + ; + +singleFieldRelevanceFunctionName + : MATCH + | MATCH_PHRASE + | MATCH_BOOL_PREFIX + | MATCH_PHRASE_PREFIX + ; + +multiFieldRelevanceFunctionName + : SIMPLE_QUERY_STRING + | MULTI_MATCH + | QUERY_STRING + ; + +// literals and values +literalValue + : stringLiteral + | integerLiteral + | decimalLiteral + | booleanLiteral + | datetimeLiteral //#datetime + | intervalLiteral + ; + +intervalLiteral + : INTERVAL valueExpression intervalUnit + ; + +stringLiteral + : DQUOTA_STRING + | SQUOTA_STRING + ; + +integerLiteral + : (PLUS | MINUS)? INTEGER_LITERAL + ; + +decimalLiteral + : (PLUS | MINUS)? DECIMAL_LITERAL + ; + +booleanLiteral + : TRUE + | FALSE + ; + +// Date and Time Literal, follow ANSI 92 +datetimeLiteral + : dateLiteral + | timeLiteral + | timestampLiteral + ; + +dateLiteral + : DATE date = stringLiteral + ; + +timeLiteral + : TIME time = stringLiteral + ; + +timestampLiteral + : TIMESTAMP timestamp = stringLiteral + ; + +intervalUnit + : MICROSECOND + | SECOND + | MINUTE + | HOUR + | DAY + | WEEK + | MONTH + | QUARTER + | YEAR + | SECOND_MICROSECOND + | MINUTE_MICROSECOND + | MINUTE_SECOND + | HOUR_MICROSECOND + | HOUR_SECOND + | HOUR_MINUTE + | DAY_MICROSECOND + | DAY_SECOND + | DAY_MINUTE + | DAY_HOUR + | YEAR_MONTH + ; + +timespanUnit + : MS + | S + | M + | H + | D + | W + | Q + | Y + | MILLISECOND + | SECOND + | MINUTE + | HOUR + | DAY + | WEEK + | MONTH + | QUARTER + | YEAR + ; + +valueList + : LT_PRTHS literalValue (COMMA literalValue)* RT_PRTHS + ; + +qualifiedName + : ident (DOT ident)* # identsAsQualifiedName + ; + +identifierSeq + : qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq + | LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq + ; + +tableQualifiedName + : tableIdent (DOT ident)* # identsAsTableQualifiedName + ; + +wcQualifiedName + : wildcard (DOT wildcard)* # identsAsWildcardQualifiedName + ; + +ident + : (DOT)? ID + | BACKTICK ident BACKTICK + | BQUOTA_STRING + | keywordsCanBeId + ; + +tableIdent + : (CLUSTER)? ident + ; + +wildcard + : ident (MODULE ident)* (MODULE)? + | SINGLE_QUOTE wildcard SINGLE_QUOTE + | DOUBLE_QUOTE wildcard DOUBLE_QUOTE + | BACKTICK wildcard BACKTICK + ; + +keywordsCanBeId + : D // OD SQL and ODBC special + | timespanUnit + | SPAN + | evalFunctionName + | relevanceArgName + | intervalUnit + | dateTimeFunctionName + | textFunctionName + | jsonFunctionName + | mathematicalFunctionName + | positionFunctionName + | cryptographicFunctionName + | singleFieldRelevanceFunctionName + | multiFieldRelevanceFunctionName + | commandName + | comparisonOperator + | explainMode + | correlationType + | geoIpProperty + // commands assist keywords + | GEOIP + | OVERRIDE + | ARROW + | IN + | SOURCE + | INDEX + | DESC + | DATASOURCES + | FROM + | PATTERN + | NEW_FIELD + | SCOPE + | MAPPING + | WITH + | USING + | CAST + | GET_FORMAT + | EXTRACT + | INTERVAL + | PLUS + | MINUS + | INCLUDEFIELDS + | NULLS + // ARGUMENT KEYWORDS + | KEEPEMPTY + | CONSECUTIVE + | DEDUP_SPLITVALUES + | PARTITIONS + | ALLNUM + | DELIM + | CENTROIDS + | ITERATIONS + | DISTANCE_TYPE + | NUMBER_OF_TREES + | SHINGLE_SIZE + | SAMPLE_SIZE + | OUTPUT_AFTER + | TIME_DECAY + | ANOMALY_RATE + | CATEGORY_FIELD + | TIME_FIELD + | TIME_ZONE + | TRAINING_DATA_SIZE + | ANOMALY_SCORE_THRESHOLD + // AGGREGATIONS + | statsFunctionName + | DISTINCT_COUNT + | DISTINCT_COUNT_APPROX + | PERCENTILE + | PERCENTILE_APPROX + | ESTDC + | ESTDC_ERROR + | MEAN + | MEDIAN + | MODE + | RANGE + | STDEV + | STDEVP + | SUMSQ + | VAR_SAMP + | VAR_POP + | TAKE + | FIRST + | LAST + | LIST + | VALUES + | PER_DAY + | PER_HOUR + | PER_MINUTE + | PER_SECOND + | RATE + | SPARKLINE + | C + | DC + // JOIN TYPE + | OUTER + | INNER + | CROSS + | LEFT + | RIGHT + | FULL + | SEMI + | ANTI + | BETWEEN + | CIDRMATCH + | trendlineType + // SORT FIELD KEYWORDS + | AUTO + | STR + | IP + | NUM + ; diff --git a/language-grammar/src/main/antlr4/OpenSearchSQLLexer.g4 b/language-grammar/src/main/antlr4/OpenSearchSQLLexer.g4 new file mode 100644 index 00000000000..ba7c5be85ab --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchSQLLexer.g4 @@ -0,0 +1,484 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* +MySQL (Positive Technologies) grammar +The MIT License (MIT). +Copyright (c) 2015-2017, Ivan Kochurkin (kvanttt@gmail.com), Positive Technologies. +Copyright (c) 2017, Ivan Khudyashev (IHudyashov@ptsecurity.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +lexer grammar OpenSearchSQLLexer; + +channels { SQLCOMMENT, ERRORCHANNEL } + + +// SKIP + +SPACE: [ \t\r\n]+ -> channel(HIDDEN); +SPEC_SQL_COMMENT: '/*!' .+? '*/' -> channel(SQLCOMMENT); +COMMENT_INPUT: '/*' .*? '*/' -> channel(HIDDEN); +LINE_COMMENT: ( + ('-- ' | '#') ~[\r\n]* ('\r'? '\n' | EOF) + | '--' ('\r'? '\n' | EOF) + ) -> channel(HIDDEN); + + +// Keywords +// Common Keywords + +ALL: 'ALL'; +AND: 'AND'; +AS: 'AS'; +ASC: 'ASC'; +BOOLEAN: 'BOOLEAN'; +BETWEEN: 'BETWEEN'; +BY: 'BY'; +CASE: 'CASE'; +CAST: 'CAST'; +CROSS: 'CROSS'; +COLUMNS: 'COLUMNS'; +DATETIME: 'DATETIME'; +DELETE: 'DELETE'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DISTINCT: 'DISTINCT'; +DOUBLE: 'DOUBLE'; +ELSE: 'ELSE'; +EXISTS: 'EXISTS'; +FALSE: 'FALSE'; +FLOAT: 'FLOAT'; +FIRST: 'FIRST'; +FROM: 'FROM'; +GROUP: 'GROUP'; +HAVING: 'HAVING'; +IN: 'IN'; +INNER: 'INNER'; +INT: 'INT'; +INTEGER: 'INTEGER'; +IS: 'IS'; +JOIN: 'JOIN'; +LAST: 'LAST'; +LEFT: 'LEFT'; +LIKE: 'LIKE'; +LIMIT: 'LIMIT'; +LONG: 'LONG'; +MATCH: 'MATCH'; +NATURAL: 'NATURAL'; +MISSING_LITERAL: 'MISSING'; +NOT: 'NOT'; +NULL_LITERAL: 'NULL'; +NULLS: 'NULLS'; +ON: 'ON'; +OR: 'OR'; +ORDER: 'ORDER'; +OUTER: 'OUTER'; +OVER: 'OVER'; +PARTITION: 'PARTITION'; +REGEXP: 'REGEXP'; +RIGHT: 'RIGHT'; +SELECT: 'SELECT'; +SHOW: 'SHOW'; +STRING: 'STRING'; +THEN: 'THEN'; +TRUE: 'TRUE'; +UNION: 'UNION'; +USING: 'USING'; +WHEN: 'WHEN'; +WHERE: 'WHERE'; + + +// OD SQL special keyword +MISSING: 'MISSING'; +EXCEPT: 'MINUS'; + + +// Group function Keywords + +AVG: 'AVG'; +COUNT: 'COUNT'; +MAX: 'MAX'; +MIN: 'MIN'; +SUM: 'SUM'; +VAR_POP: 'VAR_POP'; +VAR_SAMP: 'VAR_SAMP'; +VARIANCE: 'VARIANCE'; +STD: 'STD'; +STDDEV: 'STDDEV'; +STDDEV_POP: 'STDDEV_POP'; +STDDEV_SAMP: 'STDDEV_SAMP'; + + +// Common function Keywords + +SUBSTRING: 'SUBSTRING'; +TRIM: 'TRIM'; + +// Keywords, but can be ID +// Common Keywords, but can be ID + +END: 'END'; +FULL: 'FULL'; +OFFSET: 'OFFSET'; + +// INTERVAL AND UNIT KEYWORDS +INTERVAL: 'INTERVAL'; +MICROSECOND: 'MICROSECOND'; +SECOND: 'SECOND'; +MINUTE: 'MINUTE'; +HOUR: 'HOUR'; +DAY: 'DAY'; +WEEK: 'WEEK'; +MONTH: 'MONTH'; +QUARTER: 'QUARTER'; +YEAR: 'YEAR'; +SECOND_MICROSECOND: 'SECOND_MICROSECOND'; +MINUTE_MICROSECOND: 'MINUTE_MICROSECOND'; +MINUTE_SECOND: 'MINUTE_SECOND'; +HOUR_MICROSECOND: 'HOUR_MICROSECOND'; +HOUR_SECOND: 'HOUR_SECOND'; +HOUR_MINUTE: 'HOUR_MINUTE'; +DAY_MICROSECOND: 'DAY_MICROSECOND'; +DAY_SECOND: 'DAY_SECOND'; +DAY_MINUTE: 'DAY_MINUTE'; +DAY_HOUR: 'DAY_HOUR'; +YEAR_MONTH: 'YEAR_MONTH'; + + +// PRIVILEGES + +TABLES: 'TABLES'; + + +// Common function names + +ABS: 'ABS'; +ACOS: 'ACOS'; +ADD: 'ADD'; +ADDTIME: 'ADDTIME'; +ASCII: 'ASCII'; +ASIN: 'ASIN'; +ATAN: 'ATAN'; +ATAN2: 'ATAN2'; +CBRT: 'CBRT'; +CEIL: 'CEIL'; +CEILING: 'CEILING'; +CONCAT: 'CONCAT'; +CONCAT_WS: 'CONCAT_WS'; +CONV: 'CONV'; +CONVERT_TZ: 'CONVERT_TZ'; +COS: 'COS'; +COSH: 'COSH'; +COT: 'COT'; +CRC32: 'CRC32'; +CURDATE: 'CURDATE'; +CURTIME: 'CURTIME'; +CURRENT_DATE: 'CURRENT_DATE'; +CURRENT_TIME: 'CURRENT_TIME'; +CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; +DATE: 'DATE'; +DATE_ADD: 'DATE_ADD'; +DATE_FORMAT: 'DATE_FORMAT'; +DATE_SUB: 'DATE_SUB'; +DATEDIFF: 'DATEDIFF'; +DAYNAME: 'DAYNAME'; +DAYOFMONTH: 'DAYOFMONTH'; +DAYOFWEEK: 'DAYOFWEEK'; +DAYOFYEAR: 'DAYOFYEAR'; +DEGREES: 'DEGREES'; +DIVIDE: 'DIVIDE'; +E: 'E'; +EXP: 'EXP'; +EXPM1: 'EXPM1'; +EXTRACT: 'EXTRACT'; +FLOOR: 'FLOOR'; +FROM_DAYS: 'FROM_DAYS'; +FROM_UNIXTIME: 'FROM_UNIXTIME'; +GET_FORMAT: 'GET_FORMAT'; +IF: 'IF'; +IFNULL: 'IFNULL'; +ISNULL: 'ISNULL'; +LAST_DAY: 'LAST_DAY'; +LENGTH: 'LENGTH'; +LN: 'LN'; +LOCALTIME: 'LOCALTIME'; +LOCALTIMESTAMP: 'LOCALTIMESTAMP'; +LOCATE: 'LOCATE'; +LOG: 'LOG'; +LOG10: 'LOG10'; +LOG2: 'LOG2'; +LOWER: 'LOWER'; +LTRIM: 'LTRIM'; +MAKEDATE: 'MAKEDATE'; +MAKETIME: 'MAKETIME'; +MODULUS: 'MODULUS'; +MONTHNAME: 'MONTHNAME'; +MULTIPLY: 'MULTIPLY'; +NOW: 'NOW'; +NULLIF: 'NULLIF'; +PERIOD_ADD: 'PERIOD_ADD'; +PERIOD_DIFF: 'PERIOD_DIFF'; +PI: 'PI'; +POSITION: 'POSITION'; +POW: 'POW'; +POWER: 'POWER'; +RADIANS: 'RADIANS'; +RAND: 'RAND'; +REPLACE: 'REPLACE'; +RINT: 'RINT'; +ROUND: 'ROUND'; +RTRIM: 'RTRIM'; +REVERSE: 'REVERSE'; +SEC_TO_TIME: 'SEC_TO_TIME'; +SIGN: 'SIGN'; +SIGNUM: 'SIGNUM'; +SIN: 'SIN'; +SINH: 'SINH'; +SQRT: 'SQRT'; +STR_TO_DATE: 'STR_TO_DATE'; +SUBDATE: 'SUBDATE'; +SUBTIME: 'SUBTIME'; +SUBTRACT: 'SUBTRACT'; +SYSDATE: 'SYSDATE'; +TAN: 'TAN'; +TIME: 'TIME'; +TIMEDIFF: 'TIMEDIFF'; +TIME_FORMAT: 'TIME_FORMAT'; +TIME_TO_SEC: 'TIME_TO_SEC'; +TIMESTAMP: 'TIMESTAMP'; +TRUNCATE: 'TRUNCATE'; +TO_DAYS: 'TO_DAYS'; +TO_SECONDS: 'TO_SECONDS'; +UNIX_TIMESTAMP: 'UNIX_TIMESTAMP'; +UPPER: 'UPPER'; +UTC_DATE: 'UTC_DATE'; +UTC_TIME: 'UTC_TIME'; +UTC_TIMESTAMP: 'UTC_TIMESTAMP'; + +D: 'D'; +T: 'T'; +TS: 'TS'; +LEFT_BRACE: '{'; +RIGHT_BRACE: '}'; + + +// Window function names +DENSE_RANK: 'DENSE_RANK'; +RANK: 'RANK'; +ROW_NUMBER: 'ROW_NUMBER'; + +// OD SQL special functions +DATE_HISTOGRAM: 'DATE_HISTOGRAM'; +DAY_OF_MONTH: 'DAY_OF_MONTH'; +DAY_OF_YEAR: 'DAY_OF_YEAR'; +DAY_OF_WEEK: 'DAY_OF_WEEK'; +EXCLUDE: 'EXCLUDE'; +EXTENDED_STATS: 'EXTENDED_STATS'; +FIELD: 'FIELD'; +FILTER: 'FILTER'; +GEO_BOUNDING_BOX: 'GEO_BOUNDING_BOX'; +GEO_CELL: 'GEO_CELL'; +GEO_DISTANCE: 'GEO_DISTANCE'; +GEO_DISTANCE_RANGE: 'GEO_DISTANCE_RANGE'; +GEO_INTERSECTS: 'GEO_INTERSECTS'; +GEO_POLYGON: 'GEO_POLYGON'; +HISTOGRAM: 'HISTOGRAM'; +HOUR_OF_DAY: 'HOUR_OF_DAY'; +INCLUDE: 'INCLUDE'; +IN_TERMS: 'IN_TERMS'; +MATCHPHRASE: 'MATCHPHRASE'; +MATCH_PHRASE: 'MATCH_PHRASE'; +MATCHPHRASEQUERY: 'MATCHPHRASEQUERY'; +SIMPLE_QUERY_STRING: 'SIMPLE_QUERY_STRING'; +QUERY_STRING: 'QUERY_STRING'; +MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; +MATCHQUERY: 'MATCHQUERY'; +MATCH_QUERY: 'MATCH_QUERY'; +MINUTE_OF_DAY: 'MINUTE_OF_DAY'; +MINUTE_OF_HOUR: 'MINUTE_OF_HOUR'; +MONTH_OF_YEAR: 'MONTH_OF_YEAR'; +MULTIMATCH: 'MULTIMATCH'; +MULTI_MATCH: 'MULTI_MATCH'; +MULTIMATCHQUERY: 'MULTIMATCHQUERY'; +NESTED: 'NESTED'; +PERCENTILES: 'PERCENTILES'; +PERCENTILE: 'PERCENTILE'; +PERCENTILE_APPROX: 'PERCENTILE_APPROX'; +REGEXP_QUERY: 'REGEXP_QUERY'; +REVERSE_NESTED: 'REVERSE_NESTED'; +QUERY: 'QUERY'; +RANGE: 'RANGE'; +SCORE: 'SCORE'; +SCOREQUERY: 'SCOREQUERY'; +SCORE_QUERY: 'SCORE_QUERY'; +SECOND_OF_MINUTE: 'SECOND_OF_MINUTE'; +STATS: 'STATS'; +TERM: 'TERM'; +TERMS: 'TERMS'; +TIMESTAMPADD: 'TIMESTAMPADD'; +TIMESTAMPDIFF: 'TIMESTAMPDIFF'; +TOPHITS: 'TOPHITS'; +TYPEOF: 'TYPEOF'; +WEEK_OF_YEAR: 'WEEK_OF_YEAR'; +WEEKOFYEAR: 'WEEKOFYEAR'; +WEEKDAY: 'WEEKDAY'; +WILDCARDQUERY: 'WILDCARDQUERY'; +WILDCARD_QUERY: 'WILDCARD_QUERY'; + +// TEXT FUNCTIONS +SUBSTR: 'SUBSTR'; +STRCMP: 'STRCMP'; + +// DATE AND TIME FUNCTIONS +ADDDATE: 'ADDDATE'; +YEARWEEK: 'YEARWEEK'; + +// RELEVANCE FUNCTIONS AND PARAMETERS +ALLOW_LEADING_WILDCARD: 'ALLOW_LEADING_WILDCARD'; +ANALYZER: 'ANALYZER'; +ANALYZE_WILDCARD: 'ANALYZE_WILDCARD'; +AUTO_GENERATE_SYNONYMS_PHRASE_QUERY:'AUTO_GENERATE_SYNONYMS_PHRASE_QUERY'; +BOOST: 'BOOST'; +CASE_INSENSITIVE: 'CASE_INSENSITIVE'; +CUTOFF_FREQUENCY: 'CUTOFF_FREQUENCY'; +DEFAULT_FIELD: 'DEFAULT_FIELD'; +DEFAULT_OPERATOR: 'DEFAULT_OPERATOR'; +ESCAPE: 'ESCAPE'; +ENABLE_POSITION_INCREMENTS: 'ENABLE_POSITION_INCREMENTS'; +FIELDS: 'FIELDS'; +FLAGS: 'FLAGS'; +FUZZINESS: 'FUZZINESS'; +FUZZY_MAX_EXPANSIONS: 'FUZZY_MAX_EXPANSIONS'; +FUZZY_PREFIX_LENGTH: 'FUZZY_PREFIX_LENGTH'; +FUZZY_REWRITE: 'FUZZY_REWRITE'; +FUZZY_TRANSPOSITIONS: 'FUZZY_TRANSPOSITIONS'; +LENIENT: 'LENIENT'; +LOW_FREQ_OPERATOR: 'LOW_FREQ_OPERATOR'; +MAX_DETERMINIZED_STATES: 'MAX_DETERMINIZED_STATES'; +MAX_EXPANSIONS: 'MAX_EXPANSIONS'; +MINIMUM_SHOULD_MATCH: 'MINIMUM_SHOULD_MATCH'; +OPERATOR: 'OPERATOR'; +PHRASE_SLOP: 'PHRASE_SLOP'; +PREFIX_LENGTH: 'PREFIX_LENGTH'; +QUOTE_ANALYZER: 'QUOTE_ANALYZER'; +QUOTE_FIELD_SUFFIX: 'QUOTE_FIELD_SUFFIX'; +REWRITE: 'REWRITE'; +SLOP: 'SLOP'; +TIE_BREAKER: 'TIE_BREAKER'; +TIME_ZONE: 'TIME_ZONE'; +TYPE: 'TYPE'; +ZERO_TERMS_QUERY: 'ZERO_TERMS_QUERY'; +HIGHLIGHT: 'HIGHLIGHT'; +HIGHLIGHT_PRE_TAGS: 'PRE_TAGS'; +HIGHLIGHT_POST_TAGS: 'POST_TAGS'; + +// RELEVANCE FUNCTIONS +MATCH_BOOL_PREFIX: 'MATCH_BOOL_PREFIX'; +// Operators + +// Operators. Arithmetics + +STAR: '*'; +SLASH: '/'; +MODULE: '%'; +PLUS: '+'; +MINUS: '-'; +DIV: 'DIV'; +MOD: 'MOD'; + + +// Operators. Comparation + +EQUAL_SYMBOL: '='; +GREATER_SYMBOL: '>'; +LESS_SYMBOL: '<'; +EXCLAMATION_SYMBOL: '!'; + + +// Operators. Bit + +BIT_NOT_OP: '~'; +BIT_OR_OP: '|'; +BIT_AND_OP: '&'; +BIT_XOR_OP: '^'; + + +// Constructors symbols + +DOT: '.'; +LR_BRACKET: '('; +RR_BRACKET: ')'; +LT_SQR_PRTHS: '['; +RT_SQR_PRTHS: ']'; +COMMA: ','; +SEMI: ';'; +AT_SIGN: '@'; +ZERO_DECIMAL: '0'; +ONE_DECIMAL: '1'; +TWO_DECIMAL: '2'; +SINGLE_QUOTE_SYMB: '\''; +DOUBLE_QUOTE_SYMB: '"'; +REVERSE_QUOTE_SYMB: '`'; +COLON_SYMB: ':'; + + +// Literal Primitives + +START_NATIONAL_STRING_LITERAL: 'N' SQUOTA_STRING; +STRING_LITERAL: SQUOTA_STRING; +DECIMAL_LITERAL: DEC_DIGIT+; +HEXADECIMAL_LITERAL: 'X' '\'' (HEX_DIGIT HEX_DIGIT)+ '\'' + | '0X' HEX_DIGIT+; + +REAL_LITERAL: (DEC_DIGIT+)? '.' DEC_DIGIT+ + | DEC_DIGIT+ '.' EXPONENT_NUM_PART + | (DEC_DIGIT+)? '.' (DEC_DIGIT+ EXPONENT_NUM_PART) + | DEC_DIGIT+ EXPONENT_NUM_PART; +NULL_SPEC_LITERAL: '\\' 'N'; +BIT_STRING: BIT_STRING_L; + + + +// Identifiers + +ID: ID_LITERAL; +DOUBLE_QUOTE_ID: DQUOTA_STRING; +BACKTICK_QUOTE_ID: BQUOTA_STRING; + + +// Fragments for Literal primitives +fragment EXPONENT_NUM_PART: 'E' [-+]? DEC_DIGIT+; +fragment DQUOTA_STRING: '"' ( '\\'. | '""' | ~('"'| '\\') )* '"'; +fragment SQUOTA_STRING: '\'' ('\\'. | '\'\'' | ~('\'' | '\\'))* '\''; +fragment BQUOTA_STRING: '`' ( '\\'. | '``' | ~('`'|'\\'))* '`'; +fragment HEX_DIGIT: [0-9A-F]; +fragment DEC_DIGIT: [0-9]; +fragment BIT_STRING_L: 'B' '\'' [01]+ '\''; + +// Identifiers cannot start with a single '_' since this an OpenSearch reserved +// metadata field. Two underscores (or more) is acceptable, such as '__field'. +fragment ID_LITERAL: ([@*A-Z_])+?[*A-Z_\-0-9]*; + +// Last tokens must generate Errors + +ERROR_RECOGNITION: . -> channel(ERRORCHANNEL); diff --git a/language-grammar/src/main/antlr4/OpenSearchSQLParser.g4 b/language-grammar/src/main/antlr4/OpenSearchSQLParser.g4 new file mode 100644 index 00000000000..5f7361160b3 --- /dev/null +++ b/language-grammar/src/main/antlr4/OpenSearchSQLParser.g4 @@ -0,0 +1,844 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* +MySQL (Positive Technologies) grammar +The MIT License (MIT). +Copyright (c) 2015-2017, Ivan Kochurkin (kvanttt@gmail.com), Positive Technologies. +Copyright (c) 2017, Ivan Khudyashev (IHudyashov@ptsecurity.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +parser grammar OpenSearchSQLParser; + + +options { tokenVocab = OpenSearchSQLLexer; } +// Top Level Description + +// Root rule + +root + : sqlStatement? SEMI? EOF + ; + +// Only SELECT +sqlStatement + : dmlStatement + | adminStatement + ; + +dmlStatement + : selectStatement + ; + +// Data Manipulation Language + +// Primary DML Statements +selectStatement + : querySpecification # simpleSelect + ; + +adminStatement + : showStatement + | describeStatement + ; + +showStatement + : SHOW TABLES tableFilter + ; + +describeStatement + : DESCRIBE TABLES tableFilter columnFilter? + ; + +columnFilter + : COLUMNS LIKE showDescribePattern + ; + +tableFilter + : LIKE showDescribePattern + ; + +showDescribePattern + : stringLiteral + ; + +// Select Statement's Details +querySpecification + : selectClause fromClause? limitClause? + ; + +selectClause + : SELECT selectSpec? selectElements + ; + +selectSpec + : (ALL | DISTINCT) + ; + +selectElements + : (star = STAR | selectElement) (COMMA selectElement)* + ; + +selectElement + : expression (AS? alias)? + ; + +fromClause + : FROM relation (whereClause)? (groupByClause)? (havingClause)? (orderByClause)? // Place it under FROM for now but actually not necessary ex. A UNION B ORDER BY + + ; + +relation + : tableName (AS? alias)? # tableAsRelation + | LR_BRACKET subquery = querySpecification RR_BRACKET AS? alias # subqueryAsRelation + ; + +whereClause + : WHERE expression + ; + +groupByClause + : GROUP BY groupByElements + ; + +groupByElements + : groupByElement (COMMA groupByElement)* + ; + +groupByElement + : expression + ; + +havingClause + : HAVING expression + ; + +orderByClause + : ORDER BY orderByElement (COMMA orderByElement)* + ; + +orderByElement + : expression order = (ASC | DESC)? (NULLS (FIRST | LAST))? + ; + +limitClause + : LIMIT (offset = decimalLiteral COMMA)? limit = decimalLiteral + | LIMIT limit = decimalLiteral OFFSET offset = decimalLiteral + ; + +// Window Function's Details +windowFunctionClause + : function = windowFunction overClause + ; + +windowFunction + : functionName = (ROW_NUMBER | RANK | DENSE_RANK) LR_BRACKET functionArgs? RR_BRACKET # scalarWindowFunction + | aggregateFunction # aggregateWindowFunction + ; + +overClause + : OVER LR_BRACKET partitionByClause? orderByClause? RR_BRACKET + ; + +partitionByClause + : PARTITION BY expression (COMMA expression)* + ; + +// Literals +constant + : stringLiteral # string + | sign? decimalLiteral # signedDecimal + | sign? realLiteral # signedReal + | booleanLiteral # boolean + | datetimeLiteral # datetime + | intervalLiteral # interval + | nullLiteral # null + // Doesn't support the following types for now + //| BIT_STRING + //| NOT? nullLiteral=(NULL_LITERAL | NULL_SPEC_LITERAL) + ; + +decimalLiteral + : DECIMAL_LITERAL + | ZERO_DECIMAL + | ONE_DECIMAL + | TWO_DECIMAL + ; + +numericLiteral + : decimalLiteral + | realLiteral + ; + +stringLiteral + : STRING_LITERAL + | DOUBLE_QUOTE_ID + ; + +booleanLiteral + : TRUE + | FALSE + ; + +realLiteral + : REAL_LITERAL + ; + +sign + : PLUS + | MINUS + ; + +nullLiteral + : NULL_LITERAL + ; + +// Date and Time Literal, follow ANSI 92 +datetimeLiteral + : dateLiteral + | timeLiteral + | timestampLiteral + ; + +dateLiteral + : DATE date = stringLiteral + | LEFT_BRACE (DATE | D) date = stringLiteral RIGHT_BRACE + ; + +timeLiteral + : TIME time = stringLiteral + | LEFT_BRACE (TIME | T) time = stringLiteral RIGHT_BRACE + ; + +timestampLiteral + : TIMESTAMP timestamp = stringLiteral + | LEFT_BRACE (TIMESTAMP | TS) timestamp = stringLiteral RIGHT_BRACE + ; + +// Actually, these constants are shortcuts to the corresponding functions +datetimeConstantLiteral + : CURRENT_DATE + | CURRENT_TIME + | CURRENT_TIMESTAMP + | LOCALTIME + | LOCALTIMESTAMP + | UTC_TIMESTAMP + | UTC_DATE + | UTC_TIME + ; + +intervalLiteral + : INTERVAL expression intervalUnit + ; + +intervalUnit + : MICROSECOND + | SECOND + | MINUTE + | HOUR + | DAY + | WEEK + | MONTH + | QUARTER + | YEAR + | SECOND_MICROSECOND + | MINUTE_MICROSECOND + | MINUTE_SECOND + | HOUR_MICROSECOND + | HOUR_SECOND + | HOUR_MINUTE + | DAY_MICROSECOND + | DAY_SECOND + | DAY_MINUTE + | DAY_HOUR + | YEAR_MONTH + ; + +// predicates + +// Simplified approach for expression +expression + : NOT expression # notExpression + | left = expression AND right = expression # andExpression + | left = expression OR right = expression # orExpression + | predicate # predicateExpression + ; + +predicate + : expressionAtom # expressionAtomPredicate + | left = predicate comparisonOperator right = predicate # binaryComparisonPredicate + | predicate IS nullNotnull # isNullPredicate + | predicate NOT? BETWEEN predicate AND predicate # betweenPredicate + | left = predicate NOT? LIKE right = predicate # likePredicate + | left = predicate REGEXP right = predicate # regexpPredicate + | predicate NOT? IN '(' expressions ')' # inPredicate + ; + +expressions + : expression (',' expression)* + ; + +expressionAtom + : constant # constantExpressionAtom + | columnName # fullColumnNameExpressionAtom + | functionCall # functionCallExpressionAtom + | LR_BRACKET expression RR_BRACKET # nestedExpressionAtom + | left = expressionAtom mathOperator = (STAR | SLASH | MODULE) right = expressionAtom # mathExpressionAtom + | left = expressionAtom mathOperator = (PLUS | MINUS) right = expressionAtom # mathExpressionAtom + ; + +comparisonOperator + : '=' + | '>' + | '<' + | '<' '=' + | '>' '=' + | '<' '>' + | '!' '=' + ; + +nullNotnull + : NOT? NULL_LITERAL + ; + +functionCall + : nestedFunctionName LR_BRACKET allTupleFields RR_BRACKET # nestedAllFunctionCall + | scalarFunctionName LR_BRACKET functionArgs RR_BRACKET # scalarFunctionCall + | specificFunction # specificFunctionCall + | windowFunctionClause # windowFunctionCall + | aggregateFunction # aggregateFunctionCall + | aggregateFunction (orderByClause)? filterClause # filteredAggregationFunctionCall + | scoreRelevanceFunction # scoreRelevanceFunctionCall + | relevanceFunction # relevanceFunctionCall + | highlightFunction # highlightFunctionCall + | positionFunction # positionFunctionCall + | extractFunction # extractFunctionCall + | getFormatFunction # getFormatFunctionCall + | timestampFunction # timestampFunctionCall + ; + +timestampFunction + : timestampFunctionName LR_BRACKET simpleDateTimePart COMMA firstArg = functionArg COMMA secondArg = functionArg RR_BRACKET + ; + +timestampFunctionName + : TIMESTAMPADD + | TIMESTAMPDIFF + ; + +getFormatFunction + : GET_FORMAT LR_BRACKET getFormatType COMMA functionArg RR_BRACKET + ; + +getFormatType + : DATE + | DATETIME + | TIME + | TIMESTAMP + ; + +extractFunction + : EXTRACT LR_BRACKET datetimePart FROM functionArg RR_BRACKET + ; + +simpleDateTimePart + : MICROSECOND + | SECOND + | MINUTE + | HOUR + | DAY + | WEEK + | MONTH + | QUARTER + | YEAR + ; + +complexDateTimePart + : SECOND_MICROSECOND + | MINUTE_MICROSECOND + | MINUTE_SECOND + | HOUR_MICROSECOND + | HOUR_SECOND + | HOUR_MINUTE + | DAY_MICROSECOND + | DAY_SECOND + | DAY_MINUTE + | DAY_HOUR + | YEAR_MONTH + ; + +datetimePart + : simpleDateTimePart + | complexDateTimePart + ; + +highlightFunction + : HIGHLIGHT LR_BRACKET relevanceField (COMMA highlightArg)* RR_BRACKET + ; + +positionFunction + : POSITION LR_BRACKET functionArg IN functionArg RR_BRACKET + ; + +matchQueryAltSyntaxFunction + : field = relevanceField EQUAL_SYMBOL MATCH_QUERY LR_BRACKET query = relevanceQuery RR_BRACKET + ; + +scalarFunctionName + : mathematicalFunctionName + | dateTimeFunctionName + | textFunctionName + | flowControlFunctionName + | systemFunctionName + | nestedFunctionName + ; + +specificFunction + : CASE expression caseFuncAlternative+ (ELSE elseArg = functionArg)? END # caseFunctionCall + | CASE caseFuncAlternative+ (ELSE elseArg = functionArg)? END # caseFunctionCall + | CAST '(' expression AS convertedDataType ')' # dataTypeFunctionCall + ; + +relevanceFunction + : noFieldRelevanceFunction + | singleFieldRelevanceFunction + | multiFieldRelevanceFunction + | altSingleFieldRelevanceFunction + | altMultiFieldRelevanceFunction + ; + +scoreRelevanceFunction + : scoreRelevanceFunctionName LR_BRACKET relevanceFunction (COMMA weight = relevanceFieldWeight)? RR_BRACKET + ; + +noFieldRelevanceFunction + : noFieldRelevanceFunctionName LR_BRACKET query = relevanceQuery (COMMA relevanceArg)* RR_BRACKET + ; + +// Field is a single column +singleFieldRelevanceFunction + : singleFieldRelevanceFunctionName LR_BRACKET field = relevanceField COMMA query = relevanceQuery (COMMA relevanceArg)* RR_BRACKET + ; + +// Field is a list of columns +multiFieldRelevanceFunction + : multiFieldRelevanceFunctionName LR_BRACKET LT_SQR_PRTHS field = relevanceFieldAndWeight (COMMA field = relevanceFieldAndWeight)* RT_SQR_PRTHS COMMA query = relevanceQuery (COMMA relevanceArg)* RR_BRACKET + | multiFieldRelevanceFunctionName LR_BRACKET alternateMultiMatchQuery COMMA alternateMultiMatchField (COMMA relevanceArg)* RR_BRACKET + ; + +altSingleFieldRelevanceFunction + : field = relevanceField EQUAL_SYMBOL altSyntaxFunctionName = altSingleFieldRelevanceFunctionName LR_BRACKET query = relevanceQuery (COMMA relevanceArg)* RR_BRACKET + ; + +altMultiFieldRelevanceFunction + : field = relevanceField EQUAL_SYMBOL altSyntaxFunctionName = altMultiFieldRelevanceFunctionName LR_BRACKET query = relevanceQuery (COMMA relevanceArg)* RR_BRACKET + ; + +convertedDataType + : typeName = DATE + | typeName = TIME + | typeName = TIMESTAMP + | typeName = INT + | typeName = INTEGER + | typeName = DOUBLE + | typeName = LONG + | typeName = FLOAT + | typeName = STRING + | typeName = BOOLEAN + ; + +caseFuncAlternative + : WHEN condition = functionArg THEN consequent = functionArg + ; + +aggregateFunction + : functionName = aggregationFunctionName LR_BRACKET functionArg RR_BRACKET # regularAggregateFunctionCall + | COUNT LR_BRACKET STAR RR_BRACKET # countStarFunctionCall + | COUNT LR_BRACKET DISTINCT functionArg RR_BRACKET # distinctCountFunctionCall + | percentileApproxFunction # percentileApproxFunctionCall + ; + +percentileApproxFunction + : (PERCENTILE | PERCENTILE_APPROX) LR_BRACKET aggField = functionArg + COMMA percent = numericLiteral (COMMA compression = numericLiteral)? RR_BRACKET + ; + +filterClause + : FILTER LR_BRACKET WHERE expression RR_BRACKET + ; + +aggregationFunctionName + : AVG + | COUNT + | SUM + | MIN + | MAX + | VAR_POP + | VAR_SAMP + | VARIANCE + | STD + | STDDEV + | STDDEV_POP + | STDDEV_SAMP + ; + +mathematicalFunctionName + : ABS + | CBRT + | CEIL + | CEILING + | CONV + | CRC32 + | E + | EXP + | EXPM1 + | FLOOR + | LN + | LOG + | LOG10 + | LOG2 + | MOD + | PI + | POW + | POWER + | RAND + | RINT + | ROUND + | SIGN + | SIGNUM + | SQRT + | TRUNCATE + | trigonometricFunctionName + | arithmeticFunctionName + ; + +trigonometricFunctionName + : ACOS + | ASIN + | ATAN + | ATAN2 + | COS + | COSH + | COT + | DEGREES + | RADIANS + | SIN + | SINH + | TAN + ; + +arithmeticFunctionName + : ADD + | SUBTRACT + | MULTIPLY + | DIVIDE + | MOD + | MODULUS + ; + +dateTimeFunctionName + : datetimeConstantLiteral + | ADDDATE + | ADDTIME + | CONVERT_TZ + | CURDATE + | CURTIME + | DATE + | DATE_ADD + | DATE_FORMAT + | DATE_SUB + | DATEDIFF + | DATETIME + | DAY + | DAYNAME + | DAYOFMONTH + | DAY_OF_MONTH + | DAYOFWEEK + | DAYOFYEAR + | DAY_OF_YEAR + | DAY_OF_WEEK + | FROM_DAYS + | FROM_UNIXTIME + | HOUR + | HOUR_OF_DAY + | LAST_DAY + | MAKEDATE + | MAKETIME + | MICROSECOND + | MINUTE + | MINUTE_OF_DAY + | MINUTE_OF_HOUR + | MONTH + | MONTHNAME + | MONTH_OF_YEAR + | NOW + | PERIOD_ADD + | PERIOD_DIFF + | QUARTER + | SEC_TO_TIME + | SECOND + | SECOND_OF_MINUTE + | SUBDATE + | SUBTIME + | SYSDATE + | STR_TO_DATE + | TIME + | TIME_FORMAT + | TIME_TO_SEC + | TIMEDIFF + | TIMESTAMP + | TO_DAYS + | TO_SECONDS + | UNIX_TIMESTAMP + | WEEK + | WEEKDAY + | WEEK_OF_YEAR + | WEEKOFYEAR + | YEAR + | YEARWEEK + ; + +textFunctionName + : SUBSTR + | SUBSTRING + | TRIM + | LTRIM + | RTRIM + | LOWER + | UPPER + | CONCAT + | CONCAT_WS + | SUBSTR + | LENGTH + | STRCMP + | RIGHT + | LEFT + | ASCII + | LOCATE + | REPLACE + | REVERSE + ; + +flowControlFunctionName + : IF + | IFNULL + | NULLIF + | ISNULL + ; + +noFieldRelevanceFunctionName + : QUERY + ; + +systemFunctionName + : TYPEOF + ; + +nestedFunctionName + : NESTED + ; + +scoreRelevanceFunctionName + : SCORE + | SCOREQUERY + | SCORE_QUERY + ; + +singleFieldRelevanceFunctionName + : MATCH + | MATCHQUERY + | MATCH_QUERY + | MATCH_PHRASE + | MATCHPHRASE + | MATCHPHRASEQUERY + | MATCH_BOOL_PREFIX + | MATCH_PHRASE_PREFIX + | WILDCARD_QUERY + | WILDCARDQUERY + ; + +multiFieldRelevanceFunctionName + : MULTI_MATCH + | MULTIMATCH + | MULTIMATCHQUERY + | SIMPLE_QUERY_STRING + | QUERY_STRING + ; + +altSingleFieldRelevanceFunctionName + : MATCH_QUERY + | MATCHQUERY + | MATCH_PHRASE + | MATCHPHRASE + ; + +altMultiFieldRelevanceFunctionName + : MULTI_MATCH + | MULTIMATCH + ; + +functionArgs + : (functionArg (COMMA functionArg)*)? + ; + +functionArg + : expression + ; + +relevanceArg + : relevanceArgName EQUAL_SYMBOL relevanceArgValue + | argName = stringLiteral EQUAL_SYMBOL argVal = relevanceArgValue + ; + +highlightArg + : highlightArgName EQUAL_SYMBOL highlightArgValue + ; + +relevanceArgName + : ALLOW_LEADING_WILDCARD + | ANALYZER + | ANALYZE_WILDCARD + | AUTO_GENERATE_SYNONYMS_PHRASE_QUERY + | BOOST + | CASE_INSENSITIVE + | CUTOFF_FREQUENCY + | DEFAULT_FIELD + | DEFAULT_OPERATOR + | ENABLE_POSITION_INCREMENTS + | ESCAPE + | FIELDS + | FLAGS + | FUZZINESS + | FUZZY_MAX_EXPANSIONS + | FUZZY_PREFIX_LENGTH + | FUZZY_REWRITE + | FUZZY_TRANSPOSITIONS + | LENIENT + | LOW_FREQ_OPERATOR + | MAX_DETERMINIZED_STATES + | MAX_EXPANSIONS + | MINIMUM_SHOULD_MATCH + | OPERATOR + | PHRASE_SLOP + | PREFIX_LENGTH + | QUOTE_ANALYZER + | QUOTE_FIELD_SUFFIX + | REWRITE + | SLOP + | TIE_BREAKER + | TIME_ZONE + | TYPE + | ZERO_TERMS_QUERY + ; + +highlightArgName + : HIGHLIGHT_POST_TAGS + | HIGHLIGHT_PRE_TAGS + ; + +relevanceFieldAndWeight + : field = relevanceField + | field = relevanceField weight = relevanceFieldWeight + | field = relevanceField BIT_XOR_OP weight = relevanceFieldWeight + ; + +relevanceFieldWeight + : numericLiteral + ; + +relevanceField + : qualifiedName + | stringLiteral + ; + +relevanceQuery + : relevanceArgValue + ; + +relevanceArgValue + : qualifiedName + | constant + ; + +highlightArgValue + : stringLiteral + ; + +alternateMultiMatchArgName + : FIELDS + | QUERY + | stringLiteral + ; + +alternateMultiMatchQuery + : argName = alternateMultiMatchArgName EQUAL_SYMBOL argVal = relevanceArgValue + ; + +alternateMultiMatchField + : argName = alternateMultiMatchArgName EQUAL_SYMBOL argVal = relevanceArgValue + | argName = alternateMultiMatchArgName EQUAL_SYMBOL LT_SQR_PRTHS argVal = relevanceArgValue RT_SQR_PRTHS + ; + +// Identifiers +tableName + : qualifiedName + ; + +columnName + : qualifiedName + ; + +allTupleFields + : path = qualifiedName DOT STAR + ; + +alias + : ident + ; + +qualifiedName + : ident (DOT ident)* + ; + +ident + : DOT? ID + | BACKTICK_QUOTE_ID + | keywordsCanBeId + | scalarFunctionName + ; + +keywordsCanBeId + : FULL + | FIELD + | D + | T + | TS // OD SQL and ODBC special + | COUNT + | SUM + | AVG + | MAX + | MIN + | FIRST + | LAST + | TYPE // TODO: Type is keyword required by relevancy function. Remove this when relevancy functions moved out + ; diff --git a/language-grammar/src/main/antlr4/SparkSqlBase.g4 b/language-grammar/src/main/antlr4/SparkSqlBase.g4 new file mode 100644 index 00000000000..c53c61adfde --- /dev/null +++ b/language-grammar/src/main/antlr4/SparkSqlBase.g4 @@ -0,0 +1,246 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * This file contains code from the Apache Spark project (original license below). + * It contains modifications, which are licensed as above: + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +grammar SparkSqlBase; + +// Copy from Spark 3.3.1 SqlBaseParser.g4 and SqlBaseLexer.g4 + +@members { + /** + * When true, parser should throw ParseExcetion for unclosed bracketed comment. + */ + public boolean has_unclosed_bracketed_comment = false; + + /** + * Verify whether current token is a valid decimal token (which contains dot). + * Returns true if the character that follows the token is not a digit or letter or underscore. + * + * For example: + * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'. + * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'. + * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'. + * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed + * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+' + * which is not a digit or letter or underscore. + */ + public boolean isValidDecimal() { + int nextChar = _input.LA(1); + if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' || + nextChar == '_') { + return false; + } else { + return true; + } + } + + /** + * This method will be called when we see '/*' and try to match it as a bracketed comment. + * If the next character is '+', it should be parsed as hint later, and we cannot match + * it as a bracketed comment. + * + * Returns true if the next character is '+'. + */ + public boolean isHint() { + int nextChar = _input.LA(1); + if (nextChar == '+') { + return true; + } else { + return false; + } + } + + /** + * This method will be called when the character stream ends and try to find out the + * unclosed bracketed comment. + * If the method be called, it means the end of the entire character stream match, + * and we set the flag and fail later. + */ + public void markUnclosedComment() { + has_unclosed_bracketed_comment = true; + } +} + + +multipartIdentifierPropertyList + : multipartIdentifierProperty (COMMA multipartIdentifierProperty)* + ; + +multipartIdentifierProperty + : multipartIdentifier (options=propertyList)? + ; + +propertyList + : property (COMMA property)* + ; + +property + : key=propertyKey (EQ? value=propertyValue)? + ; + +propertyKey + : identifier (DOT identifier)* + | STRING + ; + +propertyValue + : INTEGER_VALUE + | DECIMAL_VALUE + | booleanValue + | STRING + ; + +booleanValue + : TRUE | FALSE + ; + + +multipartIdentifier + : parts+=identifier (DOT parts+=identifier)* + ; + +identifier + : IDENTIFIER #unquotedIdentifier + | quotedIdentifier #quotedIdentifierAlternative + | nonReserved #unquotedIdentifier + ; + +quotedIdentifier + : BACKQUOTED_IDENTIFIER + ; + +nonReserved + : DROP | SKIPPING | INDEX + ; + + +// Flint lexical tokens + +BLOOM_FILTER: 'BLOOM_FILTER'; +MIN_MAX: 'MIN_MAX'; +SKIPPING: 'SKIPPING'; +VALUE_SET: 'VALUE_SET'; + + +// Spark lexical tokens + +SEMICOLON: ';'; + +LEFT_PAREN: '('; +RIGHT_PAREN: ')'; +COMMA: ','; +DOT: '.'; + + +AS: 'AS'; +ALTER: 'ALTER'; +ANALYZE: 'ANALYZE'; +CREATE: 'CREATE'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DROP: 'DROP'; +EXISTS: 'EXISTS'; +EXTENDED: 'EXTENDED'; +FALSE: 'FALSE'; +FLINT: 'FLINT'; +IF: 'IF'; +IN: 'IN'; +INDEX: 'INDEX'; +INDEXES: 'INDEXES'; +JOB: 'JOB'; +MATERIALIZED: 'MATERIALIZED'; +NOT: 'NOT'; +ON: 'ON'; +PARTITION: 'PARTITION'; +RECOVER: 'RECOVER'; +REFRESH: 'REFRESH'; +SHOW: 'SHOW'; +TRUE: 'TRUE'; +VACUUM: 'VACUUM'; +VIEW: 'VIEW'; +VIEWS: 'VIEWS'; +WHERE: 'WHERE'; +WITH: 'WITH'; + + +EQ : '=' | '=='; +MINUS: '-'; + + +STRING + : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' + | '"' ( ~('"'|'\\') | ('\\' .) )* '"' + | 'R\'' (~'\'')* '\'' + | 'R"'(~'"')* '"' + ; + +INTEGER_VALUE + : DIGIT+ + ; + +DECIMAL_VALUE + : DECIMAL_DIGITS {isValidDecimal()}? + ; + +IDENTIFIER + : (LETTER | DIGIT | '_')+ + ; + +BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + +fragment DECIMAL_DIGITS + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + +fragment DIGIT + : [0-9] + ; + +fragment LETTER + : [A-Z] + ; + +SIMPLE_COMMENT + : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN) + ; + +BRACKETED_COMMENT + : '/*' {!isHint()}? ( BRACKETED_COMMENT | . )*? ('*/' | {markUnclosedComment();} EOF) -> channel(HIDDEN) + ; + +WS + : [ \r\n\t]+ -> channel(HIDDEN) + ; + +// Catch-all for anything we can't recognize. +// We use this to be able to ignore and recover all the text +// when splitting statements with DelimiterLexer +UNRECOGNIZED + : . + ; \ No newline at end of file diff --git a/language-grammar/src/main/antlr4/SqlBaseLexer.g4 b/language-grammar/src/main/antlr4/SqlBaseLexer.g4 new file mode 100644 index 00000000000..fb440ef8d37 --- /dev/null +++ b/language-grammar/src/main/antlr4/SqlBaseLexer.g4 @@ -0,0 +1,546 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is an adaptation of Presto's presto-parser/src/main/antlr4/com/facebook/presto/sql/parser/SqlBase.g4 grammar. + */ + +lexer grammar SqlBaseLexer; + +@members { + /** + * When true, parser should throw ParseException for unclosed bracketed comment. + */ + public boolean has_unclosed_bracketed_comment = false; + + /** + * Verify whether current token is a valid decimal token (which contains dot). + * Returns true if the character that follows the token is not a digit or letter or underscore. + * + * For example: + * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'. + * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'. + * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'. + * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed + * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+' + * which is not a digit or letter or underscore. + */ + public boolean isValidDecimal() { + int nextChar = _input.LA(1); + if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' || + nextChar == '_') { + return false; + } else { + return true; + } + } + + /** + * This method will be called when we see '/*' and try to match it as a bracketed comment. + * If the next character is '+', it should be parsed as hint later, and we cannot match + * it as a bracketed comment. + * + * Returns true if the next character is '+'. + */ + public boolean isHint() { + int nextChar = _input.LA(1); + if (nextChar == '+') { + return true; + } else { + return false; + } + } + + /** + * This method will be called when the character stream ends and try to find out the + * unclosed bracketed comment. + * If the method be called, it means the end of the entire character stream match, + * and we set the flag and fail later. + */ + public void markUnclosedComment() { + has_unclosed_bracketed_comment = true; + } +} + +SEMICOLON: ';'; + +LEFT_PAREN: '('; +RIGHT_PAREN: ')'; +COMMA: ','; +DOT: '.'; +LEFT_BRACKET: '['; +RIGHT_BRACKET: ']'; + +// NOTE: If you add a new token in the list below, you should update the list of keywords +// and reserved tag in `docs/sql-ref-ansi-compliance.md#sql-keywords`, and +// modify `ParserUtils.toExprAlias()` which assumes all keywords are between `ADD` and `ZONE`. + +//============================ +// Start of the keywords list +//============================ +//--SPARK-KEYWORD-LIST-START +ADD: 'ADD'; +AFTER: 'AFTER'; +ALL: 'ALL'; +ALTER: 'ALTER'; +ALWAYS: 'ALWAYS'; +ANALYZE: 'ANALYZE'; +AND: 'AND'; +ANTI: 'ANTI'; +ANY: 'ANY'; +ANY_VALUE: 'ANY_VALUE'; +ARCHIVE: 'ARCHIVE'; +ARRAY: 'ARRAY'; +AS: 'AS'; +ASC: 'ASC'; +AT: 'AT'; +AUTHORIZATION: 'AUTHORIZATION'; +BETWEEN: 'BETWEEN'; +BIGINT: 'BIGINT'; +BINARY: 'BINARY'; +BOOLEAN: 'BOOLEAN'; +BOTH: 'BOTH'; +BUCKET: 'BUCKET'; +BUCKETS: 'BUCKETS'; +BY: 'BY'; +BYTE: 'BYTE'; +CACHE: 'CACHE'; +CASCADE: 'CASCADE'; +CASE: 'CASE'; +CAST: 'CAST'; +CATALOG: 'CATALOG'; +CATALOGS: 'CATALOGS'; +CHANGE: 'CHANGE'; +CHAR: 'CHAR'; +CHARACTER: 'CHARACTER'; +CHECK: 'CHECK'; +CLEAR: 'CLEAR'; +CLUSTER: 'CLUSTER'; +CLUSTERED: 'CLUSTERED'; +CODEGEN: 'CODEGEN'; +COLLATE: 'COLLATE'; +COLLECTION: 'COLLECTION'; +COLUMN: 'COLUMN'; +COLUMNS: 'COLUMNS'; +COMMENT: 'COMMENT'; +COMMIT: 'COMMIT'; +COMPACT: 'COMPACT'; +COMPACTIONS: 'COMPACTIONS'; +COMPUTE: 'COMPUTE'; +CONCATENATE: 'CONCATENATE'; +CONSTRAINT: 'CONSTRAINT'; +COST: 'COST'; +CREATE: 'CREATE'; +CROSS: 'CROSS'; +CUBE: 'CUBE'; +CURRENT: 'CURRENT'; +CURRENT_DATE: 'CURRENT_DATE'; +CURRENT_TIME: 'CURRENT_TIME'; +CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; +CURRENT_USER: 'CURRENT_USER'; +DAY: 'DAY'; +DAYS: 'DAYS'; +DAYOFYEAR: 'DAYOFYEAR'; +DATA: 'DATA'; +DATE: 'DATE'; +DATABASE: 'DATABASE'; +DATABASES: 'DATABASES'; +DATEADD: 'DATEADD'; +DATE_ADD: 'DATE_ADD'; +DATEDIFF: 'DATEDIFF'; +DATE_DIFF: 'DATE_DIFF'; +DBPROPERTIES: 'DBPROPERTIES'; +DEC: 'DEC'; +DECIMAL: 'DECIMAL'; +DEFAULT: 'DEFAULT'; +DEFINED: 'DEFINED'; +DELETE: 'DELETE'; +DELIMITED: 'DELIMITED'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DFS: 'DFS'; +DIRECTORIES: 'DIRECTORIES'; +DIRECTORY: 'DIRECTORY'; +DISTINCT: 'DISTINCT'; +DISTRIBUTE: 'DISTRIBUTE'; +DIV: 'DIV'; +DOUBLE: 'DOUBLE'; +DROP: 'DROP'; +ELSE: 'ELSE'; +END: 'END'; +ESCAPE: 'ESCAPE'; +ESCAPED: 'ESCAPED'; +EXCEPT: 'EXCEPT'; +EXCHANGE: 'EXCHANGE'; +EXCLUDE: 'EXCLUDE'; +EXISTS: 'EXISTS'; +EXPLAIN: 'EXPLAIN'; +EXPORT: 'EXPORT'; +EXTENDED: 'EXTENDED'; +EXTERNAL: 'EXTERNAL'; +EXTRACT: 'EXTRACT'; +FALSE: 'FALSE'; +FETCH: 'FETCH'; +FIELDS: 'FIELDS'; +FILTER: 'FILTER'; +FILEFORMAT: 'FILEFORMAT'; +FIRST: 'FIRST'; +FLOAT: 'FLOAT'; +FOLLOWING: 'FOLLOWING'; +FOR: 'FOR'; +FOREIGN: 'FOREIGN'; +FORMAT: 'FORMAT'; +FORMATTED: 'FORMATTED'; +FROM: 'FROM'; +FULL: 'FULL'; +FUNCTION: 'FUNCTION'; +FUNCTIONS: 'FUNCTIONS'; +GENERATED: 'GENERATED'; +GLOBAL: 'GLOBAL'; +GRANT: 'GRANT'; +GROUP: 'GROUP'; +GROUPING: 'GROUPING'; +HAVING: 'HAVING'; +BINARY_HEX: 'X'; +HOUR: 'HOUR'; +HOURS: 'HOURS'; +IDENTIFIER_KW: 'IDENTIFIER'; +IF: 'IF'; +IGNORE: 'IGNORE'; +IMPORT: 'IMPORT'; +IN: 'IN'; +INCLUDE: 'INCLUDE'; +INDEX: 'INDEX'; +INDEXES: 'INDEXES'; +INNER: 'INNER'; +INPATH: 'INPATH'; +INPUTFORMAT: 'INPUTFORMAT'; +INSERT: 'INSERT'; +INTERSECT: 'INTERSECT'; +INTERVAL: 'INTERVAL'; +INT: 'INT'; +INTEGER: 'INTEGER'; +INTO: 'INTO'; +IS: 'IS'; +ITEMS: 'ITEMS'; +JOIN: 'JOIN'; +KEYS: 'KEYS'; +LAST: 'LAST'; +LATERAL: 'LATERAL'; +LAZY: 'LAZY'; +LEADING: 'LEADING'; +LEFT: 'LEFT'; +LIKE: 'LIKE'; +ILIKE: 'ILIKE'; +LIMIT: 'LIMIT'; +LINES: 'LINES'; +LIST: 'LIST'; +LOAD: 'LOAD'; +LOCAL: 'LOCAL'; +LOCATION: 'LOCATION'; +LOCK: 'LOCK'; +LOCKS: 'LOCKS'; +LOGICAL: 'LOGICAL'; +LONG: 'LONG'; +MACRO: 'MACRO'; +MAP: 'MAP'; +MATCHED: 'MATCHED'; +MERGE: 'MERGE'; +MICROSECOND: 'MICROSECOND'; +MICROSECONDS: 'MICROSECONDS'; +MILLISECOND: 'MILLISECOND'; +MILLISECONDS: 'MILLISECONDS'; +MINUTE: 'MINUTE'; +MINUTES: 'MINUTES'; +MONTH: 'MONTH'; +MONTHS: 'MONTHS'; +MSCK: 'MSCK'; +NAME: 'NAME'; +NAMESPACE: 'NAMESPACE'; +NAMESPACES: 'NAMESPACES'; +NANOSECOND: 'NANOSECOND'; +NANOSECONDS: 'NANOSECONDS'; +NATURAL: 'NATURAL'; +NO: 'NO'; +NOT: 'NOT' | '!'; +NULL: 'NULL'; +NULLS: 'NULLS'; +NUMERIC: 'NUMERIC'; +OF: 'OF'; +OFFSET: 'OFFSET'; +ON: 'ON'; +ONLY: 'ONLY'; +OPTION: 'OPTION'; +OPTIONS: 'OPTIONS'; +OR: 'OR'; +ORDER: 'ORDER'; +OUT: 'OUT'; +OUTER: 'OUTER'; +OUTPUTFORMAT: 'OUTPUTFORMAT'; +OVER: 'OVER'; +OVERLAPS: 'OVERLAPS'; +OVERLAY: 'OVERLAY'; +OVERWRITE: 'OVERWRITE'; +PARTITION: 'PARTITION'; +PARTITIONED: 'PARTITIONED'; +PARTITIONS: 'PARTITIONS'; +PERCENTILE_CONT: 'PERCENTILE_CONT'; +PERCENTILE_DISC: 'PERCENTILE_DISC'; +PERCENTLIT: 'PERCENT'; +PIVOT: 'PIVOT'; +PLACING: 'PLACING'; +POSITION: 'POSITION'; +PRECEDING: 'PRECEDING'; +PRIMARY: 'PRIMARY'; +PRINCIPALS: 'PRINCIPALS'; +PROPERTIES: 'PROPERTIES'; +PURGE: 'PURGE'; +QUARTER: 'QUARTER'; +QUERY: 'QUERY'; +RANGE: 'RANGE'; +REAL: 'REAL'; +RECORDREADER: 'RECORDREADER'; +RECORDWRITER: 'RECORDWRITER'; +RECOVER: 'RECOVER'; +REDUCE: 'REDUCE'; +REFERENCES: 'REFERENCES'; +REFRESH: 'REFRESH'; +RENAME: 'RENAME'; +REPAIR: 'REPAIR'; +REPEATABLE: 'REPEATABLE'; +REPLACE: 'REPLACE'; +RESET: 'RESET'; +RESPECT: 'RESPECT'; +RESTRICT: 'RESTRICT'; +REVOKE: 'REVOKE'; +RIGHT: 'RIGHT'; +RLIKE: 'RLIKE' | 'REGEXP'; +ROLE: 'ROLE'; +ROLES: 'ROLES'; +ROLLBACK: 'ROLLBACK'; +ROLLUP: 'ROLLUP'; +ROW: 'ROW'; +ROWS: 'ROWS'; +SECOND: 'SECOND'; +SECONDS: 'SECONDS'; +SCHEMA: 'SCHEMA'; +SCHEMAS: 'SCHEMAS'; +SELECT: 'SELECT'; +SEMI: 'SEMI'; +SEPARATED: 'SEPARATED'; +SERDE: 'SERDE'; +SERDEPROPERTIES: 'SERDEPROPERTIES'; +SESSION_USER: 'SESSION_USER'; +SET: 'SET'; +SETMINUS: 'MINUS'; +SETS: 'SETS'; +SHORT: 'SHORT'; +SHOW: 'SHOW'; +SKEWED: 'SKEWED'; +SMALLINT: 'SMALLINT'; +SOME: 'SOME'; +SORT: 'SORT'; +SORTED: 'SORTED'; +SOURCE: 'SOURCE'; +START: 'START'; +STATISTICS: 'STATISTICS'; +STORED: 'STORED'; +STRATIFY: 'STRATIFY'; +STRING: 'STRING'; +STRUCT: 'STRUCT'; +SUBSTR: 'SUBSTR'; +SUBSTRING: 'SUBSTRING'; +SYNC: 'SYNC'; +SYSTEM_TIME: 'SYSTEM_TIME'; +SYSTEM_VERSION: 'SYSTEM_VERSION'; +TABLE: 'TABLE'; +TABLES: 'TABLES'; +TABLESAMPLE: 'TABLESAMPLE'; +TARGET: 'TARGET'; +TBLPROPERTIES: 'TBLPROPERTIES'; +TEMPORARY: 'TEMPORARY' | 'TEMP'; +TERMINATED: 'TERMINATED'; +THEN: 'THEN'; +TIME: 'TIME'; +TIMESTAMP: 'TIMESTAMP'; +TIMESTAMP_LTZ: 'TIMESTAMP_LTZ'; +TIMESTAMP_NTZ: 'TIMESTAMP_NTZ'; +TIMESTAMPADD: 'TIMESTAMPADD'; +TIMESTAMPDIFF: 'TIMESTAMPDIFF'; +TINYINT: 'TINYINT'; +TO: 'TO'; +TOUCH: 'TOUCH'; +TRAILING: 'TRAILING'; +TRANSACTION: 'TRANSACTION'; +TRANSACTIONS: 'TRANSACTIONS'; +TRANSFORM: 'TRANSFORM'; +TRIM: 'TRIM'; +TRUE: 'TRUE'; +TRUNCATE: 'TRUNCATE'; +TRY_CAST: 'TRY_CAST'; +TYPE: 'TYPE'; +UNARCHIVE: 'UNARCHIVE'; +UNBOUNDED: 'UNBOUNDED'; +UNCACHE: 'UNCACHE'; +UNION: 'UNION'; +UNIQUE: 'UNIQUE'; +UNKNOWN: 'UNKNOWN'; +UNLOCK: 'UNLOCK'; +UNPIVOT: 'UNPIVOT'; +UNSET: 'UNSET'; +UPDATE: 'UPDATE'; +USE: 'USE'; +USER: 'USER'; +USING: 'USING'; +VALUES: 'VALUES'; +VARCHAR: 'VARCHAR'; +VERSION: 'VERSION'; +VIEW: 'VIEW'; +VIEWS: 'VIEWS'; +VOID: 'VOID'; +WEEK: 'WEEK'; +WEEKS: 'WEEKS'; +WHEN: 'WHEN'; +WHERE: 'WHERE'; +WINDOW: 'WINDOW'; +WITH: 'WITH'; +WITHIN: 'WITHIN'; +YEAR: 'YEAR'; +YEARS: 'YEARS'; +ZONE: 'ZONE'; +//--SPARK-KEYWORD-LIST-END +//============================ +// End of the keywords list +//============================ + +EQ : '=' | '=='; +NSEQ: '<=>'; +NEQ : '<>'; +NEQJ: '!='; +LT : '<'; +LTE : '<=' | '!>'; +GT : '>'; +GTE : '>=' | '!<'; + +PLUS: '+'; +MINUS: '-'; +ASTERISK: '*'; +SLASH: '/'; +PERCENT: '%'; +TILDE: '~'; +AMPERSAND: '&'; +PIPE: '|'; +CONCAT_PIPE: '||'; +HAT: '^'; +COLON: ':'; +ARROW: '->'; +FAT_ARROW : '=>'; +HENT_START: '/*+'; +HENT_END: '*/'; +QUESTION: '?'; + +STRING_LITERAL + : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' + | 'R\'' (~'\'')* '\'' + | 'R"'(~'"')* '"' + ; + +DOUBLEQUOTED_STRING + :'"' ( ~('"'|'\\') | ('\\' .) )* '"' + ; + +// NOTE: If you move a numeric literal, you should modify `ParserUtils.toExprAlias()` +// which assumes all numeric literals are between `BIGINT_LITERAL` and `BIGDECIMAL_LITERAL`. + +BIGINT_LITERAL + : DIGIT+ 'L' + ; + +SMALLINT_LITERAL + : DIGIT+ 'S' + ; + +TINYINT_LITERAL + : DIGIT+ 'Y' + ; + +INTEGER_VALUE + : DIGIT+ + ; + +EXPONENT_VALUE + : DIGIT+ EXPONENT + | DECIMAL_DIGITS EXPONENT {isValidDecimal()}? + ; + +DECIMAL_VALUE + : DECIMAL_DIGITS {isValidDecimal()}? + ; + +FLOAT_LITERAL + : DIGIT+ EXPONENT? 'F' + | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}? + ; + +DOUBLE_LITERAL + : DIGIT+ EXPONENT? 'D' + | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}? + ; + +BIGDECIMAL_LITERAL + : DIGIT+ EXPONENT? 'BD' + | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}? + ; + +IDENTIFIER + : (LETTER | DIGIT | '_')+ + ; + +BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + +fragment DECIMAL_DIGITS + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + +fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; + +fragment DIGIT + : [0-9] + ; + +fragment LETTER + : [A-Z] + ; + +SIMPLE_COMMENT + : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN) + ; + +BRACKETED_COMMENT + : '/*' {!isHint()}? ( BRACKETED_COMMENT | . )*? ('*/' | {markUnclosedComment();} EOF) -> channel(HIDDEN) + ; + +WS + : [ \r\n\t]+ -> channel(HIDDEN) + ; + +// Catch-all for anything we can't recognize. +// We use this to be able to ignore and recover all the text +// when splitting statements with DelimiterLexer +UNRECOGNIZED + : . + ; diff --git a/language-grammar/src/main/antlr4/SqlBaseParser.g4 b/language-grammar/src/main/antlr4/SqlBaseParser.g4 new file mode 100644 index 00000000000..04128216be0 --- /dev/null +++ b/language-grammar/src/main/antlr4/SqlBaseParser.g4 @@ -0,0 +1,1875 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is an adaptation of Presto's presto-parser/src/main/antlr4/com/facebook/presto/sql/parser/SqlBase.g4 grammar. + */ + +parser grammar SqlBaseParser; + +options { tokenVocab = SqlBaseLexer; } + +@members { + /** + * When false, INTERSECT is given the greater precedence over the other set + * operations (UNION, EXCEPT and MINUS) as per the SQL standard. + */ + public boolean legacy_setops_precedence_enabled = false; + + /** + * When false, a literal with an exponent would be converted into + * double type rather than decimal type. + */ + public boolean legacy_exponent_literal_as_decimal_enabled = false; + + /** + * When true, the behavior of keywords follows ANSI SQL standard. + */ + public boolean SQL_standard_keyword_behavior = false; + + /** + * When true, double quoted literals are identifiers rather than STRINGs. + */ + public boolean double_quoted_identifiers = false; +} + +singleStatement + : statement SEMICOLON* EOF + ; + +singleExpression + : namedExpression EOF + ; + +singleTableIdentifier + : tableIdentifier EOF + ; + +singleMultipartIdentifier + : multipartIdentifier EOF + ; + +singleFunctionIdentifier + : functionIdentifier EOF + ; + +singleDataType + : dataType EOF + ; + +singleTableSchema + : colTypeList EOF + ; + +statement + : query #statementDefault + | ctes? dmlStatementNoWith #dmlStatement + | USE identifierReference #use + | USE namespace identifierReference #useNamespace + | SET CATALOG (identifier | stringLit) #setCatalog + | CREATE namespace (IF NOT EXISTS)? identifierReference + (commentSpec | + locationSpec | + (WITH (DBPROPERTIES | PROPERTIES) propertyList))* #createNamespace + | ALTER namespace identifierReference + SET (DBPROPERTIES | PROPERTIES) propertyList #setNamespaceProperties + | ALTER namespace identifierReference + SET locationSpec #setNamespaceLocation + | DROP namespace (IF EXISTS)? identifierReference + (RESTRICT | CASCADE)? #dropNamespace + | SHOW namespaces ((FROM | IN) multipartIdentifier)? + (LIKE? pattern=stringLit)? #showNamespaces + | createTableHeader (LEFT_PAREN createOrReplaceTableColTypeList RIGHT_PAREN)? tableProvider? + createTableClauses + (AS? query)? #createTable + | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier + LIKE source=tableIdentifier + (tableProvider | + rowFormat | + createFileFormat | + locationSpec | + (TBLPROPERTIES tableProps=propertyList))* #createTableLike + | replaceTableHeader (LEFT_PAREN createOrReplaceTableColTypeList RIGHT_PAREN)? tableProvider? + createTableClauses + (AS? query)? #replaceTable + | ANALYZE TABLE identifierReference partitionSpec? COMPUTE STATISTICS + (identifier | FOR COLUMNS identifierSeq | FOR ALL COLUMNS)? #analyze + | ANALYZE TABLES ((FROM | IN) identifierReference)? COMPUTE STATISTICS + (identifier)? #analyzeTables + | ALTER TABLE identifierReference + ADD (COLUMN | COLUMNS) + columns=qualifiedColTypeWithPositionList #addTableColumns + | ALTER TABLE identifierReference + ADD (COLUMN | COLUMNS) + LEFT_PAREN columns=qualifiedColTypeWithPositionList RIGHT_PAREN #addTableColumns + | ALTER TABLE table=identifierReference + RENAME COLUMN + from=multipartIdentifier TO to=errorCapturingIdentifier #renameTableColumn + | ALTER TABLE identifierReference + DROP (COLUMN | COLUMNS) (IF EXISTS)? + LEFT_PAREN columns=multipartIdentifierList RIGHT_PAREN #dropTableColumns + | ALTER TABLE identifierReference + DROP (COLUMN | COLUMNS) (IF EXISTS)? + columns=multipartIdentifierList #dropTableColumns + | ALTER (TABLE | VIEW) from=identifierReference + RENAME TO to=multipartIdentifier #renameTable + | ALTER (TABLE | VIEW) identifierReference + SET TBLPROPERTIES propertyList #setTableProperties + | ALTER (TABLE | VIEW) identifierReference + UNSET TBLPROPERTIES (IF EXISTS)? propertyList #unsetTableProperties + | ALTER TABLE table=identifierReference + (ALTER | CHANGE) COLUMN? column=multipartIdentifier + alterColumnAction? #alterTableAlterColumn + | ALTER TABLE table=identifierReference partitionSpec? + CHANGE COLUMN? + colName=multipartIdentifier colType colPosition? #hiveChangeColumn + | ALTER TABLE table=identifierReference partitionSpec? + REPLACE COLUMNS + LEFT_PAREN columns=qualifiedColTypeWithPositionList + RIGHT_PAREN #hiveReplaceColumns + | ALTER TABLE identifierReference (partitionSpec)? + SET SERDE stringLit (WITH SERDEPROPERTIES propertyList)? #setTableSerDe + | ALTER TABLE identifierReference (partitionSpec)? + SET SERDEPROPERTIES propertyList #setTableSerDe + | ALTER (TABLE | VIEW) identifierReference ADD (IF NOT EXISTS)? + partitionSpecLocation+ #addTablePartition + | ALTER TABLE identifierReference + from=partitionSpec RENAME TO to=partitionSpec #renameTablePartition + | ALTER (TABLE | VIEW) identifierReference + DROP (IF EXISTS)? partitionSpec (COMMA partitionSpec)* PURGE? #dropTablePartitions + | ALTER TABLE identifierReference + (partitionSpec)? SET locationSpec #setTableLocation + | ALTER TABLE identifierReference RECOVER PARTITIONS #recoverPartitions + | DROP TABLE (IF EXISTS)? identifierReference PURGE? #dropTable + | DROP VIEW (IF EXISTS)? identifierReference #dropView + | CREATE (OR REPLACE)? (GLOBAL? TEMPORARY)? + VIEW (IF NOT EXISTS)? identifierReference + identifierCommentList? + (commentSpec | + (PARTITIONED ON identifierList) | + (TBLPROPERTIES propertyList))* + AS query #createView + | CREATE (OR REPLACE)? GLOBAL? TEMPORARY VIEW + tableIdentifier (LEFT_PAREN colTypeList RIGHT_PAREN)? tableProvider + (OPTIONS propertyList)? #createTempViewUsing + | ALTER VIEW identifierReference AS? query #alterViewQuery + | CREATE (OR REPLACE)? TEMPORARY? FUNCTION (IF NOT EXISTS)? + identifierReference AS className=stringLit + (USING resource (COMMA resource)*)? #createFunction + | DROP TEMPORARY? FUNCTION (IF EXISTS)? identifierReference #dropFunction + | EXPLAIN (LOGICAL | FORMATTED | EXTENDED | CODEGEN | COST)? + statement #explain + | SHOW TABLES ((FROM | IN) identifierReference)? + (LIKE? pattern=stringLit)? #showTables + | SHOW TABLE EXTENDED ((FROM | IN) ns=identifierReference)? + LIKE pattern=stringLit partitionSpec? #showTableExtended + | SHOW TBLPROPERTIES table=identifierReference + (LEFT_PAREN key=propertyKey RIGHT_PAREN)? #showTblProperties + | SHOW COLUMNS (FROM | IN) table=identifierReference + ((FROM | IN) ns=multipartIdentifier)? #showColumns + | SHOW VIEWS ((FROM | IN) identifierReference)? + (LIKE? pattern=stringLit)? #showViews + | SHOW PARTITIONS identifierReference partitionSpec? #showPartitions + | SHOW identifier? FUNCTIONS ((FROM | IN) ns=identifierReference)? + (LIKE? (legacy=multipartIdentifier | pattern=stringLit))? #showFunctions + | SHOW CREATE TABLE identifierReference (AS SERDE)? #showCreateTable + | SHOW CURRENT namespace #showCurrentNamespace + | SHOW CATALOGS (LIKE? pattern=stringLit)? #showCatalogs + | (DESC | DESCRIBE) FUNCTION EXTENDED? describeFuncName #describeFunction + | (DESC | DESCRIBE) namespace EXTENDED? + identifierReference #describeNamespace + | (DESC | DESCRIBE) TABLE? option=(EXTENDED | FORMATTED)? + identifierReference partitionSpec? describeColName? #describeRelation + | (DESC | DESCRIBE) QUERY? query #describeQuery + | COMMENT ON namespace identifierReference IS + comment #commentNamespace + | COMMENT ON TABLE identifierReference IS comment #commentTable + | REFRESH TABLE identifierReference #refreshTable + | REFRESH FUNCTION identifierReference #refreshFunction + | REFRESH (stringLit | .*?) #refreshResource + | CACHE LAZY? TABLE identifierReference + (OPTIONS options=propertyList)? (AS? query)? #cacheTable + | UNCACHE TABLE (IF EXISTS)? identifierReference #uncacheTable + | CLEAR CACHE #clearCache + | LOAD DATA LOCAL? INPATH path=stringLit OVERWRITE? INTO TABLE + identifierReference partitionSpec? #loadData + | TRUNCATE TABLE identifierReference partitionSpec? #truncateTable + | (MSCK)? REPAIR TABLE identifierReference + (option=(ADD|DROP|SYNC) PARTITIONS)? #repairTable + | op=(ADD | LIST) identifier .*? #manageResource + | SET ROLE .*? #failNativeCommand + | SET TIME ZONE interval #setTimeZone + | SET TIME ZONE timezone #setTimeZone + | SET TIME ZONE .*? #setTimeZone + | SET configKey EQ configValue #setQuotedConfiguration + | SET configKey (EQ .*?)? #setConfiguration + | SET .*? EQ configValue #setQuotedConfiguration + | SET .*? #setConfiguration + | RESET configKey #resetQuotedConfiguration + | RESET .*? #resetConfiguration + | CREATE INDEX (IF NOT EXISTS)? identifier ON TABLE? + identifierReference (USING indexType=identifier)? + LEFT_PAREN columns=multipartIdentifierPropertyList RIGHT_PAREN + (OPTIONS options=propertyList)? #createIndex + | DROP INDEX (IF EXISTS)? identifier ON TABLE? identifierReference #dropIndex + | unsupportedHiveNativeCommands .*? #failNativeCommand + ; + +timezone + : stringLit + | LOCAL + ; + +configKey + : quotedIdentifier + ; + +configValue + : backQuotedIdentifier + ; + +unsupportedHiveNativeCommands + : kw1=CREATE kw2=ROLE + | kw1=DROP kw2=ROLE + | kw1=GRANT kw2=ROLE? + | kw1=REVOKE kw2=ROLE? + | kw1=SHOW kw2=GRANT + | kw1=SHOW kw2=ROLE kw3=GRANT? + | kw1=SHOW kw2=PRINCIPALS + | kw1=SHOW kw2=ROLES + | kw1=SHOW kw2=CURRENT kw3=ROLES + | kw1=EXPORT kw2=TABLE + | kw1=IMPORT kw2=TABLE + | kw1=SHOW kw2=COMPACTIONS + | kw1=SHOW kw2=CREATE kw3=TABLE + | kw1=SHOW kw2=TRANSACTIONS + | kw1=SHOW kw2=INDEXES + | kw1=SHOW kw2=LOCKS + | kw1=CREATE kw2=INDEX + | kw1=DROP kw2=INDEX + | kw1=ALTER kw2=INDEX + | kw1=LOCK kw2=TABLE + | kw1=LOCK kw2=DATABASE + | kw1=UNLOCK kw2=TABLE + | kw1=UNLOCK kw2=DATABASE + | kw1=CREATE kw2=TEMPORARY kw3=MACRO + | kw1=DROP kw2=TEMPORARY kw3=MACRO + | kw1=ALTER kw2=TABLE tableIdentifier kw3=NOT kw4=CLUSTERED + | kw1=ALTER kw2=TABLE tableIdentifier kw3=CLUSTERED kw4=BY + | kw1=ALTER kw2=TABLE tableIdentifier kw3=NOT kw4=SORTED + | kw1=ALTER kw2=TABLE tableIdentifier kw3=SKEWED kw4=BY + | kw1=ALTER kw2=TABLE tableIdentifier kw3=NOT kw4=SKEWED + | kw1=ALTER kw2=TABLE tableIdentifier kw3=NOT kw4=STORED kw5=AS kw6=DIRECTORIES + | kw1=ALTER kw2=TABLE tableIdentifier kw3=SET kw4=SKEWED kw5=LOCATION + | kw1=ALTER kw2=TABLE tableIdentifier kw3=EXCHANGE kw4=PARTITION + | kw1=ALTER kw2=TABLE tableIdentifier kw3=ARCHIVE kw4=PARTITION + | kw1=ALTER kw2=TABLE tableIdentifier kw3=UNARCHIVE kw4=PARTITION + | kw1=ALTER kw2=TABLE tableIdentifier kw3=TOUCH + | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=COMPACT + | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE + | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT + | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=COLUMNS + | kw1=START kw2=TRANSACTION + | kw1=COMMIT + | kw1=ROLLBACK + | kw1=DFS + ; + +createTableHeader + : CREATE TEMPORARY? EXTERNAL? TABLE (IF NOT EXISTS)? identifierReference + ; + +replaceTableHeader + : (CREATE OR)? REPLACE TABLE identifierReference + ; + +bucketSpec + : CLUSTERED BY identifierList + (SORTED BY orderedIdentifierList)? + INTO INTEGER_VALUE BUCKETS + ; + +skewSpec + : SKEWED BY identifierList + ON (constantList | nestedConstantList) + (STORED AS DIRECTORIES)? + ; + +locationSpec + : LOCATION stringLit + ; + +commentSpec + : COMMENT stringLit + ; + +query + : ctes? queryTerm queryOrganization + ; + +insertInto + : INSERT OVERWRITE TABLE? identifierReference (partitionSpec (IF NOT EXISTS)?)? ((BY NAME) | identifierList)? #insertOverwriteTable + | INSERT INTO TABLE? identifierReference partitionSpec? (IF NOT EXISTS)? ((BY NAME) | identifierList)? #insertIntoTable + | INSERT INTO TABLE? identifierReference REPLACE whereClause #insertIntoReplaceWhere + | INSERT OVERWRITE LOCAL? DIRECTORY path=stringLit rowFormat? createFileFormat? #insertOverwriteHiveDir + | INSERT OVERWRITE LOCAL? DIRECTORY (path=stringLit)? tableProvider (OPTIONS options=propertyList)? #insertOverwriteDir + ; + +partitionSpecLocation + : partitionSpec locationSpec? + ; + +partitionSpec + : PARTITION LEFT_PAREN partitionVal (COMMA partitionVal)* RIGHT_PAREN + ; + +partitionVal + : identifier (EQ constant)? + | identifier EQ DEFAULT + ; + +namespace + : NAMESPACE + | DATABASE + | SCHEMA + ; + +namespaces + : NAMESPACES + | DATABASES + | SCHEMAS + ; + +describeFuncName + : identifierReference + | stringLit + | comparisonOperator + | arithmeticOperator + | predicateOperator + ; + +describeColName + : nameParts+=identifier (DOT nameParts+=identifier)* + ; + +ctes + : WITH namedQuery (COMMA namedQuery)* + ; + +namedQuery + : name=errorCapturingIdentifier (columnAliases=identifierList)? AS? LEFT_PAREN query RIGHT_PAREN + ; + +tableProvider + : USING multipartIdentifier + ; + +createTableClauses + :((OPTIONS options=expressionPropertyList) | + (PARTITIONED BY partitioning=partitionFieldList) | + skewSpec | + bucketSpec | + rowFormat | + createFileFormat | + locationSpec | + commentSpec | + (TBLPROPERTIES tableProps=propertyList))* + ; + +propertyList + : LEFT_PAREN property (COMMA property)* RIGHT_PAREN + ; + +property + : key=propertyKey (EQ? value=propertyValue)? + ; + +propertyKey + : identifier (DOT identifier)* + | stringLit + ; + +propertyValue + : INTEGER_VALUE + | DECIMAL_VALUE + | booleanValue + | stringLit + ; + +expressionPropertyList + : LEFT_PAREN expressionProperty (COMMA expressionProperty)* RIGHT_PAREN + ; + +expressionProperty + : key=propertyKey (EQ? value=expression)? + ; + +constantList + : LEFT_PAREN constant (COMMA constant)* RIGHT_PAREN + ; + +nestedConstantList + : LEFT_PAREN constantList (COMMA constantList)* RIGHT_PAREN + ; + +createFileFormat + : STORED AS fileFormat + | STORED BY storageHandler + ; + +fileFormat + : INPUTFORMAT inFmt=stringLit OUTPUTFORMAT outFmt=stringLit #tableFileFormat + | identifier #genericFileFormat + ; + +storageHandler + : stringLit (WITH SERDEPROPERTIES propertyList)? + ; + +resource + : identifier stringLit + ; + +dmlStatementNoWith + : insertInto query #singleInsertQuery + | fromClause multiInsertQueryBody+ #multiInsertQuery + | DELETE FROM identifierReference tableAlias whereClause? #deleteFromTable + | UPDATE identifierReference tableAlias setClause whereClause? #updateTable + | MERGE INTO target=identifierReference targetAlias=tableAlias + USING (source=identifierReference | + LEFT_PAREN sourceQuery=query RIGHT_PAREN) sourceAlias=tableAlias + ON mergeCondition=booleanExpression + matchedClause* + notMatchedClause* + notMatchedBySourceClause* #mergeIntoTable + ; + +identifierReference + : IDENTIFIER_KW LEFT_PAREN expression RIGHT_PAREN + | multipartIdentifier + ; + +queryOrganization + : (ORDER BY order+=sortItem (COMMA order+=sortItem)*)? + (CLUSTER BY clusterBy+=expression (COMMA clusterBy+=expression)*)? + (DISTRIBUTE BY distributeBy+=expression (COMMA distributeBy+=expression)*)? + (SORT BY sort+=sortItem (COMMA sort+=sortItem)*)? + windowClause? + (LIMIT (ALL | limit=expression))? + (OFFSET offset=expression)? + ; + +multiInsertQueryBody + : insertInto fromStatementBody + ; + +queryTerm + : queryPrimary #queryTermDefault + | left=queryTerm {legacy_setops_precedence_enabled}? + operator=(INTERSECT | UNION | EXCEPT | SETMINUS) setQuantifier? right=queryTerm #setOperation + | left=queryTerm {!legacy_setops_precedence_enabled}? + operator=INTERSECT setQuantifier? right=queryTerm #setOperation + | left=queryTerm {!legacy_setops_precedence_enabled}? + operator=(UNION | EXCEPT | SETMINUS) setQuantifier? right=queryTerm #setOperation + ; + +queryPrimary + : querySpecification #queryPrimaryDefault + | fromStatement #fromStmt + | TABLE identifierReference #table + | inlineTable #inlineTableDefault1 + | LEFT_PAREN query RIGHT_PAREN #subquery + ; + +sortItem + : expression ordering=(ASC | DESC)? (NULLS nullOrder=(LAST | FIRST))? + ; + +fromStatement + : fromClause fromStatementBody+ + ; + +fromStatementBody + : transformClause + whereClause? + queryOrganization + | selectClause + lateralView* + whereClause? + aggregationClause? + havingClause? + windowClause? + queryOrganization + ; + +querySpecification + : transformClause + fromClause? + lateralView* + whereClause? + aggregationClause? + havingClause? + windowClause? #transformQuerySpecification + | selectClause + fromClause? + lateralView* + whereClause? + aggregationClause? + havingClause? + windowClause? #regularQuerySpecification + ; + +transformClause + : (SELECT kind=TRANSFORM LEFT_PAREN setQuantifier? expressionSeq RIGHT_PAREN + | kind=MAP setQuantifier? expressionSeq + | kind=REDUCE setQuantifier? expressionSeq) + inRowFormat=rowFormat? + (RECORDWRITER recordWriter=stringLit)? + USING script=stringLit + (AS (identifierSeq | colTypeList | (LEFT_PAREN (identifierSeq | colTypeList) RIGHT_PAREN)))? + outRowFormat=rowFormat? + (RECORDREADER recordReader=stringLit)? + ; + +selectClause + : SELECT (hints+=hint)* setQuantifier? namedExpressionSeq + ; + +setClause + : SET assignmentList + ; + +matchedClause + : WHEN MATCHED (AND matchedCond=booleanExpression)? THEN matchedAction + ; +notMatchedClause + : WHEN NOT MATCHED (BY TARGET)? (AND notMatchedCond=booleanExpression)? THEN notMatchedAction + ; + +notMatchedBySourceClause + : WHEN NOT MATCHED BY SOURCE (AND notMatchedBySourceCond=booleanExpression)? THEN notMatchedBySourceAction + ; + +matchedAction + : DELETE + | UPDATE SET ASTERISK + | UPDATE SET assignmentList + ; + +notMatchedAction + : INSERT ASTERISK + | INSERT LEFT_PAREN columns=multipartIdentifierList RIGHT_PAREN + VALUES LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN + ; + +notMatchedBySourceAction + : DELETE + | UPDATE SET assignmentList + ; + +assignmentList + : assignment (COMMA assignment)* + ; + +assignment + : key=multipartIdentifier EQ value=expression + ; + +whereClause + : WHERE booleanExpression + ; + +havingClause + : HAVING booleanExpression + ; + +hint + : HENT_START hintStatements+=hintStatement (COMMA? hintStatements+=hintStatement)* HENT_END + ; + +hintStatement + : hintName=identifier + | hintName=identifier LEFT_PAREN parameters+=primaryExpression (COMMA parameters+=primaryExpression)* RIGHT_PAREN + ; + +fromClause + : FROM relation (COMMA relation)* lateralView* pivotClause? unpivotClause? + ; + +temporalClause + : FOR? (SYSTEM_VERSION | VERSION) AS OF version + | FOR? (SYSTEM_TIME | TIMESTAMP) AS OF timestamp=valueExpression + ; + +aggregationClause + : GROUP BY groupingExpressionsWithGroupingAnalytics+=groupByClause + (COMMA groupingExpressionsWithGroupingAnalytics+=groupByClause)* + | GROUP BY groupingExpressions+=expression (COMMA groupingExpressions+=expression)* ( + WITH kind=ROLLUP + | WITH kind=CUBE + | kind=GROUPING SETS LEFT_PAREN groupingSet (COMMA groupingSet)* RIGHT_PAREN)? + ; + +groupByClause + : groupingAnalytics + | expression + ; + +groupingAnalytics + : (ROLLUP | CUBE) LEFT_PAREN groupingSet (COMMA groupingSet)* RIGHT_PAREN + | GROUPING SETS LEFT_PAREN groupingElement (COMMA groupingElement)* RIGHT_PAREN + ; + +groupingElement + : groupingAnalytics + | groupingSet + ; + +groupingSet + : LEFT_PAREN (expression (COMMA expression)*)? RIGHT_PAREN + | expression + ; + +pivotClause + : PIVOT LEFT_PAREN aggregates=namedExpressionSeq FOR pivotColumn IN LEFT_PAREN pivotValues+=pivotValue (COMMA pivotValues+=pivotValue)* RIGHT_PAREN RIGHT_PAREN + ; + +pivotColumn + : identifiers+=identifier + | LEFT_PAREN identifiers+=identifier (COMMA identifiers+=identifier)* RIGHT_PAREN + ; + +pivotValue + : expression (AS? identifier)? + ; + +unpivotClause + : UNPIVOT nullOperator=unpivotNullClause? LEFT_PAREN + operator=unpivotOperator + RIGHT_PAREN (AS? identifier)? + ; + +unpivotNullClause + : (INCLUDE | EXCLUDE) NULLS + ; + +unpivotOperator + : (unpivotSingleValueColumnClause | unpivotMultiValueColumnClause) + ; + +unpivotSingleValueColumnClause + : unpivotValueColumn FOR unpivotNameColumn IN LEFT_PAREN unpivotColumns+=unpivotColumnAndAlias (COMMA unpivotColumns+=unpivotColumnAndAlias)* RIGHT_PAREN + ; + +unpivotMultiValueColumnClause + : LEFT_PAREN unpivotValueColumns+=unpivotValueColumn (COMMA unpivotValueColumns+=unpivotValueColumn)* RIGHT_PAREN + FOR unpivotNameColumn + IN LEFT_PAREN unpivotColumnSets+=unpivotColumnSet (COMMA unpivotColumnSets+=unpivotColumnSet)* RIGHT_PAREN + ; + +unpivotColumnSet + : LEFT_PAREN unpivotColumns+=unpivotColumn (COMMA unpivotColumns+=unpivotColumn)* RIGHT_PAREN unpivotAlias? + ; + +unpivotValueColumn + : identifier + ; + +unpivotNameColumn + : identifier + ; + +unpivotColumnAndAlias + : unpivotColumn unpivotAlias? + ; + +unpivotColumn + : multipartIdentifier + ; + +unpivotAlias + : AS? identifier + ; + +lateralView + : LATERAL VIEW (OUTER)? qualifiedName LEFT_PAREN (expression (COMMA expression)*)? RIGHT_PAREN tblName=identifier (AS? colName+=identifier (COMMA colName+=identifier)*)? + ; + +setQuantifier + : DISTINCT + | ALL + ; + +relation + : LATERAL? relationPrimary relationExtension* + ; + +relationExtension + : joinRelation + | pivotClause + | unpivotClause + ; + +joinRelation + : (joinType) JOIN LATERAL? right=relationPrimary joinCriteria? + | NATURAL joinType JOIN LATERAL? right=relationPrimary + ; + +joinType + : INNER? + | CROSS + | LEFT OUTER? + | LEFT? SEMI + | RIGHT OUTER? + | FULL OUTER? + | LEFT? ANTI + ; + +joinCriteria + : ON booleanExpression + | USING identifierList + ; + +sample + : TABLESAMPLE LEFT_PAREN sampleMethod? RIGHT_PAREN (REPEATABLE LEFT_PAREN seed=INTEGER_VALUE RIGHT_PAREN)? + ; + +sampleMethod + : negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) PERCENTLIT #sampleByPercentile + | expression ROWS #sampleByRows + | sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE + (ON (identifier | qualifiedName LEFT_PAREN RIGHT_PAREN))? #sampleByBucket + | bytes=expression #sampleByBytes + ; + +identifierList + : LEFT_PAREN identifierSeq RIGHT_PAREN + ; + +identifierSeq + : ident+=errorCapturingIdentifier (COMMA ident+=errorCapturingIdentifier)* + ; + +orderedIdentifierList + : LEFT_PAREN orderedIdentifier (COMMA orderedIdentifier)* RIGHT_PAREN + ; + +orderedIdentifier + : ident=errorCapturingIdentifier ordering=(ASC | DESC)? + ; + +identifierCommentList + : LEFT_PAREN identifierComment (COMMA identifierComment)* RIGHT_PAREN + ; + +identifierComment + : identifier commentSpec? + ; + +relationPrimary + : identifierReference temporalClause? + sample? tableAlias #tableName + | LEFT_PAREN query RIGHT_PAREN sample? tableAlias #aliasedQuery + | LEFT_PAREN relation RIGHT_PAREN sample? tableAlias #aliasedRelation + | inlineTable #inlineTableDefault2 + | functionTable #tableValuedFunction + ; + +inlineTable + : VALUES expression (COMMA expression)* tableAlias + ; + +functionTableSubqueryArgument + : TABLE identifierReference + | TABLE LEFT_PAREN identifierReference RIGHT_PAREN + | TABLE LEFT_PAREN query RIGHT_PAREN + ; + +functionTableNamedArgumentExpression + : key=identifier FAT_ARROW table=functionTableSubqueryArgument + ; + +functionTableReferenceArgument + : functionTableSubqueryArgument + | functionTableNamedArgumentExpression + ; + +functionTableArgument + : functionTableReferenceArgument + | functionArgument + ; + +functionTable + : funcName=functionName LEFT_PAREN + (functionTableArgument (COMMA functionTableArgument)*)? + RIGHT_PAREN tableAlias + ; + +tableAlias + : (AS? strictIdentifier identifierList?)? + ; + +rowFormat + : ROW FORMAT SERDE name=stringLit (WITH SERDEPROPERTIES props=propertyList)? #rowFormatSerde + | ROW FORMAT DELIMITED + (FIELDS TERMINATED BY fieldsTerminatedBy=stringLit (ESCAPED BY escapedBy=stringLit)?)? + (COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy=stringLit)? + (MAP KEYS TERMINATED BY keysTerminatedBy=stringLit)? + (LINES TERMINATED BY linesSeparatedBy=stringLit)? + (NULL DEFINED AS nullDefinedAs=stringLit)? #rowFormatDelimited + ; + +multipartIdentifierList + : multipartIdentifier (COMMA multipartIdentifier)* + ; + +multipartIdentifier + : parts+=errorCapturingIdentifier (DOT parts+=errorCapturingIdentifier)* + ; + +multipartIdentifierPropertyList + : multipartIdentifierProperty (COMMA multipartIdentifierProperty)* + ; + +multipartIdentifierProperty + : multipartIdentifier (OPTIONS options=propertyList)? + ; + +tableIdentifier + : (db=errorCapturingIdentifier DOT)? table=errorCapturingIdentifier + ; + +functionIdentifier + : (db=errorCapturingIdentifier DOT)? function=errorCapturingIdentifier + ; + +namedExpression + : expression (AS? (name=errorCapturingIdentifier | identifierList))? + ; + +namedExpressionSeq + : namedExpression (COMMA namedExpression)* + ; + +partitionFieldList + : LEFT_PAREN fields+=partitionField (COMMA fields+=partitionField)* RIGHT_PAREN + ; + +partitionField + : transform #partitionTransform + | colType #partitionColumn + ; + +transform + : qualifiedName #identityTransform + | transformName=identifier + LEFT_PAREN argument+=transformArgument (COMMA argument+=transformArgument)* RIGHT_PAREN #applyTransform + ; + +transformArgument + : qualifiedName + | constant + ; + +expression + : booleanExpression + ; + +namedArgumentExpression + : key=identifier FAT_ARROW value=expression + ; + +functionArgument + : expression + | namedArgumentExpression + ; + +expressionSeq + : expression (COMMA expression)* + ; + +booleanExpression + : NOT booleanExpression #logicalNot + | EXISTS LEFT_PAREN query RIGHT_PAREN #exists + | valueExpression predicate? #predicated + | left=booleanExpression operator=AND right=booleanExpression #logicalBinary + | left=booleanExpression operator=OR right=booleanExpression #logicalBinary + ; + +predicate + : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression + | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN + | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN + | NOT? kind=RLIKE pattern=valueExpression + | NOT? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) (LEFT_PAREN RIGHT_PAREN | LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN) + | NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=stringLit)? + | IS NOT? kind=NULL + | IS NOT? kind=(TRUE | FALSE | UNKNOWN) + | IS NOT? kind=DISTINCT FROM right=valueExpression + ; + +valueExpression + : primaryExpression #valueExpressionDefault + | operator=(MINUS | PLUS | TILDE) valueExpression #arithmeticUnary + | left=valueExpression operator=(ASTERISK | SLASH | PERCENT | DIV) right=valueExpression #arithmeticBinary + | left=valueExpression operator=(PLUS | MINUS | CONCAT_PIPE) right=valueExpression #arithmeticBinary + | left=valueExpression operator=AMPERSAND right=valueExpression #arithmeticBinary + | left=valueExpression operator=HAT right=valueExpression #arithmeticBinary + | left=valueExpression operator=PIPE right=valueExpression #arithmeticBinary + | left=valueExpression comparisonOperator right=valueExpression #comparison + ; + +datetimeUnit + : YEAR | QUARTER | MONTH + | WEEK | DAY | DAYOFYEAR + | HOUR | MINUTE | SECOND | MILLISECOND | MICROSECOND + ; + +primaryExpression + : name=(CURRENT_DATE | CURRENT_TIMESTAMP | CURRENT_USER | USER) #currentLike + | name=(TIMESTAMPADD | DATEADD | DATE_ADD) LEFT_PAREN (unit=datetimeUnit | invalidUnit=stringLit) COMMA unitsAmount=valueExpression COMMA timestamp=valueExpression RIGHT_PAREN #timestampadd + | name=(TIMESTAMPDIFF | DATEDIFF | DATE_DIFF) LEFT_PAREN (unit=datetimeUnit | invalidUnit=stringLit) COMMA startTimestamp=valueExpression COMMA endTimestamp=valueExpression RIGHT_PAREN #timestampdiff + | CASE whenClause+ (ELSE elseExpression=expression)? END #searchedCase + | CASE value=expression whenClause+ (ELSE elseExpression=expression)? END #simpleCase + | name=(CAST | TRY_CAST) LEFT_PAREN expression AS dataType RIGHT_PAREN #cast + | STRUCT LEFT_PAREN (argument+=namedExpression (COMMA argument+=namedExpression)*)? RIGHT_PAREN #struct + | FIRST LEFT_PAREN expression (IGNORE NULLS)? RIGHT_PAREN #first + | ANY_VALUE LEFT_PAREN expression (IGNORE NULLS)? RIGHT_PAREN #any_value + | LAST LEFT_PAREN expression (IGNORE NULLS)? RIGHT_PAREN #last + | POSITION LEFT_PAREN substr=valueExpression IN str=valueExpression RIGHT_PAREN #position + | constant #constantDefault + | ASTERISK #star + | qualifiedName DOT ASTERISK #star + | LEFT_PAREN namedExpression (COMMA namedExpression)+ RIGHT_PAREN #rowConstructor + | LEFT_PAREN query RIGHT_PAREN #subqueryExpression + | functionName LEFT_PAREN (setQuantifier? argument+=functionArgument + (COMMA argument+=functionArgument)*)? RIGHT_PAREN + (FILTER LEFT_PAREN WHERE where=booleanExpression RIGHT_PAREN)? + (nullsOption=(IGNORE | RESPECT) NULLS)? ( OVER windowSpec)? #functionCall + | identifier ARROW expression #lambda + | LEFT_PAREN identifier (COMMA identifier)+ RIGHT_PAREN ARROW expression #lambda + | value=primaryExpression LEFT_BRACKET index=valueExpression RIGHT_BRACKET #subscript + | identifier #columnReference + | base=primaryExpression DOT fieldName=identifier #dereference + | LEFT_PAREN expression RIGHT_PAREN #parenthesizedExpression + | EXTRACT LEFT_PAREN field=identifier FROM source=valueExpression RIGHT_PAREN #extract + | (SUBSTR | SUBSTRING) LEFT_PAREN str=valueExpression (FROM | COMMA) pos=valueExpression + ((FOR | COMMA) len=valueExpression)? RIGHT_PAREN #substring + | TRIM LEFT_PAREN trimOption=(BOTH | LEADING | TRAILING)? (trimStr=valueExpression)? + FROM srcStr=valueExpression RIGHT_PAREN #trim + | OVERLAY LEFT_PAREN input=valueExpression PLACING replace=valueExpression + FROM position=valueExpression (FOR length=valueExpression)? RIGHT_PAREN #overlay + | name=(PERCENTILE_CONT | PERCENTILE_DISC) LEFT_PAREN percentage=valueExpression RIGHT_PAREN + WITHIN GROUP LEFT_PAREN ORDER BY sortItem RIGHT_PAREN + (FILTER LEFT_PAREN WHERE where=booleanExpression RIGHT_PAREN)? ( OVER windowSpec)? #percentile + ; + +literalType + : DATE + | TIMESTAMP | TIMESTAMP_LTZ | TIMESTAMP_NTZ + | INTERVAL + | BINARY_HEX + | unsupportedType=identifier + ; + +constant + : NULL #nullLiteral + | QUESTION #posParameterLiteral + | COLON identifier #namedParameterLiteral + | interval #intervalLiteral + | literalType stringLit #typeConstructor + | number #numericLiteral + | booleanValue #booleanLiteral + | stringLit+ #stringLiteral + ; + +comparisonOperator + : EQ | NEQ | NEQJ | LT | LTE | GT | GTE | NSEQ + ; + +arithmeticOperator + : PLUS | MINUS | ASTERISK | SLASH | PERCENT | DIV | TILDE | AMPERSAND | PIPE | CONCAT_PIPE | HAT + ; + +predicateOperator + : OR | AND | IN | NOT + ; + +booleanValue + : TRUE | FALSE + ; + +interval + : INTERVAL (errorCapturingMultiUnitsInterval | errorCapturingUnitToUnitInterval) + ; + +errorCapturingMultiUnitsInterval + : body=multiUnitsInterval unitToUnitInterval? + ; + +multiUnitsInterval + : (intervalValue unit+=unitInMultiUnits)+ + ; + +errorCapturingUnitToUnitInterval + : body=unitToUnitInterval (error1=multiUnitsInterval | error2=unitToUnitInterval)? + ; + +unitToUnitInterval + : value=intervalValue from=unitInUnitToUnit TO to=unitInUnitToUnit + ; + +intervalValue + : (PLUS | MINUS)? + (INTEGER_VALUE | DECIMAL_VALUE | stringLit) + ; + +unitInMultiUnits + : NANOSECOND | NANOSECONDS | MICROSECOND | MICROSECONDS | MILLISECOND | MILLISECONDS + | SECOND | SECONDS | MINUTE | MINUTES | HOUR | HOURS | DAY | DAYS | WEEK | WEEKS + | MONTH | MONTHS | YEAR | YEARS + ; + +unitInUnitToUnit + : SECOND | MINUTE | HOUR | DAY | MONTH | YEAR + ; + +colPosition + : position=FIRST | position=AFTER afterCol=errorCapturingIdentifier + ; + +type + : BOOLEAN + | TINYINT | BYTE + | SMALLINT | SHORT + | INT | INTEGER + | BIGINT | LONG + | FLOAT | REAL + | DOUBLE + | DATE + | TIMESTAMP | TIMESTAMP_NTZ | TIMESTAMP_LTZ + | STRING + | CHARACTER | CHAR + | VARCHAR + | BINARY + | DECIMAL | DEC | NUMERIC + | VOID + | INTERVAL + | ARRAY | STRUCT | MAP + | unsupportedType=identifier + ; + +dataType + : complex=ARRAY LT dataType GT #complexDataType + | complex=MAP LT dataType COMMA dataType GT #complexDataType + | complex=STRUCT (LT complexColTypeList? GT | NEQ) #complexDataType + | INTERVAL from=(YEAR | MONTH) (TO to=MONTH)? #yearMonthIntervalDataType + | INTERVAL from=(DAY | HOUR | MINUTE | SECOND) + (TO to=(HOUR | MINUTE | SECOND))? #dayTimeIntervalDataType + | type (LEFT_PAREN INTEGER_VALUE + (COMMA INTEGER_VALUE)* RIGHT_PAREN)? #primitiveDataType + ; + +qualifiedColTypeWithPositionList + : qualifiedColTypeWithPosition (COMMA qualifiedColTypeWithPosition)* + ; + +qualifiedColTypeWithPosition + : name=multipartIdentifier dataType colDefinitionDescriptorWithPosition* + ; + +colDefinitionDescriptorWithPosition + : NOT NULL + | defaultExpression + | commentSpec + | colPosition + ; + +defaultExpression + : DEFAULT expression + ; + +colTypeList + : colType (COMMA colType)* + ; + +colType + : colName=errorCapturingIdentifier dataType (NOT NULL)? commentSpec? + ; + +createOrReplaceTableColTypeList + : createOrReplaceTableColType (COMMA createOrReplaceTableColType)* + ; + +createOrReplaceTableColType + : colName=errorCapturingIdentifier dataType colDefinitionOption* + ; + +colDefinitionOption + : NOT NULL + | defaultExpression + | generationExpression + | commentSpec + ; + +generationExpression + : GENERATED ALWAYS AS LEFT_PAREN expression RIGHT_PAREN + ; + +complexColTypeList + : complexColType (COMMA complexColType)* + ; + +complexColType + : identifier COLON? dataType (NOT NULL)? commentSpec? + ; + +whenClause + : WHEN condition=expression THEN result=expression + ; + +windowClause + : WINDOW namedWindow (COMMA namedWindow)* + ; + +namedWindow + : name=errorCapturingIdentifier AS windowSpec + ; + +windowSpec + : name=errorCapturingIdentifier #windowRef + | LEFT_PAREN name=errorCapturingIdentifier RIGHT_PAREN #windowRef + | LEFT_PAREN + ( CLUSTER BY partition+=expression (COMMA partition+=expression)* + | ((PARTITION | DISTRIBUTE) BY partition+=expression (COMMA partition+=expression)*)? + ((ORDER | SORT) BY sortItem (COMMA sortItem)*)?) + windowFrame? + RIGHT_PAREN #windowDef + ; + +windowFrame + : frameType=RANGE start=frameBound + | frameType=ROWS start=frameBound + | frameType=RANGE BETWEEN start=frameBound AND end=frameBound + | frameType=ROWS BETWEEN start=frameBound AND end=frameBound + ; + +frameBound + : UNBOUNDED boundType=(PRECEDING | FOLLOWING) + | boundType=CURRENT ROW + | expression boundType=(PRECEDING | FOLLOWING) + ; + +qualifiedNameList + : qualifiedName (COMMA qualifiedName)* + ; + +functionName + : IDENTIFIER_KW LEFT_PAREN expression RIGHT_PAREN + | identFunc=IDENTIFIER_KW // IDENTIFIER itself is also a valid function name. + | qualifiedName + | FILTER + | LEFT + | RIGHT + ; + +qualifiedName + : identifier (DOT identifier)* + ; + +// this rule is used for explicitly capturing wrong identifiers such as test-table, which should actually be `test-table` +// replace identifier with errorCapturingIdentifier where the immediate follow symbol is not an expression, otherwise +// valid expressions such as "a-b" can be recognized as an identifier +errorCapturingIdentifier + : identifier errorCapturingIdentifierExtra + ; + +// extra left-factoring grammar +errorCapturingIdentifierExtra + : (MINUS identifier)+ #errorIdent + | #realIdent + ; + +identifier + : strictIdentifier + | {!SQL_standard_keyword_behavior}? strictNonReserved + ; + +strictIdentifier + : IDENTIFIER #unquotedIdentifier + | quotedIdentifier #quotedIdentifierAlternative + | {SQL_standard_keyword_behavior}? ansiNonReserved #unquotedIdentifier + | {!SQL_standard_keyword_behavior}? nonReserved #unquotedIdentifier + ; + +quotedIdentifier + : BACKQUOTED_IDENTIFIER + | {double_quoted_identifiers}? DOUBLEQUOTED_STRING + ; + +backQuotedIdentifier + : BACKQUOTED_IDENTIFIER + ; + +number + : {!legacy_exponent_literal_as_decimal_enabled}? MINUS? EXPONENT_VALUE #exponentLiteral + | {!legacy_exponent_literal_as_decimal_enabled}? MINUS? DECIMAL_VALUE #decimalLiteral + | {legacy_exponent_literal_as_decimal_enabled}? MINUS? (EXPONENT_VALUE | DECIMAL_VALUE) #legacyDecimalLiteral + | MINUS? INTEGER_VALUE #integerLiteral + | MINUS? BIGINT_LITERAL #bigIntLiteral + | MINUS? SMALLINT_LITERAL #smallIntLiteral + | MINUS? TINYINT_LITERAL #tinyIntLiteral + | MINUS? DOUBLE_LITERAL #doubleLiteral + | MINUS? FLOAT_LITERAL #floatLiteral + | MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral + ; + +alterColumnAction + : TYPE dataType + | commentSpec + | colPosition + | setOrDrop=(SET | DROP) NOT NULL + | SET defaultExpression + | dropDefault=DROP DEFAULT + ; + +stringLit + : STRING_LITERAL + | {!double_quoted_identifiers}? DOUBLEQUOTED_STRING + ; + +comment + : stringLit + | NULL + ; + +version + : INTEGER_VALUE + | stringLit + ; + +// When `SQL_standard_keyword_behavior=true`, there are 2 kinds of keywords in Spark SQL. +// - Reserved keywords: +// Keywords that are reserved and can't be used as identifiers for table, view, column, +// function, alias, etc. +// - Non-reserved keywords: +// Keywords that have a special meaning only in particular contexts and can be used as +// identifiers in other contexts. For example, `EXPLAIN SELECT ...` is a command, but EXPLAIN +// can be used as identifiers in other places. +// You can find the full keywords list by searching "Start of the keywords list" in this file. +// The non-reserved keywords are listed below. Keywords not in this list are reserved keywords. +ansiNonReserved +//--ANSI-NON-RESERVED-START + : ADD + | AFTER + | ALTER + | ALWAYS + | ANALYZE + | ANTI + | ANY_VALUE + | ARCHIVE + | ARRAY + | ASC + | AT + | BETWEEN + | BIGINT + | BINARY + | BINARY_HEX + | BOOLEAN + | BUCKET + | BUCKETS + | BY + | BYTE + | CACHE + | CASCADE + | CATALOG + | CATALOGS + | CHANGE + | CHAR + | CHARACTER + | CLEAR + | CLUSTER + | CLUSTERED + | CODEGEN + | COLLECTION + | COLUMNS + | COMMENT + | COMMIT + | COMPACT + | COMPACTIONS + | COMPUTE + | CONCATENATE + | COST + | CUBE + | CURRENT + | DATA + | DATABASE + | DATABASES + | DATE + | DATEADD + | DATE_ADD + | DATEDIFF + | DATE_DIFF + | DAY + | DAYS + | DAYOFYEAR + | DBPROPERTIES + | DEC + | DECIMAL + | DEFAULT + | DEFINED + | DELETE + | DELIMITED + | DESC + | DESCRIBE + | DFS + | DIRECTORIES + | DIRECTORY + | DISTRIBUTE + | DIV + | DOUBLE + | DROP + | ESCAPED + | EXCHANGE + | EXCLUDE + | EXISTS + | EXPLAIN + | EXPORT + | EXTENDED + | EXTERNAL + | EXTRACT + | FIELDS + | FILEFORMAT + | FIRST + | FLOAT + | FOLLOWING + | FORMAT + | FORMATTED + | FUNCTION + | FUNCTIONS + | GENERATED + | GLOBAL + | GROUPING + | HOUR + | HOURS + | IDENTIFIER_KW + | IF + | IGNORE + | IMPORT + | INCLUDE + | INDEX + | INDEXES + | INPATH + | INPUTFORMAT + | INSERT + | INT + | INTEGER + | INTERVAL + | ITEMS + | KEYS + | LAST + | LAZY + | LIKE + | ILIKE + | LIMIT + | LINES + | LIST + | LOAD + | LOCAL + | LOCATION + | LOCK + | LOCKS + | LOGICAL + | LONG + | MACRO + | MAP + | MATCHED + | MERGE + | MICROSECOND + | MICROSECONDS + | MILLISECOND + | MILLISECONDS + | MINUTE + | MINUTES + | MONTH + | MONTHS + | MSCK + | NAME + | NAMESPACE + | NAMESPACES + | NANOSECOND + | NANOSECONDS + | NO + | NULLS + | NUMERIC + | OF + | OPTION + | OPTIONS + | OUT + | OUTPUTFORMAT + | OVER + | OVERLAY + | OVERWRITE + | PARTITION + | PARTITIONED + | PARTITIONS + | PERCENTLIT + | PIVOT + | PLACING + | POSITION + | PRECEDING + | PRINCIPALS + | PROPERTIES + | PURGE + | QUARTER + | QUERY + | RANGE + | REAL + | RECORDREADER + | RECORDWRITER + | RECOVER + | REDUCE + | REFRESH + | RENAME + | REPAIR + | REPEATABLE + | REPLACE + | RESET + | RESPECT + | RESTRICT + | REVOKE + | RLIKE + | ROLE + | ROLES + | ROLLBACK + | ROLLUP + | ROW + | ROWS + | SCHEMA + | SCHEMAS + | SECOND + | SECONDS + | SEMI + | SEPARATED + | SERDE + | SERDEPROPERTIES + | SET + | SETMINUS + | SETS + | SHORT + | SHOW + | SKEWED + | SMALLINT + | SORT + | SORTED + | SOURCE + | START + | STATISTICS + | STORED + | STRATIFY + | STRING + | STRUCT + | SUBSTR + | SUBSTRING + | SYNC + | SYSTEM_TIME + | SYSTEM_VERSION + | TABLES + | TABLESAMPLE + | TARGET + | TBLPROPERTIES + | TEMPORARY + | TERMINATED + | TIMESTAMP + | TIMESTAMP_LTZ + | TIMESTAMP_NTZ + | TIMESTAMPADD + | TIMESTAMPDIFF + | TINYINT + | TOUCH + | TRANSACTION + | TRANSACTIONS + | TRANSFORM + | TRIM + | TRUE + | TRUNCATE + | TRY_CAST + | TYPE + | UNARCHIVE + | UNBOUNDED + | UNCACHE + | UNLOCK + | UNPIVOT + | UNSET + | UPDATE + | USE + | VALUES + | VARCHAR + | VERSION + | VIEW + | VIEWS + | VOID + | WEEK + | WEEKS + | WINDOW + | YEAR + | YEARS + | ZONE +//--ANSI-NON-RESERVED-END + ; + +// When `SQL_standard_keyword_behavior=false`, there are 2 kinds of keywords in Spark SQL. +// - Non-reserved keywords: +// Same definition as the one when `SQL_standard_keyword_behavior=true`. +// - Strict-non-reserved keywords: +// A strict version of non-reserved keywords, which can not be used as table alias. +// You can find the full keywords list by searching "Start of the keywords list" in this file. +// The strict-non-reserved keywords are listed in `strictNonReserved`. +// The non-reserved keywords are listed in `nonReserved`. +// These 2 together contain all the keywords. +strictNonReserved + : ANTI + | CROSS + | EXCEPT + | FULL + | INNER + | INTERSECT + | JOIN + | LATERAL + | LEFT + | NATURAL + | ON + | RIGHT + | SEMI + | SETMINUS + | UNION + | USING + ; + +nonReserved +//--DEFAULT-NON-RESERVED-START + : ADD + | AFTER + | ALL + | ALTER + | ALWAYS + | ANALYZE + | AND + | ANY + | ANY_VALUE + | ARCHIVE + | ARRAY + | AS + | ASC + | AT + | AUTHORIZATION + | BETWEEN + | BIGINT + | BINARY + | BINARY_HEX + | BOOLEAN + | BOTH + | BUCKET + | BUCKETS + | BY + | BYTE + | CACHE + | CASCADE + | CASE + | CAST + | CATALOG + | CATALOGS + | CHANGE + | CHAR + | CHARACTER + | CHECK + | CLEAR + | CLUSTER + | CLUSTERED + | CODEGEN + | COLLATE + | COLLECTION + | COLUMN + | COLUMNS + | COMMENT + | COMMIT + | COMPACT + | COMPACTIONS + | COMPUTE + | CONCATENATE + | CONSTRAINT + | COST + | CREATE + | CUBE + | CURRENT + | CURRENT_DATE + | CURRENT_TIME + | CURRENT_TIMESTAMP + | CURRENT_USER + | DATA + | DATABASE + | DATABASES + | DATE + | DATEADD + | DATE_ADD + | DATEDIFF + | DATE_DIFF + | DAY + | DAYS + | DAYOFYEAR + | DBPROPERTIES + | DEC + | DECIMAL + | DEFAULT + | DEFINED + | DELETE + | DELIMITED + | DESC + | DESCRIBE + | DFS + | DIRECTORIES + | DIRECTORY + | DISTINCT + | DISTRIBUTE + | DIV + | DOUBLE + | DROP + | ELSE + | END + | ESCAPE + | ESCAPED + | EXCHANGE + | EXCLUDE + | EXISTS + | EXPLAIN + | EXPORT + | EXTENDED + | EXTERNAL + | EXTRACT + | FALSE + | FETCH + | FILTER + | FIELDS + | FILEFORMAT + | FIRST + | FLOAT + | FOLLOWING + | FOR + | FOREIGN + | FORMAT + | FORMATTED + | FROM + | FUNCTION + | FUNCTIONS + | GENERATED + | GLOBAL + | GRANT + | GROUP + | GROUPING + | HAVING + | HOUR + | HOURS + | IDENTIFIER_KW + | IF + | IGNORE + | IMPORT + | IN + | INCLUDE + | INDEX + | INDEXES + | INPATH + | INPUTFORMAT + | INSERT + | INT + | INTEGER + | INTERVAL + | INTO + | IS + | ITEMS + | KEYS + | LAST + | LAZY + | LEADING + | LIKE + | LONG + | ILIKE + | LIMIT + | LINES + | LIST + | LOAD + | LOCAL + | LOCATION + | LOCK + | LOCKS + | LOGICAL + | LONG + | MACRO + | MAP + | MATCHED + | MERGE + | MICROSECOND + | MICROSECONDS + | MILLISECOND + | MILLISECONDS + | MINUTE + | MINUTES + | MONTH + | MONTHS + | MSCK + | NAME + | NAMESPACE + | NAMESPACES + | NANOSECOND + | NANOSECONDS + | NO + | NOT + | NULL + | NULLS + | NUMERIC + | OF + | OFFSET + | ONLY + | OPTION + | OPTIONS + | OR + | ORDER + | OUT + | OUTER + | OUTPUTFORMAT + | OVER + | OVERLAPS + | OVERLAY + | OVERWRITE + | PARTITION + | PARTITIONED + | PARTITIONS + | PERCENTILE_CONT + | PERCENTILE_DISC + | PERCENTLIT + | PIVOT + | PLACING + | POSITION + | PRECEDING + | PRIMARY + | PRINCIPALS + | PROPERTIES + | PURGE + | QUARTER + | QUERY + | RANGE + | REAL + | RECORDREADER + | RECORDWRITER + | RECOVER + | REDUCE + | REFERENCES + | REFRESH + | RENAME + | REPAIR + | REPEATABLE + | REPLACE + | RESET + | RESPECT + | RESTRICT + | REVOKE + | RLIKE + | ROLE + | ROLES + | ROLLBACK + | ROLLUP + | ROW + | ROWS + | SCHEMA + | SCHEMAS + | SECOND + | SECONDS + | SELECT + | SEPARATED + | SERDE + | SERDEPROPERTIES + | SESSION_USER + | SET + | SETS + | SHORT + | SHOW + | SKEWED + | SMALLINT + | SOME + | SORT + | SORTED + | SOURCE + | START + | STATISTICS + | STORED + | STRATIFY + | STRING + | STRUCT + | SUBSTR + | SUBSTRING + | SYNC + | SYSTEM_TIME + | SYSTEM_VERSION + | TABLE + | TABLES + | TABLESAMPLE + | TARGET + | TBLPROPERTIES + | TEMPORARY + | TERMINATED + | THEN + | TIME + | TIMESTAMP + | TIMESTAMP_LTZ + | TIMESTAMP_NTZ + | TIMESTAMPADD + | TIMESTAMPDIFF + | TINYINT + | TO + | TOUCH + | TRAILING + | TRANSACTION + | TRANSACTIONS + | TRANSFORM + | TRIM + | TRUE + | TRUNCATE + | TRY_CAST + | TYPE + | UNARCHIVE + | UNBOUNDED + | UNCACHE + | UNIQUE + | UNKNOWN + | UNLOCK + | UNPIVOT + | UNSET + | UPDATE + | USE + | USER + | VALUES + | VARCHAR + | VERSION + | VIEW + | VIEWS + | VOID + | WEEK + | WEEKS + | WHEN + | WHERE + | WINDOW + | WITH + | WITHIN + | YEAR + | YEARS + | ZONE +//--DEFAULT-NON-RESERVED-END + ; diff --git a/settings.gradle b/settings.gradle index ba38e3aa427..fc4bc43f463 100644 --- a/settings.gradle +++ b/settings.gradle @@ -20,6 +20,7 @@ include 'benchmarks' include 'datasources' include 'async-query-core' include 'async-query' +include 'language-grammar' // exclude integ-test/doctest in case of offline build since they need downloads if (!gradle.startParameter.offline) {