Skip to content

Commit

Permalink
Merge pull request #1 from coinlist/coinlist-snowflake-binary
Browse files Browse the repository at this point in the history
works for binary uuids
  • Loading branch information
ondramie authored Dec 15, 2023
2 parents c1dde75 + bd4c06a commit b68de1f
Show file tree
Hide file tree
Showing 13 changed files with 525 additions and 27 deletions.
1 change: 1 addition & 0 deletions .github/workflows/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @coinlist/data
51 changes: 51 additions & 0 deletions .github/workflows/_cache-dependencies.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: cache_dependencies

on:
workflow_call:
inputs:
python_version:
required: true
type: string
poetry_version:
required: true
type: string
outputs:
python_cache_key:
description: "The key of the primary cache of the python dependencies"
value: ${{ jobs.python-cache.outputs.key }}


jobs:
python-cache:
runs-on: ubuntu-latest
outputs:
key: ${{ steps.define-cache-key.outputs.cache_key }}
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v4
id: setup-python
with:
python-version: '${{ inputs.python_version }}'

- name: Install and configure Poetry
uses: snok/install-poetry@v1
with:
version: ${{ inputs.poetry_version }}
virtualenvs-in-project: true

- name: Define Cache Key
id: define-cache-key
run: |
echo "cache_key=python-${{ runner.os }}--${{ inputs.python_version }}-${{ inputs.poetry_version }}-${{ hashFiles('**/poetry.lock') }}" >> $GITHUB_OUTPUT
- name: Cache venv
id: cached-python
uses: actions/cache@v3
with:
path: .venv
key: ${{ steps.define-cache-key.outputs.cache_key }}

- name: Install dependencies
run: poetry install --no-interaction --no-root
95 changes: 95 additions & 0 deletions .github/workflows/_publish-data-platform-data-diff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
name: publish_data-platform-data-diff

on:
workflow_call:
inputs:
python_version:
required: true
type: string
poetry_version:
required: true
type: string
python_cache_key:
required: true
type: string

permissions:
id-token: write
contents: read

jobs:
build:
name: 'Publish python data-platform-data-diff'
runs-on: ubuntu-latest

steps:
- name: Setup AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
role-to-assume: ${{ secrets.CROSS_ACCOUNT_ROLE_TO_ASSUME }}
aws-region: ${{ secrets.AWS_REGION }}
mask-aws-account-id: 'yes'

- uses: actions/checkout@v3

- name: Setup Python
id: setup-python
uses: actions/setup-python@v4
with:
python-version: '${{ inputs.python_version }}'

- name: Install and configure Poetry
uses: snok/install-poetry@v1
with:
version: ${{ inputs.poetry_version }}
virtualenvs-in-project: true

- name: Restore cached key
id: cache-restore
uses: actions/cache/restore@v3
with:
path: .venv
key: ${{ inputs.python_cache_key }}

- name: Install jq
run: sudo apt-get update && sudo apt-get install -y jq

- name: Set env variables
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
CODEARTIFACT_URL: ${{ secrets.CODEARTIFACT_URL }}
run: |
# Replace placeholder URL with actual repository URL
sed -i "s|PLACEHOLDER_URL|$CODEARTIFACT_URL|" pyproject.toml
VERSION=$(poetry run toml get --toml-path pyproject.toml tool.poetry.version 2>/dev/null) || { echo "FAILED TO GET POETRY VERSION"; exit 1; }
echo $VERSION > version.txt
echo "CURRENT_VERSION=$(cat version.txt)" >> $GITHUB_ENV
- name: Check if version needs to be published
if: ${{ github.ref_name == 'master' }}
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
id: check_version
run: |
if ! aws codeartifact list-package-versions --region $AWS_REGION --domain coinlist --repository data-platform-data-diff --format pypi --package data_diff 2>/dev/null | grep -q "$(cat version.txt | sed 's/\./\\./g')"; then
echo "skip_publish=false" >> $GITHUB_ENV
else
echo "skip_publish=true" >> $GITHUB_ENV
fi
- name: Publish dev version
if: ${{ github.ref_name != 'master' }}
run: |
DEV_VERSION="$CURRENT_VERSION-dev+${GITHUB_SHA:0:7}"
echo $DEV_VERSION > version.txt
poetry run toml set --toml-path pyproject.toml tool.poetry.version $DEV_VERSION || { echo "Failed to set dev version in pyproject.toml"; exit 1; }
poetry config repositories.data-platform-data-diff ${{ secrets.CODEARTIFACT_URL }}
poetry build --format wheel || { echo "Failed to build the wheel"; exit 1; }
poetry publish --repository data-platform-data-diff --username aws --password $(aws codeartifact --region ${{ secrets.AWS_REGION }} get-authorization-token --domain coinlist --query authorizationToken --output text 2>/dev/null) || { echo "Failed to publish the dev package"; exit 1; }
- name: Publish new version
if: ${{ github.ref_name == 'master' }} && ${{ env.skip_publish != 'true' }}
run: |
poetry build --format wheel 2>/dev/null || { echo "Failed to build the wheel"; exit 1; }
poetry publish --repository data-platform-data-diff --username aws --password $(aws codeartifact --region ${{ secrets.AWS_REGION }} get-authorization-token --domain coinlist --query authorizationToken --output text 2>/dev/null) || { echo "Failed to publish the package"; exit 1; }
11 changes: 2 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
name: CI-COVER-VERSIONS

on:
# push:
# paths:
# - '**.py'
# - '.github/workflows/**'
# - '!dev/**'
pull_request:
branches: [ master ]

workflow_dispatch:
workflow_call:

jobs:
unit_tests:
strategy:
fail-fast: false
fail-fast: true
matrix:
os: [ubuntu-latest]
python-version:
Expand Down
10 changes: 2 additions & 8 deletions .github/workflows/ci_full.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
name: CI-COVER-DATABASES

on:
# push:
# paths:
# - '**.py'
# - '.github/workflows/**'
# - '!dev/**'
pull_request:
branches: [ master ]
workflow_dispatch:
workflow_call:

permissions:
id-token: write # This is required for requesting the JWT
Expand All @@ -17,7 +11,7 @@ permissions:
jobs:
unit_tests:
strategy:
fail-fast: false
fail-fast: true
matrix:
os: [ubuntu-latest]
python-version:
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/formatter.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
name: formatter
on:
pull_request:
branches: [ master ]

workflow_dispatch:
workflow_dispatch:
workflow_call:

jobs:
linter_name:
Expand All @@ -19,4 +17,4 @@ jobs:
- name: Auto commit ruff formatting
uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: 'style fixes by ruff'
commit_message: 'style fixes by ruff'
58 changes: 58 additions & 0 deletions .github/workflows/pull-request-checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: PR checks

on:
pull_request: {}

permissions:
id-token: write
contents: read
actions: write

jobs:
cancel:
runs-on: ubuntu-latest
steps:
- uses: styfle/cancel-workflow-action@0.12.0
with:
access_token: ${{ github.token }}

setup:
runs-on: ubuntu-latest
needs: [cancel]
outputs:
python_version: ${{ steps.set_var.outputs.python_version }}
poetry_version: ${{ steps.set_var.outputs.poetry_version }}
steps:
- id: set_var
run: |
echo "python_version=3.8" >> $GITHUB_OUTPUT
echo "poetry_version=1.7.1" >> $GITHUB_OUTPUT
perform-ruff-formatting:
needs: [setup]
uses: ./.github/workflows/formatter.yml

cache-dependencies:
needs: [setup, perform-ruff-formatting]
uses: ./.github/workflows/_cache-dependencies.yml
secrets: inherit
with:
python_version: ${{ needs.setup.outputs.python_version }}
poetry_version: ${{ needs.setup.outputs.poetry_version }}

run-unit-test-versions:
needs: [setup]
uses: ./.github/workflows/ci_full.yml

run-unit-test-per-database:
needs: [setup]
uses: ./.github/workflows/ci.yml

publish-data-platform-data-diff:
needs: [setup, run-unit-test-versions, run-unit-test-per-database, cache-dependencies]
uses: ./.github/workflows/_publish-data-platform-data-diff.yml
secrets: inherit
with:
python_version: ${{ needs.setup.outputs.python_version }}
poetry_version: ${{ needs.setup.outputs.poetry_version }}
python_cache_key: ${{ needs.cache-dependencies.outputs.python_cache_key }}
6 changes: 6 additions & 0 deletions data_diff/abcs/database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ class String_UUID(ColType_UUID, StringType):
pass


# Snowflake Binary UUID
@attrs.define(frozen=True)
class Binary_UUID(ColType_UUID):
python_type = bytes


@attrs.define(frozen=True)
class String_Alphanum(ColType_Alphanum, StringType):
@staticmethod
Expand Down
5 changes: 5 additions & 0 deletions data_diff/databases/_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from data_diff.databases.duckdb import DuckDB
from data_diff.databases.mssql import MsSQL

from urllib.parse import unquote


@attrs.define(frozen=True)
class MatchUriPath:
Expand Down Expand Up @@ -196,6 +198,9 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
if dsn.password:
kw["password"] = dsn.password

# snowflake connector can handle unquoted values, but data-diff cannot
# results in error if user or password is encoded
# https://github.com/datafold/data-diff/issues/428
kw = {k: v for k, v in kw.items() if v is not None}

if issubclass(cls, ThreadedDatabase):
Expand Down
29 changes: 28 additions & 1 deletion data_diff/databases/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from data_diff.queries.api import Expr, table, Select, SKIP, Explain, Code, this
from data_diff.queries.ast_classes import (
Alias,
BinBoolOp,
BinOp,
CaseWhen,
Cast,
Expand Down Expand Up @@ -64,6 +65,7 @@
Float,
Native_UUID,
String_UUID,
Binary_UUID,
String_Alphanum,
String_VaryingAlphanum,
TemporalType,
Expand Down Expand Up @@ -482,6 +484,22 @@ def render_tableop(self, parent_c: Compiler, elem: TableOp) -> str:
def render__resolvecolumn(self, c: Compiler, elem: _ResolveColumn) -> str:
return self.compile(c, elem._get_resolved())

def modify_string_where_clause(self, col, where_clause):
# NOTE: snowflake specific issue with Binary columns
return where_clause.replace(f'"{col}"', f"TO_VARCHAR(\"{col}\", 'UTF-8')")

def check_for_binary_cols(self, where_exprs):
binary_uuid_columns = set()
for expr in where_exprs:
if isinstance(expr, BinBoolOp):
for arg in expr.args:
if isinstance(arg, _ResolveColumn):
resolved_column = arg.resolved
if isinstance(resolved_column, Column) and resolved_column.source_table.schema:
if isinstance(resolved_column.type, Binary_UUID):
binary_uuid_columns.add(resolved_column.name)
return binary_uuid_columns

def render_select(self, parent_c: Compiler, elem: Select) -> str:
c: Compiler = attrs.evolve(parent_c, in_select=True) # .add_table_context(self.table)
compile_fn = functools.partial(self.compile, c)
Expand All @@ -497,7 +515,13 @@ def render_select(self, parent_c: Compiler, elem: Select) -> str:
select += f" FROM {self.PLACEHOLDER_TABLE}"

if elem.where_exprs:
select += " WHERE " + " AND ".join(map(compile_fn, elem.where_exprs))
where_clause = " WHERE " + " AND ".join(map(compile_fn, elem.where_exprs))
# post processing step for snowfake BINARAY_UUID columns
if parent_c.dialect.name == "Snowflake":
binary_uuids = self.check_for_binary_cols(elem.where_exprs)
for binary_uuid in binary_uuids:
where_clause = self.modify_string_where_clause(binary_uuid, where_clause)
select += where_clause

if elem.group_by_exprs:
select += " GROUP BY " + ", ".join(map(compile_fn, elem.group_by_exprs))
Expand Down Expand Up @@ -836,6 +860,9 @@ def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
if isinstance(coltype, String_UUID):
return f"TRIM({value})"
# converts Binary to VARCHAR for Snowflake
elif isinstance(coltype, Binary_UUID):
return f"TRIM(TO_VARCHAR({value}, 'UTF-8'))"
return self.to_string(value)

def normalize_json(self, value: str, _coltype: JSON) -> str:
Expand Down
Loading

0 comments on commit b68de1f

Please sign in to comment.