Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new "filters" python file, along with a "hash" filter. This can… #18000

Merged
merged 18 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions airbyte-cdk/python/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 0.1.103

- Low-code: added hash filter to jinja template

## 0.1.102

- Low-code: Fix check for streams that do not define a stream slicer
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

import hashlib


def hash(value, hash_type="md5", salt=None):
"""
Implementation of a custom Jinja2 hash filter
Hash type defaults to 'md5' if one is not specified.

If you are using this has function for GDPR compliance, then
you should probably also pass in a salt as discussed in:
https://security.stackexchange.com/questions/202022/hashing-email-addresses-for-gdpr-compliance

This can be used in a low code connector definition under the AddFields transformation.
For example:

rates_stream:
$ref: "*ref(definitions.base_stream)"
$options:
name: "rates"
primary_key: "date"
path: "/exchangerates_data/latest"
transformations:
- type: AddFields
fields:
- path: ["some_new_path"]
value: "{{ record['rates']['CAD'] | hash('md5', 'mysalt') }}"



:param value: value to be hashed
:param hash_type: valid hash type
:param salt: a salt that will be combined with the value to ensure that the hash created for a given value on this system
is different from the hash created for that value on other systems.
:return: computed hash as a hexadecimal string
"""
hash_func = getattr(hashlib, hash_type, None)

if hash_func:
hash_obj = hash_func()
hash_obj.update(str(value).encode("utf-8"))
if salt:
hash_obj.update(str(salt).encode("utf-8"))
computed_hash = hash_obj.hexdigest()
else:
raise AttributeError("No hashing function named {hname}".format(hname=hash_type))

return computed_hash


_filters_list = [hash]
filters = {f.__name__: f for f in _filters_list}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import ast
from typing import Optional

from airbyte_cdk.sources.declarative.interpolation.filters import filters
from airbyte_cdk.sources.declarative.interpolation.interpolation import Interpolation
from airbyte_cdk.sources.declarative.interpolation.macros import macros
from airbyte_cdk.sources.declarative.types import Config
Expand Down Expand Up @@ -32,6 +33,7 @@ class JinjaInterpolation(Interpolation):

def __init__(self):
self._environment = Environment()
self._environment.filters.update(**filters)
self._environment.globals.update(**macros)

def eval(self, input_str: str, config: Config, default: Optional[str] = None, **additional_options):
Expand Down
2 changes: 1 addition & 1 deletion airbyte-cdk/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name="airbyte-cdk",
version="0.1.102",
version="0.1.103",
description="A framework for writing Airbyte Connectors.",
long_description=README,
long_description_content_type="text/markdown",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

import hashlib

from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation

interpolation = JinjaInterpolation()


def test_hash_md5_no_salt():
input_string = "abcd"
s = "{{ '%s' | hash('md5') }}" % input_string
filter_hash = interpolation.eval(s, config={})

# compute expected hash calling hashlib directly
hash_obj = hashlib.md5()
hash_obj.update(str(input_string).encode("utf-8"))
hashlib_computed_hash = hash_obj.hexdigest()

assert filter_hash == hashlib_computed_hash


def test_hash_md5_on_numeric_value():
input_value = 123.456
s = "{{ %f | hash('md5') }}" % input_value
filter_hash = interpolation.eval(s, config={})

# compute expected hash calling hashlib directly
hash_obj = hashlib.md5()
hash_obj.update(str(input_value).encode("utf-8"))
hashlib_computed_hash = hash_obj.hexdigest()

assert filter_hash == hashlib_computed_hash


def test_hash_md5_with_salt():
input_string = "test_input_string"
input_salt = "test_input_salt"

s = "{{ '%s' | hash('md5', '%s' ) }}" % (input_string, input_salt)
filter_hash = interpolation.eval(s, config={})

# compute expected value calling hashlib directly
hash_obj = hashlib.md5()
hash_obj.update(str(input_string + input_salt).encode("utf-8"))
hashlib_computed_hash = hash_obj.hexdigest()

assert filter_hash == hashlib_computed_hash