Skip to content

Commit

Permalink
Repo housekeeping
Browse files Browse the repository at this point in the history
  • Loading branch information
jtcohen6 committed Oct 2, 2021
1 parent f39169e commit 1134e2f
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 84 deletions.
26 changes: 0 additions & 26 deletions .bumpversion-dbt.cfg

This file was deleted.

1 change: 0 additions & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,3 @@ first_value = 1
first_value = 1

[bumpversion:file:dbt/adapters/spark/__version__.py]

1 change: 0 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ jobs:
source env/bin/activate
sudo apt-get install libsasl2-dev
pip install -r dev_requirements.txt
bumpversion --config-file .bumpversion-dbt.cfg patch --new-version ${{env.version_number}}
bumpversion --config-file .bumpversion.cfg patch --new-version ${{env.version_number}} --allow-dirty
git status
Expand Down
20 changes: 8 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
<p align="center">
<img src="/etc/dbt-logo-full.svg" alt="dbt logo" width="500"/>
<img src="https://raw.githubusercontent.com/dbt-labs/dbt/ec7dee39f793aa4f7dd3dae37282cc87664813e4/etc/dbt-logo-full.svg" alt="dbt logo" width="500"/>
</p>
<p align="center">
<a href="https://circleci.com/gh/fishtown-analytics/dbt-spark/tree/master">
<a href="https://circleci.com/gh/dbt-labs/dbt-spark/tree/master">
<img src="https://circleci.com/gh/fishtown-analytics/dbt-spark/tree/master.svg?style=svg" alt="CircleCI" />
</a>
<a href="https://community.getdbt.com">
<img src="https://community.getdbt.com/badge.svg" alt="Slack" />
</a>
</p>

# dbt-spark
**[dbt](https://www.getdbt.com/)** enables data analysts and engineers to transform their data using the same practices that software engineers use to build applications.

This plugin ports [dbt](https://getdbt.com) functionality to Spark. It supports running dbt against Spark clusters that are hosted via Databricks (AWS + Azure), Amazon EMR, or Docker.
dbt is the T in ELT. Organize, cleanse, denormalize, filter, rename, and pre-aggregate the raw data in your warehouse so that it's ready for analysis.

We have not tested extensively against older versions of Apache Spark. The plugin uses syntax that requires version 2.2.0 or newer. Some features require Spark 3.0 and/or Delta Lake.
## dbt-spark

### Documentation
For more information on using Spark with dbt, consult the dbt documentation:
- [Spark profile](https://docs.getdbt.com/reference/warehouse-profiles/spark-profile/)
- [Spark specific configs](https://docs.getdbt.com/reference/resource-configs/spark-configs/)
The `dbt-spark` package contains all of the code enabling dbt to work with Apache Spark and Databricks. For more information on using dbt with Spark, consult [the docs](https://docs.getdbt.com/docs/profile-spark).

We have not tested extensively against older versions of Apache Spark. The plugin uses syntax that requires version 2.2.0 or newer. Some features require Spark 3.0 and/or Delta Lake.

### Installation
This plugin can be installed via pip. Depending on your connection method, you need to specify an extra requirement.
Expand Down
4 changes: 4 additions & 0 deletions dev_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# install latest changes in dbt-core
# TODO: how to automate switching from develop to version branches?
git+https://github.com/dbt-labs/dbt.git@develop#egg=dbt-core&subdirectory=core

freezegun==0.3.9
pytest==6.0.2
mock>=1.3.0
Expand Down
1 change: 0 additions & 1 deletion etc/dbt-logo-full.svg

This file was deleted.

6 changes: 0 additions & 6 deletions requirements.txt

This file was deleted.

68 changes: 46 additions & 22 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,65 @@
#!/usr/bin/env python
from setuptools import find_namespace_packages, setup
import os
import sys
import re

# require python 3.6 or newer
if sys.version_info < (3, 6):
print('Error: dbt does not support this version of Python.')
print('Please upgrade to Python 3.6 or higher.')
sys.exit(1)


# require version of setuptools that supports find_namespace_packages
from setuptools import setup
try:
from setuptools import find_namespace_packages
except ImportError:
# the user has a downlevel version of setuptools.
print('Error: dbt requires setuptools v40.1.0 or higher.')
print('Please upgrade setuptools with "pip install --upgrade setuptools" '
'and try again')
sys.exit(1)


# pull long description from README
this_directory = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(this_directory, 'README.md')) as f:
long_description = f.read()


package_name = "dbt-spark"


# get this from a separate file
def _dbt_spark_version():
# get this package's version from dbt/adapters/<name>/__version__.py
def _get_plugin_version_dict():
_version_path = os.path.join(
this_directory, 'dbt', 'adapters', 'spark', '__version__.py'
)
_version_pattern = r'''version\s*=\s*["'](.+)["']'''
_semver = r'''(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)'''
_pre = r'''((?P<prekind>a|b|rc)(?P<pre>\d+))?'''
_version_pattern = fr'''version\s*=\s*["']{_semver}{_pre}["']'''
with open(_version_path) as f:
match = re.search(_version_pattern, f.read().strip())
if match is None:
raise ValueError(f'invalid version at {_version_path}')
return match.group(1)
return match.groupdict()


package_version = _dbt_spark_version()
description = """The SparkSQL plugin for dbt (data build tool)"""
def _get_plugin_version():
parts = _get_plugin_version_dict()
return "{major}.{minor}.{patch}{prekind}{pre}".format(**parts)

dbt_version = '0.20.0rc2'
# the package version should be the dbt version, with maybe some things on the
# ends of it. (0.20.0rc2 vs 0.20.0rc2a1, 0.20.0rc2.1, ...)
if not package_version.startswith(dbt_version):
raise ValueError(
f'Invalid setup.py: package_version={package_version} must start with '
f'dbt_version={dbt_version}'
)

# require a compatible minor version (~=), prerelease if this is a prerelease
def _get_dbt_core_version():
parts = _get_plugin_version_dict()
minor = "{major}.{minor}.0".format(**parts)
pre = (parts["prekind"]+"1" if parts["prekind"] else "")
return f"{minor}{pre}"


package_name = "dbt-spark"
package_version = _get_plugin_version()
dbt_core_version = _get_dbt_core_version()
description = """The Apache Spark adapter plugin for dbt"""

odbc_extras = ['pyodbc>=4.0.30']
pyhive_extras = [
Expand All @@ -52,14 +76,14 @@ def _dbt_spark_version():
long_description=long_description,
long_description_content_type='text/markdown',

author='Fishtown Analytics',
author_email='info@fishtownanalytics.com',
url='https://github.com/fishtown-analytics/dbt-spark',
author='dbt Labs',
author_email='info@dbtlabs.com',
url='https://github.com/dbt-labs/dbt-spark',

packages=find_namespace_packages(include=['dbt', 'dbt.*']),
include_package_data=True,
install_requires=[
f'dbt-core=={dbt_version}',
'dbt-core~={}'.format(dbt_core_version),
'sqlparams>=3.0.0',
],
extras_require={
Expand Down
26 changes: 11 additions & 15 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,41 @@ basepython = python3
commands = /bin/bash -c '$(which flake8) --select=E,W,F --ignore=W504 dbt/'
passenv = DBT_INVOCATION_ENV
deps =
-r{toxinidir}/dev_requirements.txt
-rdev_requirements.txt

[testenv:unit]
basepython = python3
commands = /bin/bash -c '{envpython} -m pytest -v {posargs} test/unit'
passenv = DBT_INVOCATION_ENV
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/dev_requirements.txt
-rdev_requirements.txt
-e.[all]

[testenv:integration-spark-databricks-http]
basepython = python3
commands = /bin/bash -c '{envpython} -m pytest -v test/integration/spark-databricks-http.dbtspec'
passenv = DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_CLUSTER_NAME DBT_DATABRICKS_TOKEN DBT_INVOCATION_ENV
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/dev_requirements.txt
-e.
-rdev_requirements.txt
-e.[all]

[testenv:integration-spark-databricks-odbc-cluster]
basepython = python3
commands = /bin/bash -c '{envpython} -m pytest -v test/integration/spark-databricks-odbc-cluster.dbtspec'
/bin/bash -c '{envpython} -m pytest -v -m profile_databricks_cluster {posargs} -n4 test/custom/*'
passenv = DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_CLUSTER_NAME DBT_DATABRICKS_TOKEN DBT_INVOCATION_ENV ODBC_DRIVER
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/dev_requirements.txt
-e.
-rdev_requirements.txt
-e.[all]

[testenv:integration-spark-databricks-odbc-sql-endpoint]
basepython = python3
commands = /bin/bash -c '{envpython} -m pytest -v test/integration/spark-databricks-odbc-sql-endpoint.dbtspec'
/bin/bash -c '{envpython} -m pytest -v -m profile_databricks_sql_endpoint {posargs} -n4 test/custom/*'
passenv = DBT_DATABRICKS_HOST_NAME DBT_DATABRICKS_ENDPOINT DBT_DATABRICKS_TOKEN DBT_INVOCATION_ENV ODBC_DRIVER
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/dev_requirements.txt
-e.
-rdev_requirements.txt
-e.[all]


[testenv:integration-spark-thrift]
Expand All @@ -54,6 +51,5 @@ commands = /bin/bash -c '{envpython} -m pytest -v test/integration/spark-thrift.
/bin/bash -c '{envpython} -m pytest -v -m profile_apache_spark {posargs} -n4 test/custom/*'
passenv = DBT_INVOCATION_ENV
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/dev_requirements.txt
-e.
-rdev_requirements.txt
-e.[all]

0 comments on commit 1134e2f

Please sign in to comment.