first commit

kakaohealthcare · Jan 29, 2024 · c81ecc2 · c81ecc2
commit c81ecc2
Show file tree

Hide file tree

Showing 12 changed files with 385 additions and 0 deletions.
diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml
@@ -0,0 +1,117 @@
+name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
+
+on: push
+
+jobs:
+  build:
+    name: Build distribution 📦
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python 🐍 distribution 📦 to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/sparksql-jupyter  # Replace <package-name> with your PyPI project name
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+  github-release:
+    name: >-
+      Sign the Python 🐍 distribution 📦 with Sigstore
+      and upload them to GitHub Release
+    needs:
+    - publish-to-pypi
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write  # IMPORTANT: mandatory for making GitHub Releases
+      id-token: write  # IMPORTANT: mandatory for sigstore
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Sign the dists with Sigstore
+      uses: sigstore/gh-action-sigstore-python@v1.2.3
+      with:
+        inputs: >-
+          ./dist/*.tar.gz
+          ./dist/*.whl
+    - name: Create GitHub Release
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: >-
+        gh release create
+        '${{ github.ref_name }}'
+        --repo '${{ github.repository }}'
+        --notes ""
+    - name: Upload artifact signatures to GitHub Release
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      # Upload to GitHub Release using the `gh` CLI.
+      # `dist/` contains the built packages, and the
+      # sigstore-produced signatures and certificates.
+      run: >-
+        gh release upload
+        '${{ github.ref_name }}' dist/**
+        --repo '${{ github.repository }}'
+
+  publish-to-testpypi:
+    name: Publish Python 🐍 distribution 📦 to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/sparksql-jupyter 
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+.DS_Store
+.vscode/
+.idea/
+__pycache__/
+build/
+dist/
+*.egg-info/
+.python-version
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Cinyoung Hur
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: prepare
+prepare:
+	# pyenv local 3.10.10
+
+.PHONY: test
+test: prepare
+	pytest
+
+.PHONY: clean
+clean:
+	rm -rf sparksql_jupyter.egg-info build dist
+
+.PHONY: dist
+dist: clean
+	python setup.py sdist bdist_wheel
+
+.PHONY: upload
+upload: dist
+	twine upload dist/*
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+# sparksql-jupyter
+
+Spark SQL magic command for Jupyter notebooks.
+
+![Example](screenshots/example.png)
+
+## Prerequisites
+- Python >= 3.6
+- PySpark >= 2.3.0
+- IPython >= 7.4.0
+- Jinja2 >= 3.0.0
+
+## Install
+```
+pip install sparksql-jupyter
+```
+
+## Usage
+
+### Load
+```
+%load_ext sparksql_jupyter
+```
+
+### Config
+```
+%config SparkSql.limit=<INT>
+```
+
+|Option|Default|Description|
+|---|---|---|
+|`SparkSql.limit`|20|The maximum number of rows to display|
+
+### Parameter
+```
+%%sparksql [-c|--cache] [-e|--eager] [-v|--view VIEW] [-l|--limit LIMIT] [-j|--jinja] [variable]
+<QUERY>
+```
+
+|Parameter|Description|
+|---|---|
+|`-c` `--cache`|Cache dataframe|
+|`-e` `--eager`|Cache dataframe with eager load|
+|`-v VIEW` `--view VIEW`|Create or replace temporary view|
+|`-l LIMIT` `--limit LIMIT`|The maximum number of rows to display (Default: `SparkSql.limit`)|
+|`variable`|Capture dataframe in a local variable|
+|`-j` `--jinja`|Capture dataframe in a local variable in jinja2 template|
+
+
+
diff --git a/screenshots/example.png b/screenshots/example.png
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+python-tag = py36
diff --git a/setup.py b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import find_packages, setup
+
+import sparksql_jupyter
+
+setup(
+    name='sparksql-jupyter',
+    version=sparksql_jupyter.__version__,
+    description='Spark SQL magic command for Jupyter notebooks',
+    long_description=open('README.md', 'r').read(),
+    long_description_content_type='text/markdown',
+    author='Chaerim Yeo',
+    author_email='yeochaerim@gmail.com',
+    url='https://github.com/cryeo/sparksql-jupyter',
+    license='MIT License',
+    install_requires=['pyspark>=2.3.0', 'ipython>=7.4.0', 'jinja2>=3.0.0'],
+    packages=find_packages(exclude=('tests', 'docs')),
+    python_requires='>=3.6',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+)
diff --git a/sparksql_magic/__init__.py b/sparksql_magic/__init__.py
@@ -0,0 +1,7 @@
+__version__ = '0.0.4'
+
+from .sparksql import SparkSql
+
+
+def load_ipython_extension(ipython):
+    ipython.register_magics(SparkSql)
diff --git a/sparksql_magic/sparksql.py b/sparksql_magic/sparksql.py
@@ -0,0 +1,93 @@
+import re
+from html import escape
+
+from IPython.core.display import HTML
+from IPython.core.magic import Magics, cell_magic, magics_class, needs_local_scope
+from IPython.core.magic_arguments import argument, magic_arguments, parse_argstring
+from pyspark.sql import SparkSession
+from traitlets import Int
+from jinja2 import Environment, StrictUndefined
+
+
+@magics_class
+class SparkSql(Magics):
+    limit = Int(20, config=True, help='The maximum number of rows to display')
+
+    @needs_local_scope
+    @cell_magic
+    @magic_arguments()
+    @argument('variable', nargs='?', type=str, help='Capture dataframe in a local variable')
+    @argument('-j', '--jinja', nargs='?', type=str, help='Capture dataframe in a local variable with jinja2 template')
+    @argument('-c', '--cache', action='store_true', help='Cache dataframe')
+    @argument('-e', '--eager', action='store_true', help='Cache dataframe with eager load')
+    @argument('-v', '--view', type=str, help='Create or replace temporary view')
+    @argument('-l', '--limit', type=int, help='The maximum number of rows to display')
+    def sparksql(self, line='', cell='', local_ns=None):
+        if local_ns is None:
+            local_ns = {}
+
+        user_ns = self.shell.user_ns.copy()
+        user_ns.update(local_ns)
+
+        args = parse_argstring(self.sparksql, line)
+
+        spark = get_instantiated_spark_session()
+
+        if spark is None:
+            print("active spark session is not found")
+            return
+
+        df = spark.sql(bind_variables(cell, user_ns))
+        if args.cache or args.eager:
+            print('cache dataframe with %s load' % ('eager' if args.eager else 'lazy'))
+            df = df.cache()
+            if args.eager:
+                df.count()
+        if args.view:
+            print('create temporary view `%s`' % args.view)
+            df.createOrReplaceTempView(args.view)
+        if args.variable:
+            print('capture dataframe to local variable from jinja template`%s`' % args.variable)
+            self.shell.user_ns.update({args.variable: df})
+
+        limit = args.limit or self.limit
+        header, contents = get_results(df, limit)
+        if len(contents) > limit:
+            print('only showing top %d row(s)' % limit)
+
+        html = make_tag('tr',
+                        ''.join(map(lambda x: make_tag('td', escape(x), style='font-weight: bold'), header)),
+                        style='border-bottom: 1px solid')
+        for index, row in enumerate(contents[:limit]):
+            html += make_tag('tr', ''.join(map(lambda x: make_tag('td', escape(x)), row)))
+
+        return HTML(make_tag('table', html))
+
+
+def bind_variables(query, user_ns):
+    env = Environment(undefined=StrictUndefined)
+    return env.from_string(query).render(user_ns)
+
+
+def get_results(df, limit):
+    def convert_value(value):
+        if value is None:
+            return 'null'
+        return str(value)
+
+    header = df.columns
+    contents = list(map(lambda row: list(map(convert_value, row)), df.take(limit + 1)))
+
+    return header, contents
+
+
+def make_tag(tag_name, body='', **kwargs):
+    attributes = ' '.join(map(lambda x: '%s="%s"' % x, kwargs.items()))
+    if attributes:
+        return '<%s %s>%s</%s>' % (tag_name, attributes, body, tag_name)
+    else:
+        return '<%s>%s</%s>' % (tag_name, body, tag_name)
+
+
+def get_instantiated_spark_session():
+    return SparkSession._instantiatedSession
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_sparksql.py b/tests/test_sparksql.py
@@ -0,0 +1,42 @@
+import pytest
+from pyspark import Row
+from unittest import mock
+from sparksql_jupyter.sparksql import bind_variables, get_results, make_tag
+from jinja2 import exceptions
+
+@pytest.fixture
+def df():
+    return mock.MagicMock(
+        columns=['col1', 'col2', 'col3'],
+        take=lambda n: [Row(col1=i, col2=str(i), col3=None) for i in range(n)]
+    )
+
+
+@pytest.fixture
+def user_ns():
+    return {
+        'month': 5,
+        'day': '10',
+        'schema': 'dl',
+        'start_var': "'2019-05-01'",
+        'end_var': "'2019-05-31'"
+    }
+
+
+def test_get_results(df):
+    assert get_results(df, 0) == (['col1', 'col2', 'col3'], [['0', '0', 'null']])
+    assert get_results(df, 1) == (['col1', 'col2', 'col3'], [['0', '0', 'null'], ['1', '1', 'null']])
+
+
+def test_bind_variables(user_ns):
+    assert bind_variables('SELECT * FROM table', user_ns) == 'SELECT * FROM table'
+    assert bind_variables("SELECT * FROM {{schema}}.table WHERE lsh_dtm between {{start_var}} and {{end_var}}", user_ns) == "SELECT * FROM dl.table WHERE lsh_dtm between '2019-05-01' and '2019-05-31'"
+
+    with pytest.raises(exceptions.UndefinedError):
+        bind_variables("SELECT * FROM table WHERE hour='{{hour}}'", user_ns)
+
+
+def test_make_tag():
+    assert make_tag('td') == '<td></td>'
+    assert make_tag('td', 'body') == '<td>body</td>'
+    assert make_tag('td', 'body', style='font-weight: bold') == '<td style="font-weight: bold">body</td>'