diff --git a/LICENSE.txt b/LICENSE.txt index fc6d123..ecc5369 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,5 @@ Copyright (c) 2015 Min RK, Florian Rathgeber, Michael McNeil Forbes +2019 Casper da Costa-Luis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/README.rst b/README.rst index 331aed7..d72805d 100644 --- a/README.rst +++ b/README.rst @@ -154,6 +154,20 @@ This is the same metadata used by the `init_cell nbextension`__. __ https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tree/master/src/jupyter_contrib_nbextensions/nbextensions/init_cell +Stripping metadata +++++++++++++++++++ + +This is configurable via `git config (--global) filter.nbstripout.extrakeys`. +An example would be: + + git config --global filter.nbstripout.extrakeys ' + metadata.celltoolbar metadata.kernel_spec.display_name + metadata.kernel_spec.name metadata.language_info.codemirror_mode.version + metadata.language_info.pygments_lexer metadata.language_info.version + metadata.toc metadata.notify_time metadata.varInspector + cell.metadata.heading_collapsed cell.metadata.hidden + cell.metadata.code_folding cell.metadata.tags cell.metadata.init_cell' + Manual filter installation ========================== diff --git a/nbstripout/__init__.py b/nbstripout/__init__.py new file mode 100644 index 0000000..4c1bbab --- /dev/null +++ b/nbstripout/__init__.py @@ -0,0 +1,4 @@ +from ._nbstripout import install, uninstall, status, main +from ._utils import pop_recursive, strip_output +__all__ = ["install", "uninstall", "status", "main", + "pop_recursive", "strip_output"] diff --git a/nbstripout.py b/nbstripout/_nbstripout.py similarity index 79% rename from nbstripout.py rename to nbstripout/_nbstripout.py index 645bf15..0300d91 100755 --- a/nbstripout.py +++ b/nbstripout/_nbstripout.py @@ -85,29 +85,11 @@ *.ipynb diff=ipynb """ - from __future__ import print_function from argparse import ArgumentParser, RawDescriptionHelpFormatter import io import sys - -input_stream = None -if sys.version_info < (3, 0): - import codecs - # Use UTF8 reader/writer for stdin/stdout - # http://stackoverflow.com/a/1169209 - if sys.stdin: - input_stream = codecs.getreader('utf8')(sys.stdin) - output_stream = codecs.getwriter('utf8')(sys.stdout) -else: - # Wrap input/output stream in UTF-8 encoded text wrapper - # https://stackoverflow.com/a/16549381 - if sys.stdin: - input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') - output_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') - -__version__ = '0.3.3' - +from nbstripout._utils import strip_output try: # Jupyter >= 4 from nbformat import read, write, NO_CONVERT @@ -131,67 +113,8 @@ def read(f, as_version): def write(nb, f): return current.write(nb, f, 'json') - -def _cells(nb): - """Yield all cells in an nbformat-insensitive manner""" - if nb.nbformat < 4: - for ws in nb.worksheets: - for cell in ws.cells: - yield cell - else: - for cell in nb.cells: - yield cell - - -def strip_output(nb, keep_output, keep_count): - """ - Strip the outputs, execution count/prompt number and miscellaneous - metadata from a notebook object, unless specified to keep either the outputs - or counts. - """ - - nb.metadata.pop('signature', None) - nb.metadata.pop('widgets', None) - - for cell in _cells(nb): - - keep_output_this_cell = keep_output - - # Keep the output for these cells, but strip count and metadata - if cell.metadata.get('init_cell') or cell.metadata.get('keep_output'): - keep_output_this_cell = True - - # Remove the outputs, unless directed otherwise - if 'outputs' in cell: - - # Default behavior strips outputs. With all outputs stripped, - # there are no counts to keep and keep_count is ignored. - if not keep_output_this_cell: - cell['outputs'] = [] - - # If keep_output_this_cell, but not keep_count, strip the counts - # from the output. - if keep_output_this_cell and not keep_count: - for output in cell['outputs']: - if 'execution_count' in output: - output['execution_count'] = None - - # If keep_output_this_cell and keep_count, do nothing. - - # Remove the prompt_number/execution_count, unless directed otherwise - if 'prompt_number' in cell and not keep_count: - cell['prompt_number'] = None - if 'execution_count' in cell and not keep_count: - cell['execution_count'] = None - - # Always remove this metadata - for output_style in ['collapsed', 'scrolled']: - if output_style in cell.metadata: - cell.metadata[output_style] = False - if 'metadata' in cell: - for field in ['collapsed', 'scrolled', 'ExecuteTime']: - cell.metadata.pop(field, None) - return nb +__all__ = ["install", "uninstall", "status", "main"] +__version__ = '0.3.3' def install(attrfile=None): @@ -273,6 +196,10 @@ def status(verbose=False): diff = check_output(['git', 'config', 'diff.ipynb.textconv']).strip() attributes = check_output(['git', 'check-attr', 'filter', '--', '*.ipynb']).strip() diff_attributes = check_output(['git', 'check-attr', 'diff', '--', '*.ipynb']).strip() + try: + extra_keys = check_output(['git', 'config', 'filter.nbstripout.extrakeys']).strip() + except CalledProcessError: + extra_keys = '' if attributes.endswith(b'unspecified'): if verbose: print('nbstripout is not installed in repository', git_dir) @@ -284,6 +211,7 @@ def status(verbose=False): print(' smudge =', smudge) print(' required =', required) print(' diff=', diff) + print(' extrakeys=', extra_keys) print('\nAttributes:\n ', attributes) print('\nDiff Attributes:\n ', diff_attributes) return 0 @@ -294,6 +222,7 @@ def status(verbose=False): def main(): + from subprocess import check_output, CalledProcessError parser = ArgumentParser(epilog=__doc__, formatter_class=RawDescriptionHelpFormatter) task = parser.add_mutually_exclusive_group() task.add_argument('--install', action='store_true', @@ -336,13 +265,33 @@ def main(): print(__version__) sys.exit(0) + try: + extra_keys = check_output(['git', 'config', 'filter.nbstripout.extrakeys']).strip() + except CalledProcessError: + extra_keys = '' + + input_stream = None + if sys.version_info < (3, 0): + import codecs + # Use UTF8 reader/writer for stdin/stdout + # http://stackoverflow.com/a/1169209 + if sys.stdin: + input_stream = codecs.getreader('utf8')(sys.stdin) + output_stream = codecs.getwriter('utf8')(sys.stdout) + else: + # Wrap input/output stream in UTF-8 encoded text wrapper + # https://stackoverflow.com/a/16549381 + if sys.stdin: + input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + output_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + for filename in args.files: if not (args.force or filename.endswith('.ipynb')): continue try: with io.open(filename, 'r', encoding='utf8') as f: nb = read(f, as_version=NO_CONVERT) - nb = strip_output(nb, args.keep_output, args.keep_count) + nb = strip_output(nb, args.keep_output, args.keep_count, extra_keys) if args.textconv: write(nb, output_stream) output_stream.flush() @@ -360,7 +309,7 @@ def main(): if not args.files and input_stream: try: nb = strip_output(read(input_stream, as_version=NO_CONVERT), - args.keep_output, args.keep_count) + args.keep_output, args.keep_count, extra_keys) write(nb, output_stream) output_stream.flush() except NotJSONError: diff --git a/nbstripout/_utils.py b/nbstripout/_utils.py new file mode 100644 index 0000000..4343256 --- /dev/null +++ b/nbstripout/_utils.py @@ -0,0 +1,104 @@ +import sys + +__all__ = ["pop_recursive", "strip_output"] + + +def pop_recursive(d, key, default=None): + """dict.pop(key) where `key` is a `.`-delimited list of nested keys. + + >>> d = {'a': {'b': 1, 'c': 2}} + >>> pop_recursive(d, 'a.c') + 2 + >>> d + {'a': {'b': 1}} + """ + nested = key.split('.') + current = d + for k in nested[:-1]: + if hasattr(current, 'get'): + current = current.get(k, {}) + else: + return default + if not hasattr(current, 'pop'): + return default + return current.pop(nested[-1], default) + + +def _cells(nb): + """Yield all cells in an nbformat-insensitive manner""" + if nb.nbformat < 4: + for ws in nb.worksheets: + for cell in ws.cells: + yield cell + else: + for cell in nb.cells: + yield cell + + +def strip_output(nb, keep_output, keep_count, extra_keys=''): + """ + Strip the outputs, execution count/prompt number and miscellaneous + metadata from a notebook object, unless specified to keep either the outputs + or counts. + + `extra_keys` could be 'metadata.foo cell.metadata.bar metadata.baz' + """ + if hasattr(extra_keys, 'decode'): + extra_keys = extra_keys.decode() + extra_keys = extra_keys.split() + keys = {'metadata': [], 'cell': {'metadata': []}} + for key in extra_keys: + if key.startswith('metadata.'): + keys['metadata'].append(key[len('metadata.'):]) + elif key.startswith('cell.metadata.'): + keys['cell']['metadata'].append(key[len('cell.metadata.'):]) + else: + sys.stderr.write('ignoring extra key `%s`' % key) + + nb.metadata.pop('signature', None) + nb.metadata.pop('widgets', None) + for field in keys['metadata']: + pop_recursive(nb.metadata, field) + + for cell in _cells(nb): + keep_output_this_cell = keep_output + + # Keep the output for these cells, but strip count and metadata + if cell.metadata.get('init_cell') or cell.metadata.get('keep_output'): + keep_output_this_cell = True + + # Remove the outputs, unless directed otherwise + if 'outputs' in cell: + + # Default behavior strips outputs. With all outputs stripped, + # there are no counts to keep and keep_count is ignored. + if not keep_output_this_cell: + cell['outputs'] = [] + + # If keep_output_this_cell, but not keep_count, strip the counts + # from the output. + if keep_output_this_cell and not keep_count: + for output in cell['outputs']: + if 'execution_count' in output: + output['execution_count'] = None + + # If keep_output_this_cell and keep_count, do nothing. + + # Remove the prompt_number/execution_count, unless directed otherwise + if 'prompt_number' in cell and not keep_count: + cell['prompt_number'] = None + if 'execution_count' in cell and not keep_count: + cell['execution_count'] = None + + # Always remove this metadata + for output_style in ['collapsed', 'scrolled']: + if output_style in cell.metadata: + cell.metadata[output_style] = False + if 'metadata' in cell: + for field in ['collapsed', 'scrolled', 'ExecuteTime']: + cell.metadata.pop(field, None) + for (extra, fields) in keys['cell'].items(): + if extra in cell: + for field in fields: + pop_recursive(getattr(cell, extra), field) + return nb diff --git a/setup.py b/setup.py index c30568b..6b6f208 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup +from setuptools import setup, find_packages with open('README.rst') as f: long_description = f.read() @@ -32,10 +32,11 @@ description='Strips outputs from Jupyter and IPython notebooks', long_description=long_description, - py_modules=['nbstripout'], + packages=find_packages(), + provides=['nbstripout'], entry_points={ 'console_scripts': [ - 'nbstripout = nbstripout:main' + 'nbstripout = nbstripout._nbstripout:main' ], }, diff --git a/tests/test-git.t b/tests/test-git.t index 5d6674a..57843de 100644 --- a/tests/test-git.t +++ b/tests/test-git.t @@ -1,17 +1,43 @@ $ git init foobar Initialized empty Git repository in .* (re) $ cd foobar + $ git config --local filter.nbstripout.extrakeys ' ' $ echo -n "*.txt text" >> .git/info/attributes $ ${NBSTRIPOUT_EXE:-nbstripout} --is-installed [1] $ ${NBSTRIPOUT_EXE:-nbstripout} --install $ ${NBSTRIPOUT_EXE:-nbstripout} --is-installed $ git diff --no-index --no-ext-diff --unified=0 --exit-code -a --no-prefix ${TESTDIR}/test_diff.ipynb ${TESTDIR}/test_diff_output.ipynb - $ git diff --no-index ${TESTDIR}/test_diff.ipynb ${TESTDIR}/test_diff_different.ipynb + $ git diff --no-index ${TESTDIR}/test_diff.ipynb ${TESTDIR}/test_diff_different_extrakeys.ipynb (diff --git.*) (re) (index .*) (re) (--- .*test_diff.ipynb) (re) - (\+\+\+ .*test_diff_different.ipynb) (re) + (\+\+\+ .*test_diff_different_extrakeys.ipynb) (re) + @@ -6,15 +6,14 @@ + "metadata": {}, + "outputs": [], + "source": [ + - "print(\"aou\")" + + "print(\"aou now it is different\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + - "language": "python", + - "name": "python2" + + "language": "python" + }, + "language_info": { + "codemirror_mode": { + [1] + $ git config --local filter.nbstripout.extrakeys 'cell.metadata.collapsed metadata.kernelspec.name' + $ git diff --no-index ${TESTDIR}/test_diff.ipynb ${TESTDIR}/test_diff_different_extrakeys.ipynb + (diff --git.*) (re) + (index .*) (re) + (--- .*test_diff.ipynb) (re) + (\+\+\+ .*test_diff_different_extrakeys.ipynb) (re) @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], diff --git a/tests/test_diff_different_extrakeys.ipynb b/tests/test_diff_different_extrakeys.ipynb new file mode 100644 index 0000000..2dba5e9 --- /dev/null +++ b/tests/test_diff_different_extrakeys.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"aou now it is different\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..a2daf75 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,7 @@ +from nbstripout._utils import pop_recursive + + +def test_pop_recursive(): + d = {'a': {'b': 1, 'c': 2}} + assert pop_recursive(d, 'a.c') == 2 + assert d == {'a': {'b': 1}}