diff --git a/compyle/config.py b/compyle/config.py index c3ed6c0..750126f 100644 --- a/compyle/config.py +++ b/compyle/config.py @@ -15,6 +15,7 @@ def __init__(self): self._use_double = None self._omp_schedule = None self._profile = None + self._count_flops = None self._use_local_memory = None self._wgs = None self._suppress_warnings = None @@ -129,6 +130,19 @@ def profile(self, value): def _profile_default(self): return False + @property + def count_flops(self): + if self._count_flops is None: + self._count_flops = self._count_flops_default() + return self._count_flops + + @count_flops.setter + def count_flops(self, value): + self._count_flops = value + + def _count_flops_default(self): + return False + @property def use_local_memory(self): if self._use_local_memory is None: diff --git a/compyle/jit.py b/compyle/jit.py index ee70486..1fc8abb 100644 --- a/compyle/jit.py +++ b/compyle/jit.py @@ -14,7 +14,7 @@ dtype_to_knowntype, annotate) from .extern import Extern from .utils import getsourcelines -from .profile import profile +from .profile import record_flops, profile from . import array from . import parallel @@ -265,13 +265,9 @@ def visit_UnaryOp(self, node): return self.visit(node.operand) def visit_Return(self, node): - if isinstance(node.value, ast.Name) or \ - isinstance(node.value, ast.Subscript) or \ - isinstance(node.value, ast.Num) or \ - isinstance(node.value, ast.BinOp) or \ - isinstance(node.value, ast.Call) or \ - isinstance(node.value, ast.IfExp) or \ - isinstance(node.value, ast.UnaryOp): + valid_return_expr = (ast.Name, ast.Subscript, ast.Num, ast.BinOp, + ast.Call, ast.IfExp, ast.UnaryOp) + if isinstance(node.value, valid_return_expr): result_type = self.visit(node.value) if result_type: self.arg_types['return_'] = result_type @@ -287,11 +283,12 @@ def visit_Return(self, node): class ElementwiseJIT(parallel.ElementwiseBase): def __init__(self, func, backend=None): backend = array.get_backend(backend) - self.tp = Transpiler(backend=backend) + self._config = get_config() + self.tp = Transpiler(backend=backend, + count_flops=self._config.count_flops) self.backend = backend self.name = 'elwise_%s' % func.__name__ self.func = func - self._config = get_config() self.cython_gen = CythonGenerator() self.source = '# Code jitted, call the function to generate the code.' self.all_source = self.source @@ -333,6 +330,10 @@ def _massage_arg(self, x): def __call__(self, *args, **kw): c_func = self._generate_kernel(*args) c_args = [self._massage_arg(x) for x in args] + if self._config.count_flops: + flop_counter = array.zeros(args[0].length, np.int64, + backend=self.backend) + c_args.append(flop_counter.dev) if self.backend == 'cython': size = len(c_args[0]) @@ -347,6 +348,9 @@ def __call__(self, *args, **kw): c_func(*c_args, **kw) event.record() event.synchronize() + if self._config.count_flops: + flops = array.sum(flop_counter) + record_flops(self.name, flops) class ReductionJIT(parallel.ReductionBase): @@ -523,7 +527,7 @@ def __call__(self, **kwargs): c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()} if self._get_backend_key() in self.output_func.arg_keys: output_arg_keys = self.output_func.arg_keys[ - self._get_backend_key()] + self._get_backend_key()] else: raise ValueError("No kernel arguments found for backend = %s, " "use_openmp = %s, use_double = %s" % diff --git a/compyle/parallel.py b/compyle/parallel.py index 90a5f1d..460bc17 100644 --- a/compyle/parallel.py +++ b/compyle/parallel.py @@ -13,7 +13,7 @@ import numpy as np from .config import get_config -from .profile import profile +from .profile import record_flops, profile from .cython_generator import get_parallel_range, CythonGenerator from .transpiler import Transpiler, convert_to_float_if_needed from .types import dtype_to_ctype @@ -404,11 +404,12 @@ def get_common_cache_key(obj): class ElementwiseBase(object): def __init__(self, func, backend=None): backend = array.get_backend(backend) - self.tp = Transpiler(backend=backend) + self._config = get_config() + self.tp = Transpiler(backend=backend, + count_flops=self._config.count_flops) self.backend = backend self.name = 'elwise_%s' % func.__name__ self.func = func - self._config = get_config() self.cython_gen = CythonGenerator() self.queue = None # This is the source generated for the user code. @@ -453,11 +454,17 @@ def _generate(self, declarations=None): ctx = get_context() self.queue = get_queue() name = self.func.__name__ + call_args = ', '.join(c_data[1]) + if self._config.count_flops: + call_args += ', cpy_flop_counter' expr = '{func}({args})'.format( func=name, - args=', '.join(c_data[1]) + args=call_args ) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) + if self._config.count_flops: + arguments += ', long* cpy_flop_counter' + preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True @@ -483,11 +490,17 @@ def _generate(self, declarations=None): from pycuda.elementwise import ElementwiseKernel from pycuda._cluda import CLUDA_PREAMBLE name = self.func.__name__ + call_args = ', '.join(c_data[1]) + if self._config.count_flops: + call_args += ', cpy_flop_counter' expr = '{func}({args})'.format( func=name, - args=', '.join(c_data[1]) + args=call_args ) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) + if self._config.count_flops: + arguments += ', long* cpy_flop_counter' + preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True @@ -519,6 +532,8 @@ def _add_address_space(arg): return arg args = [_add_address_space(arg) for arg in c_data[0]] + if self._config.count_flops: + args.append('GLOBAL_MEM long* cpy_flop_counter') code[:header_idx] = wrap( 'WITHIN_KERNEL void {func}({args})'.format( func=self.func.__name__, @@ -527,6 +542,14 @@ def _add_address_space(arg): width=78, subsequent_indent=' ' * 4, break_long_words=False ) self.tp.blocks[-1].code = '\n'.join(code) + if self._config.count_flops: + for idx, block in enumerate(self.tp.blocks[:-1]): + self.tp.blocks[idx].code = block.code.replace( + '${offset}', '0' + ) + self.tp.blocks[-1].code = self.tp.blocks[-1].code.replace( + '${offset}', 'i' + ) def _massage_arg(self, x): if isinstance(x, array.Array): @@ -539,6 +562,10 @@ def _massage_arg(self, x): @profile def __call__(self, *args, **kw): c_args = [self._massage_arg(x) for x in args] + if self._config.count_flops: + flop_counter = array.zeros(args[0].length, np.int64, + backend=self.backend) + c_args.append(flop_counter.dev) if self.backend == 'cython': size = len(c_args[0]) c_args.insert(0, size) @@ -552,6 +579,9 @@ def __call__(self, *args, **kw): self.c_func(*c_args, **kw) event.record() event.synchronize() + if self._config.count_flops: + flops = array.sum(flop_counter) + record_flops(self.name, flops) class Elementwise(object): diff --git a/compyle/profile.py b/compyle/profile.py index 78c079a..0d99d4a 100644 --- a/compyle/profile.py +++ b/compyle/profile.py @@ -8,6 +8,7 @@ _profile_info = defaultdict(lambda: {'calls': 0, 'time': 0}) +_flops_info = defaultdict(lambda: {'calls': 0, 'flops': 0}) def _record_profile(name, time): @@ -16,6 +17,12 @@ def _record_profile(name, time): _profile_info[name]['calls'] += 1 +def record_flops(name, flops): + global _flops_info + _flops_info[name]['flops'] += flops + _flops_info[name]['calls'] += 1 + + @contextmanager def profile_ctx(name): """ Context manager for profiling @@ -54,6 +61,21 @@ def get_profile_info(): return _profile_info +def get_flops_info(): + global _flops_info + return _flops_info + + +def reset_profile_info(): + global _profile_info + _profile_info = defaultdict(lambda: {'calls': 0, 'time': 0}) + + +def reset_flops_info(): + global _flops_info + _flops_info = defaultdict(lambda: {'calls': 0, 'flops': 0}) + + def print_profile(): global _profile_info profile_data = sorted(_profile_info.items(), key=lambda x: x[1]['time'], @@ -73,6 +95,25 @@ def print_profile(): print("Total profiled time: %g secs" % tot_time) +def print_flops_info(): + global _flops_info + flops_data = sorted(_flops_info.items(), key=lambda x: x[1]['flops'], + reverse=True) + if len(_flops_info) == 0: + print("No flops information available") + return + print("FLOPS info:") + print("{:<40} {:<10} {:<10}".format('Function', 'N calls', 'FLOPS')) + tot_flops = 0 + for kernel, data in flops_data: + print("{:<40} {:<10} {:<10}".format( + kernel, + data['calls'], + data['flops'])) + tot_flops += data['flops'] + print("Total FLOPS: %i" % tot_flops) + + def profile_kernel(kernel, name, backend=None): """For profiling raw PyCUDA/PyOpenCL kernels or cython functions """ diff --git a/compyle/tests/test_parallel.py b/compyle/tests/test_parallel.py index 1dd39a9..0e01e0b 100644 --- a/compyle/tests/test_parallel.py +++ b/compyle/tests/test_parallel.py @@ -9,6 +9,7 @@ from ..types import annotate, declare from ..parallel import Elementwise, Reduction, Scan from ..low_level import atomic_inc +from ..profile import get_flops_info, reset_flops_info from .test_jit import g MY_CONST = 42 @@ -19,6 +20,20 @@ def external(x): return x +@annotate(xi='double', return_='double') +def external_flops(xi): + res = declare('double') + res = 0 + for i in range(10): + res += external_flops2(xi) + return res + + +@annotate(xi='double', return_='double') +def external_flops2(xi): + return xi + 2 + + class ParallelUtilsBase(object): def test_elementwise_works_with_cython(self): self._check_simple_elementwise(backend='cython') @@ -44,6 +59,26 @@ def test_elementwise_works_with_global_constant_cuda(self): importorskip('pycuda') self._check_elementwise_with_constant(backend='cuda') + def test_elementwise_count_flops_opencl(self): + importorskip('pyopencl') + self._check_elementwise_count_flops(backend='opencl') + + def test_elementwise_count_flops_cuda(self): + importorskip('pycuda') + self._check_elementwise_count_flops(backend='cuda') + + def test_elementwise_count_flops_with_external_funcs_opencl(self): + importorskip('pyopencl') + self._check_elementwise_count_flops_with_external_funcs( + backend='opencl' + ) + + def test_elementwise_count_flops_with_external_funcs_cuda(self): + importorskip('pycuda') + self._check_elementwise_count_flops_with_external_funcs( + backend='cuda' + ) + def test_reduction_works_without_map_cython(self): self._check_simple_reduction(backend='cython') @@ -255,6 +290,46 @@ def set_const(i, x): x.pull() np.testing.assert_almost_equal(x.data, MY_CONST) + def _check_elementwise_count_flops(self, backend): + # Given + @annotate(i='int', x='doublep', y='doublep') + def axpb(i, x, y): + y[i] = 2 * x[i] + 3 + + x = np.zeros(100) + y = np.zeros(100) + x, y = wrap(x, y, backend=backend) + + with use_config(count_flops=True): + # When + e = Elementwise(axpb, backend=backend) + e(x, y) + + # Then + flops_info = get_flops_info() + assert flops_info['elwise_axpb']['flops'] == 200 + reset_flops_info() + + def _check_elementwise_count_flops_with_external_funcs(self, backend): + # Given + @annotate(i='int', x='doublep', y='doublep') + def axpb(i, x, y): + y[i] = 2 * external_flops(x[i]) + + x = np.zeros(100) + y = np.zeros(100) + x, y = wrap(x, y, backend=backend) + + with use_config(count_flops=True): + # When + e = Elementwise(axpb, backend=backend) + e(x, y) + + # Then + flops_info = get_flops_info() + assert flops_info['elwise_axpb']['flops'] == 2100 + reset_flops_info() + def _check_simple_reduction(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) @@ -571,6 +646,46 @@ def set_const(i, x): x.pull() np.testing.assert_almost_equal(x.data, MY_CONST) + def _check_elementwise_count_flops(self, backend): + # Given + @annotate + def axpb(i, x, y): + y[i] = 2 * x[i] + 3 + + x = np.zeros(100) + y = np.zeros(100) + x, y = wrap(x, y, backend=backend) + + with use_config(count_flops=True): + # When + e = Elementwise(axpb, backend=backend) + e(x, y) + + # Then + flops_info = get_flops_info() + assert flops_info['elwise_axpb']['flops'] == 200 + reset_flops_info() + + def _check_elementwise_count_flops_with_external_funcs(self, backend): + # Given + @annotate + def axpb(i, x, y): + y[i] = 2 * external_flops(x[i]) + + x = np.zeros(100) + y = np.zeros(100) + x, y = wrap(x, y, backend=backend) + + with use_config(count_flops=True): + # When + e = Elementwise(axpb, backend=backend) + e(x, y) + + # Then + flops_info = get_flops_info() + assert flops_info['elwise_axpb']['flops'] == 2100 + reset_flops_info() + def _check_simple_reduction(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) diff --git a/compyle/tests/test_translator.py b/compyle/tests/test_translator.py index 306f7b9..a3414ef 100644 --- a/compyle/tests/test_translator.py +++ b/compyle/tests/test_translator.py @@ -5,6 +5,7 @@ from ..config import get_config from ..types import annotate, declare +from ..low_level import cast, address, atomic_inc from ..translator import ( CConverter, CodeGenerationError, CStructHelper, KnownType, OpenCLConverter, CUDAConverter, py2c @@ -1214,6 +1215,142 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): assert code.strip() == expect.strip() +def check_opencl_cuda_conversion_flops(converter_obj): + # Note that LID_0 etc. are predefined symbols when we include the CLUDA + # preamble, therefore should be known. + src = dedent(''' + def f(i, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): + s_p[i] = LID_0*GID_0 + ''') + + # When + known_types = {'d_p': KnownType('GLOBAL_MEM int*'), 'i': KnownType('long')} + converter = converter_obj(known_types=known_types, + count_flops=True) + code = converter.convert(src) + print(code) + + # Then + expect = dedent(''' +WITHIN_KERNEL void f(long i, GLOBAL_MEM double* s_p, long d_idx, GLOBAL_MEM + int* d_p, long J, double t, double* l, double* xx, GLOBAL_MEM long* + cpy_flop_counter) +{ + s_p[i] = (LID_0 * GID_0); + cpy_flop_counter[${offset}] += 1; +} + ''') + assert code.strip() == expect.strip() + + +def check_opencl_cuda_conversion_flops_with_return_in_if(converter_obj): + # Note that LID_0 etc. are predefined symbols when we include the CLUDA + # preamble, therefore should be known. + src = dedent(''' + def f(i, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): + s_p[i] = LID_0*GID_0 + 5 + if d_idx < 10: + s_p[i] += 5 * LID_0 + return d_idx * s_p[i] + s_p[i] += 5 * GID_0 + return s_p[i] + ''') + + # When + known_types = {'d_p': KnownType('GLOBAL_MEM int*'), 'i': KnownType('long')} + converter = converter_obj(known_types=known_types, + count_flops=True) + code = converter.convert(src) + print(code) + + # Then + expect = dedent(''' +WITHIN_KERNEL double f(long i, GLOBAL_MEM double* s_p, long d_idx, GLOBAL_MEM + int* d_p, long J, double t, double* l, double* xx, GLOBAL_MEM long* + cpy_flop_counter) +{ + s_p[i] = ((LID_0 * GID_0) + 5); + if ((d_idx < 10)) { + s_p[i] += (5 * LID_0); + cpy_flop_counter[${offset}] += 5; + return (d_idx * s_p[i]); + } + s_p[i] += (5 * GID_0); + cpy_flop_counter[${offset}] += 4; + return s_p[i]; +} + ''') + assert code.strip() == expect.strip() + + +def check_opencl_cuda_conversion_flops_with_blocks(converter_obj): + # Note that LID_0 etc. are predefined symbols when we include the CLUDA + # preamble, therefore should be known. + src = dedent(''' + def f(i, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): + s_p[i] = LID_0*GID_0 + for j in range(10): + if d_p[i] < 2 * t * J: + s_p[i] += d_idx * t * GDIM_0 + s_p[i] += LDIM_0 * 10 * 20 + ''') + + # When + known_types = {'d_p': KnownType('GLOBAL_MEM int*'), 'i': KnownType('long')} + converter = converter_obj(known_types=known_types, + count_flops=True) + code = converter.convert(src) + print(code) + + # Then + expect = dedent(''' +WITHIN_KERNEL void f(long i, GLOBAL_MEM double* s_p, long d_idx, GLOBAL_MEM + int* d_p, long J, double t, double* l, double* xx, GLOBAL_MEM long* + cpy_flop_counter) +{ + s_p[i] = (LID_0 * GID_0); + for (long j=0; j<10; j+=1) { + if ((d_p[i] < ((2 * t) * J))) { + s_p[i] += ((d_idx * t) * GDIM_0); + cpy_flop_counter[${offset}] += 3; + } + s_p[i] += ((LDIM_0 * 10) * 20); + cpy_flop_counter[${offset}] += 5; + } + cpy_flop_counter[${offset}] += 1; +} + ''') + assert code.strip() == expect.strip() + + +def check_opencl_cuda_conversion_flops_with_return(converter_obj): + # Note that LID_0 etc. are predefined symbols when we include the CLUDA + # preamble, therefore should be known. + src = dedent(''' + def f(i, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): + return s_p[i] * d_idx + ''') + + # When + known_types = {'d_p': KnownType('GLOBAL_MEM int*'), 'i': KnownType('long')} + converter = converter_obj(known_types=known_types, + count_flops=True) + code = converter.convert(src) + print(code) + + # Then + expect = dedent(''' +WITHIN_KERNEL double f(long i, GLOBAL_MEM double* s_p, long d_idx, GLOBAL_MEM + int* d_p, long J, double t, double* l, double* xx, GLOBAL_MEM long* + cpy_flop_counter) +{ + cpy_flop_counter[${offset}] += 1; + return (s_p[i] * d_idx); +} + ''') + assert code.strip() == expect.strip() + + def test_cuda_conversion(): check_opencl_cuda_conversion(CUDAConverter) @@ -1222,6 +1359,38 @@ def test_opencl_conversion(): check_opencl_cuda_conversion(OpenCLConverter) +def test_opencl_conversion_flops(): + check_opencl_cuda_conversion_flops(OpenCLConverter) + + +def test_cuda_conversion_flops(): + check_opencl_cuda_conversion_flops(CUDAConverter) + + +def test_opencl_conversion_flops_for(): + check_opencl_cuda_conversion_flops_with_blocks(OpenCLConverter) + + +def test_cuda_conversion_flops_for(): + check_opencl_cuda_conversion_flops_with_blocks(CUDAConverter) + + +def test_opencl_conversion_flops_return(): + check_opencl_cuda_conversion_flops_with_return(OpenCLConverter) + + +def test_cuda_conversion_flops_return(): + check_opencl_cuda_conversion_flops_with_return(CUDAConverter) + + +def test_opencl_conversion_flops_return_in_if(): + check_opencl_cuda_conversion_flops_with_return_in_if(OpenCLConverter) + + +def test_cuda_conversion_flops_return_in_if(): + check_opencl_cuda_conversion_flops_with_return_in_if(CUDAConverter) + + def test_opencl_class(): src = dedent(''' class Foo(object): diff --git a/compyle/translator.py b/compyle/translator.py index f11bac5..be390a7 100644 --- a/compyle/translator.py +++ b/compyle/translator.py @@ -26,6 +26,7 @@ CodeGenerationError, KnownType, Undefined, all_numeric ) from .utils import getsource +from . import transpiler as tp PY_VER = sys.version_info.major @@ -60,8 +61,9 @@ def detect_type(name, value): ) -def py2c(src, detect_type=detect_type, known_types=None): - converter = CConverter(detect_type=detect_type, known_types=known_types) +def py2c(src, detect_type=detect_type, known_types=None, count_flops=False): + converter = CConverter(detect_type=detect_type, known_types=known_types, + count_flops=count_flops) result = converter.convert(src) r = converter.get_declarations() + result print(r) @@ -121,7 +123,8 @@ def get_code(self): class CConverter(ast.NodeVisitor): - def __init__(self, detect_type=detect_type, known_types=None): + def __init__(self, detect_type=detect_type, known_types=None, + count_flops=False): self._declares = {} self._known = set(( 'M_E', 'M_LOG2E', 'M_LOG10E', 'M_LN2', 'M_LN10', @@ -140,11 +143,17 @@ def __init__(self, detect_type=detect_type, known_types=None): self._annotations = {} self._declarations = None self._ignore_methods = [] + self._count_flops = count_flops + self._flop_count = [0] + self._depth = 0 + self._last_for_depth = -1 + self._external_funcs = [] self._replacements = { 'True': '1', 'False': '0', 'None': 'NULL', True: '1', False: '0', None: 'NULL', } self.function_address_space = '' + self.backend = None def _body_has_return(self, body): return re.search(r'\breturn\b', body) is not None @@ -198,6 +207,9 @@ def _get_function_args(self, node): arg, type = self._get_local_arg(arg, type) call_sig.append('{type} {arg}'.format(type=type, arg=arg)) + if self._count_flops: + call_sig.append(self._get_flop_counter_arg()) + return ', '.join(call_sig) def _get_variable_declaration(self, type_str, names): @@ -236,6 +248,9 @@ def _get_local_info(self, obj): def _get_local_declarations(self): return '' + def _get_flop_counter_arg(self): + return 'long* cpy_flop_counter' + def add_known(self, names): '''Add a known name that should not be auto-declared. @@ -250,6 +265,8 @@ def convert(self, src, ignore_methods=None): self._src = src.splitlines() code = ast.parse(src) result = self.visit(code) + if self._depth: + raise ValueError("Non-zero function depth at the end of parsing") self._ignore_methods = [] return result @@ -301,6 +318,10 @@ def parse_instance(self, obj, ignore_methods=None): def parse_function(self, obj, declarations=None): src = dedent(getsource(obj)) fname = obj.__name__ + symbols, implicits, funcs, externs = tp.get_external_symbols_and_calls( + obj, self.backend + ) + self._external_funcs = [f.__name__ for f in funcs] self._declarations = declarations self._annotations[fname] = getattr(obj, '__annotations__', {}) self._local_decl = self._get_local_info(obj) @@ -308,12 +329,35 @@ def parse_function(self, obj, declarations=None): self._local_decl = None self._annotations = {} self._declarations = None + self._external_funcs = [] return code def render_atomic(self, func, arg): raise NotImplementedError( "Atomics only supported by CUDA/OpenCL backends") + def _get_flop_increment(self): + return self._indent_block( + 'cpy_flop_counter[${offset}] += %i;' % + self._flop_count[self._depth] + ) + + def _increment_depth(self): + self._depth += 1 + if len(self._flop_count) <= self._depth: + self._flop_count.append(0) + + def _get_flop_increment_cumulative(self, final_depth): + if not self._count_flops: + return '' + current_depth = self._depth + total_increment = 0 + while current_depth >= final_depth: + total_increment += self._flop_count[current_depth] + current_depth -= 1 + return 'cpy_flop_counter[${offset}] += %i;' % \ + total_increment + '\n' + def visit_LShift(self, node): return '<<' @@ -358,10 +402,12 @@ def visit_Attribute(self, node): return '%s->%s' % (self.visit(node.value), node.attr) def visit_AugAssign(self, node): + self._flop_count[self._depth] += 1 return '%s %s= %s;' % (self.visit(node.target), self.visit(node.op), self.visit(node.value)) def visit_BinOp(self, node): + self._flop_count[self._depth] += 1 if isinstance(node.op, ast.Pow): return 'pow(%s, %s)' % ( self.visit(node.left), self.visit(node.right) @@ -372,11 +418,14 @@ def visit_BinOp(self, node): return '(%s %s %s)' % result def visit_BoolOp(self, node): + self._flop_count[self._depth] += 1 op = ' %s ' % self.visit(node.op) return '(%s)' % (op.join(self.visit(x) for x in node.values)) def visit_Break(self, node): - return 'break;' + flop_increments = self._get_flop_increment_cumulative( + self._last_for_depth) + return flop_increments + 'break;' def visit_Call(self, node): if isinstance(node.func, ast.Name): @@ -387,9 +436,12 @@ def visit_Call(self, node): elif node.func.id == 'cast': return '(%s) (%s)' % (node.args[1].s, self.visit(node.args[0])) else: + args = ', '.join(self.visit(x) for x in node.args) + if self._count_flops and node.func.id in self._external_funcs: + args = args + ', cpy_flop_counter + ${offset}' return '{func}({args})'.format( func=node.func.id, - args=', '.join(self.visit(x) for x in node.args) + args=args ) elif isinstance(node.func, ast.Attribute): if node.func.value.id in self._known_types: @@ -426,7 +478,9 @@ def visit_Compare(self, node): self.visit(node.comparators[0])) def visit_Continue(self, node): - return 'continue;' + flop_increments = self._get_flop_increment_cumulative( + self._last_for_depth) + return flop_increments + 'continue;' def visit_Div(self, node): return '/' @@ -462,6 +516,9 @@ def visit_For(self, node): positive_step = True int_step = True int_stop = True + self._increment_depth() + prev_last_for_depth = self._last_for_depth + self._last_for_depth = self._depth if len(args) == 1: start, stop, incr = 0, self.visit(args[0]), 1 int_stop = simple = self._check_if_integer(stop) @@ -488,19 +545,28 @@ def visit_For(self, node): target_type = '' target = self.visit(node.target) + if simple: + block = '\n'.join(self._indent_block(self.visit(x)) + for x in node.body) + if self._count_flops: + block += '\n' + self._get_flop_increment() + comparator = '<' if positive_step else '>' r = ('for ({type}{i}={start}; {i}{comp}{stop}; {i}+={incr})' ' {{\n{block}\n}}\n').format( i=target, type=target_type, start=start, stop=stop, incr=incr, comp=comparator, - block='\n'.join( - self._indent_block(self.visit(x)) for x in node.body - ) + block=block ) else: count = self._for_count self._for_count += 1 + block = '\n'.join(self._indent_block(self.visit(x)) + for x in node.body) + if self._count_flops: + block += '\n' + self._get_flop_increment() + r = '' if not int_stop: stop_var = '__cpy_stop_{count}'.format(count=count) @@ -514,9 +580,6 @@ def visit_For(self, node): stop = stop_var if int_step: comparator = '<' if positive_step else '>' - block = '\n'.join( - self._indent_block(self.visit(x)) for x in node.body - ) r += ('for ({type}{i}={start}; {i}{comp}{stop}; {i}+={incr})' ' {{\n{block}\n}}\n').format( i=target, type=target_type, @@ -533,9 +596,6 @@ def visit_For(self, node): type=type, step_var=step_var, incr=incr ) incr = step_var - block = '\n'.join( - self._indent_block(self.visit(x)) for x in node.body - ) r += dedent('''\ if ({incr} < 0) {{ for ({type}{i}={start}; {i}>{stop}; {i}+={incr}) {{ @@ -560,6 +620,9 @@ def visit_For(self, node): if local_scope: self._known.remove(node.target.id) + self._flop_count[self._depth] = 0 + self._depth -= 1 + self._last_for_depth = prev_last_for_depth return r def visit_FunctionDef(self, node): @@ -568,6 +631,8 @@ def visit_FunctionDef(self, node): assert node.args.kwarg is None, \ "Functions with kwargs not supported in line %d." % node.lineno + self._depth = 0 + self._flop_count = [0] if self._class_name and (node.name.startswith(('_', 'py_')) or node.name in self._ignore_methods): return '' @@ -581,8 +646,13 @@ def visit_FunctionDef(self, node): self._known.update(x.arg for x in node.args.args) args = self._get_function_args(node) - body = '\n'.join(self._indent_block(self.visit(item)) - for item in self._remove_docstring(node.body)) + bodylines = [self._indent_block(self.visit(item)) + for item in self._remove_docstring(node.body)] + + if self._count_flops and not isinstance(node.body[-1], ast.Return): + bodylines.append(self._get_flop_increment()) + + body = '\n'.join(bodylines) local_decl = self._get_local_declarations() if len(self._class_name) > 0: func_name = self._class_name + '_' + node.name @@ -602,6 +672,7 @@ def visit_FunctionDef(self, node): )) self._known = orig_known self._declares = orig_declares + self._flop_count = [0] return sig + '\n{\n' + local_decl + declares + body + '\n}\n' def visit_Gt(self, node): @@ -611,18 +682,31 @@ def visit_GtE(self, node): return '>=' def visit_If(self, node): - code = 'if ({cond}) {{\n{block}\n}}\n'.format( - cond=self.visit(node.test), - block='\n'.join( + cond = self.visit(node.test) + self._increment_depth() + block = '\n'.join( self._indent_block(self.visit(x)) for x in node.body - ) ) + if self._count_flops and not isinstance( + node.body[-1], (ast.Continue, ast.Break, ast.Return)): + block += '\n' + self._get_flop_increment() + code = 'if ({cond}) {{\n{block}\n}}\n'.format( + cond=cond, + block=block + ) + self._flop_count[self._depth] = 0 if node.orelse: + block = '\n'.join( + self._indent_block(self.visit(x)) for x in node.orelse + ) + if self._count_flops and not isinstance( + node.orelse[-1], (ast.Continue, ast.Break, ast.Return)): + block += '\n' + self._get_flop_increment() code += 'else {{\n{block}\n}}\n'.format( - block='\n'.join( - self._indent_block(self.visit(x)) for x in node.orelse - ) + block=block ) + self._flop_count[self._depth] = 0 + self._depth -= 1 return code def visit_IfExp(self, node): @@ -689,9 +773,12 @@ def visit_Pass(self, node): def visit_Return(self, node): if node.value: - return 'return %s;' % (self.visit(node.value)) + ret_value = self.visit(node.value) + flop_increments = self._get_flop_increment_cumulative(0) + if node.value: + return flop_increments + 'return %s;' % ret_value else: - return 'return;' + return flop_increments + 'return;' def visit_Sub(self, node): return '-' @@ -716,14 +803,20 @@ def visit_USub(self, node): return '-' def visit_While(self, node): + self._increment_depth() if node.orelse: self.error('Does not support while/else clauses.', node.orelse[0]) - return 'while ({cond}) {{\n{block}\n}}\n'.format( - cond=self.visit(node.test), - block='\n'.join( - self._indent_block(self.visit(x)) for x in node.body - ) + block = '\n'.join( + self._indent_block(self.visit(x)) for x in node.body ) + if self._count_flops: + block += '\n' + self._get_flop_increment() + code = 'while ({cond}) {{\n{block}\n}}\n'.format( + cond=self.visit(node.test), block=block + ) + self._flop_count[self._depth] = 0 + self._depth -= 1 + return code def ocl_detect_pointer_base_type(name, value): @@ -761,8 +854,10 @@ def ocl_detect_type(name, value): class OpenCLConverter(CConverter): - def __init__(self, detect_type=ocl_detect_type, known_types=None): - super(OpenCLConverter, self).__init__(detect_type, known_types) + def __init__(self, detect_type=ocl_detect_type, known_types=None, + count_flops=False): + super(OpenCLConverter, self).__init__(detect_type, known_types, + count_flops=count_flops) self.function_address_space = 'WITHIN_KERNEL ' self._known.update(( 'LID_0', 'LID_1', 'LID_2', @@ -770,10 +865,14 @@ def __init__(self, detect_type=ocl_detect_type, known_types=None): 'LDIM_0', 'LDIM_1', 'LDIM_2', 'GDIM_0', 'GDIM_1', 'GDIM_2' )) + self.backend = 'opencl' def _get_self_type(self): return KnownType('GLOBAL_MEM %s*' % self._class_name) + def _get_flop_counter_arg(self): + return 'GLOBAL_MEM long* cpy_flop_counter' + def render_atomic(self, func, arg): if func == 'atomic_inc': return 'atomic_inc(&%s)' % self.visit(arg) @@ -782,9 +881,12 @@ def render_atomic(self, func, arg): class CUDAConverter(OpenCLConverter): - def __init__(self, detect_type=ocl_detect_type, known_types=None): - super(CUDAConverter, self).__init__(detect_type, known_types) + def __init__(self, detect_type=ocl_detect_type, known_types=None, + count_flops=False): + super(CUDAConverter, self).__init__(detect_type, known_types, + count_flops=count_flops) self._local_decl = None + self.backend = 'cuda' def _get_local_arg(self, arg, type): return 'size_%s' % arg, 'int' diff --git a/compyle/transpiler.py b/compyle/transpiler.py index 46663e6..8d5581d 100644 --- a/compyle/transpiler.py +++ b/compyle/transpiler.py @@ -124,7 +124,7 @@ def __eq__(self, other): class Transpiler(object): - def __init__(self, backend='cython', incl_cluda=True): + def __init__(self, backend='cython', incl_cluda=True, count_flops=False): """Constructor. Parameters @@ -159,7 +159,7 @@ def __init__(self, backend='cython', incl_cluda=True): elif backend == 'opencl': from pyopencl._cluda import CLUDA_PREAMBLE - self._cgen = OpenCLConverter() + self._cgen = OpenCLConverter(count_flops=count_flops) cluda = '' if incl_cluda: cluda = Template(text=CLUDA_PREAMBLE).render( @@ -177,7 +177,7 @@ def __init__(self, backend='cython', incl_cluda=True): ''') elif backend == 'cuda': from pycuda._cluda import CLUDA_PREAMBLE - self._cgen = CUDAConverter() + self._cgen = CUDAConverter(count_flops=count_flops) cluda = '' if incl_cluda: cluda = Template(text=CLUDA_PREAMBLE).render( diff --git a/compyle/utils.py b/compyle/utils.py index 41141ac..90016e9 100644 --- a/compyle/utils.py +++ b/compyle/utils.py @@ -2,7 +2,7 @@ import argparse import atexit from compyle.config import get_config -from compyle.profile import print_profile +from compyle.profile import print_profile, print_flops_info def getsourcelines(obj): @@ -61,7 +61,14 @@ def __init__(self, *args, **kwargs): dest='profile', default=False, help='Print profiling info' ) + self.add_argument( + '--count-flops', action='store_true', + dest='count_flops', + default=False, help='Print flops info' + ) + self.profile_registered = False + self.flops_registered = False def _set_config_options(self, options): get_config().use_openmp = options.openmp @@ -75,6 +82,10 @@ def _set_config_options(self, options): get_config().profile = True atexit.register(print_profile) self.profile_registered = True + if options.count_flops and not self.flops_registered: + get_config().count_flops = True + atexit.register(print_flops_info) + self.flops_registered = True def parse_args(self, *args, **kwargs): options = super().parse_args(*args, **kwargs)