From 2d4a7d3a566fc082a55cdea4ee075cec3092edb4 Mon Sep 17 00:00:00 2001 From: Rohan Yadav Date: Fri, 8 Jan 2021 18:39:51 -0800 Subject: [PATCH] *: add support for windowing of tensors This commit adds support for windowing of tensors in the existing index notation DSL. For example: ``` A(i, j) = B(i(1, 4), j) * C(i, j(5, 10)) ``` causes `B` to be windowed along its first mode, and `C` to be windowed along its second mode. In this commit any mix of windowed and non-windowed modes are supported, along with windowing the same tensor in different ways in the same expression. The windowing expressions correspond to the `:` operator to slice dimensions in `numpy`. Currently, only windowing by integers is supported. Windowing is achieved by tying windowing information to particular `Iterator` objects, as these are created for each `Tensor`-`IndexVar` pair. When iterating over an `Iterator` that may be windowed, extra steps are taken to either generate an index into the windowed space, or to recover an index from a point in the windowed space. --- include/taco/index_notation/index_notation.h | 84 +++- .../index_notation/index_notation_nodes.h | 21 +- include/taco/ir/ir.h | 3 +- include/taco/lower/iterator.h | 15 + include/taco/lower/lowerer_impl.h | 23 +- include/taco/tensor.h | 79 ++++ src/codegen/codegen_c.cpp | 8 +- src/codegen/codegen_cuda.cpp | 9 +- src/error/error_checks.cpp | 8 + src/index_notation/index_notation.cpp | 94 ++++- .../index_notation_rewriter.cpp | 2 +- src/ir/ir.cpp | 3 +- src/lower/iterator.cpp | 39 +- src/lower/lowerer_impl.cpp | 137 ++++++- src/tensor.cpp | 33 ++ test/test.cpp | 16 + test/test.h | 5 + test/tests-windowing.cpp | 386 ++++++++++++++++++ 18 files changed, 930 insertions(+), 35 deletions(-) create mode 100644 test/tests-windowing.cpp diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index cacd1411c..0646398b0 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -1,6 +1,7 @@ #ifndef TACO_INDEX_NOTATION_H #define TACO_INDEX_NOTATION_H +#include #include #include #include @@ -30,6 +31,7 @@ class Format; class Schedule; class IndexVar; +class WindowedIndexVar; class TensorVar; class IndexExpr; @@ -37,6 +39,7 @@ class Assignment; class Access; struct AccessNode; +struct AccessWindow; struct LiteralNode; struct NegNode; struct SqrtNode; @@ -220,7 +223,8 @@ class Access : public IndexExpr { Access() = default; Access(const Access&) = default; Access(const AccessNode*); - Access(const TensorVar& tensorVar, const std::vector& indices={}); + Access(const TensorVar &tensorVar, const std::vector &indices = {}, + const std::map &windows = {}); /// Return the Access expression's TensorVar. const TensorVar &getTensorVar() const; @@ -228,6 +232,16 @@ class Access : public IndexExpr { /// Returns the index variables used to index into the Access's TensorVar. const std::vector& getIndexVars() const; + /// hasWindowedModes returns true if any accessed modes are windowed. + bool hasWindowedModes() const; + + /// Returns whether or not the input mode (0-indexed) is windowed. + bool isModeWindowed(int mode) const; + + /// Return the {lower,upper} bound of the window on the input mode (0-indexed). + int getWindowLowerBound(int mode) const; + int getWindowUpperBound(int mode) const; + /// Assign the result of an expression to a left-hand-side tensor access. /// ``` /// a(i) = b(i) * c(i); @@ -800,11 +814,67 @@ class Multi : public IndexStmt { /// Create a multi index statement. Multi multi(IndexStmt stmt1, IndexStmt stmt2); +/// IndexVarInterface is a marker superclass for IndexVar-like objects. +/// It is intended to be used in situations where many IndexVar-like objects +/// must be stored together, like when building an Access AST node where some +/// of the access variables are windowed. Use cases for IndexVarInterface +/// will inspect the underlying type of the IndexVarInterface. For sake of +/// completeness, the current implementers of IndexVarInterface are: +/// * IndexVar +/// * WindowedIndexVar +/// If this set changes, make sure to update the match function. +class IndexVarInterface { +public: + virtual ~IndexVarInterface() = default; + + /// match performs a dynamic case analysis of the implementers of IndexVarInterface + /// as a utility for handling the different values within. It mimics the dynamic + /// type assertion of Go. + static void match( + std::shared_ptr ptr, + std::function)> ivarFunc, + std::function)> wvarFunc + ) { + auto iptr = std::dynamic_pointer_cast(ptr); + auto wptr = std::dynamic_pointer_cast(ptr); + if (iptr != nullptr) { + ivarFunc(iptr); + } else if (wptr != nullptr) { + wvarFunc(wptr); + } else { + taco_iassert("IndexVarInterface was not IndexVar or WindowedIndexVar"); + } + } +}; + +/// WindowedIndexVar represents an IndexVar that has been windowed. For example, +/// A(i) = B(i(2, 4)) +/// In this case, i(2, 4) is a WindowedIndexVar. WindowedIndexVar is defined +/// before IndexVar so that IndexVar can return objects of type WindowedIndexVar. +class WindowedIndexVar : public util::Comparable, public IndexVarInterface { +public: + WindowedIndexVar(IndexVar base, int lo = -1, int hi = -1); + ~WindowedIndexVar() = default; + + /// getIndexVar returns the underlying IndexVar. + IndexVar getIndexVar() const; + + /// get{Lower,Upper}Bound returns the {lower,upper} bound of the window of + /// this index variable. + int getLowerBound() const; + int getUpperBound() const; + +private: + struct Content; + std::shared_ptr content; +}; + /// Index variables are used to index into tensors in index expressions, and /// they represent iteration over the tensor modes they index into. -class IndexVar : public util::Comparable { +class IndexVar : public util::Comparable, public IndexVarInterface { public: IndexVar(); + ~IndexVar() = default; IndexVar(const std::string& name); /// Returns the name of the index variable. @@ -813,6 +883,8 @@ class IndexVar : public util::Comparable { friend bool operator==(const IndexVar&, const IndexVar&); friend bool operator<(const IndexVar&, const IndexVar&); + /// Indexing into an IndexVar returns a window into it. + WindowedIndexVar operator()(int lo, int hi); private: struct Content; @@ -823,7 +895,15 @@ struct IndexVar::Content { std::string name; }; +struct WindowedIndexVar::Content { + IndexVar base; + int lo; + int hi; +}; + +std::ostream& operator<<(std::ostream&, const std::shared_ptr&); std::ostream& operator<<(std::ostream&, const IndexVar&); +std::ostream& operator<<(std::ostream&, const WindowedIndexVar&); /// A suchthat statement provides a set of IndexVarRel that constrain /// the iteration space for the child concrete index notation diff --git a/include/taco/index_notation/index_notation_nodes.h b/include/taco/index_notation/index_notation_nodes.h index 95439cd6b..d18ee4a3d 100644 --- a/include/taco/index_notation/index_notation_nodes.h +++ b/include/taco/index_notation/index_notation_nodes.h @@ -13,10 +13,21 @@ namespace taco { +// An AccessNode carries the windowing information for an IndexVar + TensorVar +// combination. An AccessWindow contains the lower and upper bounds of each +// windowed mode (0-indexed). AccessWindow is extracted from AccessNode so that +// it can be referenced externally. +struct AccessWindow { + int lo; + int hi; + friend bool operator==(const AccessWindow& a, const AccessWindow& b) { + return a.lo == b.lo && a.hi == b.hi; + } +}; struct AccessNode : public IndexExprNode { - AccessNode(TensorVar tensorVar, const std::vector& indices) - : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar), indexVars(indices) {} + AccessNode(TensorVar tensorVar, const std::vector& indices, const std::map& windows={}) + : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar), indexVars(indices), windowedModes(windows) {} void accept(IndexExprVisitorStrict* v) const { v->visit(this); @@ -26,6 +37,12 @@ struct AccessNode : public IndexExprNode { TensorVar tensorVar; std::vector indexVars; + std::map windowedModes; + +protected: + /// Initialize an AccessNode with just a TensorVar. If this constructor is used, + /// then indexVars must be set afterwards. + explicit AccessNode(TensorVar tensorVar) : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar) {} }; struct LiteralNode : public IndexExprNode { diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 0fd9a45b4..f852f26b1 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -688,9 +688,10 @@ struct Allocate : public StmtNode { Expr num_elements; Expr old_elements; // used for realloc in CUDA bool is_realloc; + bool clear; // Whether to use calloc to allocate this memory. static Stmt make(Expr var, Expr num_elements, bool is_realloc=false, - Expr old_elements=Expr()); + Expr old_elements=Expr(), bool clear=false); static const IRNodeType _type_info = IRNodeType::Allocate; }; diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h index 1d871ffaa..0fe765653 100644 --- a/include/taco/lower/iterator.h +++ b/include/taco/lower/iterator.h @@ -159,6 +159,17 @@ class Iterator : public util::Comparable { /// Returns true if the iterator is defined, false otherwise. bool defined() const; + /// Methods for querying and operating on windowed tensor modes. + + /// isWindowed returns true if this iterator is operating over a window + /// of a tensor mode. + bool isWindowed() const; + + /// getWindow{Lower,Upper}Bound return the {Lower,Upper} bound of the + /// window that this iterator operates over. + ir::Expr getWindowLowerBound() const; + ir::Expr getWindowUpperBound() const; + friend bool operator==(const Iterator&, const Iterator&); friend bool operator<(const Iterator&, const Iterator&); friend std::ostream& operator<<(std::ostream&, const Iterator&); @@ -169,6 +180,10 @@ class Iterator : public util::Comparable { Iterator(std::shared_ptr content); void setChild(const Iterator& iterator) const; + + friend class Iterators; + /// setWindowBounds sets the window bounds of this iterator. + void setWindowBounds(ir::Expr lo, ir::Expr hi); }; /** diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index 39675d450..afc936145 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -375,9 +375,30 @@ class LowererImpl : public util::Uncopyable { /// Create an expression to index into a tensor value array. ir::Expr generateValueLocExpr(Access access) const; - /// Expression that evaluates to true if none of the iteratators are exhausted + /// Expression that evaluates to true if none of the iterators are exhausted ir::Expr checkThatNoneAreExhausted(std::vector iterators); + /// Expression that returns the beginning of a window to iterate over + /// in a compressed iterator. It is used when operating over windows of + /// tensors, instead of the full tensor. + ir::Expr searchForStartOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end); + + /// Statement that guards against going out of bounds of the window that + /// the input iterator was configured with. + ir::Stmt upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access); + + /// Expression that recovers a canonical index variable from a position in + /// a windowed position iterator. A windowed position iterator iterates over + /// values in the range [lo, hi). This expression projects values in that + /// range back into the canonical range of [0, n). + ir::Expr projectWindowedPositionToCanonicalSpace(Iterator iterator, ir::Expr expr); + + // projectCanonicalSpaceToWindowedPosition is the opposite of + // projectWindowedPositionToCanonicalSpace. It takes an expression ranging + // through the canonical space of [0, n) and projects it up to the windowed + // range of [lo, hi). + ir::Expr projectCanonicalSpaceToWindowedPosition(Iterator iterator, ir::Expr expr); + private: bool assemble; bool compute; diff --git a/include/taco/tensor.h b/include/taco/tensor.h index 313758252..25186c815 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -386,6 +386,9 @@ class TensorBase { /// Create an index expression that accesses (reads or writes) this tensor. Access operator()(const std::vector& indices); + /// Create a possibly windowed index expression that accesses (reads or writes) this tensor. + Access operator()(const std::vector>& indices); + /// Create an index expression that accesses (reads) this (scalar) tensor. Access operator()(); @@ -621,6 +624,20 @@ class Tensor : public TensorBase { template Access operator()(const IndexVars&... indices); + /// The below two Access methods are used to allow users to access tensors + /// with a mix of IndexVar's and WindowedIndexVar's. This allows natural + /// expressions like + /// A(i, j(1, 3)) = B(i(2, 4), j) * C(i(5, 7), j(7, 9)) + /// to be constructed without adjusting the original API. + + /// Create an index expression that accesses (reads, writes) this tensor. + template + Access operator()(const WindowedIndexVar& first, const IndexVars&... indices); + + /// Create an index expression that accesses (reads, writes) this tensor. + template + Access operator()(const IndexVar& first, const IndexVars&... indices); + ScalarAccess operator()(const std::vector& indices); /// Create an index expression that accesses (reads) this tensor. @@ -629,6 +646,15 @@ class Tensor : public TensorBase { /// Assign an expression to a scalar tensor. void operator=(const IndexExpr& expr); + +private: + /// The _access method family is the template level implementation of + /// Access() expressions containing mixes of IndexVar and WindowedIndexVar objects. + template + std::vector> _access(const First& first, const Rest&... rest); + std::vector> _access(); + template + Access _access_wrapper(const Args&... args); }; template @@ -1084,6 +1110,59 @@ Access Tensor::operator()(const IndexVars&... indices) { return TensorBase::operator()(std::vector{indices...}); } +/// The _access() methods perform primitive recursion on the input variadic template. +/// This means that each instance of the _access method matches on the first element +/// of the variadic template parameter pack, performs an "action", then recurses +/// with the remaining elements in the parameter pack through a recursive call +/// to _access. Since this is recursion, we need a base case. The empty argument +/// instance of _access returns an empty value of the desired type, in this case +/// a vector of IndexVarInterface. +template +std::vector> Tensor::_access() { + return std::vector>{}; +} + +/// The recursive case of _access matches on the first element, and attempts to +/// create a shared_ptr out of it. It then makes a recursive call to get a +/// vector with the rest of the elements. Then, it pushes the first element onto +/// the back of the vector -- this check ensures that the type First is indeed +/// a member of IndexVarInterface. +template +template +std::vector> Tensor::_access(const First& first, const Rest&... rest) { + auto var = std::make_shared(first); + auto ret = _access(rest...); + ret.push_back(var); + return ret; +} + +/// _access_wrapper just calls into _access and reverses the result to get the initial +/// order of the arguments. +template +template +Access Tensor::_access_wrapper(const Args&... args) { + auto resultReversed = this->_access(args...); + std::vector> result; + result.reserve(resultReversed.size()); + for (auto& it : util::reverse(resultReversed)) { + result.push_back(it); + } + return TensorBase::operator()(result); +} + +/// We have to case on whether the first argument is an IndexVar or a WindowedIndexVar +/// so that the template engine can differentiate between the two versions. +template +template +Access Tensor::operator()(const IndexVar& first, const IndexVars&... indices) { + return this->_access_wrapper(first, indices...); +} +template +template +Access Tensor::operator()(const WindowedIndexVar& first, const IndexVars&... indices) { + return this->_access_wrapper(first, indices...); +} + template ScalarAccess Tensor::operator()(const std::vector& indices) { taco_uassert(indices.size() == (size_t)getOrder()) diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index a0c6c9591..f48f34f2a 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -516,7 +516,13 @@ void CodeGen_C::visit(const Allocate* op) { stream << ", "; } else { - stream << "malloc("; + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } } stream << "sizeof(" << elementType << ")"; stream << " * "; diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index d0c69ffd8..7210c6cde 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -1293,9 +1293,14 @@ void CodeGen_CUDA::visit(const Call* op) { stream << op->func << "("; parentPrecedence = Precedence::CALL; - // Need to print cast to type so that arguments match + // Need to print cast to type so that arguments match. if (op->args.size() > 0) { - if (op->type != op->args[0].type() || isa(op->args[0])) { + // However, the binary search arguments take int* as their first + // argument. This pointer information isn't carried anywhere in + // the argument expressions, so we need to special case and not + // emit an invalid cast for that argument. + auto opIsBinarySearch = op->func == "taco_binarySearchAfter" || op->func == "taco_binarySearchBefore"; + if (!opIsBinarySearch && (op->type != op->args[0].type() || isa(op->args[0]))) { stream << "(" << printCUDAType(op->type, false) << ") "; } op->args[0].accept(this); diff --git a/src/error/error_checks.cpp b/src/error/error_checks.cpp index 9fc067d2d..7516ab3d2 100644 --- a/src/error/error_checks.cpp +++ b/src/error/error_checks.cpp @@ -53,6 +53,14 @@ std::pair dimensionsTypecheck(const std::vector& resultV for (size_t mode = 0; mode < readNode->indexVars.size(); mode++) { IndexVar var = readNode->indexVars[mode]; Dimension dimension = readNode->tensorVar.getType().getShape().getDimension(mode); + + // If this access has windowed modes, use the dimensions of those windows + // as the shape, rather than the shape of the underlying tensor. + auto a = Access(readNode); + if (a.isModeWindowed(mode)) { + dimension = Dimension(a.getWindowUpperBound(mode) - a.getWindowLowerBound(mode)); + } + if (util::contains(indexVarDims,var) && indexVarDims.at(var) != dimension) { errors.push_back(addDimensionError(var, indexVarDims.at(var), dimension)); } else { diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index f00557eeb..4b090d84b 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -185,7 +185,7 @@ struct Isomorphic : public IndexNotationVisitorStrict { return; } } - eq = true; + eq = anode->windowedModes == bnode->windowedModes; } void visit(const LiteralNode* anode) { @@ -746,8 +746,8 @@ IndexExpr operator/(const IndexExpr& lhs, const IndexExpr& rhs) { Access::Access(const AccessNode* n) : IndexExpr(n) { } -Access::Access(const TensorVar& tensor, const std::vector& indices) - : Access(new AccessNode(tensor, indices)) { +Access::Access(const TensorVar& tensor, const std::vector& indices, const std::map& windows) + : Access(new AccessNode(tensor, indices, windows)) { } const TensorVar& Access::getTensorVar() const { @@ -758,11 +758,45 @@ const std::vector& Access::getIndexVars() const { return getNode(*this)->indexVars; } +bool Access::hasWindowedModes() const { + return !getNode(*this)->windowedModes.empty(); +} + +bool Access::isModeWindowed(int mode) const { + auto node = getNode(*this); + return node->windowedModes.find(mode) != node->windowedModes.end(); +} + +int Access::getWindowLowerBound(int mode) const { + taco_iassert(this->isModeWindowed(mode)); + return getNode(*this)->windowedModes.at(mode).lo; +} + +int Access::getWindowUpperBound(int mode) const { + taco_iassert(this->isModeWindowed(mode)); + return getNode(*this)->windowedModes.at(mode).hi; +} + static void check(Assignment assignment) { - auto tensorVar = assignment.getLhs().getTensorVar(); - auto freeVars = assignment.getLhs().getIndexVars(); + auto lhs = assignment.getLhs(); + auto tensorVar = lhs.getTensorVar(); + auto freeVars = lhs.getIndexVars(); auto indexExpr = assignment.getRhs(); auto shape = tensorVar.getType().getShape(); + + // If the LHS access has any windowed modes, use the dimensions of those + // windows as the shape, rather than the shape of the underlying tensor. + if (lhs.hasWindowedModes()) { + vector dims(shape.getOrder()); + for (int i = 0; i < shape.getOrder();i++) { + dims[i] = shape.getDimension(i); + if (lhs.isModeWindowed(i)) { + dims[i] = Dimension(lhs.getWindowUpperBound(i) - lhs.getWindowLowerBound(i)); + } + } + shape = Shape(dims); + } + auto typecheck = error::dimensionsTypecheck(freeVars, indexExpr, shape); taco_uassert(typecheck.first) << error::expr_dimension_mismatch << " " << typecheck.second; } @@ -1800,6 +1834,10 @@ std::string IndexVar::getName() const { return content->name; } +WindowedIndexVar IndexVar::operator()(int lo, int hi) { + return WindowedIndexVar(*this, lo, hi); +} + bool operator==(const IndexVar& a, const IndexVar& b) { return a.content == b.content; } @@ -1808,10 +1846,42 @@ bool operator<(const IndexVar& a, const IndexVar& b) { return a.content < b.content; } +std::ostream& operator<<(std::ostream& os, const std::shared_ptr& var) { + std::stringstream ss; + IndexVarInterface::match(var, [&](std::shared_ptr ivar) { + ss << *ivar; + }, [&](std::shared_ptr wvar) { + ss << *wvar; + }); + return os << ss.str(); +} + std::ostream& operator<<(std::ostream& os, const IndexVar& var) { return os << var.getName(); } +std::ostream& operator<<(std::ostream& os, const WindowedIndexVar& var) { + return os << var.getIndexVar(); +} + +WindowedIndexVar::WindowedIndexVar(IndexVar base, int lo, int hi) : content( new Content){ + this->content->base = base; + this->content->lo = lo; + this->content->hi = hi; +} + +IndexVar WindowedIndexVar::getIndexVar() const { + return this->content->base; +} + +int WindowedIndexVar::getLowerBound() const { + return this->content->lo; +} + +int WindowedIndexVar::getUpperBound() const { + return this->content->hi; +} + // class TensorVar struct TensorVar::Content { int id; @@ -1951,6 +2021,20 @@ static bool isValid(Assignment assignment, string* reason) { auto result = lhs.getTensorVar(); auto freeVars = lhs.getIndexVars(); auto shape = result.getType().getShape(); + + // If the LHS access has any windowed modes, use the dimensions of those + // windows as the shape, rather than the shape of the underlying tensor. + if (lhs.hasWindowedModes()) { + vector dims(shape.getOrder()); + for (int i = 0; i < shape.getOrder();i++) { + dims[i] = shape.getDimension(i); + if (lhs.isModeWindowed(i)) { + dims[i] = Dimension(lhs.getWindowUpperBound(i) - lhs.getWindowLowerBound(i)); + } + } + shape = Shape(dims); + } + auto typecheck = error::dimensionsTypecheck(freeVars, rhs, shape); if (!typecheck.first) { *reason = error::expr_dimension_mismatch + " " + typecheck.second; diff --git a/src/index_notation/index_notation_rewriter.cpp b/src/index_notation/index_notation_rewriter.cpp index ac7bd6945..35d111457 100644 --- a/src/index_notation/index_notation_rewriter.cpp +++ b/src/index_notation/index_notation_rewriter.cpp @@ -327,7 +327,7 @@ struct ReplaceIndexVars : public IndexNotationRewriter { } } if (modified) { - expr = Access(op->tensorVar, indexVars); + expr = Access(op->tensorVar, indexVars, op->windowedModes); } else { expr = op; diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp index 6c5dd8fcb..2623b27cd 100644 --- a/src/ir/ir.cpp +++ b/src/ir/ir.cpp @@ -749,7 +749,7 @@ Stmt Yield::make(std::vector coords, Expr val) { } // Allocate -Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_elements) { +Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_elements, bool clear) { taco_iassert(var.as() || (var.as() && var.as()->is_ptr)) << "Can only allocate memory for a pointer-typed Var"; @@ -761,6 +761,7 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme alloc->is_realloc = is_realloc; taco_iassert(!is_realloc || old_elements.ptr != NULL); alloc->old_elements = old_elements; + alloc->clear = clear; return alloc; } diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 9f5b7dd4b..fe27d5489 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -28,6 +28,16 @@ struct Iterator::Content { ir::Expr segendVar; ir::Expr validVar; ir::Expr beginVar; + + // AccessWindow represents a window (or slice) into a tensor mode, given by + // the expressions representing an upper and lower bound. An iterator + // is windowed if window is not NULL. + struct Window { + ir::Expr lo; + ir::Expr hi; + Window(ir::Expr _lo, ir::Expr _hi) : lo(_lo), hi(_hi) {}; + }; + std::unique_ptr window; }; Iterator::Iterator() : content(nullptr) { @@ -323,6 +333,24 @@ bool Iterator::defined() const { return content != nullptr; } +bool Iterator::isWindowed() const { + return this->content->window != nullptr; +} + +ir::Expr Iterator::getWindowLowerBound() const { + taco_iassert(this->isWindowed()); + return this->content->window->lo; +} + +ir::Expr Iterator::getWindowUpperBound() const { + taco_iassert(this->isWindowed()); + return this->content->window->hi; +} + +void Iterator::setWindowBounds(ir::Expr lo, ir::Expr hi) { + this->content->window = std::make_unique(Content::Window(lo, hi)); +} + bool operator==(const Iterator& a, const Iterator& b) { if (a.isDimensionIterator() && b.isDimensionIterator()) { return a.getIndexVar() == b.getIndexVar(); @@ -425,7 +453,7 @@ Iterators::Iterators(IndexStmt stmt, const map& tensorVars) }) ); - // Reverse the levelITerators map for fast modeAccess lookup + // Reverse the levelIterators map for fast modeAccess lookup for (auto& iterator : content->levelIterators) { content->modeAccesses.insert({iterator.second, iterator.first}); } @@ -472,6 +500,15 @@ Iterators::createAccessIterators(Access access, Format format, Expr tensorIR, Pr string name = iteratorIndexVar.getName() + tensorConcrete.getName(); Iterator iterator(iteratorIndexVar, tensorIR, mode, parent, name, true); + + // If the access that this iterator corresponds to has a window, then + // adjust the iterator appropriately. + if (access.isModeWindowed(modeNumber)) { + auto lo = ir::Literal::make(access.getWindowLowerBound(modeNumber)); + auto hi = ir::Literal::make(access.getWindowUpperBound(modeNumber)); + iterator.setWindowBounds(lo, hi); + } + content->levelIterators.insert({{access,modeNumber+1}, iterator}); if (iteratorIndexVar != indexVar) { // add to allowing lowering to find correct iterator for this pos variable diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 717ae3884..0de713a2c 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -168,17 +168,32 @@ LowererImpl::lower(IndexStmt stmt, string name, vector indexVars = getIndexVars(stmt); for (auto& indexVar : indexVars) { Expr dimension; + // getDimension extracts an Expr that holds the dimension + // of a particular tensor mode. This Expr should be used as a loop bound + // when iterating over the dimension of the target tensor. + auto getDimension = [&](const TensorVar& tv, const Access& a, int mode) { + // If the tensor mode is windowed, then the dimension for iteration is the bounds + // of the window. Otherwise, it is the actual dimension of the mode. + if (a.isModeWindowed(mode)) { + // The mode value used to access .levelIterator is 1-indexed, while + // the mode input to getDimension is 0-indexed. So, we shift it up by 1. + auto iter = iterators.levelIterator(ModeAccess(a, mode+1)); + return ir::Sub::make(iter.getWindowUpperBound(), iter.getWindowLowerBound()); + } else { + return GetProperty::make(tensorVars.at(tv), TensorProperty::Dimension, mode); + } + }; match(stmt, function([&]( const AssignmentNode* n, Matcher* m) { m->match(n->rhs); if (!dimension.defined()) { auto ivars = n->lhs.getIndexVars(); + auto tv = n->lhs.getTensorVar(); int loc = (int)distance(ivars.begin(), find(ivars.begin(),ivars.end(), indexVar)); - if(!util::contains(temporariesSet, n->lhs.getTensorVar())) { - dimension = GetProperty::make(tensorVars.at(n->lhs.getTensorVar()), - TensorProperty::Dimension, loc); + if(!util::contains(temporariesSet, tv)) { + dimension = getDimension(tv, n->lhs, loc); } } }), @@ -189,8 +204,7 @@ LowererImpl::lower(IndexStmt stmt, string name, find(indexVars.begin(),indexVars.end(), indexVar)); if(!util::contains(temporariesSet, n->tensorVar)) { - dimension = GetProperty::make(tensorVars.at(n->tensorVar), - TensorProperty::Dimension, loc); + dimension = getDimension(n->tensorVar, Access(n), loc); } } }) @@ -1002,9 +1016,16 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator, { Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); + Stmt boundsGuard = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { Expr coordinateArray = iterator.posAccess(iterator.getPosVar(), coordinates(iterator)).getResults()[0]; + // If the iterator is windowed, we must recover the coordinate index + // variable from the windowed space. + if (iterator.isWindowed()) { + coordinateArray = this->projectWindowedPositionToCanonicalSpace(iterator, coordinateArray); + boundsGuard = this->upperBoundGuardForWindowPosition(iterator, coordinate); + } declareCoordinate = VarDecl::make(coordinate, coordinateArray); } if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { @@ -1038,6 +1059,11 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator, boundsCompute = bounds.compute(); startBound = bounds[0]; endBound = bounds[1]; + // If we have a window on this iterator, then search for the start of + // the window rather than starting at the beginning of the level. + if (iterator.isWindowed()) { + startBound = this->searchForStartOfWindowPosition(iterator, startBound, endBound); + } } else { taco_iassert(iterator.isOrdered() && iterator.getParent().isOrdered()); taco_iassert(iterator.isCompact() && iterator.getParent().isCompact()); @@ -1059,10 +1085,12 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator, && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + // Loop with preamble and postamble - return Block::blanks(boundsCompute, + return Block::blanks( + boundsCompute, For::make(iterator.getPosVar(), startBound, endBound, 1, - Block::make(declareCoordinate, body), + Block::make(declareCoordinate, boundsGuard, body), kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), posAppend); @@ -1321,9 +1349,18 @@ Stmt LowererImpl::resolveCoordinate(std::vector mergers, ir::Expr coor // Just one position iterator so it is the resolved coordinate ModeFunction posAccess = merger.posAccess(merger.getPosVar(), coordinates(merger)); - Stmt resolution = emitVarDecl ? VarDecl::make(coordinate, posAccess[0]) : Assign::make(coordinate, posAccess[0]); + auto access = posAccess[0]; + auto guard = Stmt(); + // If the iterator is windowed, we must recover the coordinate index + // variable from the windowed space. + if (merger.isWindowed()) { + access = this->projectWindowedPositionToCanonicalSpace(merger, access); + guard = this->upperBoundGuardForWindowPosition(merger, coordinate); + } + Stmt resolution = emitVarDecl ? VarDecl::make(coordinate, access) : Assign::make(coordinate, access); return Block::make(posAccess.compute(), - resolution); + resolution, + guard); } else if (merger.hasCoordIter()) { taco_not_supported_yet; @@ -1975,6 +2012,7 @@ Stmt LowererImpl::initResultArrays(vector writes, Expr tensor = getTensorVar(write.getTensorVar()); Expr valuesArr = GetProperty::make(tensor, TensorProperty::Values); + bool clearValuesAllocation = false; Expr parentSize = 1; if (generateAssembleCode()) { @@ -2002,6 +2040,8 @@ Stmt LowererImpl::initResultArrays(vector writes, } parentSize = size; + // Writes into a windowed iterator require the allocation to be cleared. + clearValuesAllocation |= iterator.isWindowed(); } // Pre-allocate memory for the value array if computing while assembling @@ -2012,7 +2052,8 @@ Stmt LowererImpl::initResultArrays(vector writes, Expr allocSize = isValue(parentSize, 0) ? DEFAULT_ALLOC_SIZE : parentSize; initArrays.push_back(VarDecl::make(capacityVar, allocSize)); - initArrays.push_back(Allocate::make(valuesArr, capacityVar)); + initArrays.push_back(Allocate::make(valuesArr, capacityVar, false /* is_realloc */, Expr() /* old_elements */, + clearValuesAllocation)); } taco_iassert(!initArrays.empty()); @@ -2062,6 +2103,7 @@ ir::Stmt LowererImpl::finalizeResultArrays(std::vector writes) { return Stmt(); } + bool clearValuesAllocation = false; std::vector result; for (auto& write : writes) { if (write.getTensorVar().getOrder() == 0) continue; @@ -2085,13 +2127,17 @@ ir::Stmt LowererImpl::finalizeResultArrays(std::vector writes) { } result.push_back(finalize); parentSize = size; + // Writes into a windowed iterator require the allocation to be cleared. + clearValuesAllocation |= iterator.isWindowed(); } if (!generateComputeCode()) { // Allocate memory for values array after assembly if not also computing Expr tensor = getTensorVar(write.getTensorVar()); Expr valuesArr = GetProperty::make(tensor, TensorProperty::Values); - result.push_back(Allocate::make(valuesArr, parentSize)); + result.push_back(Allocate::make(valuesArr, parentSize, false, Expr(), true)); + result.push_back(Allocate::make(valuesArr, parentSize, false /* is_realloc */, Expr() /* old_elements */, + clearValuesAllocation)); } } return result.empty() ? Stmt() : Block::blanks(result); @@ -2262,7 +2308,6 @@ Stmt LowererImpl::zeroInitValues(Expr tensor, Expr begin, Expr size) { return For::make(p, lower, upper, 1, zeroInit, parallel); } - Stmt LowererImpl::declLocatePosVars(vector locators) { vector result; for (Iterator& locator : locators) { @@ -2284,7 +2329,14 @@ Stmt LowererImpl::declLocatePosVars(vector locators) { continue; // these will be recovered with separate procedure } do { - ModeFunction locate = locateIterator.locate(coordinates(locateIterator)); + auto coords = coordinates(locateIterator); + // If this dimension iterator operates over a window, then it needs + // to be projected up to the window's iteration space. + if (locateIterator.isWindowed()) { + auto expr = coords[coords.size() - 1]; + coords[coords.size() - 1] = this->projectCanonicalSpaceToWindowedPosition(locateIterator, expr); + } + ModeFunction locate = locateIterator.locate(coords); taco_iassert(isValue(locate.getResults()[1], true)); Stmt declarePosVar = VarDecl::make(locateIterator.getPosVar(), locate.getResults()[0]); @@ -2378,6 +2430,11 @@ Stmt LowererImpl::codeToInitializeIteratorVar(Iterator iterator, vectoriterators)[coordinateVar][0]; if (binarySearchTarget != underivedBounds[coordinateVar][0]) { + // If we have a window, then we need to project up the binary search target + // into the window rather than the beginning of the level. + if (iterator.isWindowed()) { + binarySearchTarget = this->projectCanonicalSpaceToWindowedPosition(iterator, binarySearchTarget); + } result.push_back(VarDecl::make(iterator.getBeginVar(), binarySearchTarget)); vector binarySearchArgs = { @@ -2394,7 +2451,13 @@ Stmt LowererImpl::codeToInitializeIteratorVar(Iterator iterator, vectorsearchForStartOfWindowPosition(iterator, bounds[0], bounds[1]); + } + result.push_back(VarDecl::make(iterVar, bound)); } result.push_back(VarDecl::make(endVar, bounds[1])); @@ -2556,13 +2619,22 @@ Stmt LowererImpl::codeToLoadCoordinatesFromPosIterators(vector iterato ModeFunction posAccess = posIter.posAccess(posIter.getPosVar(), coordinates(posIter)); loadPosIterCoordinateStmts.push_back(posAccess.compute()); + auto access = posAccess[0]; + // If this iterator is windowed, then it needs to be projected down to + // recover the coordinate variable. + // TODO (rohany): Would be cleaner to have this logic be moved into the + // ModeFunction, rather than having to check in some places? + if (posIter.isWindowed()) { + access = this->projectWindowedPositionToCanonicalSpace(posIter, access); + } if (declVars) { - loadPosIterCoordinateStmts.push_back(VarDecl::make(posIter.getCoordVar(), - posAccess[0])); + loadPosIterCoordinateStmts.push_back(VarDecl::make(posIter.getCoordVar(), access)); } else { - loadPosIterCoordinateStmts.push_back(Assign::make(posIter.getCoordVar(), - posAccess[0])); + loadPosIterCoordinateStmts.push_back(Assign::make(posIter.getCoordVar(), access)); + } + if (posIter.isWindowed()) { + loadPosIterCoordinateStmts.push_back(this->upperBoundGuardForWindowPosition(posIter, posIter.getCoordVar())); } } loadPosIterCoordinates = Block::make(loadPosIterCoordinateStmts); @@ -2701,4 +2773,33 @@ Expr LowererImpl::checkThatNoneAreExhausted(std::vector iterators) : Lt::make(iterators[0].getIteratorVar(), iterators[0].getEndVar()); } +Expr LowererImpl::searchForStartOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end) { + taco_iassert(iterator.isWindowed()); + vector args = { + // Search over the `crd` array of the level, + iterator.getMode().getModePack().getArray(1), + // between the start and end position, + start, end, + // for the beginning of the window. + iterator.getWindowLowerBound(), + }; + return Call::make("taco_binarySearchAfter", args, Datatype::UInt64); +} + +Stmt LowererImpl::upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access) { + taco_iassert(iterator.isWindowed()); + return ir::IfThenElse::make( + ir::Gte::make(access, ir::Sub::make(iterator.getWindowUpperBound(), iterator.getWindowLowerBound())), + ir::Break::make() + ); +} + +Expr LowererImpl::projectWindowedPositionToCanonicalSpace(Iterator iterator, ir::Expr expr) { + return ir::Sub::make(expr, iterator.getWindowLowerBound()); +} + +Expr LowererImpl::projectCanonicalSpaceToWindowedPosition(Iterator iterator, ir::Expr expr) { + return ir::Add::make(expr, iterator.getWindowLowerBound()); +} + } diff --git a/src/tensor.cpp b/src/tensor.cpp index cd8efd7d4..ce2e4190d 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -458,6 +458,31 @@ static inline map getTensors(const IndexExpr& expr); struct AccessTensorNode : public AccessNode { AccessTensorNode(TensorBase tensor, const std::vector& indices) : AccessNode(tensor.getTensorVar(), indices), tensor(tensor) {} + + AccessTensorNode(TensorBase tensor, const std::vector>& indices) + : AccessNode(tensor.getTensorVar()), tensor(tensor) { + // Create the vector of IndexVar to assign to this->indexVars. + std::vector ivars(indices.size()); + for (size_t i = 0; i < indices.size(); i++) { + auto var = indices[i]; + // Match on what the IndexVarInterface actually is. + IndexVarInterface::match(var, [&](std::shared_ptr ivar) { + ivars[i] = *ivar; + }, [&](std::shared_ptr wvar) { + ivars[i] = wvar->getIndexVar(); + auto lo = wvar->getLowerBound(); + auto hi = wvar->getUpperBound(); + taco_uassert(lo >= 0) << "slice lower bound must be >= 0"; + taco_uassert(hi <= tensor.getDimension(i)) << + "slice upper bound must be <= tensor dimension (" << tensor.getDimension(i) << ")"; + this->windowedModes[i].lo = lo; + this->windowedModes[i].hi = hi; + }); + } + // Initialize this->indexVars. + this->indexVars = std::move(ivars); + } + TensorBase tensor; virtual void setAssignment(const Assignment& assignment) { tensor.syncDependentTensors(); @@ -502,6 +527,14 @@ Access TensorBase::operator()(const std::vector& indices) { return Access(new AccessTensorNode(*this, indices)); } +Access TensorBase::operator()(const std::vector>& indices) { + taco_uassert(indices.size() == (size_t)getOrder()) + << "A tensor of order " << getOrder() << " must be indexed with " + << getOrder() << " variables, but is indexed with: " + << util::join(indices); + return Access(new AccessTensorNode(*this, indices)); +} + Access TensorBase::operator()() { return this->operator()(std::vector()); } diff --git a/test/test.cpp b/test/test.cpp index 597ddf2ef..a49f10ff7 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -1,3 +1,5 @@ +#include + #include "test.h" #include "taco/tensor.h" @@ -51,4 +53,18 @@ ostream& operator<<(ostream& os, const NotationTest& test) { return os; } +void ASSERT_THROWS_EXCEPTION_WITH_ERROR(std::function f, std::string err) { + EXPECT_THROW({ + try { + f(); + } catch (TacoException& e) { + // Catch and inspect the exception to make sure that err is within it. + auto s = std::string(e.what()); + ASSERT_TRUE(s.find(err) != std::string::npos); + // Throw the exception back up to gtest. + throw; + } + }, TacoException); +} + }} diff --git a/test/test.h b/test/test.h index 04000bafa..3302bf81f 100644 --- a/test/test.h +++ b/test/test.h @@ -3,6 +3,7 @@ #include "gtest/gtest.h" +#include #include #include #include @@ -93,6 +94,10 @@ void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, ASSERT_ARRAY_EQ(expectedValues, {(double*)storage.getValues().getData(),nnz}); } +// ASSERT_THROWS_EXCEPTION_WITH_ERROR asserts that the input function throws +// a TacoException with the input string err contained within the body. +void ASSERT_THROWS_EXCEPTION_WITH_ERROR(std::function f, std::string err); + struct NotationTest { NotationTest(IndexStmt actual, IndexStmt expected) : actual(actual), expected(expected) {} diff --git a/test/tests-windowing.cpp b/test/tests-windowing.cpp new file mode 100644 index 000000000..95e75d34e --- /dev/null +++ b/test/tests-windowing.cpp @@ -0,0 +1,386 @@ +#include "test.h" +#include "taco/tensor.h" +#include "taco/codegen/module.h" +#include "taco/index_notation/index_notation.h" +#include "taco/lower/lower.h" + +#include + +using namespace taco; + +// mixIndexing is a compilation test to ensure that we can index into a +// tensor with a mix of IndexVars and WindowedIndexVars. +TEST(windowing, mixIndexing) { + auto dim = 10; + Tensor a("a", {dim, dim, dim, dim, dim}, {Dense, Dense, Dense, Dense, Dense}); + IndexVar i, j, k, l, m; + auto w1 = a(i, j(1, 3), k, l(4, 5), m(6, 7)); + auto w2 = a(i(1, 3), j(2, 4), k, l, m(3, 5)); +} + +TEST(windowing, boundsChecks) { + Tensor a("a", {5}, {Dense}); + IndexVar i("i"); + ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(-1, 4)); }, "slice lower bound"); + ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(0, 10)); }, "slice upper bound"); +} + +// sliceMultipleWays tests that the same tensor can be sliced in different ways +// in the same expression. +TEST(windowing, sliceMultipleWays) { + auto dim = 10; + Tensor a("a", {dim}, {Dense}); + Tensor b("b", {dim}, {Sparse}); + Tensor c("c", {dim}, {Dense}); + Tensor expected("expected", {dim}, {Dense}); + for (int i = 0; i < dim; i++) { + a.insert({i}, i); + b.insert({i}, i); + } + expected.insert({2}, 10); + expected.insert({3}, 13); + a.pack(); b.pack(); expected.pack(); + IndexVar i("i"), j("j"); + + c(i(2, 4)) = a(i(5, 7)) + a(i(1, 3)) + b(i(4, 6)); + c.evaluate(); + ASSERT_TRUE(equals(expected, c)); +} + +// The test basic tests basic windowing behavior parameterized by a dimension +// of the input tensors and formats for each of the tensors in the computation. +struct basic : public TestWithParam> {}; +TEST_P(basic, windowing){ + Tensor expectedAdd("expectedAdd", {2, 2}, {Dense, Dense}); + expectedAdd.insert({0, 0}, 14); + expectedAdd.insert({0, 1}, 17); + expectedAdd.insert({1, 0}, 17); + expectedAdd.insert({1, 1}, 20); + expectedAdd.pack(); + Tensor expectedMul("expectedMul", {2, 2}, {Dense, Dense}); + expectedMul.insert({0, 0}, 64); + expectedMul.insert({0, 1}, 135); + expectedMul.insert({1, 0}, 135); + expectedMul.insert({1, 1}, 240); + expectedMul.pack(); + Tensor d("d", {2, 2}, {Dense, Dense}); + + // The test is parameterized by a dimension, and formats for the different tensors. + auto dim = std::get<0>(GetParam()); + auto x = std::get<1>(GetParam()); + auto y = std::get<2>(GetParam()); + auto z = std::get<3>(GetParam()); + Tensor a("a", {dim, dim}, {Dense, x}); + Tensor b("b", {dim, dim}, {Dense, y}); + Tensor c("c", {dim, dim}, {Dense, z}); + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + a.insert({i, j}, i + j); + b.insert({i, j}, i + j); + c.insert({i, j}, i + j); + } + } + + a.pack(); + b.pack(); + c.pack(); + + IndexVar i, j; + d(i, j) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6)) + c(i(1, 3), j(1, 3)); + d.evaluate(); + ASSERT_TRUE(equals(expectedAdd, d)) + << endl << expectedAdd << endl << endl << d << endl + << dim << " " << x << " " << y << " " << z << endl; + + d(i, j) = a(i(2, 4), j(2, 4)) * b(i(4, 6), j(4, 6)) * c(i(1, 3), j(1, 3)); + d.evaluate(); + ASSERT_TRUE(equals(expectedMul, d)) + << endl << expectedMul << endl << endl << d << endl + << dim << " " << x << " " << y << " " << z << endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + basic, + // Test on the cartesian product of the chosen dimensions and different + // combinations for tensor formats. + Combine(Values(6, 20), Values(Dense, Sparse), Values(Dense, Sparse), Values(Dense, Sparse)) +); + +// slicedOutput tests that operations can write to a window within an output tensor. +// The test is parameterized over formats for the used tensors. +struct slicedOutput : public TestWithParam> {}; +TEST_P(slicedOutput, windowing) { + auto dim = 10; + Tensor expected("expected", {10, 10}, {Dense, Dense}); + expected.insert({8, 8}, 12); + expected.insert({8, 9}, 14); + expected.insert({9, 8}, 14); + expected.insert({9, 9}, 16); + expected.pack(); + auto x = std::get<0>(GetParam()); + auto y = std::get<1>(GetParam()); + Tensor a("a", {dim, dim}, {Dense, x}); + Tensor b("b", {dim, dim}, {Dense, y}); + Tensor c("c", {dim, dim}, {Dense, Dense}); + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + a.insert({i, j}, i + j); + b.insert({i, j}, i + j); + } + } + a.pack(); + b.pack(); + + IndexVar i, j; + c(i(8, 10), j(8, 10)) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6)); + c.evaluate(); + ASSERT_TRUE(equals(expected, c)) + << endl << expected << endl << endl << c << endl + << dim << " " << x << " " << y << endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + slicedOutput, + Combine(Values(Dense, Sparse), Values(Dense, Sparse)) +); + +// matrixMultiple tests a matrix multiply, and in the process is testing +// windowing on expressions that contain reductions. The test is parameterized +// over formats for the used tensors. +struct matrixMultiply : public TestWithParam> {}; +TEST_P(matrixMultiply, windowing) { + auto dim = 10; + auto windowDim = 4; + + Tensor a("a", {windowDim, windowDim}, {Dense, Dense}); + Tensor b("b", {windowDim, windowDim}, {Dense, Dense}); + Tensor c("c", {windowDim, windowDim}, {Dense, Dense}); + Tensor expected("expected", {windowDim, windowDim}, {Dense, Dense}); + + auto x = std::get<0>(GetParam()); + auto y = std::get<1>(GetParam()); + Tensor aw("aw", {dim, dim}, {Dense, x}); + Tensor bw("bw", {dim, dim}, {Dense, y}); + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + aw.insert({i, j}, i + j); + bw.insert({i, j}, i + j); + } + } + aw.pack(); bw.pack(); + + IndexVar i("i"), j("j"), k("k"); + // Evaluate the windowed matrix multiply. + c(i, k) = aw(i(4, 8), j(2, 6)) * bw(j(0, 4), k(6, 10)); + c.evaluate(); + + // Copy the windowed portions of aw and bw into separate tensors, and test + // that the un-windowed matrix multiplication has the same results. + a(i, j) = aw(i(4, 8), j(2, 6)); + a.evaluate(); + b(i, j) = bw(i(0, 4), j(6, 10)); + b.evaluate(); + expected(i, k) = a(i, j) * b(j, k); + expected.evaluate(); + + ASSERT_TRUE(equals(expected, c)) << expected << endl << c << endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + matrixMultiply, + Combine(Values(Dense, Sparse), Values(Dense, Sparse)) +); + +// workspace tests that workspaces can be assigned to and used in computations +// that involve windowed tensors. The test is parameterized over formats for +// the used tensors. +struct workspace : public TestWithParam> {}; +TEST_P(workspace, windowing) { + auto dim = 10; + size_t windowDim = 4; + Tensor d("d", {static_cast(windowDim)}, {Dense}); + Tensor expected("expected", {static_cast(windowDim)}, {Dense}); + expected.insert({0}, 8); expected.insert({1}, 11); + expected.insert({2}, 14); expected.insert({3}, 17); + expected.pack(); + + auto x = std::get<0>(GetParam()); + auto y = std::get<1>(GetParam()); + Tensor a("a", {dim}, {x}); + Tensor b("b", {dim}, {y}); + Tensor c("c", {dim}, {Dense}); + for (int i = 0; i < dim; i++) { + a.insert({i}, i); + b.insert({i}, i); + c.insert({i}, i); + } + a.pack(); + b.pack(); + c.pack(); + IndexVar i("i"); + TensorVar p("p", Type(Int(), {windowDim}), Dense); + auto precomputed = a(i(2, 6)) + b(i(6, 10)); + d(i) = precomputed + c(i(0, 4)); + auto stmt = d.getAssignment().concretize(); + stmt = stmt.precompute(precomputed, i, i, p); + d.compile(stmt.concretize()); + d.evaluate(); + ASSERT_TRUE(equals(d, expected)) << expected << endl << d << endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + workspace, + Combine(Values(Dense, Sparse), Values(Dense, Sparse)) +); + +// transformations tests how windowing interacts with sparse iteration space +// transformations and different mode formats. +TEST(windowing, transformations) { + auto dim = 10; + Tensor expected("expected", {2, 2}, {Dense, Dense}); + expected.insert({0, 0}, 12); + expected.insert({0, 1}, 14); + expected.insert({1, 0}, 14); + expected.insert({1, 1}, 16); + expected.pack(); + + IndexVar i("i"), j("j"), i1 ("i1"), i2 ("i2"); + auto testFn = [&](std::function modifier, std::vector formats) { + for (auto& format : formats) { + Tensor a("a", {dim, dim}, format); + Tensor b("b", {dim, dim}, format); + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + a.insert({i, j}, i + j); + b.insert({i, j}, i + j); + } + } + a.pack(); b.pack(); + + Tensor c("c", {2, 2}, {Dense, Dense}); + c(i, j) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6)); + auto stmt = c.getAssignment().concretize(); + c.compile(modifier(stmt)); + c.evaluate(); + equals(c, expected); + ASSERT_TRUE(equals(c, expected)) << endl << c << endl << expected << endl << format << endl; + } + }; + + std::vector allFormats = {{Dense, Dense}, {Dense, Sparse}, {Sparse, Dense}, {Sparse, Sparse}}; + testFn([&](IndexStmt stmt) { + return stmt.split(i, i1, i2, 4).unroll(i2, 4); + }, allFormats); + + testFn([&](IndexStmt stmt) { + return stmt.reorder(i, j); + }, {{Dense, Dense}}); + + // We can only (currently) parallelize the outer dimension loop if it is dense. + testFn([&](IndexStmt stmt) { + return stmt.parallelize(i, taco::ParallelUnit::CPUThread, taco::OutputRaceStrategy::NoRaces); + }, {{Dense, Dense}, {Dense, Sparse}}); +} + +// assignment tests assignments of and to windows in different combinations. +// The test is parameterized over formats for the used tensors. +struct assignment : public TestWithParam {}; +TEST_P(assignment, windowing) { + auto dim = 10; + auto srcFormat = GetParam(); + Tensor A("A", {dim, dim}, srcFormat); + + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + A.insert({i, j}, i + j); + } + } + A.pack(); + + IndexVar i, j; + + // First assign a window of A to a window of B. + Tensor B("B", {dim, dim}, {Dense, Dense}); + B(i(2, 4), j(3, 5)) = A(i(4, 6), j(5, 7)); + B.evaluate(); + Tensor expected("expected", {dim, dim}, {Dense, Dense}); + expected.insert({2, 3}, 9); expected.insert({2, 4}, 10); + expected.insert({3, 3}, 10); expected.insert({3, 4}, 11); + expected.pack(); + ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl; + + // Assign a window of A to b. + B = Tensor("B", {2, 2}, {Dense, Dense}); + B(i, j) = A(i(4, 6), j(5, 7)); + B.evaluate(); + expected = Tensor("expected", {2, 2}, {Dense, Dense}); + expected.insert({0, 0}, 9); expected.insert({0, 1}, 10); + expected.insert({1, 0}, 10); expected.insert({1, 1}, 11); + expected.pack(); + ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl; + + // Assign A to a window of B. + A = Tensor("A", {2, 2}, srcFormat); + A.insert({0, 0}, 0); A.insert({0, 1}, 1); + A.insert({1, 0}, 1); A.insert({1, 1}, 2); + A.pack(); + B = Tensor("B", {dim, dim}, {Dense, Dense}); + B(i(4, 6), j(5, 7)) = A(i, j); + B.evaluate(); + expected = Tensor("expected", {dim, dim}, {Dense, Dense}); + expected.insert({4, 5}, 0); expected.insert({4, 6}, 1); + expected.insert({5, 5}, 1); expected.insert({5, 6}, 2); + expected.pack(); + ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + assignment, + Values(Dense, Sparse) +); + +// cuda tests a basic windowing operation when using GPU targeted code. +// The test is parameterized over formats for the used tensors. +struct cuda : public TestWithParam> {}; +TEST_P(cuda, windowing) { + if (!should_use_CUDA_codegen()) { + return; + } + auto dim = 10; + Tensor expected("expected", {2, 2}, {Dense, Dense}); + expected.insert({0, 0}, 12); expected.insert({0, 1}, 14); + expected.insert({1, 0}, 14); expected.insert({1, 1}, 16); + expected.pack(); + + auto x = std::get<0>(GetParam()); + auto y = std::get<1>(GetParam()); + Tensor a("a", {dim, dim}, {Dense, x}); + Tensor b("b", {dim, dim}, {Dense, y}); + Tensor c("c", {2, 2}, {Dense, Dense}); + + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + a.insert({i, j}, i + j); + b.insert({i, j}, i + j); + } + } + a.pack(); b.pack(); + + IndexVar i("i"), j("j"), i1("i1"), i2("i2"), i3("i3"), i4("i4"); + c(i, j) = a(i(4, 6), j(4, 6)) + b(i(2, 4), j(2, 4)); + auto stmt = c.getAssignment().concretize(); + stmt = stmt.split(i, i1, i2, 512) + .split(i2, i3, i4, 32) + .parallelize(i1, ParallelUnit::GPUBlock, OutputRaceStrategy::NoRaces) + .parallelize(i3, ParallelUnit::GPUWarp, OutputRaceStrategy::NoRaces) + .parallelize(i4, ParallelUnit::GPUThread, OutputRaceStrategy::NoRaces); + + c.compile(stmt); + c.evaluate(); + ASSERT_TRUE(equals(c, expected)) << c << endl << expected << endl; +} +INSTANTIATE_TEST_CASE_P( + windowing, + cuda, + Combine(Values(Dense, Sparse), Values(Dense, Sparse)) +);