From 2d4a7d3a566fc082a55cdea4ee075cec3092edb4 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Fri, 8 Jan 2021 18:39:51 -0800
Subject: [PATCH] *: add support for windowing of tensors

This commit adds support for windowing of tensors in the existing index
notation DSL. For example:

```
A(i, j) = B(i(1, 4), j) * C(i, j(5, 10))
```

causes `B` to be windowed along its first mode, and `C` to be windowed
along its second mode. In this commit any mix of windowed and
non-windowed modes are supported, along with windowing the same tensor
in different ways in the same expression. The windowing expressions
correspond to the `:` operator to slice dimensions in `numpy`.

Currently, only windowing by integers is supported.

Windowing is achieved by tying windowing information to particular
`Iterator` objects, as these are created for each `Tensor`-`IndexVar`
pair. When iterating over an `Iterator` that may be windowed, extra
steps are taken to either generate an index into the windowed space, or
to recover an index from a point in the windowed space.
---
 include/taco/index_notation/index_notation.h  |  84 +++-
 .../index_notation/index_notation_nodes.h     |  21 +-
 include/taco/ir/ir.h                          |   3 +-
 include/taco/lower/iterator.h                 |  15 +
 include/taco/lower/lowerer_impl.h             |  23 +-
 include/taco/tensor.h                         |  79 ++++
 src/codegen/codegen_c.cpp                     |   8 +-
 src/codegen/codegen_cuda.cpp                  |   9 +-
 src/error/error_checks.cpp                    |   8 +
 src/index_notation/index_notation.cpp         |  94 ++++-
 .../index_notation_rewriter.cpp               |   2 +-
 src/ir/ir.cpp                                 |   3 +-
 src/lower/iterator.cpp                        |  39 +-
 src/lower/lowerer_impl.cpp                    | 137 ++++++-
 src/tensor.cpp                                |  33 ++
 test/test.cpp                                 |  16 +
 test/test.h                                   |   5 +
 test/tests-windowing.cpp                      | 386 ++++++++++++++++++
 18 files changed, 930 insertions(+), 35 deletions(-)
 create mode 100644 test/tests-windowing.cpp
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index cacd1411c..0646398b0 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -1,6 +1,7 @@
 #ifndef TACO_INDEX_NOTATION_H
 #define TACO_INDEX_NOTATION_H
 
+#include <functional>
 #include <ostream>
 #include <string>
 #include <memory>
@@ -30,6 +31,7 @@ class Format;
 class Schedule;
 
 class IndexVar;
+class WindowedIndexVar;
 class TensorVar;
 
 class IndexExpr;
@@ -37,6 +39,7 @@ class Assignment;
 class Access;
 
 struct AccessNode;
+struct AccessWindow;
 struct LiteralNode;
 struct NegNode;
 struct SqrtNode;
@@ -220,7 +223,8 @@ class Access : public IndexExpr {
   Access() = default;
   Access(const Access&) = default;
   Access(const AccessNode*);
-  Access(const TensorVar& tensorVar, const std::vector<IndexVar>& indices={});
+  Access(const TensorVar &tensorVar, const std::vector<IndexVar> &indices = {},
+         const std::map<int, AccessWindow> &windows = {});
 
   /// Return the Access expression's TensorVar.
   const TensorVar &getTensorVar() const;
@@ -228,6 +232,16 @@ class Access : public IndexExpr {
   /// Returns the index variables used to index into the Access's TensorVar.
   const std::vector<IndexVar>& getIndexVars() const;
 
+  /// hasWindowedModes returns true if any accessed modes are windowed.
+  bool hasWindowedModes() const;
+
+  /// Returns whether or not the input mode (0-indexed) is windowed.
+  bool isModeWindowed(int mode) const;
+
+  /// Return the {lower,upper} bound of the window on the input mode (0-indexed).
+  int getWindowLowerBound(int mode) const;
+  int getWindowUpperBound(int mode) const;
+
   /// Assign the result of an expression to a left-hand-side tensor access.
   /// ```
   /// a(i) = b(i) * c(i);
@@ -800,11 +814,67 @@ class Multi : public IndexStmt {
 /// Create a multi index statement.
 Multi multi(IndexStmt stmt1, IndexStmt stmt2);
 
+/// IndexVarInterface is a marker superclass for IndexVar-like objects.
+/// It is intended to be used in situations where many IndexVar-like objects
+/// must be stored together, like when building an Access AST node where some
+/// of the access variables are windowed. Use cases for IndexVarInterface
+/// will inspect the underlying type of the IndexVarInterface. For sake of
+/// completeness, the current implementers of IndexVarInterface are:
+/// * IndexVar
+/// * WindowedIndexVar
+/// If this set changes, make sure to update the match function.
+class IndexVarInterface {
+public:
+  virtual ~IndexVarInterface() = default;
+
+  /// match performs a dynamic case analysis of the implementers of IndexVarInterface
+  /// as a utility for handling the different values within. It mimics the dynamic
+  /// type assertion of Go.
+  static void match(
+      std::shared_ptr<IndexVarInterface> ptr,
+      std::function<void(std::shared_ptr<IndexVar>)> ivarFunc,
+      std::function<void(std::shared_ptr<WindowedIndexVar>)> wvarFunc
+  ) {
+    auto iptr = std::dynamic_pointer_cast<IndexVar>(ptr);
+    auto wptr = std::dynamic_pointer_cast<WindowedIndexVar>(ptr);
+    if (iptr != nullptr) {
+      ivarFunc(iptr);
+    } else if (wptr != nullptr) {
+      wvarFunc(wptr);
+    } else {
+      taco_iassert("IndexVarInterface was not IndexVar or WindowedIndexVar");
+    }
+  }
+};
+
+/// WindowedIndexVar represents an IndexVar that has been windowed. For example,
+///   A(i) = B(i(2, 4))
+/// In this case, i(2, 4) is a WindowedIndexVar. WindowedIndexVar is defined
+/// before IndexVar so that IndexVar can return objects of type WindowedIndexVar.
+class WindowedIndexVar : public util::Comparable<WindowedIndexVar>, public IndexVarInterface {
+public:
+  WindowedIndexVar(IndexVar base, int lo = -1, int hi = -1);
+  ~WindowedIndexVar() = default;
+
+  /// getIndexVar returns the underlying IndexVar.
+  IndexVar getIndexVar() const;
+
+  /// get{Lower,Upper}Bound returns the {lower,upper} bound of the window of
+  /// this index variable.
+  int getLowerBound() const;
+  int getUpperBound() const;
+
+private:
+  struct Content;
+  std::shared_ptr<Content> content;
+};
+
 /// Index variables are used to index into tensors in index expressions, and
 /// they represent iteration over the tensor modes they index into.
-class IndexVar : public util::Comparable<IndexVar> {
+class IndexVar : public util::Comparable<IndexVar>, public IndexVarInterface {
 public:
   IndexVar();
+  ~IndexVar() = default;
   IndexVar(const std::string& name);
 
   /// Returns the name of the index variable.
@@ -813,6 +883,8 @@ class IndexVar : public util::Comparable<IndexVar> {
   friend bool operator==(const IndexVar&, const IndexVar&);
   friend bool operator<(const IndexVar&, const IndexVar&);
 
+  /// Indexing into an IndexVar returns a window into it.
+  WindowedIndexVar operator()(int lo, int hi);
 
 private:
   struct Content;
@@ -823,7 +895,15 @@ struct IndexVar::Content {
   std::string name;
 };
 
+struct WindowedIndexVar::Content {
+  IndexVar base;
+  int lo;
+  int hi;
+};
+
+std::ostream& operator<<(std::ostream&, const std::shared_ptr<IndexVarInterface>&);
 std::ostream& operator<<(std::ostream&, const IndexVar&);
+std::ostream& operator<<(std::ostream&, const WindowedIndexVar&);
 
 /// A suchthat statement provides a set of IndexVarRel that constrain
 /// the iteration space for the child concrete index notation
diff --git a/include/taco/index_notation/index_notation_nodes.h b/include/taco/index_notation/index_notation_nodes.h
index 95439cd6b..d18ee4a3d 100644
--- a/include/taco/index_notation/index_notation_nodes.h
+++ b/include/taco/index_notation/index_notation_nodes.h
@@ -13,10 +13,21 @@
 
 namespace taco {
 
+// An AccessNode carries the windowing information for an IndexVar + TensorVar
+// combination. An AccessWindow contains the lower and upper bounds of each
+// windowed mode (0-indexed). AccessWindow is extracted from AccessNode so that
+// it can be referenced externally.
+struct AccessWindow {
+  int lo;
+  int hi;
+  friend bool operator==(const AccessWindow& a, const AccessWindow& b) {
+    return a.lo == b.lo && a.hi == b.hi;
+  }
+};
 
 struct AccessNode : public IndexExprNode {
-  AccessNode(TensorVar tensorVar, const std::vector<IndexVar>& indices)
-      : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar), indexVars(indices) {}
+  AccessNode(TensorVar tensorVar, const std::vector<IndexVar>& indices, const std::map<int, AccessWindow>& windows={})
+      : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar), indexVars(indices), windowedModes(windows) {}
 
   void accept(IndexExprVisitorStrict* v) const {
     v->visit(this);
@@ -26,6 +37,12 @@ struct AccessNode : public IndexExprNode {
 
   TensorVar tensorVar;
   std::vector<IndexVar> indexVars;
+  std::map<int, AccessWindow> windowedModes;
+
+protected:
+  /// Initialize an AccessNode with just a TensorVar. If this constructor is used,
+  /// then indexVars must be set afterwards.
+  explicit AccessNode(TensorVar tensorVar) : IndexExprNode(tensorVar.getType().getDataType()), tensorVar(tensorVar) {}
 };
 
 struct LiteralNode : public IndexExprNode {
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 0fd9a45b4..f852f26b1 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -688,9 +688,10 @@ struct Allocate : public StmtNode<Allocate> {
   Expr num_elements;
   Expr old_elements; // used for realloc in CUDA
   bool is_realloc;
+  bool clear; // Whether to use calloc to allocate this memory.
   
   static Stmt make(Expr var, Expr num_elements, bool is_realloc=false,
-                   Expr old_elements=Expr());
+                   Expr old_elements=Expr(), bool clear=false);
   
   static const IRNodeType _type_info = IRNodeType::Allocate;
 };
diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h
index 1d871ffaa..0fe765653 100644
--- a/include/taco/lower/iterator.h
+++ b/include/taco/lower/iterator.h
@@ -159,6 +159,17 @@ class Iterator : public util::Comparable<Iterator> {
   /// Returns true if the iterator is defined, false otherwise.
   bool defined() const;
 
+  /// Methods for querying and operating on windowed tensor modes.
+
+  /// isWindowed returns true if this iterator is operating over a window
+  /// of a tensor mode.
+  bool isWindowed() const;
+
+  /// getWindow{Lower,Upper}Bound return the {Lower,Upper} bound of the
+  /// window that this iterator operates over.
+  ir::Expr getWindowLowerBound() const;
+  ir::Expr getWindowUpperBound() const;
+
   friend bool operator==(const Iterator&, const Iterator&);
   friend bool operator<(const Iterator&, const Iterator&);
   friend std::ostream& operator<<(std::ostream&, const Iterator&);
@@ -169,6 +180,10 @@ class Iterator : public util::Comparable<Iterator> {
 
   Iterator(std::shared_ptr<Content> content);
   void setChild(const Iterator& iterator) const;
+
+  friend class Iterators;
+  /// setWindowBounds sets the window bounds of this iterator.
+  void setWindowBounds(ir::Expr lo, ir::Expr hi);
 };
 
 /**
diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
index 39675d450..afc936145 100644
--- a/include/taco/lower/lowerer_impl.h
+++ b/include/taco/lower/lowerer_impl.h
@@ -375,9 +375,30 @@ class LowererImpl : public util::Uncopyable {
   /// Create an expression to index into a tensor value array.
   ir::Expr generateValueLocExpr(Access access) const;
 
-  /// Expression that evaluates to true if none of the iteratators are exhausted
+  /// Expression that evaluates to true if none of the iterators are exhausted
   ir::Expr checkThatNoneAreExhausted(std::vector<Iterator> iterators);
 
+  /// Expression that returns the beginning of a window to iterate over
+  /// in a compressed iterator. It is used when operating over windows of
+  /// tensors, instead of the full tensor.
+  ir::Expr searchForStartOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end);
+
+  /// Statement that guards against going out of bounds of the window that
+  /// the input iterator was configured with.
+  ir::Stmt upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access);
+
+  /// Expression that recovers a canonical index variable from a position in
+  /// a windowed position iterator. A windowed position iterator iterates over
+  /// values in the range [lo, hi). This expression projects values in that
+  /// range back into the canonical range of [0, n).
+  ir::Expr projectWindowedPositionToCanonicalSpace(Iterator iterator, ir::Expr expr);
+
+  // projectCanonicalSpaceToWindowedPosition is the opposite of
+  // projectWindowedPositionToCanonicalSpace. It takes an expression ranging
+  // through the canonical space of [0, n) and projects it up to the windowed
+  // range of [lo, hi).
+  ir::Expr projectCanonicalSpaceToWindowedPosition(Iterator iterator, ir::Expr expr);
+
 private:
   bool assemble;
   bool compute;
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index 313758252..25186c815 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -386,6 +386,9 @@ class TensorBase {
   /// Create an index expression that accesses (reads or writes) this tensor.
   Access operator()(const std::vector<IndexVar>& indices);
 
+  /// Create a possibly windowed index expression that accesses (reads or writes) this tensor.
+  Access operator()(const std::vector<std::shared_ptr<IndexVarInterface>>& indices);
+
   /// Create an index expression that accesses (reads) this (scalar) tensor.
   Access operator()();
 
@@ -621,6 +624,20 @@ class Tensor : public TensorBase {
   template <typename... IndexVars>
   Access operator()(const IndexVars&... indices);
 
+  /// The below two Access methods are used to allow users to access tensors
+  /// with a mix of IndexVar's and WindowedIndexVar's. This allows natural
+  /// expressions like
+  ///   A(i, j(1, 3)) = B(i(2, 4), j) * C(i(5, 7), j(7, 9))
+  /// to be constructed without adjusting the original API.
+
+  /// Create an index expression that accesses (reads, writes) this tensor.
+  template <typename... IndexVars>
+  Access operator()(const WindowedIndexVar& first, const IndexVars&... indices);
+
+  /// Create an index expression that accesses (reads, writes) this tensor.
+  template <typename... IndexVars>
+  Access operator()(const IndexVar& first, const IndexVars&... indices);
+
   ScalarAccess<CType> operator()(const std::vector<int>& indices);
 
   /// Create an index expression that accesses (reads) this tensor.
@@ -629,6 +646,15 @@ class Tensor : public TensorBase {
 
   /// Assign an expression to a scalar tensor.
   void operator=(const IndexExpr& expr);
+
+private:
+  /// The _access method family is the template level implementation of
+  /// Access() expressions containing mixes of IndexVar and WindowedIndexVar objects.
+  template <typename First, typename... Rest>
+  std::vector<std::shared_ptr<IndexVarInterface>> _access(const First& first, const Rest&... rest);
+  std::vector<std::shared_ptr<IndexVarInterface>> _access();
+  template <typename... Args>
+  Access _access_wrapper(const Args&... args);
 };
 
 template <typename CType>
@@ -1084,6 +1110,59 @@ Access Tensor<CType>::operator()(const IndexVars&... indices) {
   return TensorBase::operator()(std::vector<IndexVar>{indices...});
 }
 
+/// The _access() methods perform primitive recursion on the input variadic template.
+/// This means that each instance of the _access method matches on the first element
+/// of the variadic template parameter pack, performs an "action", then recurses
+/// with the remaining elements in the parameter pack through a recursive call
+/// to _access. Since this is recursion, we need a base case. The empty argument
+/// instance of _access returns an empty value of the desired type, in this case
+/// a vector of IndexVarInterface.
+template <typename CType>
+std::vector<std::shared_ptr<IndexVarInterface>> Tensor<CType>::_access() {
+  return std::vector<std::shared_ptr<IndexVarInterface>>{};
+}
+
+/// The recursive case of _access matches on the first element, and attempts to
+/// create a shared_ptr out of it. It then makes a recursive call to get a
+/// vector with the rest of the elements. Then, it pushes the first element onto
+/// the back of the vector -- this check ensures that the type First is indeed
+/// a member of IndexVarInterface.
+template <typename CType>
+template <typename First, typename... Rest>
+std::vector<std::shared_ptr<IndexVarInterface>> Tensor<CType>::_access(const First& first, const Rest&... rest) {
+  auto var = std::make_shared<First>(first);
+  auto ret = _access(rest...);
+  ret.push_back(var);
+  return ret;
+}
+
+/// _access_wrapper just calls into _access and reverses the result to get the initial
+/// order of the arguments.
+template <typename CType>
+template <typename... Args>
+Access Tensor<CType>::_access_wrapper(const Args&... args) {
+  auto resultReversed = this->_access(args...);
+  std::vector<std::shared_ptr<IndexVarInterface>> result;
+  result.reserve(resultReversed.size());
+  for (auto& it : util::reverse(resultReversed)) {
+    result.push_back(it);
+  }
+  return TensorBase::operator()(result);
+}
+
+/// We have to case on whether the first argument is an IndexVar or a WindowedIndexVar
+/// so that the template engine can differentiate between the two versions.
+template <typename CType>
+template <typename... IndexVars>
+Access Tensor<CType>::operator()(const IndexVar& first, const IndexVars&... indices) {
+  return this->_access_wrapper(first, indices...);
+}
+template <typename CType>
+template <typename... IndexVars>
+Access Tensor<CType>::operator()(const WindowedIndexVar& first, const IndexVars&... indices) {
+  return this->_access_wrapper(first, indices...);
+}
+
 template <typename CType>
 ScalarAccess<CType> Tensor<CType>::operator()(const std::vector<int>& indices) {
   taco_uassert(indices.size() == (size_t)getOrder())
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index a0c6c9591..f48f34f2a 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -516,7 +516,13 @@ void CodeGen_C::visit(const Allocate* op) {
     stream << ", ";
   }
   else {
-    stream << "malloc(";
+    // If the allocation was requested to clear the allocated memory,
+    // use calloc instead of malloc.
+    if (op->clear) {
+      stream << "calloc(1, ";
+    } else {
+      stream << "malloc(";
+    }
   }
   stream << "sizeof(" << elementType << ")";
   stream << " * ";
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index d0c69ffd8..7210c6cde 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -1293,9 +1293,14 @@ void CodeGen_CUDA::visit(const Call* op) {
   stream << op->func << "(";
   parentPrecedence = Precedence::CALL;
 
-  // Need to print cast to type so that arguments match
+  // Need to print cast to type so that arguments match.
   if (op->args.size() > 0) {
-    if (op->type != op->args[0].type() || isa<Literal>(op->args[0])) {
+    // However, the binary search arguments take int* as their first
+    // argument. This pointer information isn't carried anywhere in
+    // the argument expressions, so we need to special case and not
+    // emit an invalid cast for that argument.
+    auto opIsBinarySearch = op->func == "taco_binarySearchAfter" || op->func == "taco_binarySearchBefore";
+    if (!opIsBinarySearch && (op->type != op->args[0].type() || isa<Literal>(op->args[0]))) {
       stream << "(" << printCUDAType(op->type, false) << ") ";
     }
     op->args[0].accept(this);
diff --git a/src/error/error_checks.cpp b/src/error/error_checks.cpp
index 9fc067d2d..7516ab3d2 100644
--- a/src/error/error_checks.cpp
+++ b/src/error/error_checks.cpp
@@ -53,6 +53,14 @@ std::pair<bool, string> dimensionsTypecheck(const std::vector<IndexVar>& resultV
     for (size_t mode = 0; mode < readNode->indexVars.size(); mode++) {
       IndexVar var = readNode->indexVars[mode];
       Dimension dimension = readNode->tensorVar.getType().getShape().getDimension(mode);
+
+      // If this access has windowed modes, use the dimensions of those windows
+      // as the shape, rather than the shape of the underlying tensor.
+      auto a = Access(readNode);
+      if (a.isModeWindowed(mode)) {
+        dimension = Dimension(a.getWindowUpperBound(mode) - a.getWindowLowerBound(mode));
+      }
+
       if (util::contains(indexVarDims,var) && indexVarDims.at(var) != dimension) {
         errors.push_back(addDimensionError(var, indexVarDims.at(var), dimension));
       } else {
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index f00557eeb..4b090d84b 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -185,7 +185,7 @@ struct Isomorphic : public IndexNotationVisitorStrict {
         return;
       }
     }
-    eq = true;
+    eq = anode->windowedModes == bnode->windowedModes;
   }
 
   void visit(const LiteralNode* anode) {
@@ -746,8 +746,8 @@ IndexExpr operator/(const IndexExpr& lhs, const IndexExpr& rhs) {
 Access::Access(const AccessNode* n) : IndexExpr(n) {
 }
 
-Access::Access(const TensorVar& tensor, const std::vector<IndexVar>& indices)
-    : Access(new AccessNode(tensor, indices)) {
+Access::Access(const TensorVar& tensor, const std::vector<IndexVar>& indices, const std::map<int, AccessWindow>& windows)
+    : Access(new AccessNode(tensor, indices, windows)) {
 }
 
 const TensorVar& Access::getTensorVar() const {
@@ -758,11 +758,45 @@ const std::vector<IndexVar>& Access::getIndexVars() const {
   return getNode(*this)->indexVars;
 }
 
+bool Access::hasWindowedModes() const {
+  return !getNode(*this)->windowedModes.empty();
+}
+
+bool Access::isModeWindowed(int mode) const {
+  auto node = getNode(*this);
+  return node->windowedModes.find(mode) != node->windowedModes.end();
+}
+
+int Access::getWindowLowerBound(int mode) const {
+  taco_iassert(this->isModeWindowed(mode));
+  return getNode(*this)->windowedModes.at(mode).lo;
+}
+
+int Access::getWindowUpperBound(int mode) const {
+  taco_iassert(this->isModeWindowed(mode));
+  return getNode(*this)->windowedModes.at(mode).hi;
+}
+
 static void check(Assignment assignment) {
-  auto tensorVar = assignment.getLhs().getTensorVar();
-  auto freeVars = assignment.getLhs().getIndexVars();
+  auto lhs = assignment.getLhs();
+  auto tensorVar = lhs.getTensorVar();
+  auto freeVars = lhs.getIndexVars();
   auto indexExpr = assignment.getRhs();
   auto shape = tensorVar.getType().getShape();
+
+  // If the LHS access has any windowed modes, use the dimensions of those
+  // windows as the shape, rather than the shape of the underlying tensor.
+  if (lhs.hasWindowedModes()) {
+    vector<Dimension> dims(shape.getOrder());
+    for (int i = 0; i < shape.getOrder();i++) {
+      dims[i] = shape.getDimension(i);
+      if (lhs.isModeWindowed(i)) {
+        dims[i] = Dimension(lhs.getWindowUpperBound(i) - lhs.getWindowLowerBound(i));
+      }
+    }
+    shape = Shape(dims);
+  }
+
   auto typecheck = error::dimensionsTypecheck(freeVars, indexExpr, shape);
   taco_uassert(typecheck.first) << error::expr_dimension_mismatch << " " << typecheck.second;
 }
@@ -1800,6 +1834,10 @@ std::string IndexVar::getName() const {
   return content->name;
 }
 
+WindowedIndexVar IndexVar::operator()(int lo, int hi) {
+  return WindowedIndexVar(*this, lo, hi);
+}
+
 bool operator==(const IndexVar& a, const IndexVar& b) {
   return a.content == b.content;
 }
@@ -1808,10 +1846,42 @@ bool operator<(const IndexVar& a, const IndexVar& b) {
   return a.content < b.content;
 }
 
+std::ostream& operator<<(std::ostream& os, const std::shared_ptr<IndexVarInterface>& var) {
+  std::stringstream ss;
+  IndexVarInterface::match(var, [&](std::shared_ptr<IndexVar> ivar) {
+    ss << *ivar;
+  }, [&](std::shared_ptr<WindowedIndexVar> wvar) {
+    ss << *wvar;
+  });
+  return os << ss.str();
+}
+
 std::ostream& operator<<(std::ostream& os, const IndexVar& var) {
   return os << var.getName();
 }
 
+std::ostream& operator<<(std::ostream& os, const WindowedIndexVar& var) {
+  return os << var.getIndexVar();
+}
+
+WindowedIndexVar::WindowedIndexVar(IndexVar base, int lo, int hi) : content( new Content){
+  this->content->base = base;
+  this->content->lo = lo;
+  this->content->hi = hi;
+}
+
+IndexVar WindowedIndexVar::getIndexVar() const {
+  return this->content->base;
+}
+
+int WindowedIndexVar::getLowerBound() const {
+  return this->content->lo;
+}
+
+int WindowedIndexVar::getUpperBound() const {
+  return this->content->hi;
+}
+
 // class TensorVar
 struct TensorVar::Content {
   int id;
@@ -1951,6 +2021,20 @@ static bool isValid(Assignment assignment, string* reason) {
   auto result = lhs.getTensorVar();
   auto freeVars = lhs.getIndexVars();
   auto shape = result.getType().getShape();
+
+  // If the LHS access has any windowed modes, use the dimensions of those
+  // windows as the shape, rather than the shape of the underlying tensor.
+  if (lhs.hasWindowedModes()) {
+    vector<Dimension> dims(shape.getOrder());
+    for (int i = 0; i < shape.getOrder();i++) {
+      dims[i] = shape.getDimension(i);
+      if (lhs.isModeWindowed(i)) {
+        dims[i] = Dimension(lhs.getWindowUpperBound(i) - lhs.getWindowLowerBound(i));
+      }
+    }
+    shape = Shape(dims);
+  }
+
   auto typecheck = error::dimensionsTypecheck(freeVars, rhs, shape);
   if (!typecheck.first) {
     *reason = error::expr_dimension_mismatch + " " + typecheck.second;
diff --git a/src/index_notation/index_notation_rewriter.cpp b/src/index_notation/index_notation_rewriter.cpp
index ac7bd6945..35d111457 100644
--- a/src/index_notation/index_notation_rewriter.cpp
+++ b/src/index_notation/index_notation_rewriter.cpp
@@ -327,7 +327,7 @@ struct ReplaceIndexVars : public IndexNotationRewriter {
       }
     }
     if (modified) {
-      expr = Access(op->tensorVar, indexVars);
+      expr = Access(op->tensorVar, indexVars, op->windowedModes);
     }
     else {
       expr = op;
diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
index 6c5dd8fcb..2623b27cd 100644
--- a/src/ir/ir.cpp
+++ b/src/ir/ir.cpp
@@ -749,7 +749,7 @@ Stmt Yield::make(std::vector<Expr> coords, Expr val) {
 }
 
 // Allocate
-Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_elements) {
+Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_elements, bool clear) {
   taco_iassert(var.as<GetProperty>() ||
                (var.as<Var>() && var.as<Var>()->is_ptr)) <<
       "Can only allocate memory for a pointer-typed Var";
@@ -761,6 +761,7 @@ Stmt Allocate::make(Expr var, Expr num_elements, bool is_realloc, Expr old_eleme
   alloc->is_realloc = is_realloc;
   taco_iassert(!is_realloc || old_elements.ptr != NULL);
   alloc->old_elements = old_elements;
+  alloc->clear = clear;
   return alloc;
 }
 
diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp
index 9f5b7dd4b..fe27d5489 100644
--- a/src/lower/iterator.cpp
+++ b/src/lower/iterator.cpp
@@ -28,6 +28,16 @@ struct Iterator::Content {
   ir::Expr segendVar;
   ir::Expr validVar;
   ir::Expr beginVar;
+
+  // AccessWindow represents a window (or slice) into a tensor mode, given by
+  // the expressions representing an upper and lower bound. An iterator
+  // is windowed if window is not NULL.
+  struct Window {
+      ir::Expr lo;
+      ir::Expr hi;
+      Window(ir::Expr _lo, ir::Expr _hi) : lo(_lo), hi(_hi) {};
+  };
+  std::unique_ptr<Window> window;
 };
 
 Iterator::Iterator() : content(nullptr) {
@@ -323,6 +333,24 @@ bool Iterator::defined() const {
   return content != nullptr;
 }
 
+bool Iterator::isWindowed() const {
+  return this->content->window != nullptr;
+}
+
+ir::Expr Iterator::getWindowLowerBound() const {
+  taco_iassert(this->isWindowed());
+  return this->content->window->lo;
+}
+
+ir::Expr Iterator::getWindowUpperBound() const {
+  taco_iassert(this->isWindowed());
+  return this->content->window->hi;
+}
+
+void Iterator::setWindowBounds(ir::Expr lo, ir::Expr hi) {
+  this->content->window = std::make_unique<Content::Window>(Content::Window(lo, hi));
+}
+
 bool operator==(const Iterator& a, const Iterator& b) {
   if (a.isDimensionIterator() && b.isDimensionIterator()) {
     return a.getIndexVar() == b.getIndexVar();
@@ -425,7 +453,7 @@ Iterators::Iterators(IndexStmt stmt, const map<TensorVar, Expr>& tensorVars)
     })
   );
 
-  // Reverse the levelITerators map for fast modeAccess lookup
+  // Reverse the levelIterators map for fast modeAccess lookup
   for (auto& iterator : content->levelIterators) {
     content->modeAccesses.insert({iterator.second, iterator.first});
   }
@@ -472,6 +500,15 @@ Iterators::createAccessIterators(Access access, Format format, Expr tensorIR, Pr
 
       string name = iteratorIndexVar.getName() + tensorConcrete.getName();
       Iterator iterator(iteratorIndexVar, tensorIR, mode, parent, name, true);
+
+      // If the access that this iterator corresponds to has a window, then
+      // adjust the iterator appropriately.
+      if (access.isModeWindowed(modeNumber)) {
+        auto lo = ir::Literal::make(access.getWindowLowerBound(modeNumber));
+        auto hi = ir::Literal::make(access.getWindowUpperBound(modeNumber));
+        iterator.setWindowBounds(lo, hi);
+      }
+
       content->levelIterators.insert({{access,modeNumber+1}, iterator});
       if (iteratorIndexVar != indexVar) {
         // add to allowing lowering to find correct iterator for this pos variable
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 717ae3884..0de713a2c 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -168,17 +168,32 @@ LowererImpl::lower(IndexStmt stmt, string name,
   vector<IndexVar> indexVars = getIndexVars(stmt);
   for (auto& indexVar : indexVars) {
     Expr dimension;
+    // getDimension extracts an Expr that holds the dimension
+    // of a particular tensor mode. This Expr should be used as a loop bound
+    // when iterating over the dimension of the target tensor.
+    auto getDimension = [&](const TensorVar& tv, const Access& a, int mode) {
+      // If the tensor mode is windowed, then the dimension for iteration is the bounds
+      // of the window. Otherwise, it is the actual dimension of the mode.
+      if (a.isModeWindowed(mode)) {
+        // The mode value used to access .levelIterator is 1-indexed, while
+        // the mode input to getDimension is 0-indexed. So, we shift it up by 1.
+        auto iter = iterators.levelIterator(ModeAccess(a, mode+1));
+        return ir::Sub::make(iter.getWindowUpperBound(), iter.getWindowLowerBound());
+      } else {
+        return GetProperty::make(tensorVars.at(tv), TensorProperty::Dimension, mode);
+      }
+    };
     match(stmt,
       function<void(const AssignmentNode*, Matcher*)>([&](
           const AssignmentNode* n, Matcher* m) {
         m->match(n->rhs);
         if (!dimension.defined()) {
           auto ivars = n->lhs.getIndexVars();
+          auto tv = n->lhs.getTensorVar();
           int loc = (int)distance(ivars.begin(),
                                   find(ivars.begin(),ivars.end(), indexVar));
-          if(!util::contains(temporariesSet, n->lhs.getTensorVar())) {
-            dimension = GetProperty::make(tensorVars.at(n->lhs.getTensorVar()),
-                                          TensorProperty::Dimension, loc);
+          if(!util::contains(temporariesSet, tv)) {
+            dimension = getDimension(tv, n->lhs, loc);
           }
         }
       }),
@@ -189,8 +204,7 @@ LowererImpl::lower(IndexStmt stmt, string name,
                                   find(indexVars.begin(),indexVars.end(),
                                        indexVar));
           if(!util::contains(temporariesSet, n->tensorVar)) {
-            dimension = GetProperty::make(tensorVars.at(n->tensorVar),
-                                          TensorProperty::Dimension, loc);
+            dimension = getDimension(n->tensorVar, Access(n), loc);
           }
         }
       })
@@ -1002,9 +1016,16 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator,
 {
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
   Stmt declareCoordinate = Stmt();
+  Stmt boundsGuard = Stmt();
   if (provGraph.isCoordVariable(forall.getIndexVar())) {
     Expr coordinateArray = iterator.posAccess(iterator.getPosVar(),
                                               coordinates(iterator)).getResults()[0];
+    // If the iterator is windowed, we must recover the coordinate index
+    // variable from the windowed space.
+    if (iterator.isWindowed()) {
+      coordinateArray = this->projectWindowedPositionToCanonicalSpace(iterator, coordinateArray);
+      boundsGuard = this->upperBoundGuardForWindowPosition(iterator, coordinate);
+    }
     declareCoordinate = VarDecl::make(coordinate, coordinateArray);
   }
   if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {
@@ -1038,6 +1059,11 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator,
     boundsCompute = bounds.compute();
     startBound = bounds[0];
     endBound = bounds[1];
+    // If we have a window on this iterator, then search for the start of
+    // the window rather than starting at the beginning of the level.
+    if (iterator.isWindowed()) {
+        startBound = this->searchForStartOfWindowPosition(iterator, startBound, endBound);
+    }
   } else {
     taco_iassert(iterator.isOrdered() && iterator.getParent().isOrdered());
     taco_iassert(iterator.isCompact() && iterator.getParent().isCompact());
@@ -1059,10 +1085,12 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator,
            && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) {
     kind = LoopKind::Runtime;
   }
+
   // Loop with preamble and postamble
-  return Block::blanks(boundsCompute,
+  return Block::blanks(
+                       boundsCompute,
                        For::make(iterator.getPosVar(), startBound, endBound, 1,
-                                 Block::make(declareCoordinate, body),
+                                 Block::make(declareCoordinate, boundsGuard, body),
                                  kind,
                                  ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()),
                        posAppend);
@@ -1321,9 +1349,18 @@ Stmt LowererImpl::resolveCoordinate(std::vector<Iterator> mergers, ir::Expr coor
       // Just one position iterator so it is the resolved coordinate
       ModeFunction posAccess = merger.posAccess(merger.getPosVar(),
                                                 coordinates(merger));
-      Stmt resolution = emitVarDecl ? VarDecl::make(coordinate, posAccess[0]) : Assign::make(coordinate, posAccess[0]);
+      auto access = posAccess[0];
+      auto guard = Stmt();
+      // If the iterator is windowed, we must recover the coordinate index
+      // variable from the windowed space.
+      if (merger.isWindowed()) {
+        access = this->projectWindowedPositionToCanonicalSpace(merger, access);
+        guard = this->upperBoundGuardForWindowPosition(merger, coordinate);
+      }
+      Stmt resolution = emitVarDecl ? VarDecl::make(coordinate, access) : Assign::make(coordinate, access);
       return Block::make(posAccess.compute(),
-                         resolution);
+                         resolution,
+                         guard);
     }
     else if (merger.hasCoordIter()) {
       taco_not_supported_yet;
@@ -1975,6 +2012,7 @@ Stmt LowererImpl::initResultArrays(vector<Access> writes,
 
     Expr tensor = getTensorVar(write.getTensorVar());
     Expr valuesArr = GetProperty::make(tensor, TensorProperty::Values);
+    bool clearValuesAllocation = false;
 
     Expr parentSize = 1;
     if (generateAssembleCode()) {
@@ -2002,6 +2040,8 @@ Stmt LowererImpl::initResultArrays(vector<Access> writes,
         }
 
         parentSize = size;
+        // Writes into a windowed iterator require the allocation to be cleared.
+        clearValuesAllocation |= iterator.isWindowed();
       }
 
       // Pre-allocate memory for the value array if computing while assembling
@@ -2012,7 +2052,8 @@ Stmt LowererImpl::initResultArrays(vector<Access> writes,
         Expr allocSize = isValue(parentSize, 0) 
                          ? DEFAULT_ALLOC_SIZE : parentSize;
         initArrays.push_back(VarDecl::make(capacityVar, allocSize));
-        initArrays.push_back(Allocate::make(valuesArr, capacityVar));
+        initArrays.push_back(Allocate::make(valuesArr, capacityVar, false /* is_realloc */, Expr() /* old_elements */,
+                                            clearValuesAllocation));
       }
 
       taco_iassert(!initArrays.empty());
@@ -2062,6 +2103,7 @@ ir::Stmt LowererImpl::finalizeResultArrays(std::vector<Access> writes) {
     return Stmt();
   }
 
+  bool clearValuesAllocation = false;
   std::vector<Stmt> result;
   for (auto& write : writes) {
     if (write.getTensorVar().getOrder() == 0) continue;
@@ -2085,13 +2127,17 @@ ir::Stmt LowererImpl::finalizeResultArrays(std::vector<Access> writes) {
       }
       result.push_back(finalize);
       parentSize = size;
+      // Writes into a windowed iterator require the allocation to be cleared.
+      clearValuesAllocation |= iterator.isWindowed();
     }
 
     if (!generateComputeCode()) {
       // Allocate memory for values array after assembly if not also computing
       Expr tensor = getTensorVar(write.getTensorVar());
       Expr valuesArr = GetProperty::make(tensor, TensorProperty::Values);
-      result.push_back(Allocate::make(valuesArr, parentSize));
+      result.push_back(Allocate::make(valuesArr, parentSize, false, Expr(), true));
+      result.push_back(Allocate::make(valuesArr, parentSize, false /* is_realloc */, Expr() /* old_elements */,
+                                      clearValuesAllocation));
     }
   }
   return result.empty() ? Stmt() : Block::blanks(result);
@@ -2262,7 +2308,6 @@ Stmt LowererImpl::zeroInitValues(Expr tensor, Expr begin, Expr size) {
   return For::make(p, lower, upper, 1, zeroInit, parallel);
 }
 
-
 Stmt LowererImpl::declLocatePosVars(vector<Iterator> locators) {
   vector<Stmt> result;
   for (Iterator& locator : locators) {
@@ -2284,7 +2329,14 @@ Stmt LowererImpl::declLocatePosVars(vector<Iterator> locators) {
         continue; // these will be recovered with separate procedure
       }
       do {
-        ModeFunction locate = locateIterator.locate(coordinates(locateIterator));
+        auto coords = coordinates(locateIterator);
+        // If this dimension iterator operates over a window, then it needs
+        // to be projected up to the window's iteration space.
+        if (locateIterator.isWindowed()) {
+          auto expr = coords[coords.size() - 1];
+          coords[coords.size() - 1] = this->projectCanonicalSpaceToWindowedPosition(locateIterator, expr);
+        }
+        ModeFunction locate = locateIterator.locate(coords);
         taco_iassert(isValue(locate.getResults()[1], true));
         Stmt declarePosVar = VarDecl::make(locateIterator.getPosVar(),
                                            locate.getResults()[0]);
@@ -2378,6 +2430,11 @@ Stmt LowererImpl::codeToInitializeIteratorVar(Iterator iterator, vector<Iterator
 
         Expr binarySearchTarget = provGraph.deriveCoordBounds(definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, this->iterators)[coordinateVar][0];
         if (binarySearchTarget != underivedBounds[coordinateVar][0]) {
+          // If we have a window, then we need to project up the binary search target
+          // into the window rather than the beginning of the level.
+          if (iterator.isWindowed()) {
+            binarySearchTarget = this->projectCanonicalSpaceToWindowedPosition(iterator, binarySearchTarget);
+          }
           result.push_back(VarDecl::make(iterator.getBeginVar(), binarySearchTarget));
 
           vector<Expr> binarySearchArgs = {
@@ -2394,7 +2451,13 @@ Stmt LowererImpl::codeToInitializeIteratorVar(Iterator iterator, vector<Iterator
         }
       }
       else {
-        result.push_back(VarDecl::make(iterVar, bounds[0]));
+        auto bound = bounds[0];
+        // If we have a window on this iterator, then search for the start of
+        // the window rather than starting at the beginning of the level.
+        if (iterator.isWindowed()) {
+            bound = this->searchForStartOfWindowPosition(iterator, bounds[0], bounds[1]);
+        }
+        result.push_back(VarDecl::make(iterVar, bound));
       }
 
       result.push_back(VarDecl::make(endVar, bounds[1]));
@@ -2556,13 +2619,22 @@ Stmt LowererImpl::codeToLoadCoordinatesFromPosIterators(vector<Iterator> iterato
       ModeFunction posAccess = posIter.posAccess(posIter.getPosVar(),
                                                  coordinates(posIter));
       loadPosIterCoordinateStmts.push_back(posAccess.compute());
+      auto access = posAccess[0];
+      // If this iterator is windowed, then it needs to be projected down to
+      // recover the coordinate variable.
+      // TODO (rohany): Would be cleaner to have this logic be moved into the
+      //  ModeFunction, rather than having to check in some places?
+      if (posIter.isWindowed()) {
+        access = this->projectWindowedPositionToCanonicalSpace(posIter, access);
+      }
       if (declVars) {
-        loadPosIterCoordinateStmts.push_back(VarDecl::make(posIter.getCoordVar(),
-                                                           posAccess[0]));
+        loadPosIterCoordinateStmts.push_back(VarDecl::make(posIter.getCoordVar(), access));
       }
       else {
-        loadPosIterCoordinateStmts.push_back(Assign::make(posIter.getCoordVar(),
-                                                          posAccess[0]));
+        loadPosIterCoordinateStmts.push_back(Assign::make(posIter.getCoordVar(), access));
+      }
+      if (posIter.isWindowed()) {
+        loadPosIterCoordinateStmts.push_back(this->upperBoundGuardForWindowPosition(posIter, posIter.getCoordVar()));
       }
     }
     loadPosIterCoordinates = Block::make(loadPosIterCoordinateStmts);
@@ -2701,4 +2773,33 @@ Expr LowererImpl::checkThatNoneAreExhausted(std::vector<Iterator> iterators)
          : Lt::make(iterators[0].getIteratorVar(), iterators[0].getEndVar());
 }
 
+Expr LowererImpl::searchForStartOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end) {
+    taco_iassert(iterator.isWindowed());
+    vector<Expr> args = {
+            // Search over the `crd` array of the level,
+            iterator.getMode().getModePack().getArray(1),
+            // between the start and end position,
+            start, end,
+            // for the beginning of the window.
+            iterator.getWindowLowerBound(),
+    };
+    return Call::make("taco_binarySearchAfter", args, Datatype::UInt64);
+}
+
+Stmt LowererImpl::upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access) {
+    taco_iassert(iterator.isWindowed());
+    return ir::IfThenElse::make(
+            ir::Gte::make(access, ir::Sub::make(iterator.getWindowUpperBound(), iterator.getWindowLowerBound())),
+            ir::Break::make()
+    );
+}
+
+Expr LowererImpl::projectWindowedPositionToCanonicalSpace(Iterator iterator, ir::Expr expr) {
+  return ir::Sub::make(expr, iterator.getWindowLowerBound());
+}
+
+Expr LowererImpl::projectCanonicalSpaceToWindowedPosition(Iterator iterator, ir::Expr expr) {
+  return ir::Add::make(expr, iterator.getWindowLowerBound());
+}
+
 }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index cd8efd7d4..ce2e4190d 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -458,6 +458,31 @@ static inline map<TensorVar, TensorBase> getTensors(const IndexExpr& expr);
 struct AccessTensorNode : public AccessNode {
   AccessTensorNode(TensorBase tensor, const std::vector<IndexVar>& indices)
       :  AccessNode(tensor.getTensorVar(), indices), tensor(tensor) {}
+
+  AccessTensorNode(TensorBase tensor, const std::vector<std::shared_ptr<IndexVarInterface>>& indices)
+    : AccessNode(tensor.getTensorVar()), tensor(tensor) {
+    // Create the vector of IndexVar to assign to this->indexVars.
+    std::vector<IndexVar> ivars(indices.size());
+    for (size_t i = 0; i < indices.size(); i++) {
+      auto var = indices[i];
+      // Match on what the IndexVarInterface actually is.
+      IndexVarInterface::match(var, [&](std::shared_ptr<IndexVar> ivar) {
+        ivars[i] = *ivar;
+      }, [&](std::shared_ptr<WindowedIndexVar> wvar) {
+        ivars[i] = wvar->getIndexVar();
+        auto lo = wvar->getLowerBound();
+        auto hi = wvar->getUpperBound();
+        taco_uassert(lo >= 0) << "slice lower bound must be >= 0";
+        taco_uassert(hi <= tensor.getDimension(i)) <<
+          "slice upper bound must be <= tensor dimension (" << tensor.getDimension(i) << ")";
+        this->windowedModes[i].lo = lo;
+        this->windowedModes[i].hi = hi;
+      });
+    }
+    // Initialize this->indexVars.
+    this->indexVars = std::move(ivars);
+  }
+
   TensorBase tensor;
   virtual void setAssignment(const Assignment& assignment) {
     tensor.syncDependentTensors();
@@ -502,6 +527,14 @@ Access TensorBase::operator()(const std::vector<IndexVar>& indices) {
   return Access(new AccessTensorNode(*this, indices));
 }
 
+Access TensorBase::operator()(const std::vector<std::shared_ptr<IndexVarInterface>>& indices) {
+  taco_uassert(indices.size() == (size_t)getOrder())
+      << "A tensor of order " << getOrder() << " must be indexed with "
+      << getOrder() << " variables, but is indexed with:  "
+      << util::join(indices);
+  return Access(new AccessTensorNode(*this, indices));
+}
+
 Access TensorBase::operator()() {
   return this->operator()(std::vector<IndexVar>());
 }
diff --git a/test/test.cpp b/test/test.cpp
index 597ddf2ef..a49f10ff7 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -1,3 +1,5 @@
+#include <functional>
+
 #include "test.h"
 #include "taco/tensor.h"
 
@@ -51,4 +53,18 @@ ostream& operator<<(ostream& os, const NotationTest& test) {
   return os;
 }
 
+void ASSERT_THROWS_EXCEPTION_WITH_ERROR(std::function<void()> f, std::string err) {
+  EXPECT_THROW({
+    try {
+      f();
+    } catch (TacoException& e) {
+      // Catch and inspect the exception to make sure that err is within it.
+      auto s = std::string(e.what());
+      ASSERT_TRUE(s.find(err) != std::string::npos);
+      // Throw the exception back up to gtest.
+      throw;
+    }
+  }, TacoException);
+}
+
 }}
diff --git a/test/test.h b/test/test.h
index 04000bafa..3302bf81f 100644
--- a/test/test.h
+++ b/test/test.h
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 
+#include <functional>
 #include <iostream>
 #include <vector>
 #include <memory>
@@ -93,6 +94,10 @@ void ASSERT_COMPONENTS_EQUALS(vector<vector<vector<int>>> expectedIndices,
   ASSERT_ARRAY_EQ(expectedValues, {(double*)storage.getValues().getData(),nnz});
 }
 
+// ASSERT_THROWS_EXCEPTION_WITH_ERROR asserts that the input function throws
+// a TacoException with the input string err contained within the body.
+void ASSERT_THROWS_EXCEPTION_WITH_ERROR(std::function<void()> f, std::string err);
+
 struct NotationTest {
   NotationTest(IndexStmt actual, IndexStmt expected)
       : actual(actual), expected(expected) {}
diff --git a/test/tests-windowing.cpp b/test/tests-windowing.cpp
new file mode 100644
index 000000000..95e75d34e
--- /dev/null
+++ b/test/tests-windowing.cpp
@@ -0,0 +1,386 @@
+#include "test.h"
+#include "taco/tensor.h"
+#include "taco/codegen/module.h"
+#include "taco/index_notation/index_notation.h"
+#include "taco/lower/lower.h"
+
+#include <tuple>
+
+using namespace taco;
+
+// mixIndexing is a compilation test to ensure that we can index into a
+// tensor with a mix of IndexVars and WindowedIndexVars.
+TEST(windowing, mixIndexing) {
+  auto dim = 10;
+  Tensor<int> a("a", {dim, dim, dim, dim, dim}, {Dense, Dense, Dense, Dense, Dense});
+  IndexVar i, j, k, l, m;
+  auto w1 = a(i, j(1, 3), k, l(4, 5), m(6, 7));
+  auto w2 = a(i(1, 3), j(2, 4), k, l, m(3, 5));
+}
+
+TEST(windowing, boundsChecks) {
+  Tensor<int> a("a", {5}, {Dense});
+  IndexVar i("i");
+  ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(-1, 4)); }, "slice lower bound");
+  ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(0, 10)); }, "slice upper bound");
+}
+
+// sliceMultipleWays tests that the same tensor can be sliced in different ways
+// in the same expression.
+TEST(windowing, sliceMultipleWays) {
+  auto dim = 10;
+  Tensor<int> a("a", {dim}, {Dense});
+  Tensor<int> b("b", {dim}, {Sparse});
+  Tensor<int> c("c", {dim}, {Dense});
+  Tensor<int> expected("expected", {dim}, {Dense});
+  for (int i = 0; i < dim; i++) {
+    a.insert({i}, i);
+    b.insert({i}, i);
+  }
+  expected.insert({2}, 10);
+  expected.insert({3}, 13);
+  a.pack(); b.pack(); expected.pack();
+  IndexVar i("i"), j("j");
+
+  c(i(2, 4)) = a(i(5, 7)) + a(i(1, 3)) + b(i(4, 6));
+  c.evaluate();
+  ASSERT_TRUE(equals(expected, c));
+}
+
+// The test basic tests basic windowing behavior parameterized by a dimension
+// of the input tensors and formats for each of the tensors in the computation.
+struct basic : public TestWithParam<std::tuple<int, ModeFormat, ModeFormat, ModeFormat>> {};
+TEST_P(basic, windowing){
+  Tensor<int> expectedAdd("expectedAdd", {2, 2}, {Dense, Dense});
+  expectedAdd.insert({0, 0}, 14);
+  expectedAdd.insert({0, 1}, 17);
+  expectedAdd.insert({1, 0}, 17);
+  expectedAdd.insert({1, 1}, 20);
+  expectedAdd.pack();
+  Tensor<int> expectedMul("expectedMul", {2, 2}, {Dense, Dense});
+  expectedMul.insert({0, 0}, 64);
+  expectedMul.insert({0, 1}, 135);
+  expectedMul.insert({1, 0}, 135);
+  expectedMul.insert({1, 1}, 240);
+  expectedMul.pack();
+  Tensor<int> d("d", {2, 2}, {Dense, Dense});
+
+  // The test is parameterized by a dimension, and formats for the different tensors.
+  auto dim = std::get<0>(GetParam());
+  auto x = std::get<1>(GetParam());
+  auto y = std::get<2>(GetParam());
+  auto z = std::get<3>(GetParam());
+  Tensor<int> a("a", {dim, dim}, {Dense, x});
+  Tensor<int> b("b", {dim, dim}, {Dense, y});
+  Tensor<int> c("c", {dim, dim}, {Dense, z});
+  for (int i = 0; i < dim; i++) {
+    for (int j = 0; j < dim; j++) {
+      a.insert({i, j}, i + j);
+      b.insert({i, j}, i + j);
+      c.insert({i, j}, i + j);
+    }
+  }
+
+  a.pack();
+  b.pack();
+  c.pack();
+
+  IndexVar i, j;
+  d(i, j) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6)) + c(i(1, 3), j(1, 3));
+  d.evaluate();
+  ASSERT_TRUE(equals(expectedAdd, d))
+                << endl << expectedAdd << endl << endl << d << endl
+                << dim << " " << x << " " << y << " " << z << endl;
+
+  d(i, j) = a(i(2, 4), j(2, 4)) * b(i(4, 6), j(4, 6)) * c(i(1, 3), j(1, 3));
+  d.evaluate();
+  ASSERT_TRUE(equals(expectedMul, d))
+                << endl << expectedMul << endl << endl << d << endl
+                << dim << " " << x << " " << y << " " << z << endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    basic,
+    // Test on the cartesian product of the chosen dimensions and different
+    // combinations for tensor formats.
+    Combine(Values(6, 20), Values(Dense, Sparse), Values(Dense, Sparse), Values(Dense, Sparse))
+);
+
+// slicedOutput tests that operations can write to a window within an output tensor.
+// The test is parameterized over formats for the used tensors.
+struct slicedOutput : public TestWithParam<std::tuple<ModeFormat, ModeFormat>> {};
+TEST_P(slicedOutput, windowing) {
+  auto dim = 10;
+  Tensor<int> expected("expected", {10, 10}, {Dense, Dense});
+  expected.insert({8, 8}, 12);
+  expected.insert({8, 9}, 14);
+  expected.insert({9, 8}, 14);
+  expected.insert({9, 9}, 16);
+  expected.pack();
+  auto x = std::get<0>(GetParam());
+  auto y = std::get<1>(GetParam());
+  Tensor<int> a("a", {dim, dim}, {Dense, x});
+  Tensor<int> b("b", {dim, dim}, {Dense, y});
+  Tensor<int> c("c", {dim, dim}, {Dense, Dense});
+  for (int i = 0; i < dim; i++) {
+    for (int j = 0; j < dim; j++) {
+      a.insert({i, j}, i + j);
+      b.insert({i, j}, i + j);
+    }
+  }
+  a.pack();
+  b.pack();
+
+  IndexVar i, j;
+  c(i(8, 10), j(8, 10)) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6));
+  c.evaluate();
+  ASSERT_TRUE(equals(expected, c))
+                << endl << expected << endl << endl << c << endl
+                << dim << " " << x << " " << y << endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    slicedOutput,
+    Combine(Values(Dense, Sparse), Values(Dense, Sparse))
+);
+
+// matrixMultiple tests a matrix multiply, and in the process is testing
+// windowing on expressions that contain reductions. The test is parameterized
+// over formats for the used tensors.
+struct matrixMultiply : public TestWithParam<std::tuple<ModeFormat, ModeFormat>> {};
+TEST_P(matrixMultiply, windowing) {
+  auto dim = 10;
+  auto windowDim = 4;
+
+  Tensor<int> a("a", {windowDim, windowDim}, {Dense, Dense});
+  Tensor<int> b("b", {windowDim, windowDim}, {Dense, Dense});
+  Tensor<int> c("c", {windowDim, windowDim}, {Dense, Dense});
+  Tensor<int> expected("expected", {windowDim, windowDim}, {Dense, Dense});
+
+  auto x = std::get<0>(GetParam());
+  auto y = std::get<1>(GetParam());
+  Tensor<int> aw("aw", {dim, dim}, {Dense, x});
+  Tensor<int> bw("bw", {dim, dim}, {Dense, y});
+  for (int i = 0; i < dim; i++) {
+    for (int j = 0; j < dim; j++) {
+      aw.insert({i, j}, i + j);
+      bw.insert({i, j}, i + j);
+    }
+  }
+  aw.pack(); bw.pack();
+
+  IndexVar i("i"), j("j"), k("k");
+  // Evaluate the windowed matrix multiply.
+  c(i, k) = aw(i(4, 8), j(2, 6)) * bw(j(0, 4), k(6, 10));
+  c.evaluate();
+
+  // Copy the windowed portions of aw and bw into separate tensors, and test
+  // that the un-windowed matrix multiplication has the same results.
+  a(i, j) = aw(i(4, 8), j(2, 6));
+  a.evaluate();
+  b(i, j) = bw(i(0, 4), j(6, 10));
+  b.evaluate();
+  expected(i, k) = a(i, j) * b(j, k);
+  expected.evaluate();
+
+  ASSERT_TRUE(equals(expected, c)) << expected << endl << c << endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    matrixMultiply,
+    Combine(Values(Dense, Sparse), Values(Dense, Sparse))
+);
+
+// workspace tests that workspaces can be assigned to and used in computations
+// that involve windowed tensors. The test is parameterized over formats for
+// the used tensors.
+struct workspace : public TestWithParam<std::tuple<ModeFormat, ModeFormat>> {};
+TEST_P(workspace, windowing) {
+  auto dim = 10;
+  size_t windowDim = 4;
+  Tensor<int> d("d", {static_cast<int>(windowDim)}, {Dense});
+  Tensor<int> expected("expected", {static_cast<int>(windowDim)}, {Dense});
+  expected.insert({0}, 8); expected.insert({1}, 11);
+  expected.insert({2}, 14); expected.insert({3}, 17);
+  expected.pack();
+
+  auto x = std::get<0>(GetParam());
+  auto y = std::get<1>(GetParam());
+  Tensor<int> a("a", {dim}, {x});
+  Tensor<int> b("b", {dim}, {y});
+  Tensor<int> c("c", {dim}, {Dense});
+  for (int i = 0; i < dim; i++) {
+    a.insert({i}, i);
+    b.insert({i}, i);
+    c.insert({i}, i);
+  }
+  a.pack();
+  b.pack();
+  c.pack();
+  IndexVar i("i");
+  TensorVar p("p", Type(Int(), {windowDim}), Dense);
+  auto precomputed = a(i(2, 6)) + b(i(6, 10));
+  d(i) = precomputed + c(i(0, 4));
+  auto stmt = d.getAssignment().concretize();
+  stmt = stmt.precompute(precomputed, i, i, p);
+  d.compile(stmt.concretize());
+  d.evaluate();
+  ASSERT_TRUE(equals(d, expected)) << expected << endl << d << endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    workspace,
+    Combine(Values(Dense, Sparse), Values(Dense, Sparse))
+);
+
+// transformations tests how windowing interacts with sparse iteration space
+// transformations and different mode formats.
+TEST(windowing, transformations) {
+  auto dim = 10;
+  Tensor<int> expected("expected", {2, 2}, {Dense, Dense});
+  expected.insert({0, 0}, 12);
+  expected.insert({0, 1}, 14);
+  expected.insert({1, 0}, 14);
+  expected.insert({1, 1}, 16);
+  expected.pack();
+
+  IndexVar i("i"), j("j"), i1 ("i1"), i2 ("i2");
+  auto testFn = [&](std::function<IndexStmt(IndexStmt)> modifier, std::vector<Format> formats) {
+    for (auto& format : formats) {
+      Tensor<int> a("a", {dim, dim}, format);
+      Tensor<int> b("b", {dim, dim}, format);
+      for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < dim; j++) {
+          a.insert({i, j}, i + j);
+          b.insert({i, j}, i + j);
+        }
+      }
+      a.pack(); b.pack();
+
+      Tensor<int> c("c", {2, 2}, {Dense, Dense});
+      c(i, j) = a(i(2, 4), j(2, 4)) + b(i(4, 6), j(4, 6));
+      auto stmt = c.getAssignment().concretize();
+      c.compile(modifier(stmt));
+      c.evaluate();
+      equals(c, expected);
+      ASSERT_TRUE(equals(c, expected)) << endl << c << endl << expected << endl << format << endl;
+    }
+  };
+
+  std::vector<Format> allFormats = {{Dense, Dense}, {Dense, Sparse}, {Sparse, Dense}, {Sparse, Sparse}};
+  testFn([&](IndexStmt stmt) {
+    return stmt.split(i, i1, i2, 4).unroll(i2, 4);
+ }, allFormats);
+
+  testFn([&](IndexStmt stmt) {
+    return stmt.reorder(i, j);
+  }, {{Dense, Dense}});
+
+  // We can only (currently) parallelize the outer dimension loop if it is dense.
+  testFn([&](IndexStmt stmt) {
+    return stmt.parallelize(i, taco::ParallelUnit::CPUThread, taco::OutputRaceStrategy::NoRaces);
+  }, {{Dense, Dense}, {Dense, Sparse}});
+}
+
+// assignment tests assignments of and to windows in different combinations.
+// The test is parameterized over formats for the used tensors.
+struct assignment : public TestWithParam<ModeFormat> {};
+TEST_P(assignment, windowing) {
+  auto dim = 10;
+  auto srcFormat = GetParam();
+  Tensor<int> A("A", {dim, dim}, srcFormat);
+
+  for (int i = 0; i < dim; i++) {
+    for (int j = 0; j < dim; j++) {
+      A.insert({i, j}, i + j);
+    }
+  }
+  A.pack();
+
+  IndexVar i, j;
+
+  // First assign a window of A to a window of B.
+  Tensor<int> B("B", {dim, dim}, {Dense, Dense});
+  B(i(2, 4), j(3, 5)) = A(i(4, 6), j(5, 7));
+  B.evaluate();
+  Tensor<int> expected("expected", {dim, dim}, {Dense, Dense});
+  expected.insert({2, 3}, 9); expected.insert({2, 4}, 10);
+  expected.insert({3, 3}, 10); expected.insert({3, 4}, 11);
+  expected.pack();
+  ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl;
+
+  // Assign a window of A to b.
+  B = Tensor<int>("B", {2, 2}, {Dense, Dense});
+  B(i, j) = A(i(4, 6), j(5, 7));
+  B.evaluate();
+  expected = Tensor<int>("expected", {2, 2}, {Dense, Dense});
+  expected.insert({0, 0}, 9); expected.insert({0, 1}, 10);
+  expected.insert({1, 0}, 10); expected.insert({1, 1}, 11);
+  expected.pack();
+  ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl;
+
+  // Assign A to a window of B.
+  A = Tensor<int>("A", {2, 2}, srcFormat);
+  A.insert({0, 0}, 0); A.insert({0, 1}, 1);
+  A.insert({1, 0}, 1); A.insert({1, 1}, 2);
+  A.pack();
+  B = Tensor<int>("B", {dim, dim}, {Dense, Dense});
+  B(i(4, 6), j(5, 7)) = A(i, j);
+  B.evaluate();
+  expected = Tensor<int>("expected", {dim, dim}, {Dense, Dense});
+  expected.insert({4, 5}, 0); expected.insert({4, 6}, 1);
+  expected.insert({5, 5}, 1); expected.insert({5, 6}, 2);
+  expected.pack();
+  ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    assignment,
+    Values(Dense, Sparse)
+);
+
+// cuda tests a basic windowing operation when using GPU targeted code.
+// The test is parameterized over formats for the used tensors.
+struct cuda : public TestWithParam<std::tuple<ModeFormat, ModeFormat>> {};
+TEST_P(cuda, windowing) {
+  if (!should_use_CUDA_codegen()) {
+    return;
+  }
+  auto dim = 10;
+  Tensor<int> expected("expected", {2, 2}, {Dense, Dense});
+  expected.insert({0, 0}, 12); expected.insert({0, 1}, 14);
+  expected.insert({1, 0}, 14); expected.insert({1, 1}, 16);
+  expected.pack();
+
+  auto x = std::get<0>(GetParam());
+  auto y = std::get<1>(GetParam());
+  Tensor<int> a("a", {dim, dim}, {Dense, x});
+  Tensor<int> b("b", {dim, dim}, {Dense, y});
+  Tensor<int> c("c", {2, 2}, {Dense, Dense});
+
+  for (int i = 0; i < dim; i++) {
+    for (int j = 0; j < dim; j++) {
+      a.insert({i, j}, i + j);
+      b.insert({i, j}, i + j);
+    }
+  }
+  a.pack(); b.pack();
+
+  IndexVar i("i"), j("j"), i1("i1"), i2("i2"), i3("i3"), i4("i4");
+  c(i, j) = a(i(4, 6), j(4, 6)) + b(i(2, 4), j(2, 4));
+  auto stmt = c.getAssignment().concretize();
+  stmt = stmt.split(i, i1, i2, 512)
+             .split(i2, i3, i4, 32)
+             .parallelize(i1, ParallelUnit::GPUBlock, OutputRaceStrategy::NoRaces)
+             .parallelize(i3, ParallelUnit::GPUWarp, OutputRaceStrategy::NoRaces)
+             .parallelize(i4, ParallelUnit::GPUThread, OutputRaceStrategy::NoRaces);
+
+  c.compile(stmt);
+  c.evaluate();
+  ASSERT_TRUE(equals(c, expected)) << c << endl << expected << endl;
+}
+INSTANTIATE_TEST_CASE_P(
+    windowing,
+    cuda,
+    Combine(Values(Dense, Sparse), Values(Dense, Sparse))
+);