Merge branch 'master' of https://github.com/halide/Halide

jrk · Jan 30, 2014 · 16c9175 · 16c9175
2 parents 3145709 + 80668df
commit 16c9175
Show file tree

Hide file tree

Showing 14 changed files with 177 additions and 61 deletions.
diff --git a/apps/local_laplacian/local_laplacian.cpp b/apps/local_laplacian/local_laplacian.cpp
@@ -131,12 +131,12 @@ int main(int argc, char **argv) {
     } else {
         // cpu schedule
         Var yi;
-        output.split(y, y, yi, 4).parallel(y).vectorize(x, 4);
-        gray.compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
+        output.parallel(y, 4).vectorize(x, 4);
+        gray.compute_root().parallel(y, 4).vectorize(x, 4);
         for (int j = 0; j < 4; j++) {
-            if (j > 0) inGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
-            if (j > 0) gPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
-            outGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
+            if (j > 0) inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
+            if (j > 0) gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
+            outGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
         }
         for (int j = 4; j < J; j++) {
             inGPyramid[j].compute_root().parallel(y);

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
@@ -730,10 +730,18 @@ void merge_boxes(Box &a, const Box &b) {
 
     for (size_t i = 0; i < a.size(); i++) {
         if (!a[i].min.same_as(b[i].min)) {
-            a[i].min = min(a[i].min, b[i].min);
+            if (a[i].min.defined() && b[i].min.defined()) {
+                a[i].min = min(a[i].min, b[i].min);
+            } else {
+                a[i].min = Expr();
+            }
         }
         if (!a[i].max.same_as(b[i].max)) {
-            a[i].max = max(a[i].max, b[i].max);
+            if (a[i].min.defined() && b[i].min.defined()) {
+                a[i].max = max(a[i].max, b[i].max);
+            } else {
+                a[i].max = Expr();
+            }
         }
     }
 }

diff --git a/src/CodeGen.cpp b/src/CodeGen.cpp
@@ -1066,10 +1066,7 @@ void CodeGen::visit(const Load *op) {
         add_tbaa_metadata(load, op->name);
         value = load;
     } else {
-        int alignment = op->type.bytes();
-        if (possibly_misaligned) {
-            alignment = op->type.element_of().bytes();
-        }
+        int alignment = op->type.bytes(); // The size of a single element
         const Ramp *ramp = op->index.as<Ramp>();
         const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : NULL;
 
@@ -1095,49 +1092,49 @@ void CodeGen::visit(const Load *op) {
             value = load;
         } else if (ramp && stride && stride->value == 2) {
             // Load two vectors worth and then shuffle
+            Expr base_a = ramp->base, base_b = ramp->base + ramp->width;
 
-            // If the base ends in an odd constant, then subtract one
-            // and do a different shuffle. This helps expressions like
-            // (f(2*x) + f(2*x+1) share loads.
-            Expr new_base;
-            const Add *add = ramp->base.as<Add>();
-            const IntImm *offset = add ? add->b.as<IntImm>() : NULL;
-            bool shifted = false;
-            if (offset) {
-                if (offset->value == 1) {
-                    new_base = add->a;
-                    shifted = true;
-                } else if (offset->value & 1) {
-                    new_base = add->a + (offset->value - 1);
-                    shifted = true;
-                } else {
-                    new_base = ramp->base;
-                }
+            // False indicates we should take the even-numbered lanes
+            // from the load, true indicates we should take the
+            // odd-numbered-lanes.
+            bool shifted_a = false, shifted_b = false;
 
-                // Redo alignment analysis
-                if (internal && shifted) {
-                    alignment = op->type.bytes();
-                    ModulusRemainder mod_rem = modulus_remainder(new_base);
-                    alignment *= gcd(gcd(mod_rem.modulus, mod_rem.remainder), 32);
-                    if (alignment < 0) alignment = -alignment;
-                }
+            // Don't read beyond the end of an external buffer.
+            if (!internal) {
+                base_b -= 1;
+                shifted_b = true;
             } else {
-                new_base = ramp->base;
+                // If the base ends in an odd constant, then subtract one
+                // and do a different shuffle. This helps expressions like
+                // (f(2*x) + f(2*x+1) share loads
+                const Add *add = ramp->base.as<Add>();
+                const IntImm *offset = add ? add->b.as<IntImm>() : NULL;
+                if (offset && offset->value & 1) {
+                    base_a -= 1;
+                    shifted_a = true;
+                    base_b -= 1;
+                    shifted_b = true;
+                }
             }
 
-            Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), new_base);
-            ptr = builder->CreatePointerCast(ptr, llvm_type_of(op->type)->getPointerTo());
-            LoadInst *a = builder->CreateAlignedLoad(ptr, alignment);
-            add_tbaa_metadata(a, op->name);
-            ptr = builder->CreateConstInBoundsGEP1_32(ptr, 1);
-            int bytes = (op->type.bits * op->type.width)/8;
-            LoadInst *b = builder->CreateAlignedLoad(ptr, gcd(alignment, bytes));
-            add_tbaa_metadata(b, op->name);
+            // Do each load.
+            Expr ramp_a = Ramp::make(base_a, 1, ramp->width);
+            Expr ramp_b = Ramp::make(base_b, 1, ramp->width);
+            Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param);
+            Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param);
+            Value *vec_a = codegen(load_a);
+            Value *vec_b = codegen(load_b);
+
+            // Shuffle together the results.
             vector<Constant *> indices(ramp->width);
-            for (int i = 0; i < ramp->width; i++) {
-                indices[i] = ConstantInt::get(i32, i*2 + (shifted ? 1 : 0));
+            for (int i = 0; i < (ramp->width + 1)/2; i++) {
+                indices[i] = ConstantInt::get(i32, i*2 + (shifted_a ? 1 : 0));
             }
-            value = builder->CreateShuffleVector(a, b, ConstantVector::get(indices));
+            for (int i = (ramp->width + 1)/2; i < ramp->width; i++) {
+                indices[i] = ConstantInt::get(i32, i*2 + (shifted_b ? 1 : 0));
+            }
+
+            value = builder->CreateShuffleVector(vec_a, vec_b, ConstantVector::get(indices));
         } else if (ramp && stride && stride->value == -1) {
             // Load the vector and then flip it in-place
             Expr base = ramp->base - ramp->width + 1;

diff --git a/src/CodeGen_C.h b/src/CodeGen_C.h
@@ -41,7 +41,7 @@ class CodeGen_C : public IRPrinter {
     static void test();
 
 protected:
-    /** An for the most recently generated ssa variable */
+    /** An ID for the most recently generated ssa variable */
     std::string id;
 
     /** A cache of generated values in scope */

diff --git a/src/Func.cpp b/src/Func.cpp
@@ -547,6 +547,13 @@ ScheduleHandle &ScheduleHandle::unroll(Var var) {
     return *this;
 }
 
+ScheduleHandle &ScheduleHandle::parallel(Var var, Expr factor) {
+    Var tmp;
+    split(var, var, tmp, factor);
+    parallel(var);
+    return *this;
+}
+
 ScheduleHandle &ScheduleHandle::vectorize(Var var, int factor) {
     Var tmp;
     split(var, var, tmp, factor);
@@ -790,6 +797,11 @@ Func &Func::unroll(Var var) {
     return *this;
 }
 
+Func &Func::parallel(Var var, Expr factor) {
+    ScheduleHandle(func.schedule()).parallel(var, factor);
+    return *this;
+}
+
 Func &Func::vectorize(Var var, int factor) {
     ScheduleHandle(func.schedule()).vectorize(var, factor);
     return *this;

diff --git a/src/Func.h b/src/Func.h
@@ -203,6 +203,7 @@ class ScheduleHandle {
     EXPORT ScheduleHandle &parallel(Var var);
     EXPORT ScheduleHandle &vectorize(Var var);
     EXPORT ScheduleHandle &unroll(Var var);
+    EXPORT ScheduleHandle &parallel(Var var, Expr task_size);
     EXPORT ScheduleHandle &vectorize(Var var, int factor);
     EXPORT ScheduleHandle &unroll(Var var, int factor);
     EXPORT ScheduleHandle &tile(Var x, Var y, Var xo, Var yo, Var xi, Var yi, Expr xfactor, Expr yfactor);
@@ -499,10 +500,11 @@ class Func {
     EXPORT void set_error_handler(void (*handler)(void *, const char *));
 
     /** Set a custom malloc and free for halide to use. Malloc should
-     * return 32-byte aligned chunks of memory. If compiling
-     * statically, routines with appropriate signatures can be
-     * provided directly
-     \code
+     * return 32-byte aligned chunks of memory, and it should be safe
+     * for Halide to read slightly out of bounds (up to 8 bytes before
+     * the start or beyond the end). If compiling statically, routines
+     * with appropriate signatures can be provided directly
+    \code
      extern "C" void *halide_malloc(void *, size_t)
      extern "C" void halide_free(void *, void *)
      \endcode
@@ -726,6 +728,14 @@ class Func {
     /** Mark a dimension to be traversed in parallel */
     EXPORT Func &parallel(Var var);
 
+    /** Split a dimension by the given task_size, and the parallelize the
+     * outer dimension. This creates parallel tasks that have size
+     * task_size. After this call, var refers to the outer dimension of
+     * the split. The inner dimension has a new anonymous name. If you
+     * wish to mutate it, or schedule with respect to it, do the split
+     * manually. */
+    EXPORT Func &parallel(Var var, Expr task_size);
+
     /** Mark a dimension to be computed all-at-once as a single
      * vector. The dimension should have constant extent -
      * e.g. because it is the inner dimension following a split by a

diff --git a/src/IRMutator.h b/src/IRMutator.h
@@ -18,7 +18,7 @@ namespace Internal {
  * (e.g. replacing a variable with a value (Substitute.h), or
  * constant-folding).
  *
- * Your mutate should override the visit methods you can about. Return
+ * Your mutate should override the visit methods you care about. Return
  * the new expression by assigning to expr or stmt. The default ones
  * recursively mutate their children. To mutate sub-expressions and
  * sub-statements you should the mutate method, which will dispatch to

diff --git a/src/Lower.cpp b/src/Lower.cpp
@@ -1305,6 +1305,13 @@ Stmt add_image_checks(Stmt s, Function f) {
             string actual_extent_name = name + ".extent." + dim;
             Expr actual_min = Variable::make(Int(32), actual_min_name);
             Expr actual_extent = Variable::make(Int(32), actual_extent_name);
+            if (!touched[j].min.defined() || !touched[j].max.defined()) {
+                std::cerr << "Error: buffer " << name
+                          << " may be accessed in an unbounded way in dimension "
+                          << j << "\n";
+                assert(false);
+            }
+
             Expr min_required = touched[j].min;
             Expr extent_required = touched[j].max + 1 - touched[j].min;
             string error_msg_extent = error_name + " is accessed beyond the extent in dimension " + dim;

diff --git a/src/Simplify.cpp b/src/Simplify.cpp
@@ -1559,12 +1559,6 @@ class Simplify : public IRMutator {
         info = var_info.get(op->name);
         var_info.pop(op->name);
 
-        if (body.same_as(op->body) &&
-            value.same_as(op->value) &&
-            !new_value.defined()) {
-            return op;
-        }
-
         Body result = body;
 
         if (new_value.defined() && info.new_uses > 0) {
@@ -1577,6 +1571,16 @@ class Simplify : public IRMutator {
             result = T::make(op->name, value, result);
         }
 
+        // Don't needlessly make a new Let/LetStmt node.  (Here's a
+        // piece of template syntax I've never needed before).
+        const T *new_op = result.template as<T>();
+        if (new_op &&
+            new_op->name == op->name &&
+            new_op->body.same_as(op->body) &&
+            new_op->value.same_as(op->value)) {
+            return op;
+        }
+
         return result;
 
     }
@@ -2083,6 +2087,7 @@ void simplify_test() {
 
     // Check that dead lets get stripped
     check(Let::make("x", 3*y*y*y, 4), 4);
+    check(Let::make("x", 0, 0), 0);
 
     std::cout << "Simplify test passed" << std::endl;
 }

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
@@ -74,7 +74,9 @@ extern void halide_shutdown_thread_pool();
 
 /** Define halide_malloc and halide_free to replace the default memory
  * allocator.  See Func::set_custom_allocator. (Specifically note that
- * halide_malloc must return a 32-byte aligned pointer.)
+ * halide_malloc must return a 32-byte aligned pointer, and it must be
+ * safe to read at least 8 bytes before the start and beyond the
+ * end.)
  */
 //@{
 extern void *halide_malloc(void *user_context, size_t x);

diff --git a/src/runtime/posix_allocator.cpp b/src/runtime/posix_allocator.cpp
@@ -23,7 +23,7 @@ WEAK void *halide_malloc(void *user_context, size_t x) {
     if (halide_custom_malloc) {
         return halide_custom_malloc(user_context, x);
     } else {
-        void *orig = malloc(x+32);
+        void *orig = malloc(x+40);
         if (orig == NULL) {
             // Will result in a failed assertion and a call to halide_error
             return NULL;

diff --git a/test/correctness/strided_load.cpp b/test/correctness/strided_load.cpp
@@ -0,0 +1,31 @@
+#include <Halide.h>
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Image<int8_t> im(1697);
+
+    // A strided load with stride two loads a pair of vectors and
+    // shuffles out the elements like so:
+    // A0 A1 A2 A3 B0 B1 B2 B3 -> A0 A2 B0 B2
+
+    // That technique applied to the following would read beyond the
+    // input, so the second load actually gets pushed backwards (or
+    // valgrind would complain).
+    Func f, g;
+    Var x;
+    f(x) = im(2*x);
+    f.compute_root().vectorize(x, 16).bound(x, 0, 849);
+
+    // However, it's safe to apply it to this step, because f is an
+    // internal allocation, and halide_malloc adds a safety margin.
+    g(x) = f(2*x);
+    g.compute_root().vectorize(x, 16).bound(x, 0, 425); // 24 * 2 = 48 < 49
+
+    //g.compile_to_assembly("/dev/stdout", std::vector<Argument>(), "g");
+
+    g.realize(425);
+
+    return 0;
+}
diff --git a/test/error/unbounded_input.cpp b/test/error/unbounded_input.cpp
@@ -0,0 +1,21 @@
+#include <Halide.h>
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Func f;
+    Var x, y;
+
+    ImageParam in(Float(32), 2);
+    ImageParam x_coord(Int(32), 2);
+    ImageParam y_coord(Int(32), 2);
+
+    f(x, y) = in(x_coord(x, y), y_coord(x, y));
+
+    f.compile_jit();
+
+    printf("I should not have reached here\n");
+
+    return 0;
+}
diff --git a/test/error/unbounded_output.cpp b/test/error/unbounded_output.cpp
@@ -0,0 +1,23 @@
+#include <Halide.h>
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Func f;
+    Var x, y;
+
+    ImageParam in(Float(32), 2);
+    ImageParam x_coord(Int(32), 2);
+    ImageParam y_coord(Int(32), 2);
+
+    f(x, y) = 0.0f;
+    RDom r(0, 100, 0, 100);
+    f(x_coord(r.x, r.y), y_coord(r.x, r.y)) += in(r.x, r.y);
+
+    f.compile_jit();
+
+    printf("I should not have reached here\n");
+
+    return 0;
+}