Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
abadams committed Jan 30, 2014
2 parents 3145709 + 80668df commit 16c9175
Show file tree
Hide file tree
Showing 14 changed files with 177 additions and 61 deletions.
10 changes: 5 additions & 5 deletions apps/local_laplacian/local_laplacian.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,12 @@ int main(int argc, char **argv) {
} else {
// cpu schedule
Var yi;
output.split(y, y, yi, 4).parallel(y).vectorize(x, 4);
gray.compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
output.parallel(y, 4).vectorize(x, 4);
gray.compute_root().parallel(y, 4).vectorize(x, 4);
for (int j = 0; j < 4; j++) {
if (j > 0) inGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
if (j > 0) gPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
outGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
if (j > 0) inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
if (j > 0) gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
outGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4);
}
for (int j = 4; j < J; j++) {
inGPyramid[j].compute_root().parallel(y);
Expand Down
12 changes: 10 additions & 2 deletions src/Bounds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,10 +730,18 @@ void merge_boxes(Box &a, const Box &b) {

for (size_t i = 0; i < a.size(); i++) {
if (!a[i].min.same_as(b[i].min)) {
a[i].min = min(a[i].min, b[i].min);
if (a[i].min.defined() && b[i].min.defined()) {
a[i].min = min(a[i].min, b[i].min);
} else {
a[i].min = Expr();
}
}
if (!a[i].max.same_as(b[i].max)) {
a[i].max = max(a[i].max, b[i].max);
if (a[i].min.defined() && b[i].min.defined()) {
a[i].max = max(a[i].max, b[i].max);
} else {
a[i].max = Expr();
}
}
}
}
Expand Down
77 changes: 37 additions & 40 deletions src/CodeGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1066,10 +1066,7 @@ void CodeGen::visit(const Load *op) {
add_tbaa_metadata(load, op->name);
value = load;
} else {
int alignment = op->type.bytes();
if (possibly_misaligned) {
alignment = op->type.element_of().bytes();
}
int alignment = op->type.bytes(); // The size of a single element
const Ramp *ramp = op->index.as<Ramp>();
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : NULL;

Expand All @@ -1095,49 +1092,49 @@ void CodeGen::visit(const Load *op) {
value = load;
} else if (ramp && stride && stride->value == 2) {
// Load two vectors worth and then shuffle
Expr base_a = ramp->base, base_b = ramp->base + ramp->width;

// If the base ends in an odd constant, then subtract one
// and do a different shuffle. This helps expressions like
// (f(2*x) + f(2*x+1) share loads.
Expr new_base;
const Add *add = ramp->base.as<Add>();
const IntImm *offset = add ? add->b.as<IntImm>() : NULL;
bool shifted = false;
if (offset) {
if (offset->value == 1) {
new_base = add->a;
shifted = true;
} else if (offset->value & 1) {
new_base = add->a + (offset->value - 1);
shifted = true;
} else {
new_base = ramp->base;
}
// False indicates we should take the even-numbered lanes
// from the load, true indicates we should take the
// odd-numbered-lanes.
bool shifted_a = false, shifted_b = false;

// Redo alignment analysis
if (internal && shifted) {
alignment = op->type.bytes();
ModulusRemainder mod_rem = modulus_remainder(new_base);
alignment *= gcd(gcd(mod_rem.modulus, mod_rem.remainder), 32);
if (alignment < 0) alignment = -alignment;
}
// Don't read beyond the end of an external buffer.
if (!internal) {
base_b -= 1;
shifted_b = true;
} else {
new_base = ramp->base;
// If the base ends in an odd constant, then subtract one
// and do a different shuffle. This helps expressions like
// (f(2*x) + f(2*x+1) share loads
const Add *add = ramp->base.as<Add>();
const IntImm *offset = add ? add->b.as<IntImm>() : NULL;
if (offset && offset->value & 1) {
base_a -= 1;
shifted_a = true;
base_b -= 1;
shifted_b = true;
}
}

Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), new_base);
ptr = builder->CreatePointerCast(ptr, llvm_type_of(op->type)->getPointerTo());
LoadInst *a = builder->CreateAlignedLoad(ptr, alignment);
add_tbaa_metadata(a, op->name);
ptr = builder->CreateConstInBoundsGEP1_32(ptr, 1);
int bytes = (op->type.bits * op->type.width)/8;
LoadInst *b = builder->CreateAlignedLoad(ptr, gcd(alignment, bytes));
add_tbaa_metadata(b, op->name);
// Do each load.
Expr ramp_a = Ramp::make(base_a, 1, ramp->width);
Expr ramp_b = Ramp::make(base_b, 1, ramp->width);
Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param);
Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param);
Value *vec_a = codegen(load_a);
Value *vec_b = codegen(load_b);

// Shuffle together the results.
vector<Constant *> indices(ramp->width);
for (int i = 0; i < ramp->width; i++) {
indices[i] = ConstantInt::get(i32, i*2 + (shifted ? 1 : 0));
for (int i = 0; i < (ramp->width + 1)/2; i++) {
indices[i] = ConstantInt::get(i32, i*2 + (shifted_a ? 1 : 0));
}
value = builder->CreateShuffleVector(a, b, ConstantVector::get(indices));
for (int i = (ramp->width + 1)/2; i < ramp->width; i++) {
indices[i] = ConstantInt::get(i32, i*2 + (shifted_b ? 1 : 0));
}

value = builder->CreateShuffleVector(vec_a, vec_b, ConstantVector::get(indices));
} else if (ramp && stride && stride->value == -1) {
// Load the vector and then flip it in-place
Expr base = ramp->base - ramp->width + 1;
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_C.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class CodeGen_C : public IRPrinter {
static void test();

protected:
/** An for the most recently generated ssa variable */
/** An ID for the most recently generated ssa variable */
std::string id;

/** A cache of generated values in scope */
Expand Down
12 changes: 12 additions & 0 deletions src/Func.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,13 @@ ScheduleHandle &ScheduleHandle::unroll(Var var) {
return *this;
}

ScheduleHandle &ScheduleHandle::parallel(Var var, Expr factor) {
Var tmp;
split(var, var, tmp, factor);
parallel(var);
return *this;
}

ScheduleHandle &ScheduleHandle::vectorize(Var var, int factor) {
Var tmp;
split(var, var, tmp, factor);
Expand Down Expand Up @@ -790,6 +797,11 @@ Func &Func::unroll(Var var) {
return *this;
}

Func &Func::parallel(Var var, Expr factor) {
ScheduleHandle(func.schedule()).parallel(var, factor);
return *this;
}

Func &Func::vectorize(Var var, int factor) {
ScheduleHandle(func.schedule()).vectorize(var, factor);
return *this;
Expand Down
18 changes: 14 additions & 4 deletions src/Func.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ class ScheduleHandle {
EXPORT ScheduleHandle &parallel(Var var);
EXPORT ScheduleHandle &vectorize(Var var);
EXPORT ScheduleHandle &unroll(Var var);
EXPORT ScheduleHandle &parallel(Var var, Expr task_size);
EXPORT ScheduleHandle &vectorize(Var var, int factor);
EXPORT ScheduleHandle &unroll(Var var, int factor);
EXPORT ScheduleHandle &tile(Var x, Var y, Var xo, Var yo, Var xi, Var yi, Expr xfactor, Expr yfactor);
Expand Down Expand Up @@ -499,10 +500,11 @@ class Func {
EXPORT void set_error_handler(void (*handler)(void *, const char *));

/** Set a custom malloc and free for halide to use. Malloc should
* return 32-byte aligned chunks of memory. If compiling
* statically, routines with appropriate signatures can be
* provided directly
\code
* return 32-byte aligned chunks of memory, and it should be safe
* for Halide to read slightly out of bounds (up to 8 bytes before
* the start or beyond the end). If compiling statically, routines
* with appropriate signatures can be provided directly
\code
extern "C" void *halide_malloc(void *, size_t)
extern "C" void halide_free(void *, void *)
\endcode
Expand Down Expand Up @@ -726,6 +728,14 @@ class Func {
/** Mark a dimension to be traversed in parallel */
EXPORT Func &parallel(Var var);

/** Split a dimension by the given task_size, and the parallelize the
* outer dimension. This creates parallel tasks that have size
* task_size. After this call, var refers to the outer dimension of
* the split. The inner dimension has a new anonymous name. If you
* wish to mutate it, or schedule with respect to it, do the split
* manually. */
EXPORT Func &parallel(Var var, Expr task_size);

/** Mark a dimension to be computed all-at-once as a single
* vector. The dimension should have constant extent -
* e.g. because it is the inner dimension following a split by a
Expand Down
2 changes: 1 addition & 1 deletion src/IRMutator.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace Internal {
* (e.g. replacing a variable with a value (Substitute.h), or
* constant-folding).
*
* Your mutate should override the visit methods you can about. Return
* Your mutate should override the visit methods you care about. Return
* the new expression by assigning to expr or stmt. The default ones
* recursively mutate their children. To mutate sub-expressions and
* sub-statements you should the mutate method, which will dispatch to
Expand Down
7 changes: 7 additions & 0 deletions src/Lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,13 @@ Stmt add_image_checks(Stmt s, Function f) {
string actual_extent_name = name + ".extent." + dim;
Expr actual_min = Variable::make(Int(32), actual_min_name);
Expr actual_extent = Variable::make(Int(32), actual_extent_name);
if (!touched[j].min.defined() || !touched[j].max.defined()) {
std::cerr << "Error: buffer " << name
<< " may be accessed in an unbounded way in dimension "
<< j << "\n";
assert(false);
}

Expr min_required = touched[j].min;
Expr extent_required = touched[j].max + 1 - touched[j].min;
string error_msg_extent = error_name + " is accessed beyond the extent in dimension " + dim;
Expand Down
17 changes: 11 additions & 6 deletions src/Simplify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1559,12 +1559,6 @@ class Simplify : public IRMutator {
info = var_info.get(op->name);
var_info.pop(op->name);

if (body.same_as(op->body) &&
value.same_as(op->value) &&
!new_value.defined()) {
return op;
}

Body result = body;

if (new_value.defined() && info.new_uses > 0) {
Expand All @@ -1577,6 +1571,16 @@ class Simplify : public IRMutator {
result = T::make(op->name, value, result);
}

// Don't needlessly make a new Let/LetStmt node. (Here's a
// piece of template syntax I've never needed before).
const T *new_op = result.template as<T>();
if (new_op &&
new_op->name == op->name &&
new_op->body.same_as(op->body) &&
new_op->value.same_as(op->value)) {
return op;
}

return result;

}
Expand Down Expand Up @@ -2083,6 +2087,7 @@ void simplify_test() {

// Check that dead lets get stripped
check(Let::make("x", 3*y*y*y, 4), 4);
check(Let::make("x", 0, 0), 0);

std::cout << "Simplify test passed" << std::endl;
}
Expand Down
4 changes: 3 additions & 1 deletion src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ extern void halide_shutdown_thread_pool();

/** Define halide_malloc and halide_free to replace the default memory
* allocator. See Func::set_custom_allocator. (Specifically note that
* halide_malloc must return a 32-byte aligned pointer.)
* halide_malloc must return a 32-byte aligned pointer, and it must be
* safe to read at least 8 bytes before the start and beyond the
* end.)
*/
//@{
extern void *halide_malloc(void *user_context, size_t x);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/posix_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ WEAK void *halide_malloc(void *user_context, size_t x) {
if (halide_custom_malloc) {
return halide_custom_malloc(user_context, x);
} else {
void *orig = malloc(x+32);
void *orig = malloc(x+40);
if (orig == NULL) {
// Will result in a failed assertion and a call to halide_error
return NULL;
Expand Down
31 changes: 31 additions & 0 deletions test/correctness/strided_load.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include <Halide.h>
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
Image<int8_t> im(1697);

// A strided load with stride two loads a pair of vectors and
// shuffles out the elements like so:
// A0 A1 A2 A3 B0 B1 B2 B3 -> A0 A2 B0 B2

// That technique applied to the following would read beyond the
// input, so the second load actually gets pushed backwards (or
// valgrind would complain).
Func f, g;
Var x;
f(x) = im(2*x);
f.compute_root().vectorize(x, 16).bound(x, 0, 849);

// However, it's safe to apply it to this step, because f is an
// internal allocation, and halide_malloc adds a safety margin.
g(x) = f(2*x);
g.compute_root().vectorize(x, 16).bound(x, 0, 425); // 24 * 2 = 48 < 49

//g.compile_to_assembly("/dev/stdout", std::vector<Argument>(), "g");

g.realize(425);

return 0;
}
21 changes: 21 additions & 0 deletions test/error/unbounded_input.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include <Halide.h>
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
Func f;
Var x, y;

ImageParam in(Float(32), 2);
ImageParam x_coord(Int(32), 2);
ImageParam y_coord(Int(32), 2);

f(x, y) = in(x_coord(x, y), y_coord(x, y));

f.compile_jit();

printf("I should not have reached here\n");

return 0;
}
23 changes: 23 additions & 0 deletions test/error/unbounded_output.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <Halide.h>
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
Func f;
Var x, y;

ImageParam in(Float(32), 2);
ImageParam x_coord(Int(32), 2);
ImageParam y_coord(Int(32), 2);

f(x, y) = 0.0f;
RDom r(0, 100, 0, 100);
f(x_coord(r.x, r.y), y_coord(r.x, r.y)) += in(r.x, r.y);

f.compile_jit();

printf("I should not have reached here\n");

return 0;
}

0 comments on commit 16c9175

Please sign in to comment.