Skip to content

Commit c07f3e8

Browse files
Merge branch 'master' into srj-f16
2 parents c02750d + bf7500b commit c07f3e8

17 files changed

+546
-3
lines changed

Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ RUNTIME_CPP_COMPONENTS = \
721721
profiler_inlined \
722722
qurt_allocator \
723723
qurt_hvx \
724+
qurt_hvx_vtcm \
724725
qurt_init_fini \
725726
qurt_threads \
726727
qurt_threads_tsan \
@@ -1816,6 +1817,7 @@ install_qc: install $(HEXAGON_RUNTIME_LIBS)
18161817
ln -sf $(PREFIX)/share/halide/tools/GenGen.cpp $(PREFIX)/tools/GenGen.cpp
18171818
ln -sf $(PREFIX)/lib/v60/hexagon_sim_remote $(PREFIX)/bin/hexagon_sim_remote
18181819
ln -sf $(PREFIX)/lib/v60/libsim_qurt.a $(PREFIX)/lib/libsim_qurt.a
1820+
ln -sf $(PREFIX)/lib/v60/libsim_qurt_vtcm.a $(PREFIX)/lib/libsim_qurt_vtcm.a
18191821

18201822
# We need to capture the system libraries that we'll need to link
18211823
# against, so that downstream consumers of our build rules don't

src/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ set(RUNTIME_CPP
7272
profiler_inlined
7373
qurt_allocator
7474
qurt_hvx
75+
qurt_hvx_vtcm
7576
qurt_init_fini
7677
qurt_threads
7778
qurt_threads_tsan

src/CodeGen_Hexagon.cpp

+58
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,12 @@ void CodeGen_Hexagon::compile_func(const LoweredFunc &f,
309309
body = unpredicate_loads_stores(body);
310310
debug(2) << "Lowering after unpredicating loads/stores:\n" << body << "\n\n";
311311

312+
if (target.has_feature(Target::HVX_v65)) {
313+
// Generate vscatter-vgathers before optimize_hexagon_shuffles.
314+
debug(1) << "Looking for vscatter-vgather...\n";
315+
body = scatter_gather_generator(body);
316+
}
317+
312318
debug(1) << "Optimizing shuffles...\n";
313319
// vlut always indexes 64 bytes of the LUT at a time, even in 128 byte mode.
314320
const int lut_alignment = 64;
@@ -1900,6 +1906,35 @@ void CodeGen_Hexagon::visit(const Call *op) {
19001906
return;
19011907
}
19021908

1909+
if (op->is_intrinsic() && op->name == "gather") {
1910+
internal_assert(op->args.size() == 5);
1911+
internal_assert(op->type.bits() != 8);
1912+
int index_lanes = op->type.lanes();
1913+
int intrin_lanes = native_vector_bits()/op->type.bits();
1914+
1915+
string name = "halide.hexagon.vgather";
1916+
name += (op->type.bits() == 16) ? ".h.h" : ".w.w";
1917+
llvm::Function *fn = module->getFunction(name);
1918+
1919+
Value *dst_buffer = codegen(op->args[0]);
1920+
Value *src_ptr = codegen(op->args[2]);
1921+
Value *size = codegen(op->args[3]);
1922+
Value *index = codegen(op->args[4]);
1923+
1924+
// Cut up the indices into appropriately-sized pieces.
1925+
for (int start = 0; start < index_lanes; start += intrin_lanes) {
1926+
vector<Value *> args;
1927+
Value *new_index = slice_vector(index, start, intrin_lanes);
1928+
args.push_back(dst_buffer);
1929+
args.push_back(codegen(op->args[1] + start));
1930+
args.push_back(src_ptr);
1931+
args.push_back(size);
1932+
args.push_back(new_index);
1933+
value = builder->CreateCall(fn, args);
1934+
}
1935+
return;
1936+
}
1937+
19031938
CodeGen_Posix::visit(op);
19041939
}
19051940

@@ -2025,5 +2060,28 @@ void CodeGen_Hexagon::visit(const NE *op) {
20252060
}
20262061
}
20272062

2063+
void CodeGen_Hexagon::visit(const Allocate *op) {
2064+
if (op->memory_type == MemoryType::VTCM && !op->new_expr.defined()) {
2065+
if (!target.has_feature(Target::HVX_v65)) {
2066+
user_error << "VTCM store_in requires hvx_v65 target feature.\n";
2067+
}
2068+
// Calculate size of allocation.
2069+
Expr size = op->type.bytes();
2070+
for (size_t i = 0; i < op->extents.size(); i++) {
2071+
size *= op->extents[i];
2072+
}
2073+
size += allocation_padding(op->type);
2074+
Expr new_expr = Call::make(Handle(), "halide_vtcm_malloc", {size},
2075+
Call::Extern);
2076+
string free_function = "halide_vtcm_free";
2077+
Stmt new_alloc = Allocate::make(op->name, op->type, op->memory_type,
2078+
op->extents, op->condition, op->body,
2079+
new_expr, free_function);
2080+
new_alloc.accept(this);
2081+
} else {
2082+
CodeGen_Posix::visit(op);
2083+
}
2084+
}
2085+
20282086
} // namespace Internal
20292087
} // namespace Halide

src/CodeGen_Hexagon.h

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
6565
void visit(const GT *);
6666
void visit(const EQ *);
6767
void visit(const Select *);
68+
void visit(const Allocate *);
6869
///@}
6970

7071
/** We ask for an extra vector on each allocation to enable fast

src/CodeGen_Internal.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ bool function_takes_user_context(const std::string &name) {
193193
"halide_qurt_hvx_lock",
194194
"halide_qurt_hvx_unlock",
195195
"halide_qurt_hvx_unlock_as_destructor",
196+
"halide_vtcm_malloc",
197+
"halide_vtcm_free",
196198
"halide_cuda_initialize_kernels",
197199
"halide_opencl_initialize_kernels",
198200
"halide_opengl_initialize_kernels",

src/Expr.h

+6
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,12 @@ enum class MemoryType {
388388
* "local" in OpenCL, and "threadgroup" in metal. Can be shared
389389
* across GPU threads within the same block. */
390390
GPUShared,
391+
392+
/** Vector Tightly Coupled Memory. HVX (Hexagon) local memory available on
393+
* v65+. This memory has higher performance and lower power. Ideal for
394+
* intermediate buffers. Necessary for vgather-vscatter instructions
395+
* on Hexagon */
396+
VTCM,
391397
};
392398

393399
namespace Internal {

src/HexagonOptimize.cpp

+113
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,113 @@ class RearrangeExpressions : public IRMutator2 {
20012001
}
20022002
};
20032003

2004+
// Try generating vgathers instead of shuffles.
2005+
// At present, we request VTCM memory with single page allocation flag for all
2006+
// store_in allocations. So it's always safe to generate a vgather.
2007+
// Expressions which generate vgathers are of the form:
2008+
// out(x) = lut(foo(x))
2009+
// For vgathers out and lut should be in VTCM in a single page.
2010+
class ScatterGatherGenerator : public IRMutator2 {
2011+
Scope<Interval> bounds;
2012+
std::unordered_map<string, const Allocate *> allocations;
2013+
2014+
using IRMutator2::visit;
2015+
2016+
template <typename NodeType, typename T>
2017+
NodeType visit_let(const T *op) {
2018+
// We only care about vector lets.
2019+
if (op->value.type().is_vector()) {
2020+
bounds.push(op->name, bounds_of_expr_in_scope(op->value, bounds));
2021+
}
2022+
NodeType node = IRMutator2::visit(op);
2023+
if (op->value.type().is_vector()) {
2024+
bounds.pop(op->name);
2025+
}
2026+
return node;
2027+
}
2028+
2029+
Expr visit(const Let *op) { return visit_let<Expr>(op); }
2030+
2031+
Stmt visit(const LetStmt *op) { return visit_let<Stmt>(op); }
2032+
2033+
Stmt visit(const Allocate *op) {
2034+
// Create a map of the allocation
2035+
allocations[op->name] = op;
2036+
return IRMutator2::visit(op);
2037+
}
2038+
2039+
// Try to match expressions of the form:
2040+
// out(x) = lut(foo(x))
2041+
// to generate vgathers. Here, out and lut should have
2042+
// store_in(MemoryType::VTCM) directive. If a vgather is found return Call
2043+
// Expr to vgather, otherwise Expr().
2044+
Expr make_gather(const Load *op, Expr dst_base, Expr dst_index) {
2045+
Type ty = op->type;
2046+
const Allocate *alloc = allocations[op->name];
2047+
// The lut should be in VTCM.
2048+
if (!alloc || alloc->memory_type != MemoryType::VTCM) {
2049+
return Expr();
2050+
}
2051+
// HVX has only 16 or 32-bit gathers. Predicated vgathers are not
2052+
// supported yet.
2053+
if (op->index.as<Ramp>() || !is_one(op->predicate) || !ty.is_vector() ||
2054+
ty.bits() == 8) {
2055+
return Expr();
2056+
}
2057+
Expr index = mutate(ty.bytes() * op->index);
2058+
Interval index_bounds = bounds_of_expr_in_scope(index, bounds);
2059+
if (ty.bits() == 16 && index_bounds.is_bounded()) {
2060+
Expr index_span = span_of_bounds(index_bounds);
2061+
index_span = common_subexpression_elimination(index_span);
2062+
index_span = simplify(index_span);
2063+
// We need to downcast the index values to 16 bit signed. So all the
2064+
// the indices must be less than 1 << 15.
2065+
if (!can_prove(index_span < std::numeric_limits<int16_t>::max())) {
2066+
return Expr();
2067+
}
2068+
}
2069+
// Calculate the size of the buffer lut in bytes.
2070+
Expr size = ty.bytes();
2071+
for (size_t i = 0; i < alloc->extents.size(); i++) {
2072+
size *= alloc->extents[i];
2073+
}
2074+
Expr src = Variable::make(Handle(), op->name);
2075+
Expr new_index = mutate(cast(ty.with_code(Type::Int), index));
2076+
dst_index = mutate(dst_index);
2077+
2078+
return Call::make(ty, "gather", {dst_base, dst_index, src, size-1, new_index},
2079+
Call::Intrinsic);
2080+
}
2081+
2082+
Stmt visit(const Store *op) {
2083+
// HVX has only 16 or 32-bit gathers. Predicated vgathers are not
2084+
// supported yet.
2085+
Type ty = op->value.type();
2086+
if (!is_one(op->predicate) || !ty.is_vector() || ty.bits() == 8) {
2087+
return IRMutator2::visit(op);
2088+
}
2089+
// To use vgathers, the destination address must be VTCM memory.
2090+
const Allocate *alloc = allocations[op->name];
2091+
if (!alloc || alloc->memory_type != MemoryType::VTCM) {
2092+
return IRMutator2::visit(op);
2093+
}
2094+
// The source for a gather must also be a buffer in VTCM.
2095+
if (op->index.as<Ramp>() && op->value.as<Load>()) {
2096+
// Check for vgathers
2097+
Expr dst_base = Variable::make(Handle(), op->name);
2098+
Expr dst_index = op->index.as<Ramp>()->base;
2099+
Expr value = make_gather(op->value.as<Load>(), dst_base, dst_index);
2100+
if (value.defined()) {
2101+
// Found a vgather.
2102+
// Function make_gather already mutates all the call arguements,
2103+
// so no need to mutate again.
2104+
return Evaluate::make(value);
2105+
}
2106+
}
2107+
return IRMutator2::visit(op);
2108+
}
2109+
};
2110+
20042111
} // namespace
20052112

20062113
Stmt optimize_hexagon_shuffles(Stmt s, int lut_alignment) {
@@ -2017,6 +2124,12 @@ Stmt vtmpy_generator(Stmt s) {
20172124
return s;
20182125
}
20192126

2127+
Stmt scatter_gather_generator(Stmt s) {
2128+
// Generate vscatter-vgather instruction if target >= v65
2129+
s = ScatterGatherGenerator().mutate(s);
2130+
return s;
2131+
}
2132+
20202133
Stmt optimize_hexagon_instructions(Stmt s, Target t, Scope<ModulusRemainder> &alignment_info) {
20212134
// Convert some expressions to an equivalent form which get better
20222135
// optimized in later stages for hexagon

src/HexagonOptimize.h

+7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ Stmt optimize_hexagon_shuffles(Stmt s, int lut_alignment);
1818
/** Generate vtmpy instruction if possible */
1919
Stmt vtmpy_generator(Stmt s);
2020

21+
/* Generate vscatter-vgather instructions on Hexagon using VTCM memory.
22+
* The pass should be run before generating shuffles.
23+
* Some expressions which generate vscatter-vgathers are:
24+
* 1. out(x) = lut(foo(x)) -> vgather
25+
* 2. out(idx(x)) = foo(x) -> vscatter */
26+
Stmt scatter_gather_generator(Stmt s);
27+
2128
/** Hexagon deinterleaves when performing widening operations, and
2229
* interleaves when performing narrowing operations. This pass
2330
* rewrites widenings/narrowings to be explicit in the IR, and

src/IRPrinter.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ std::ostream &operator<<(std::ostream &out, const MemoryType &t) {
121121
case MemoryType::GPUShared:
122122
out << "GPUShared";
123123
break;
124+
case MemoryType::VTCM:
125+
out << "VTCM";
126+
break;
124127
}
125128
return out;
126129
}

src/LLVM_Runtime_Linker.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ DECLARE_CPP_INITMOD(profiler)
126126
DECLARE_CPP_INITMOD(profiler_inlined)
127127
DECLARE_CPP_INITMOD(qurt_allocator)
128128
DECLARE_CPP_INITMOD(qurt_hvx)
129+
DECLARE_CPP_INITMOD(qurt_hvx_vtcm)
129130
DECLARE_CPP_INITMOD(qurt_init_fini)
130131
DECLARE_CPP_INITMOD(qurt_threads)
131132
DECLARE_CPP_INITMOD(qurt_threads_tsan)
@@ -853,6 +854,11 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
853854
} else if (t.has_feature(Target::HVX_128)) {
854855
modules.push_back(get_initmod_hvx_128_ll(c));
855856
}
857+
if (t.has_feature(Target::HVX_v65)) {
858+
modules.push_back(get_initmod_qurt_hvx_vtcm(c, bits_64,
859+
debug));
860+
}
861+
856862
} else {
857863
modules.push_back(get_initmod_prefetch(c, bits_64, debug));
858864
}

src/runtime/hexagon_remote/Makefile

+11-3
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ remotes: \
7979
bin/v60/libhalide_hexagon_remote_skel.so \
8080
bin/v60/signed_by_debug/libhalide_hexagon_remote_skel.so \
8181
bin/v60/hexagon_sim_remote \
82-
bin/v60/libsim_qurt.a
82+
bin/v60/libsim_qurt.a \
83+
bin/v60/libsim_qurt_vtcm.a
8384

8485
bin/src/halide_hexagon_remote.h bin/src/halide_hexagon_remote_skel.c bin/src/halide_hexagon_remote_stub.c: halide_hexagon_remote.idl
8586
mkdir -p $(@D)
@@ -160,6 +161,13 @@ bin/%/sim_qurt.o: sim_qurt.cpp
160161
bin/%/libsim_qurt.a: bin/%/sim_qurt.o
161162
ar rcs $@ $^
162163

164+
bin/%/sim_qurt_vtcm.o: sim_qurt_vtcm.cpp
165+
mkdir -p $(@D)
166+
$(CXX-$*) $(CCFLAGS-$*) -c sim_qurt_vtcm.cpp -o $@
167+
168+
bin/%/libsim_qurt_vtcm.a: bin/%/sim_qurt_vtcm.o
169+
ar rcs $@ $^
170+
163171
CRT0_STANDALONE=$(shell $(CXX-v60) -G0 -print-file-name=crt0_standalone.o)
164172
CRT0 =$(shell $(CXX-v60) -G0 -print-file-name=crt0.o)
165173
INIT =$(shell $(CXX-v60) -G0 -print-file-name=init.o)
@@ -169,10 +177,10 @@ LIB_GCC =$(shell $(CXX-v60) -G0 -print-file-name=libgcc.a)
169177
FINI =$(shell $(CXX-v60) -G0 -print-file-name=fini.o)
170178
LIBDL =$(HEXAGON_TOOLS_ROOT)/Tools/target/hexagon/lib/v60/G0/libdl.a
171179

172-
bin/%/hexagon_sim_remote: bin/%/sim_remote.o bin/%/dlib.o bin/%/known_symbols.o bin/%/libsim_qurt.a
180+
bin/%/hexagon_sim_remote: bin/%/sim_remote.o bin/%/dlib.o bin/%/known_symbols.o bin/%/libsim_qurt.a bin/%/libsim_qurt_vtcm.a
173181
mkdir -p $(@D)
174182
$(LD-$*) -o $@ $(CRT0_STANDALONE) $(CRT0) $(INIT) bin/$*/sim_remote.o bin/$*/dlib.o bin/$*/known_symbols.o bin/$*/libsim_qurt.a $(LIBDL) \
175-
--start-group $(LIB_STANDALONE) --whole-archive $(LIB_C) --no-whole-archive $(LIB_GCC) --end-group $(FINI) \
183+
--start-group $(LIB_STANDALONE) --whole-archive $(LIB_C) bin/$*/libsim_qurt_vtcm.a --no-whole-archive $(LIB_GCC) --end-group $(FINI) \
176184
--dynamic-linker= -E --force-dynamic
177185
#$(CC-$*) -m$* -mG0lib -G0 -ldl -lc -lstandalone -lgcc $^ $(LIBDL) -o $@
178186

0 commit comments

Comments
 (0)