Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ tsl_gpu_library(
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/hash",
"@com_google_absl//absl/log",
"@com_google_absl//absl/strings",
"@tsl//tsl/platform:abi",
"@tsl//tsl/platform:mutex",
Expand Down
14 changes: 13 additions & 1 deletion xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/backends/profiler/gpu/cupti_buffer_events.h"

#include <cstdint>

#include "absl/strings/str_cat.h"
#include "third_party/gpus/cuda/include/cuda.h"
#include "xla/backends/profiler/gpu/cupti_interface.h"
Expand Down Expand Up @@ -164,6 +166,7 @@ void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
event.scope_range_id = info.scope_range_id;
SetEventGraphId(event, kernel);
event.kernel_info.registers_per_thread = kernel->registersPerThread;
event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
Expand Down Expand Up @@ -201,6 +204,7 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
/* .context_id = */ graph_trace->contextId,
/* .stream_id = */ graph_trace->streamId,
/* .graph_id = */ graph_trace->graphId,
/* .scope_range_id = */ info.scope_range_id,
});
}

Expand Down Expand Up @@ -240,6 +244,8 @@ void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info =
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
event.scope_range_id = info.scope_range_id;
SetEventGraphId(event, memcpy);
event.memcpy_info.copy_kind = memcpy->copyKind;
event.memcpy_info.num_bytes = memcpy->bytes;
Expand Down Expand Up @@ -270,6 +276,8 @@ void AddMemcpyP2PActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info =
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
event.scope_range_id = info.scope_range_id;
SetEventGraphId(event, memcpy);
event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
event.memcpy_info.num_bytes = memcpy->bytes;
Expand Down Expand Up @@ -539,7 +547,8 @@ absl::string_view StringDeduper::Dedup(absl::string_view str,

void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
const absl::string_view annotation,
const absl::string_view nvtx_range) {
const absl::string_view nvtx_range,
int64_t scope_range_id) {
if (annotation.empty() && nvtx_range.empty()) return;
VLOG(3) << "Add annotation: device_id: " << device_id
<< " correlation_id: " << correlation_id
Expand All @@ -550,6 +559,7 @@ void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
AnnotationInfo info;
info.annotation = per_device_map.annotation_deduper.Dedup(annotation);
info.nvtx_range = per_device_map.nvtx_range_deduper.Dedup(nvtx_range);
info.scope_range_id = scope_range_id;
per_device_map.correlation_map.emplace(correlation_id, info);
}
}
Expand Down Expand Up @@ -600,6 +610,7 @@ CallbackAnnotationsAndEvents &CallbackAnnotationsAndEvents::operator=(
nvtx_ranges_ = std::move(another.nvtx_ranges_);
num_dropped_events_ = another.num_dropped_events_;
event_queue_ = std::move(another.event_queue_);
scope_range_id_tree_ = std::move(another.scope_range_id_tree_);
another.Clear();
return *this;
}
Expand All @@ -609,6 +620,7 @@ void CallbackAnnotationsAndEvents::Clear() {
nvtx_ranges_.Clear();
num_dropped_events_ = 0;
event_queue_.Clear();
scope_range_id_tree_.clear();
}

} // namespace profiler
Expand Down
11 changes: 10 additions & 1 deletion xla/backends/profiler/gpu/cupti_buffer_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ struct CuptiTracerEvent {
int64_t context_id = kInvalidContextId;
int64_t stream_id = kInvalidStreamId;
uint32_t graph_id = 0;
int64_t scope_range_id = 0;
union {
// For Memcpy API and activities. `type` must be Memcpy*.
MemcpyDetails memcpy_info;
Expand Down Expand Up @@ -266,13 +267,15 @@ class AnnotationMap {
struct AnnotationInfo {
absl::string_view annotation;
absl::string_view nvtx_range;
int64_t scope_range_id = 0;
};

explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
: max_size_(max_size), per_device_map_(num_gpus) {}

void Add(uint32_t device_id, uint32_t correlation_id,
absl::string_view annotation, absl::string_view nvtx_range);
absl::string_view annotation, absl::string_view nvtx_range,
int64_t scope_range_id = 0);

AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id) const
ABSL_ATTRIBUTE_LIFETIME_BOUND;
Expand Down Expand Up @@ -300,6 +303,9 @@ struct CuptiEventCollectorDelegate {
: annotation_map(p_annotation_map), receive(std::move(p_receive)) {}
};

// A tree of scope range ids which map child_id ==> parent_id
typedef absl::flat_hash_map<int64_t, int64_t> ScopeRangeIdTree;

class CuptiActivityBufferManager {
public:
struct ActivityBufferAndSize {
Expand Down Expand Up @@ -367,6 +373,8 @@ class CallbackAnnotationsAndEvents {

EventQueue& event_queue() { return event_queue_; }

ScopeRangeIdTree& scope_range_id_tree() { return scope_range_id_tree_; }

size_t NumDroppedEvents() const { return num_dropped_events_; }

void IncNumDroppedEvents() { ++num_dropped_events_; }
Expand All @@ -378,6 +386,7 @@ class CallbackAnnotationsAndEvents {
StringDeduper nvtx_ranges_;
size_t num_dropped_events_ = 0;
EventQueue event_queue_;
ScopeRangeIdTree scope_range_id_tree_;
};

} // namespace profiler
Expand Down
39 changes: 36 additions & 3 deletions xla/backends/profiler/gpu/cupti_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ limitations under the License.
#include <optional>
#include <queue>
#include <string>
#include <vector>

#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/hash/hash.h"
#include "absl/log/log.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
#include "third_party/gpus/cuda/include/cuda.h"
#include "third_party/gpus/cuda/include/cuda_occupancy.h"
#include "xla/backends/profiler/gpu/cupti_buffer_events.h"
#include "xla/tsl/profiler/utils/parse_annotation.h"
#include "xla/tsl/profiler/utils/trace_utils.h"
#include "xla/tsl/profiler/utils/xplane_builder.h"
Expand All @@ -52,6 +55,7 @@ using tsl::profiler::FindOrAddMutablePlaneWithName;
using tsl::profiler::GpuPlaneName;
using tsl::profiler::kCuptiDriverApiPlaneName;
using tsl::profiler::kDeviceVendorNvidia;
using tsl::profiler::kScopeRangeIdTreePlaneName;
using tsl::profiler::kThreadIdOverhead;
using tsl::profiler::ParseAnnotationStack;
using tsl::profiler::StatType;
Expand Down Expand Up @@ -180,6 +184,11 @@ class PerDeviceCollector {
GetStatTypeStr(StatType::kCorrelationId)),
event.correlation_id);
}
if (event.scope_range_id) {
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kScopeRangeId)),
event.scope_range_id);
}
if (!event.nvtx_range.empty()) {
xevent.AddStatValue(
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
Expand Down Expand Up @@ -549,7 +558,18 @@ class EventInQueue {
void CuptiTraceCollector::OnTracerCollectedCallbackData(
std::vector<CallbackAnnotationsAndEvents> callback_annotations_and_events,
bool need_callback_events) {
// Build merged annotation first.
// Merge per-thread scope range id tree.
for (auto& annotations_and_events : callback_annotations_and_events) {
auto& per_thread_scope_range_id_tree =
annotations_and_events.scope_range_id_tree();
for (const auto [child_id, parent_id] : per_thread_scope_range_id_tree) {
scope_range_id_tree_.insert({child_id, parent_id});
}
// Free resources earlier although it will be freed later if not here.
per_thread_scope_range_id_tree.clear();
}

// Build merged annotation.
std::priority_queue<EventInQueue> min_heap;
for (auto& annotations_and_events : callback_annotations_and_events) {
EventInQueue event_in_queue(annotations_and_events.event_queue());
Expand All @@ -566,11 +586,12 @@ void CuptiTraceCollector::OnTracerCollectedCallbackData(
CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
for (uint32_t device = 0; device < options_.num_gpus; ++device) {
annotation_map_.Add(device, event.correlation_id, event.annotation,
event.nvtx_range);
event.nvtx_range, event.scope_range_id);
}
} else {
annotation_map_.Add(event.device_id, event.correlation_id,
event.annotation, event.nvtx_range);
event.annotation, event.nvtx_range,
event.scope_range_id);
}
// Clear the annotation and nvtx_range of the Callback API events, as they
// are now in the combined AnnotationMap which will be used by the
Expand Down Expand Up @@ -648,6 +669,17 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}

void Flush() override {}
void ExportScopeRangeIdTree(XSpace* space) {
XPlaneBuilder plane(
FindOrAddMutablePlaneWithName(space, kScopeRangeIdTreePlaneName));
// No metadata is used for this plane, we just use the XStat to
// transfer the map without break any existing proto.
tensorflow::profiler::XStatMetadata metadata;
for (const auto& [child_id, parent_id] : scope_range_id_tree_) {
metadata.set_id(child_id);
plane.AddStatValue(metadata, parent_id);
}
}
// Returns true if some GPU events are captured.
bool Export(XSpace* space, uint64_t end_gpu_ns) override {
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
Expand All @@ -656,6 +688,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
LOG(INFO) << " GpuTracer max callback_events: "
<< options_.max_activity_api_events
<< ", max activity events: " << options_.max_activity_api_events;
ExportScopeRangeIdTree(space);
size_t num_events = 0;
XPlaneBuilder host_plane(
FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
Expand Down
6 changes: 4 additions & 2 deletions xla/backends/profiler/gpu/cupti_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ class CuptiTraceCollector {
// After CuptiTracer stop, collected per-thread callback data from threads
// will be send here. Default behavior are: a) create merged annotation map
// (for later activity event usage), and b) direct add all event by calling
// AddEvent(). If need_callback_events is false, only annotation map
// will be merged, all events will be dropped.
// AddEvent(). If need_callback_events is false, only annotation map and scope
// range id tree will be merged, all events will be dropped.
virtual void OnTracerCollectedCallbackData(
std::vector<CallbackAnnotationsAndEvents> callback_events,
bool need_callback_events);
Expand Down Expand Up @@ -91,6 +91,8 @@ class CuptiTraceCollector {

protected:
CuptiTracerCollectorOptions options_;
// map of child_scope_id -> parent_scope_id
ScopeRangeIdTree scope_range_id_tree_;

private:
AnnotationMap annotation_map_;
Expand Down
16 changes: 16 additions & 0 deletions xla/backends/profiler/gpu/cupti_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,19 @@ class GuardedCallbackAnnotationsAndEvents {
annotations_and_events_.event_queue().Push(std::move(event));
}

void AddScopeRangeIdSequence(absl::Span<const int64_t> sequence) {
if (sequence.size() > 1) {
const int64_t *head = sequence.data();
const int64_t *curr = &sequence.back();

tsl::mutex_lock lock(mu_);
ScopeRangeIdTree &tree = annotations_and_events_.scope_range_id_tree();
for (; curr > head && !tree.contains(*curr); --curr) {
tree.emplace(*curr, *(curr - 1));
}
}
}

private:
tsl::mutex mu_;
CallbackAnnotationsAndEvents annotations_and_events_ TF_GUARDED_BY(mu_);
Expand All @@ -850,10 +863,13 @@ absl::Status AddDriverApiCallbackEvent(
return absl::OkStatus();
}
tracer->IncCallbackEventCount();
absl::Span<const int64_t> range_ids = AnnotationStack::GetScopeRangeIds();
guarded_annotations_and_events.AddScopeRangeIdSequence(range_ids);
CuptiTracerEvent event{};
event.correlation_id = cbdata->correlationId;
event.annotation = annotation;
event.nvtx_range = nvtx_range;
event.scope_range_id = range_ids.empty() ? 0 : range_ids.back();
SetCallbackEventUponApiExit(event, cupti_interface, device_id, cbid, cbdata,
start_tsc, end_tsc);
guarded_annotations_and_events.Push(*tracer, std::move(event));
Expand Down
2 changes: 2 additions & 0 deletions xla/tsl/profiler/backends/cpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ cc_library(
]),
deps = [
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
"@tsl//tsl/platform:macros",
"@tsl//tsl/platform:types",
] + if_static([
Expand All @@ -104,6 +105,7 @@ cc_library(
]),
deps = [
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
"@tsl//tsl/platform:macros",
"@tsl//tsl/platform:types",
],
Expand Down
37 changes: 29 additions & 8 deletions xla/tsl/profiler/backends/cpu/annotation_stack.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@ limitations under the License.

#include <atomic>
#include <cstddef>
#include <cstdint>
#include <string>
#include <string_view>
#include <utility>
#include <tuple>
#include <vector>

#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "tsl/platform/macros.h"
#include "tsl/platform/types.h"

namespace tsl {
Expand All @@ -35,36 +38,54 @@ static auto GetAnnotationData(const std::atomic<int>& atomic) {
int generation = 0;
std::vector<size_t> stack;
std::string string;
std::vector<int64_t> scope_range_id_stack;
} data;
int generation = atomic.load(std::memory_order_acquire);
if (generation != data.generation) {
data = {generation};
}
return std::make_pair(&data.stack, &data.string);
return std::make_tuple(&data.stack, &data.string, &data.scope_range_id_stack);
};

void AnnotationStack::PushAnnotation(std::string_view name) {
auto [stack, string] = GetAnnotationData(generation_);
static std::atomic<int64_t> scope_range_counter = 0;

auto [stack, string, scope_range_id_stack] = GetAnnotationData(generation_);
stack->push_back(string->size());
if (!string->empty()) {
return absl::StrAppend(
absl::StrAppend(
string, "::", absl::string_view(name.data(), name.size()) // NOLINT
);
} else {
string->assign(name);
}
int64_t scope_range_id =
scope_range_counter.fetch_add(1, std::memory_order_relaxed) + 1;
if (TF_PREDICT_FALSE(scope_range_id == 0)) {
scope_range_id =
scope_range_counter.fetch_add(1, std::memory_order_relaxed) + 1;
}
string->assign(name);
scope_range_id_stack->push_back(scope_range_id);
}

void AnnotationStack::PopAnnotation() {
auto [stack, string] = GetAnnotationData(generation_);
auto [stack, string, scope_range_id_stack] = GetAnnotationData(generation_);
if (stack->empty()) {
return string->clear();
string->clear();
scope_range_id_stack->clear();
return;
}
string->resize(stack->back());
stack->pop_back();
scope_range_id_stack->pop_back();
}

const string& AnnotationStack::Get() {
return *std::get<std::string*>(GetAnnotationData(generation_));
return *std::get<1>(GetAnnotationData(generation_));
}

absl::Span<const int64_t> AnnotationStack::GetScopeRangeIds() {
return absl::MakeConstSpan(*std::get<2>(GetAnnotationData(generation_)));
}

void AnnotationStack::Enable(bool enable) {
Expand Down
Loading