From c322d2833c1a2f1cd56c7c9a18cdb5f8126e76b8 Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Mon, 15 Jul 2024 11:32:13 -0400 Subject: [PATCH] [hip][cuda] Update event allocation and collection. (#17603) The existing system was not sufficient for graphs, as they can be run out of order and have different behavior for event recording. This does not entirely solve the problem for re-use, if we ever want to simultaneously submit more than one graph at a time, but is much closer. --------- Signed-off-by: Andrew Woloszyn --- .../hal/drivers/cuda/graph_command_buffer.c | 35 +- .../hal/drivers/cuda/graph_command_buffer.h | 5 + .../src/iree/hal/drivers/cuda/nccl_channel.c | 10 +- .../src/iree/hal/drivers/cuda/nccl_channel.h | 1 + .../hal/drivers/cuda/pending_queue_actions.c | 12 +- .../hal/drivers/cuda/stream_command_buffer.c | 43 ++- .../hal/drivers/cuda/stream_command_buffer.h | 6 + runtime/src/iree/hal/drivers/cuda/tracing.c | 331 ++++++++++++----- runtime/src/iree/hal/drivers/cuda/tracing.h | 107 +++--- .../hal/drivers/hip/graph_command_buffer.c | 41 ++- .../hal/drivers/hip/graph_command_buffer.h | 5 + .../hal/drivers/hip/pending_queue_actions.c | 16 +- .../src/iree/hal/drivers/hip/rccl_channel.c | 10 +- .../src/iree/hal/drivers/hip/rccl_channel.h | 1 + .../hal/drivers/hip/stream_command_buffer.c | 43 ++- .../hal/drivers/hip/stream_command_buffer.h | 5 + runtime/src/iree/hal/drivers/hip/tracing.c | 336 +++++++++++++----- runtime/src/iree/hal/drivers/hip/tracing.h | 114 +++--- 18 files changed, 823 insertions(+), 298 deletions(-) diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c index 3350c5cb0b7f..c747c3c699b9 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c @@ -33,6 +33,7 @@ typedef struct iree_hal_cuda_graph_command_buffer_t { // Per-stream CUDA tracing context. iree_hal_cuda_tracing_context_t* tracing_context; + iree_hal_cuda_tracing_context_event_list_t tracing_event_list; // A resource set to maintain references to all resources used within the // command buffer. @@ -96,10 +97,11 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external( &command_buffer->cu_graph_nodes[command_buffer->graph_node_count++]; size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0; IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, tracing_event_node, - command_buffer->cu_graph, &command_buffer->cu_barrier_node, - dependency_count, file_name, file_name_length, line, function_name, - function_name_length, name, name_length); + command_buffer->tracing_context, &command_buffer->tracing_event_list, + tracing_event_node, command_buffer->cu_graph, + &command_buffer->cu_barrier_node, dependency_count, file_name, + file_name_length, line, function_name, function_name_length, name, + name_length); // Move the barrier forward to make sure that the tracing event is recorded // before work starts. @@ -121,10 +123,10 @@ static void iree_cuda_graph_command_buffer_trace_zone_end( size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0; IREE_ASSERT_GT(dependency_count, 0, "ending a zone should at least depend on the beginning"); - IREE_CUDA_GRAPH_TRACE_ZONE_END(command_buffer->tracing_context, - tracing_event_node, command_buffer->cu_graph, - &command_buffer->cu_barrier_node, - dependency_count); + IREE_CUDA_GRAPH_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + tracing_event_node, command_buffer->cu_graph, + &command_buffer->cu_barrier_node, dependency_count); // We need to wait on the tracing end before other work starts. // GPU tracing zones are first-in, last-out. @@ -191,6 +193,8 @@ iree_status_t iree_hal_cuda_graph_command_buffer_create( command_buffer->host_allocator = host_allocator; command_buffer->symbols = cuda_symbols; command_buffer->tracing_context = tracing_context; + command_buffer->tracing_event_list.head = NULL; + command_buffer->tracing_event_list.tail = NULL; iree_arena_initialize(block_pool, &command_buffer->arena); command_buffer->cu_context = context; command_buffer->cu_graph = NULL; @@ -224,6 +228,9 @@ static void iree_hal_cuda_graph_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_cuda_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); + // Drop any pending collective batches before we tear things down. iree_hal_collective_batch_clear(&command_buffer->collective_batch); @@ -261,6 +268,18 @@ CUgraphExec iree_hal_cuda_graph_command_buffer_handle( return command_buffer->cu_graph_exec; } +void iree_hal_cuda_graph_tracing_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_cuda_graph_command_buffer_t* command_buffer = + iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); + if (!command_buffer->tracing_context) { + return; + } + + iree_hal_cuda_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); +} + // Flushes any pending batched collective operations. // Must be called before any other non-collective nodes are added to the graph // or a barrier is encountered. diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h index e6e3c85c565e..aa0d2cc3b352 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h @@ -42,6 +42,11 @@ bool iree_hal_cuda_graph_command_buffer_isa( CUgraphExec iree_hal_cuda_graph_command_buffer_handle( iree_hal_command_buffer_t* command_buffer); +// This is to be called after the given |command_buffer| has been submitted +// in order to notify the tracing system that there are events to collect. +void iree_hal_cuda_graph_tracing_notify_submitted_commands( + iree_hal_command_buffer_t* command_buffer); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c index 5d10ee73047b..e3eb31c16d3b 100644 --- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c +++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c @@ -542,6 +542,7 @@ static iree_status_t iree_hal_cuda_nccl_submit_batch_entry( iree_status_t iree_hal_cuda_nccl_submit_batch( const iree_hal_cuda_nccl_dynamic_symbols_t* symbols, iree_hal_cuda_tracing_context_t* tracing_context, + iree_hal_cuda_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, CUstream stream) { IREE_ASSERT_ARGUMENT(symbols); IREE_ASSERT_ARGUMENT(batch); @@ -558,9 +559,9 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, stream, __FILE__, strlen(__FILE__), (uint32_t)__LINE__, - __FUNCTION__, strlen(__FUNCTION__), collective_str.data, - collective_str.size); + tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__), + (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), + collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -577,7 +578,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( // End all zones we began above - note that these are just simply nested so // order doesn't matter so long as we end the right number of zones. for (iree_host_size_t i = 0; i < batch->count; ++i) { - IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, stream); + IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, + stream); } #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.h b/runtime/src/iree/hal/drivers/cuda/nccl_channel.h index cea4911a3296..182bc5ca44e1 100644 --- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.h +++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.h @@ -49,6 +49,7 @@ iree_status_t iree_hal_cuda_nccl_channel_create( iree_status_t iree_hal_cuda_nccl_submit_batch( const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols, iree_hal_cuda_tracing_context_t* tracing_context, + iree_hal_cuda_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, CUstream stream); #ifdef __cplusplus diff --git a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c index 86d5762379f2..345608165a90 100644 --- a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c +++ b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c @@ -22,6 +22,7 @@ #include "iree/hal/drivers/cuda/event_pool.h" #include "iree/hal/drivers/cuda/event_semaphore.h" #include "iree/hal/drivers/cuda/graph_command_buffer.h" +#include "iree/hal/drivers/cuda/stream_command_buffer.h" #include "iree/hal/drivers/utils/semaphore.h" #include "iree/hal/utils/deferred_command_buffer.h" #include "iree/hal/utils/resource_set.h" @@ -729,12 +730,17 @@ static iree_status_t iree_hal_cuda_pending_queue_actions_issue_execution( action->payload.execution.binding_tables ? action->payload.execution.binding_tables[i] : iree_hal_buffer_binding_table_empty(); - if (iree_hal_cuda_graph_command_buffer_isa(command_buffer)) { + if (iree_hal_cuda_stream_command_buffer_isa(command_buffer)) { + // Notify that the commands were "submitted" so we can + // make sure to clean up our trace events. + iree_hal_cuda_stream_notify_submitted_commands(command_buffer); + } else if (iree_hal_cuda_graph_command_buffer_isa(command_buffer)) { CUgraphExec exec = iree_hal_cuda_graph_command_buffer_handle(command_buffer); IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR( z0, symbols, cuGraphLaunch(exec, action->dispatch_cu_stream), "cuGraphLaunch"); + iree_hal_cuda_graph_tracing_notify_submitted_commands(command_buffer); } else { iree_hal_command_buffer_t* stream_command_buffer = NULL; iree_hal_command_buffer_mode_t mode = @@ -753,6 +759,10 @@ static iree_status_t iree_hal_cuda_pending_queue_actions_issue_execution( IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_deferred_command_buffer_apply( command_buffer, stream_command_buffer, binding_table)); + iree_hal_cuda_stream_notify_submitted_commands(stream_command_buffer); + // The stream_command_buffer is going to be retained by + // the action->resource_set and deleted after the action + // completes. iree_hal_resource_release(stream_command_buffer); } } diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c index fcc13ef60e92..8f9cd2d074e4 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c @@ -23,6 +23,7 @@ typedef struct iree_hal_cuda_stream_command_buffer_t { // Per-stream CUDA tracing context. iree_hal_cuda_tracing_context_t* tracing_context; + iree_hal_cuda_tracing_context_event_list_t tracing_event_list; CUstream cu_stream; @@ -98,6 +99,8 @@ iree_status_t iree_hal_cuda_stream_command_buffer_create( command_buffer->cuda_symbols = cuda_symbols; command_buffer->nccl_symbols = nccl_symbols; command_buffer->tracing_context = tracing_context; + command_buffer->tracing_event_list.head = NULL; + command_buffer->tracing_event_list.tail = NULL; command_buffer->cu_stream = stream; iree_arena_initialize(block_pool, &command_buffer->arena); @@ -122,6 +125,9 @@ static void iree_hal_cuda_stream_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_cuda_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); + iree_hal_collective_batch_deinitialize(&command_buffer->collective_batch); iree_hal_resource_set_free(command_buffer->resource_set); iree_arena_deinitialize(&command_buffer->arena); @@ -136,6 +142,18 @@ bool iree_hal_cuda_stream_command_buffer_isa( &iree_hal_cuda_stream_command_buffer_vtable); } +void iree_hal_cuda_stream_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_cuda_stream_command_buffer_t* command_buffer = + iree_hal_cuda_stream_command_buffer_cast(base_command_buffer); + if (!command_buffer->tracing_context) { + return; + } + + iree_hal_cuda_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); +} + // Flushes any pending batched collective operations. // Must be called before any other non-collective nodes are added to the graph // or a barrier is encountered. @@ -151,7 +169,8 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_flush_collectives( IREE_TRACE_ZONE_BEGIN(z0); iree_status_t status = iree_hal_cuda_nccl_submit_batch( command_buffer->nccl_symbols, command_buffer->tracing_context, - &command_buffer->collective_batch, command_buffer->cu_stream); + &command_buffer->tracing_event_list, &command_buffer->collective_batch, + command_buffer->cu_stream); iree_hal_collective_batch_clear(&command_buffer->collective_batch); IREE_TRACE_ZONE_END(z0); return status; @@ -164,7 +183,8 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_begin( (void)command_buffer; IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->cu_stream, + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_cuda_stream_command_buffer", strlen("iree_hal_cuda_stream_command_buffer"), /*name=*/NULL, 0); @@ -200,6 +220,7 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_end( &command_buffer->collective_batch); IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->cu_stream); IREE_TRACE_ZONE_END(z0); @@ -215,10 +236,10 @@ static void iree_hal_cuda_stream_command_buffer_begin_debug_group( (void)command_buffer; IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->cu_stream, - location ? location->file.data : NULL, location ? location->file.size : 0, - location ? location->line : 0, /*func_name=*/NULL, 0, label.data, - label.size); + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, location ? location->file.data : NULL, + location ? location->file.size : 0, location ? location->line : 0, + /*func_name=*/NULL, 0, label.data, label.size); // TODO: pass along to CUPTI if available. } @@ -232,6 +253,7 @@ static void iree_hal_cuda_stream_command_buffer_end_debug_group( // TODO: pass along to CUPTI if available. IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->cu_stream); } @@ -528,10 +550,10 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->cu_stream, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -614,6 +636,7 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( "cuLaunchKernel"); IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->cu_stream); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h index 5ab4a11f08c4..47a9fdbaaa56 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h @@ -44,6 +44,12 @@ iree_status_t iree_hal_cuda_stream_command_buffer_create( bool iree_hal_cuda_stream_command_buffer_isa( iree_hal_command_buffer_t* command_buffer); +// This is to be called after a command buffer has been submitted +// in order to notify the tracing system that there are events +// to collect. +void iree_hal_cuda_stream_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.c b/runtime/src/iree/hal/drivers/cuda/tracing.c index 02d4b202f74b..68df0521aca6 100644 --- a/runtime/src/iree/hal/drivers/cuda/tracing.c +++ b/runtime/src/iree/hal/drivers/cuda/tracing.c @@ -16,8 +16,36 @@ // To prevent spilling pages we leave some room for the context structure. #define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) +// iree_hal_hip_tracing_context_event_t contains a cuEvent that is used to +// record timestamps for tracing GPU execution. In this struct, there are also +// two linked lists that the current event may be added to during its lifetime. +// +// --------------------->---Submissions--->---------- +// \ \ \ +// \ \ \ +// command_buffer command_buffer command_buffer +// +// The submission list is owned by the tracing context and elements are +// inserted and removed as commmand_buffers are submitted and when they +// complete. This is a list of the head elements for each command buffer. +// The commnad buffer list is owned by the command buffer. It is the list of +// events used to trace command buffer dispatches. +// +// When the event is in the freelist, next_submission should be null, and +// we reuse next_in_command_buffer to track the next free event. +// +// When the even is grabbed from the freelist to track GPU executions, +// it is added to the list in recording command_buffer. +struct iree_hal_cuda_tracing_context_event_t { + CUevent event; + iree_hal_cuda_tracing_context_event_t* next_in_command_buffer; + iree_hal_cuda_tracing_context_event_t* next_submission; + bool was_submitted; +}; + struct iree_hal_cuda_tracing_context_t { const iree_hal_cuda_dynamic_symbols_t* symbols; + iree_slim_mutex_t event_mutex; CUstream stream; iree_arena_block_pool_t* block_pool; @@ -32,13 +60,32 @@ struct iree_hal_cuda_tracing_context_t { // between events and we need a stable base event. CUevent base_event; - // Indices into |event_pool| defining a ringbuffer. - uint32_t query_head; - uint32_t query_tail; + // Unallocated event list head. next_in_command_buffer points to the next + // available event. + iree_hal_cuda_tracing_context_event_t* event_freelist_head; + + // Submitted events. + iree_hal_cuda_tracing_context_event_list_t submitted_event_list; + uint32_t query_capacity; // Event pool reused to capture tracing timestamps. - CUevent event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY]; + // The lifetime of the events are as follows. + // 1) All events are allocated when the tracing context is created. + // 2) When a command_buffer inserts a query via: + // iree_hal_cuda_**_tracing_context_insert_query + // an event is pulled from the event freelist and added to the + // command buffer. + // 3) When a command buffer is dispatched and + // iree_hal_cuda_tracing_notify_submitted is called, the events + // for that command buffer are added to the submitted_event_list. + // 4) When the command buffer completes iree_hal_cuda_tracing_context_collect + // is called, and the events are removed from submitted_event_list as + // we collect their values. + // 5) When the command buffer is destroyed, all events are put at the front + // of event_freelist. + iree_hal_cuda_tracing_context_event_t + event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY]; }; static iree_status_t iree_hal_cuda_tracing_context_initial_calibration( @@ -89,6 +136,9 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( context->block_pool = block_pool; context->host_allocator = host_allocator; context->query_capacity = IREE_ARRAYSIZE(context->event_pool); + context->submitted_event_list.head = NULL; + context->submitted_event_list.tail = NULL; + iree_slim_mutex_initialize(&context->event_mutex); } // Pre-allocate all events in the event pool. @@ -97,10 +147,21 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( z_event_pool, "iree_hal_cuda_tracing_context_allocate_event_pool"); IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool, (int64_t)context->query_capacity); + context->event_freelist_head = &context->event_pool[0]; for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { status = IREE_CURESULT_TO_STATUS( - symbols, cuEventCreate(&context->event_pool[i], CU_EVENT_DEFAULT)); + symbols, + cuEventCreate(&context->event_pool[i].event, CU_EVENT_DEFAULT)); if (!iree_status_is_ok(status)) break; + if (i > 0) { + context->event_pool[i - 1].next_in_command_buffer = + &context->event_pool[i]; + } + context->event_pool[i].next_submission = NULL; + context->event_pool[i].was_submitted = false; + if (i + 1 == context->query_capacity) { + context->event_pool[i].next_in_command_buffer = NULL; + } } IREE_TRACE_ZONE_END(z_event_pool); } @@ -149,9 +210,9 @@ void iree_hal_cuda_tracing_context_free( IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool, "iree_hal_cuda_tracing_context_free_event_pool"); for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { - if (context->event_pool[i]) { + if (context->event_pool[i].event) { IREE_CUDA_IGNORE_ERROR(context->symbols, - cuEventDestroy(context->event_pool[i])); + cuEventDestroy(context->event_pool[i].event)); } } IREE_TRACE_ZONE_END(z_event_pool); @@ -160,6 +221,8 @@ void iree_hal_cuda_tracing_context_free( cuEventDestroy(context->base_event)); } + iree_slim_mutex_deinitialize(&context->event_mutex); + iree_allocator_t host_allocator = context->host_allocator; iree_allocator_free(host_allocator, context); @@ -169,31 +232,31 @@ void iree_hal_cuda_tracing_context_free( void iree_hal_cuda_tracing_context_collect( iree_hal_cuda_tracing_context_t* context) { if (!context) return; - if (context->query_tail == context->query_head) { - // No outstanding queries. + iree_slim_mutex_lock(&context->event_mutex); + + // No outstanding queries + if (!context->submitted_event_list.head) { + iree_slim_mutex_unlock(&context->event_mutex); return; } IREE_TRACE_ZONE_BEGIN(z0); - - while (context->query_tail != context->query_head) { - // Compute the contiguous range of queries ready to be read. - // If the ringbuffer wraps around we'll handle that in the next loop. - uint32_t try_query_count = - context->query_head < context->query_tail - ? context->query_capacity - context->query_tail - : context->query_head - context->query_tail; - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)try_query_count); - - // Scan and feed the times to tracy, stopping when we hit the first - // unavailable query. - uint32_t query_base = context->query_tail; - uint32_t read_query_count = 0; - for (uint32_t i = 0; i < try_query_count; ++i) { - // Ensure the event has completed; will return CUDA_ERROR_NOT_READY if - // recorded but not retired or any other deferred error. - uint16_t query_id = (uint16_t)(query_base + i); - CUevent query_event = context->event_pool[query_id]; - CUresult result = context->symbols->cuEventQuery(query_event); + // submitted_event_list is a list of the head elements for each command + // buffer that has been submitted. Here we loop over all of the events, + // wait for them to complete and gather the results with cuEventQuery. + + iree_hal_cuda_tracing_context_event_t* events = + context->submitted_event_list.head; + uint32_t read_query_count = 0; + // Outer per-command_buffer loop. + while (events) { + iree_hal_cuda_tracing_context_event_t* event = events; + // Inner per-event loop. + while (event) { + uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); + + CUresult result = context->symbols->cuEventSynchronize(event->event); + if (result != CUDA_SUCCESS) break; + result = context->symbols->cuEventQuery(event->event); if (result != CUDA_SUCCESS) break; // Calculate context-relative time and notify tracy. @@ -201,66 +264,157 @@ void iree_hal_cuda_tracing_context_collect( IREE_CUDA_IGNORE_ERROR( context->symbols, cuEventElapsedTime(&relative_millis, context->base_event, - query_event)); + event->event)); int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0); iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); - read_query_count = i + 1; + read_query_count += 1; + event = event->next_in_command_buffer; } - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); + iree_hal_cuda_tracing_context_event_t* next = events->next_submission; + events->was_submitted = true; + events = next; + context->submitted_event_list.head = events; + } + IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); + IREE_TRACE_ZONE_END(z0); + iree_slim_mutex_unlock(&context->event_mutex); +} + +void iree_hal_cuda_tracing_notify_submitted( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list) { + if (!context) return; + IREE_ASSERT_ARGUMENT(event_list); + iree_slim_mutex_lock(&context->event_mutex); + + if (!event_list->head) { + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + + if (!context->submitted_event_list.head) { + context->submitted_event_list.head = event_list->head; + context->submitted_event_list.tail = event_list->head; + } else { + context->submitted_event_list.tail->next_submission = event_list->head; + context->submitted_event_list.tail = event_list->head; + } - context->query_tail += read_query_count; - if (context->query_tail >= context->query_capacity) { - context->query_tail = 0; + iree_slim_mutex_unlock(&context->event_mutex); +} + +void iree_hal_cuda_tracing_free( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list) { + if (!context) return; + iree_slim_mutex_lock(&context->event_mutex); + + IREE_ASSERT_ARGUMENT(event_list); + + if (!event_list->head) { + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + + // Free an event list that was previously created. There is some book-keeping + // to keep tracy happy, and then we remove the elements from the + // passed in event_list and add them to the front of the free-list. + + // If this event list has never been submitted we still need to add values to + // the timeline otherwise tracy will not behave correctly. + if (!event_list->head->was_submitted) { + iree_hal_cuda_tracing_context_event_t* event = event_list->head; + while (event) { + uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); + iree_tracing_gpu_zone_notify(context->id, query_id, 0); + event = event->next_in_command_buffer; } } - IREE_TRACE_ZONE_END(z0); + if (!context->event_freelist_head) { + context->event_freelist_head = event_list->head; + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + event_list->head->next_submission = NULL; + event_list->head->was_submitted = false; + event_list->tail->next_in_command_buffer = context->event_freelist_head; + context->event_freelist_head = event_list->head; + + event_list->head = NULL; + event_list->tail = NULL; + iree_slim_mutex_unlock(&context->event_mutex); } +static void iree_hal_cuda_tracing_context_event_list_append_event( + iree_hal_cuda_tracing_context_event_list_t* event_list, + iree_hal_cuda_tracing_context_event_t* event) { + if (!event_list->head) { + event_list->head = event; + event_list->tail = event; + } else { + event_list->tail->next_in_command_buffer = event; + event_list->tail = event; + } +} + +// Grabs the next available query out of the freelist and adds it to +// the event_list that was passed in. Also starts the recording of the +// event. static uint16_t iree_hal_cuda_stream_tracing_context_insert_query( - iree_hal_cuda_tracing_context_t* context, CUstream stream) { + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) { + iree_slim_mutex_lock(&context->event_mutex); + IREE_ASSERT_ARGUMENT(event_list); + // Allocate an event from the pool for use by the query. - uint32_t query_id = context->query_head; - context->query_head = (context->query_head + 1) % context->query_capacity; + // TODO: If we have run out of our freelist, then we need to try and recover + // allocate events. + iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head; + context->event_freelist_head = event->next_in_command_buffer; + uint32_t query_id = event - &context->event_pool[0]; + IREE_ASSERT(event->next_in_command_buffer != NULL); + event->next_in_command_buffer = NULL; - // TODO: check to see if the read and write heads of the ringbuffer have - // overlapped. If they have we could try to collect but it's not guaranteed - // that collection will complete (e.g. we may be reserving events for use in - // graphs that haven't yet been launched). - // - // For now we just allow the overlap and tracing results will be inconsistent. - IREE_ASSERT_NE(context->query_head, context->query_tail); + IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event->event, stream)); - CUevent event = context->event_pool[query_id]; - IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event, stream)); + iree_hal_cuda_tracing_context_event_list_append_event(event_list, event); + iree_slim_mutex_unlock(&context->event_mutex); return query_id; } +// Grabs the next available query out of the freelist and adds it to +// the event_list that was passed in. Also inserts the event record +// node into the passed in graph. It returns the index of the +// event. static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( - iree_hal_cuda_tracing_context_t* context, CUgraphNode* out_node, - CUgraph graph, CUgraphNode* dependency_nodes, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, + CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, size_t dependency_nodes_count) { + IREE_ASSERT_ARGUMENT(event_list); + iree_slim_mutex_lock(&context->event_mutex); + // Allocate an event from the pool for use by the query. - uint32_t query_id = context->query_head; - context->query_head = (context->query_head + 1) % context->query_capacity; - - // TODO: check to see if the read and write heads of the ringbuffer have - // overlapped. If they have we could try to collect but it's not guaranteed - // that collection will complete (e.g. we may be reserving events for use in - // graphs that haven't yet been launched). - // - // For now we just allow the overlap and tracing results will be inconsistent. - IREE_ASSERT_NE(context->query_head, context->query_tail); - - CUevent event = context->event_pool[query_id]; + // TODO: If we have run out of our freelist, then we need to try and recover + // or allocate more events. + iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head; + context->event_freelist_head = event->next_in_command_buffer; + uint32_t query_id = event - &context->event_pool[0]; + IREE_ASSERT(event->next_in_command_buffer != NULL); + event->next_in_command_buffer = NULL; + iree_status_t status = IREE_CURESULT_TO_STATUS( context->symbols, cuGraphAddEventRecordNode(out_node, graph, dependency_nodes, - dependency_nodes_count, event)); + dependency_nodes_count, event->event)); IREE_ASSERT(iree_status_is_ok(status)); + iree_hal_cuda_tracing_context_event_list_append_event(event_list, event); + + iree_slim_mutex_unlock(&context->event_mutex); return query_id; } @@ -270,56 +424,63 @@ static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( // using the differences between them. void iree_hal_cuda_stream_tracing_zone_begin_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, const iree_tracing_location_t* src_loc) { IREE_ASSERT_ARGUMENT(context); - uint16_t query_id = - iree_hal_cuda_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); } void iree_hal_cuda_stream_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { IREE_ASSERT_ARGUMENT(context); - uint16_t query_id = - iree_hal_cuda_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } void iree_hal_cuda_graph_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, CUgraphNode* out_node, - CUgraph graph, CUgraphNode* dependency_nodes, size_t dependency_nodes_count, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length) { + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, + CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + size_t dependency_nodes_count, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length) { if (!context) return; uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, out_node, graph, dependency_nodes, dependency_nodes_count); + context, event_list, out_node, graph, dependency_nodes, + dependency_nodes_count); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } void iree_hal_cuda_stream_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream) { + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) { if (!context) return; - uint16_t query_id = - iree_hal_cuda_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_end(context->id, query_id); } void iree_hal_cuda_graph_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, CUgraphNode* out_node, - CUgraph graph, CUgraphNode* dependency_nodes, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, + CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, out_node, graph, dependency_nodes, dependency_nodes_count); + context, event_list, out_node, graph, dependency_nodes, + dependency_nodes_count); iree_tracing_gpu_zone_end(context->id, query_id); } @@ -340,4 +501,12 @@ void iree_hal_cuda_tracing_context_free( void iree_hal_cuda_tracing_context_collect( iree_hal_cuda_tracing_context_t* context) {} +void iree_hal_cuda_tracing_notify_submitted( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list) {} + +void iree_hal_cuda_tracing_free( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list) {} + #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.h b/runtime/src/iree/hal/drivers/cuda/tracing.h index 65630dd132b2..abe468f1c389 100644 --- a/runtime/src/iree/hal/drivers/cuda/tracing.h +++ b/runtime/src/iree/hal/drivers/cuda/tracing.h @@ -42,6 +42,15 @@ extern "C" { // Thread-compatible: external synchronization is required if using from // multiple threads (same as with CUstream itself). typedef struct iree_hal_cuda_tracing_context_t iree_hal_cuda_tracing_context_t; +typedef struct iree_hal_cuda_tracing_context_event_t + iree_hal_cuda_tracing_context_event_t; + +// This is used when tracing is enabled. Calls to dispatch and event related +// functions will update the pointers to keep the list up to date. +typedef struct iree_hal_cuda_tracing_context_event_list_t { + iree_hal_cuda_tracing_context_event_t* head; + iree_hal_cuda_tracing_context_event_t* tail; +} iree_hal_cuda_tracing_context_event_list_t; // Allocates a tracing context for the given CUDA |stream|. // Each context must only be used with the stream it was created for. @@ -62,80 +71,98 @@ void iree_hal_cuda_tracing_context_free( void iree_hal_cuda_tracing_context_collect( iree_hal_cuda_tracing_context_t* context); +// Notifies that the given list of events has been dispached on to the gpu. +void iree_hal_cuda_tracing_notify_submitted( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list); + +// Frees the events and returns them back into the tracing context. +void iree_hal_cuda_tracing_free( + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list); + #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE // Begins a normal zone derived on the calling |src_loc|. // Must be perfectly nested and paired with a corresponding zone end. void iree_hal_cuda_stream_tracing_zone_begin_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, const iree_tracing_location_t* src_loc); // Begins an external zone using the given source information. // The provided strings will be copied into the tracy buffer. void iree_hal_cuda_stream_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length); void iree_hal_cuda_graph_tracing_zone_begin_external_impl( - iree_hal_cuda_tracing_context_t* context, CUgraphNode* out_node, - CUgraph graph, CUgraphNode* dependency_nodes, size_t dependency_nodes_count, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length); + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, + CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + size_t dependency_nodes_count, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length); void iree_hal_cuda_stream_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, CUstream stream); + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream); void iree_hal_cuda_graph_tracing_zone_end_impl( - iree_hal_cuda_tracing_context_t* context, CUgraphNode* out_node, - CUgraph graph, CUgraphNode* dependency_nodes, + iree_hal_cuda_tracing_context_t* context, + iree_hal_cuda_tracing_context_event_list_t* event_list, + CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, size_t dependency_nodes_count); // Begins a new zone with the parent function name. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, stream) \ +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin, \ + event_list_end, stream) \ static const iree_tracing_location_t TracyConcat( \ __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ (uint32_t)__LINE__, 0}; \ iree_hal_cuda_stream_tracing_zone_begin_impl( \ - context, stream, &TracyConcat(__tracy_source_location, __LINE__)); + context, event_list_begin, event_list_end, stream, \ + &TracyConcat(__tracy_source_location, __LINE__)); // Begins an externally defined zone with a dynamic source location. // The |file_name|, |function_name|, and optional |name| strings will be copied // into the trace buffer and do not need to persist. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, stream, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_hal_cuda_stream_tracing_zone_begin_external_impl( \ - context, stream, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hal_cuda_stream_tracing_zone_begin_external_impl( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) #define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) \ + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ iree_hal_cuda_graph_tracing_zone_begin_external_impl( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) - -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, stream) \ - iree_hal_cuda_stream_tracing_zone_end_impl(context, stream) -#define IREE_CUDA_GRAPH_TRACE_ZONE_END( \ - context, out_node, graph, dependency_nodes, dependency_nodes_count) \ - iree_hal_cuda_graph_tracing_zone_end_impl( \ - context, out_node, graph, dependency_nodes, dependency_nodes_count) - + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream) \ + iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream) +#define IREE_CUDA_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + dependency_nodes, \ + dependency_nodes_count) \ + iree_hal_cuda_graph_tracing_zone_end_impl(context, event_list, out_node, \ + graph, dependency_nodes, \ + dependency_nodes_count) #else -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, stream) -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, stream, file_name, file_name_length, line, function_name, \ +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) -#define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, stream) +#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c index bae70fb73a64..65b2c4c9be28 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c @@ -34,6 +34,7 @@ typedef struct iree_hal_hip_graph_command_buffer_t { // Per-stream HIP tracing context. iree_hal_hip_tracing_context_t* tracing_context; + iree_hal_hip_tracing_context_event_list_t tracing_event_list; // A resource set to maintain references to all resources used within the // command buffer. @@ -97,10 +98,11 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( &command_buffer->hip_graph_nodes[command_buffer->graph_node_count++]; size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0; IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, tracing_event_node, - command_buffer->hip_graph, &command_buffer->hip_barrier_node, - dependency_count, file_name, file_name_length, line, function_name, - function_name_length, name, name_length); + command_buffer->tracing_context, &command_buffer->tracing_event_list, + tracing_event_node, command_buffer->hip_graph, + &command_buffer->hip_barrier_node, dependency_count, file_name, + file_name_length, line, function_name, function_name_length, name, + name_length); // Move the barrier forward to make sure that the tracing event is recorded // before work starts. @@ -122,10 +124,10 @@ static void iree_hip_graph_command_buffer_trace_zone_end( size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0; IREE_ASSERT_GT(dependency_count, 0, "ending a zone should at least depend on the beginning"); - IREE_HIP_GRAPH_TRACE_ZONE_END(command_buffer->tracing_context, - tracing_event_node, command_buffer->hip_graph, - &command_buffer->hip_barrier_node, - dependency_count); + IREE_HIP_GRAPH_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + tracing_event_node, command_buffer->hip_graph, + &command_buffer->hip_barrier_node, dependency_count); // We need to wait on the tracing end before other work starts. // GPU tracing zones are first-in, last-out. @@ -194,6 +196,8 @@ iree_status_t iree_hal_hip_graph_command_buffer_create( command_buffer->host_allocator = host_allocator; command_buffer->symbols = hip_symbols; command_buffer->tracing_context = tracing_context; + command_buffer->tracing_event_list.head = NULL; + command_buffer->tracing_event_list.tail = NULL; iree_arena_initialize(block_pool, &command_buffer->arena); command_buffer->hip_context = context; command_buffer->hip_graph = NULL; @@ -227,6 +231,9 @@ static void iree_hal_hip_graph_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_hip_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); + // Drop any pending collective batches before we tear things down. iree_hal_collective_batch_clear(&command_buffer->collective_batch); @@ -264,6 +271,18 @@ hipGraphExec_t iree_hal_hip_graph_command_buffer_handle( return command_buffer->hip_exec; } +void iree_hal_hip_graph_tracing_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_hip_graph_command_buffer_t* command_buffer = + iree_hal_hip_graph_command_buffer_cast(base_command_buffer); + if (!command_buffer->tracing_context) { + return; + } + + iree_hal_hip_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); +} + // Flushes any pending batched collective operations. // Must be called before any other non-collective nodes are added to the graph // or a barrier is encountered. @@ -321,7 +340,11 @@ static iree_status_t iree_hal_hip_graph_command_buffer_begin( hipGraphCreate(&command_buffer->hip_graph, /*flags=*/0), "hipGraphCreate"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( + command_buffer, + /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_graph_command_buffer", + strlen("iree_hal_hip_graph_command_buffer"), + /*name=*/NULL, 0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h index 7235d3832cee..9701ee86ea9e 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h @@ -45,6 +45,11 @@ bool iree_hal_hip_graph_command_buffer_isa( hipGraphExec_t iree_hal_hip_graph_command_buffer_handle( iree_hal_command_buffer_t* command_buffer); +// This is to be called after the given |command_buffer| has been submitted +// in order to notify the tracing system that there are events to collect. +void iree_hal_hip_graph_tracing_notify_submitted_commands( + iree_hal_command_buffer_t* command_buffer); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c index 7fe40b5601c0..fb02bc17d50d 100644 --- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c +++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c @@ -730,16 +730,20 @@ static iree_status_t iree_hal_hip_pending_queue_actions_issue_execution( ? action->payload.execution.binding_tables[i] : iree_hal_buffer_binding_table_empty(); if (iree_hal_hip_stream_command_buffer_isa(command_buffer)) { - // Nothing to do for an inline command buffer; all the work has already - // been submitted. When we support semaphores we'll still need to signal - // their completion but do not have to worry about any waits: if there - // were waits we wouldn't have been able to execute inline! + // Nothing much to do for an inline command buffer; all the work has + // already been submitted. When we support semaphores we'll still need to + // signal their completion but do not have to worry about any waits: if + // there were waits we wouldn't have been able to execute inline! We do + // notify that the commands were "submitted" so we can make sure to clean + // up our trace events. + iree_hal_hip_stream_notify_submitted_commands(command_buffer); } else if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) { hipGraphExec_t exec = iree_hal_hip_graph_command_buffer_handle(command_buffer); IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR( z0, symbols, hipGraphLaunch(exec, action->dispatch_hip_stream), "hipGraphLaunch"); + iree_hal_hip_graph_tracing_notify_submitted_commands(command_buffer); } else { iree_hal_command_buffer_t* stream_command_buffer = NULL; iree_hal_command_buffer_mode_t mode = @@ -758,6 +762,10 @@ static iree_status_t iree_hal_hip_pending_queue_actions_issue_execution( IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_deferred_command_buffer_apply( command_buffer, stream_command_buffer, binding_table)); + iree_hal_hip_stream_notify_submitted_commands(stream_command_buffer); + // The stream_command_buffer is going to be retained by + // the action->resource_set and deleted after the action + // completes. iree_hal_resource_release(stream_command_buffer); } } diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.c b/runtime/src/iree/hal/drivers/hip/rccl_channel.c index 4854d0fb0677..e3c38a21bf7c 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.c +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.c @@ -576,6 +576,7 @@ static iree_status_t iree_hal_hip_nccl_submit_batch_entry( iree_status_t iree_hal_hip_nccl_submit_batch( const iree_hal_hip_nccl_dynamic_symbols_t* symbols, iree_hal_hip_tracing_context_t* tracing_context, + iree_hal_hip_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, hipStream_t stream) { IREE_ASSERT_ARGUMENT(symbols); IREE_ASSERT_ARGUMENT(batch); @@ -592,9 +593,9 @@ iree_status_t iree_hal_hip_nccl_submit_batch( iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, stream, __FILE__, strlen(__FILE__), (uint32_t)__LINE__, - __FUNCTION__, strlen(__FUNCTION__), collective_str.data, - collective_str.size); + tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__), + (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), + collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -611,7 +612,8 @@ iree_status_t iree_hal_hip_nccl_submit_batch( // order doesn't matter so long as we end the right number of zones. IREE_TRACE({ for (iree_host_size_t i = 0; i < batch->count; ++i) { - IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, stream); + IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, + stream); } }); diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.h b/runtime/src/iree/hal/drivers/hip/rccl_channel.h index 2421dc17d29f..366cf2fd565d 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.h +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.h @@ -49,6 +49,7 @@ iree_status_t iree_hal_hip_nccl_channel_create( iree_status_t iree_hal_hip_nccl_submit_batch( const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols, iree_hal_hip_tracing_context_t* tracing_context, + iree_hal_hip_tracing_context_event_list_t* tracing_event_list, const iree_hal_collective_batch_t* batch, hipStream_t stream); #ifdef __cplusplus diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c index ede6d8cfabb9..a250299ca62b 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c @@ -25,6 +25,7 @@ typedef struct iree_hal_hip_stream_command_buffer_t { // Per-stream HIP tracing context. iree_hal_hip_tracing_context_t* tracing_context; + iree_hal_hip_tracing_context_event_list_t tracing_event_list; hipStream_t hip_stream; @@ -98,6 +99,8 @@ iree_status_t iree_hal_hip_stream_command_buffer_create( command_buffer->hip_symbols = hip_symbols; command_buffer->nccl_symbols = nccl_symbols; command_buffer->tracing_context = tracing_context; + command_buffer->tracing_event_list.head = NULL; + command_buffer->tracing_event_list.tail = NULL; command_buffer->hip_stream = stream; iree_arena_initialize(block_pool, &command_buffer->arena); @@ -122,6 +125,9 @@ static void iree_hal_hip_stream_command_buffer_destroy( iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_hip_tracing_free(command_buffer->tracing_context, + &command_buffer->tracing_event_list); + iree_hal_collective_batch_deinitialize(&command_buffer->collective_batch); iree_hal_resource_set_free(command_buffer->resource_set); iree_arena_deinitialize(&command_buffer->arena); @@ -136,6 +142,18 @@ bool iree_hal_hip_stream_command_buffer_isa( &iree_hal_hip_stream_command_buffer_vtable); } +void iree_hal_hip_stream_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_hip_stream_command_buffer_t* command_buffer = + iree_hal_hip_stream_command_buffer_cast(base_command_buffer); + if (!command_buffer->tracing_context) { + return; + } + + iree_hal_hip_tracing_notify_submitted(command_buffer->tracing_context, + &command_buffer->tracing_event_list); +} + // Flushes any pending batched collective operations. // Must be called before any other non-collective nodes are added to the graph // or a barrier is encountered. @@ -151,7 +169,8 @@ static iree_status_t iree_hal_hip_stream_command_buffer_flush_collectives( IREE_TRACE_ZONE_BEGIN(z0); iree_status_t status = iree_hal_hip_nccl_submit_batch( command_buffer->nccl_symbols, command_buffer->tracing_context, - &command_buffer->collective_batch, command_buffer->hip_stream); + &command_buffer->tracing_event_list, &command_buffer->collective_batch, + command_buffer->hip_stream); iree_hal_collective_batch_clear(&command_buffer->collective_batch); IREE_TRACE_ZONE_END(z0); return status; @@ -164,7 +183,8 @@ static iree_status_t iree_hal_hip_stream_command_buffer_begin( (void)command_buffer; IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->hip_stream, + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_stream_command_buffer", strlen("iree_hal_hip_stream_command_buffer"), /*name=*/NULL, 0); @@ -194,6 +214,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_end( &command_buffer->resource_set)); IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->hip_stream); IREE_TRACE_ZONE_END(z0); @@ -209,10 +230,10 @@ static void iree_hal_hip_stream_command_buffer_begin_debug_group( (void)command_buffer; IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->hip_stream, - location ? location->file.data : NULL, location ? location->file.size : 0, - location ? location->line : 0, /*func_name=*/NULL, 0, label.data, - label.size); + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, location ? location->file.data : NULL, + location ? location->file.size : 0, location ? location->line : 0, + /*func_name=*/NULL, 0, label.data, label.size); } static void iree_hal_hip_stream_command_buffer_end_debug_group( @@ -222,6 +243,7 @@ static void iree_hal_hip_stream_command_buffer_end_debug_group( (void)command_buffer; IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->hip_stream); } @@ -519,10 +541,10 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer->tracing_context, command_buffer->hip_stream, - kernel_info.source_filename.data, kernel_info.source_filename.size, - kernel_info.source_line, kernel_info.function_name.data, - kernel_info.function_name.size, + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, kernel_info.source_filename.data, + kernel_info.source_filename.size, kernel_info.source_line, + kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -594,6 +616,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( "hipModuleLaunchKernel"); IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, + &command_buffer->tracing_event_list, command_buffer->hip_stream); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h index e6a1dfc1b4f6..50eddf1d9daa 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h @@ -44,6 +44,11 @@ iree_status_t iree_hal_hip_stream_command_buffer_create( bool iree_hal_hip_stream_command_buffer_isa( iree_hal_command_buffer_t* command_buffer); +// This is to be called after a command buffer has been submitted +// in order to notify the tracing system that there are events +// to collect. +void iree_hal_hip_stream_notify_submitted_commands( + iree_hal_command_buffer_t* base_command_buffer); #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c index ae9d4a11dc49..0fbe2bb2512e 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.c +++ b/runtime/src/iree/hal/drivers/hip/tracing.c @@ -16,8 +16,36 @@ // To prevent spilling pages we leave some room for the context structure. #define IREE_HAL_HIP_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256) +// iree_hal_hip_tracing_context_event_t contains a hipEvent that is used to +// record timestamps for tracing GPU execution. In this struct, there are also +// two linked lists that the current event may be added to during its lifetime. +// +// --------------------->---Submissions--->---------- +// \ \ \ +// \ \ \ +// command_buffer command_buffer command_buffer +// +// The submission list is owned by the tracing context and elements are +// inserted and removed as commmand_buffers are submitted and when they +// complete. This is a list of the head elements for each command buffer. +// The commnad buffer list is owned by the command buffer. It is the list of +// events used to trace command buffer dispatches. +// +// When the event is in the freelist, next_submission should be null, and +// we reuse next_in_command_buffer to track the next free event. +// +// When the even is grabbed from the freelist to track GPU executions, +// it is added to the list in recording command_buffer. +struct iree_hal_hip_tracing_context_event_t { + hipEvent_t event; + iree_hal_hip_tracing_context_event_t* next_in_command_buffer; + iree_hal_hip_tracing_context_event_t* next_submission; + bool was_submitted; +}; + struct iree_hal_hip_tracing_context_t { const iree_hal_hip_dynamic_symbols_t* symbols; + iree_slim_mutex_t event_mutex; hipStream_t stream; iree_arena_block_pool_t* block_pool; @@ -32,13 +60,32 @@ struct iree_hal_hip_tracing_context_t { // we need a stable base event. hipEvent_t base_event; - // Indices into |event_pool| defining a ringbuffer. - uint32_t query_head; - uint32_t query_tail; + // Unallocated event list head. next_in_command_buffer points to the next + // available event. + iree_hal_hip_tracing_context_event_t* event_freelist_head; + + // Submitted events + iree_hal_hip_tracing_context_event_list_t submitted_event_list; + uint32_t query_capacity; // Event pool reused to capture tracing timestamps. - hipEvent_t event_pool[IREE_HAL_HIP_TRACING_DEFAULT_QUERY_CAPACITY]; + // The lifetime of the events are as follows. + // 1) All events are allocated when the tracing context is created. + // 2) When a command_buffer inserts a query via: + // iree_hal_cuda_**_tracing_context_insert_query + // an event is pulled from the event freelist and added to the + // command buffer. + // 3) When a command buffer is dispatched and + // iree_hal_hip_tracing_notify_submitted is called, the events + // for that command buffer are added to the submitted_event_list. + // 4) When the command buffer completes iree_hal_cuda_tracing_context_collect + // is called, and the events are removed from submitted_event_list as + // we collect their values. + // 5) When the command buffer is destroyed, all events are put at the front + // of event_freelist. + iree_hal_hip_tracing_context_event_t + event_pool[IREE_HAL_HIP_TRACING_DEFAULT_QUERY_CAPACITY]; }; static iree_status_t iree_hal_hip_tracing_context_initial_calibration( @@ -90,6 +137,9 @@ iree_status_t iree_hal_hip_tracing_context_allocate( context->block_pool = block_pool; context->host_allocator = host_allocator; context->query_capacity = IREE_ARRAYSIZE(context->event_pool); + context->submitted_event_list.head = NULL; + context->submitted_event_list.tail = NULL; + iree_slim_mutex_initialize(&context->event_mutex); } // Pre-allocate all events in the event pool. @@ -98,11 +148,21 @@ iree_status_t iree_hal_hip_tracing_context_allocate( z_event_pool, "iree_hal_hip_tracing_context_allocate_event_pool"); IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool, (int64_t)context->query_capacity); + context->event_freelist_head = &context->event_pool[0]; for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { status = IREE_HIP_RESULT_TO_STATUS( - symbols, - hipEventCreateWithFlags(&context->event_pool[i], hipEventDefault)); + symbols, hipEventCreateWithFlags(&context->event_pool[i].event, + hipEventDefault)); if (!iree_status_is_ok(status)) break; + if (i > 0) { + context->event_pool[i - 1].next_in_command_buffer = + &context->event_pool[i]; + } + context->event_pool[i].next_submission = NULL; + context->event_pool[i].was_submitted = false; + if (i + 1 == context->query_capacity) { + context->event_pool[i].next_in_command_buffer = NULL; + } } IREE_TRACE_ZONE_END(z_event_pool); } @@ -152,9 +212,9 @@ void iree_hal_hip_tracing_context_free( IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool, "iree_hal_hip_tracing_context_free_event_pool"); for (iree_host_size_t i = 0; i < context->query_capacity; ++i) { - if (context->event_pool[i]) { + if (context->event_pool[i].event) { IREE_HIP_IGNORE_ERROR(context->symbols, - hipEventDestroy(context->event_pool[i])); + hipEventDestroy(context->event_pool[i].event)); } } IREE_TRACE_ZONE_END(z_event_pool); @@ -163,6 +223,8 @@ void iree_hal_hip_tracing_context_free( hipEventDestroy(context->base_event)); } + iree_slim_mutex_deinitialize(&context->event_mutex); + iree_allocator_t host_allocator = context->host_allocator; iree_allocator_free(host_allocator, context); @@ -172,31 +234,30 @@ void iree_hal_hip_tracing_context_free( void iree_hal_hip_tracing_context_collect( iree_hal_hip_tracing_context_t* context) { if (!context) return; - if (context->query_tail == context->query_head) { - // No outstanding queries. + iree_slim_mutex_lock(&context->event_mutex); + // No outstanding queries + if (!context->submitted_event_list.head) { + iree_slim_mutex_unlock(&context->event_mutex); return; } IREE_TRACE_ZONE_BEGIN(z0); - while (context->query_tail != context->query_head) { - // Compute the contiguous range of queries ready to be read. - // If the ringbuffer wraps around we'll handle that in the next loop. - uint32_t try_query_count = - context->query_head < context->query_tail - ? context->query_capacity - context->query_tail - : context->query_head - context->query_tail; - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)try_query_count); - - // Scan and feed the times to tracy, stopping when we hit the first - // unavailable query. - uint32_t query_base = context->query_tail; - uint32_t read_query_count = 0; - for (uint32_t i = 0; i < try_query_count; ++i) { - // Ensure the event has completed; will return HIP_ERROR_NOT_READY if - // recorded but not retired or any other deferred error. - uint16_t query_id = (uint16_t)(query_base + i); - hipEvent_t query_event = context->event_pool[query_id]; - hipError_t result = context->symbols->hipEventQuery(query_event); + // submitted_event_list is a list of the head elements for each command + // buffer that has been submitted. Here we loop over all of the events, + // wait for them to complete and gather the results with hipEventQuery. + iree_hal_hip_tracing_context_event_t* events = + context->submitted_event_list.head; + uint32_t read_query_count = 0; + // Outer per-command_buffer loop. + while (events) { + iree_hal_hip_tracing_context_event_t* event = events; + // Inner per-event loop. + while (event) { + uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); + + hipError_t result = context->symbols->hipEventSynchronize(event->event); + if (result != hipSuccess) break; + result = context->symbols->hipEventQuery(event->event); if (result != hipSuccess) break; // Calculate context-relative time and notify tracy. @@ -204,66 +265,156 @@ void iree_hal_hip_tracing_context_collect( IREE_HIP_IGNORE_ERROR( context->symbols, hipEventElapsedTime(&relative_millis, context->base_event, - query_event)); + event->event)); int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0); - iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); - read_query_count = i + 1; + iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp); + read_query_count += 1; + event = event->next_in_command_buffer; } - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); + iree_hal_hip_tracing_context_event_t* next = events->next_submission; + events->was_submitted = true; + events = next; + context->submitted_event_list.head = events; + } + IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count); + + IREE_TRACE_ZONE_END(z0); + iree_slim_mutex_unlock(&context->event_mutex); +} + +void iree_hal_hip_tracing_notify_submitted( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list) { + if (!context) return; + IREE_ASSERT_ARGUMENT(event_list); + iree_slim_mutex_lock(&context->event_mutex); + + if (!event_list->head) { + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + + if (!context->submitted_event_list.head) { + context->submitted_event_list.head = event_list->head; + context->submitted_event_list.tail = event_list->head; + } else { + context->submitted_event_list.tail->next_submission = event_list->head; + context->submitted_event_list.tail = event_list->head; + } + + iree_slim_mutex_unlock(&context->event_mutex); +} - context->query_tail += read_query_count; - if (context->query_tail >= context->query_capacity) { - context->query_tail = 0; +void iree_hal_hip_tracing_free( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list) { + if (!context) return; + iree_slim_mutex_lock(&context->event_mutex); + IREE_ASSERT_ARGUMENT(event_list); + + if (!event_list->head) { + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + // Free an event list that was previously created. There is some book-keeping + // to keep tracy happy, and then we remove the elements from the + // passed in event_list and add them to the front of the free-list. + + // If this event list has never been submitted we still need to add values to + // the timeline otherwise tracy will not behave correctly. + if (!event_list->head->was_submitted) { + iree_hal_hip_tracing_context_event_t* event = event_list->head; + while (event) { + uint32_t query_id = (uint32_t)(event - &context->event_pool[0]); + iree_tracing_gpu_zone_notify(context->id, query_id, 0); + event = event->next_in_command_buffer; } } - IREE_TRACE_ZONE_END(z0); + if (!context->event_freelist_head) { + context->event_freelist_head = event_list->head; + iree_slim_mutex_unlock(&context->event_mutex); + return; + } + event_list->head->next_submission = NULL; + event_list->head->was_submitted = false; + event_list->tail->next_in_command_buffer = context->event_freelist_head; + context->event_freelist_head = event_list->head; + + event_list->head = NULL; + event_list->tail = NULL; + iree_slim_mutex_unlock(&context->event_mutex); } +static void iree_hal_hip_tracing_context_event_list_append_event( + iree_hal_hip_tracing_context_event_list_t* event_list, + iree_hal_hip_tracing_context_event_t* event) { + if (!event_list->head) { + event_list->head = event; + event_list->tail = event; + } else { + event_list->tail->next_in_command_buffer = event; + event_list->tail = event; + } +} + +// Grabs the next available query out of the freelist and adds it to +// the event_list that was passed in. Also starts the recording of the +// event. static uint16_t iree_hal_hip_stream_tracing_context_insert_query( - iree_hal_hip_tracing_context_t* context, hipStream_t stream) { + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream) { + iree_slim_mutex_lock(&context->event_mutex); + IREE_ASSERT_ARGUMENT(event_list); + // Allocate an event from the pool for use by the query. - uint32_t query_id = context->query_head; - context->query_head = (context->query_head + 1) % context->query_capacity; + // TODO: If we have run out of our freelist, then we need to try and recover + // or allocate more events. + iree_hal_hip_tracing_context_event_t* event = context->event_freelist_head; + context->event_freelist_head = event->next_in_command_buffer; + uint32_t query_id = event - &context->event_pool[0]; + IREE_ASSERT(event->next_in_command_buffer != NULL); + event->next_in_command_buffer = NULL; - // TODO: check to see if the read and write heads of the ringbuffer have - // overlapped. If they have we could try to collect but it's not guaranteed - // that collection will complete (e.g. we may be reserving events for use in - // graphs that haven't yet been launched). - // - // For now we just allow the overlap and tracing results will be inconsistent. - IREE_ASSERT_NE(context->query_head, context->query_tail); + IREE_HIP_IGNORE_ERROR(context->symbols, hipEventRecord(event->event, stream)); - hipEvent_t event = context->event_pool[query_id]; - IREE_HIP_IGNORE_ERROR(context->symbols, hipEventRecord(event, stream)); + iree_hal_hip_tracing_context_event_list_append_event(event_list, event); + iree_slim_mutex_unlock(&context->event_mutex); return query_id; } +// Grabs the next available query out of the freelist and adds it to +// the event_list that was passed in. Also inserts the event record +// node into the passed in graph. It returns the index of the +// event. static uint16_t iree_hal_hip_graph_tracing_context_insert_query( - iree_hal_hip_tracing_context_t* context, hipGraphNode_t* out_node, - hipGraph_t graph, hipGraphNode_t* dependency_nodes, - size_t dependency_nodes_count) { + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, + hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { + IREE_ASSERT_ARGUMENT(event_list); + iree_slim_mutex_lock(&context->event_mutex); // Allocate an event from the pool for use by the query. - uint32_t query_id = context->query_head; - context->query_head = (context->query_head + 1) % context->query_capacity; - - // TODO: check to see if the read and write heads of the ringbuffer have - // overlapped. If they have we could try to collect but it's not guaranteed - // that collection will complete (e.g. we may be reserving events for use in - // graphs that haven't yet been launched). - // - // For now we just allow the overlap and tracing results will be inconsistent. - IREE_ASSERT_NE(context->query_head, context->query_tail); - - hipEvent_t event = context->event_pool[query_id]; + // TODO: If we have run out of our freelist, then we need to try and recover + // or + // allocate more events. + iree_hal_hip_tracing_context_event_t* event = context->event_freelist_head; + context->event_freelist_head = event->next_in_command_buffer; + uint32_t query_id = event - &context->event_pool[0]; + IREE_ASSERT(event->next_in_command_buffer != NULL); + event->next_in_command_buffer = NULL; + iree_status_t status = IREE_HIP_RESULT_TO_STATUS( context->symbols, hipGraphAddEventRecordNode(out_node, graph, dependency_nodes, - dependency_nodes_count, event)); + dependency_nodes_count, event->event)); IREE_ASSERT(iree_status_is_ok(status)); + iree_hal_hip_tracing_context_event_list_append_event(event_list, event); + + iree_slim_mutex_unlock(&context->event_mutex); return query_id; } @@ -271,58 +422,65 @@ static uint16_t iree_hal_hip_graph_tracing_context_insert_query( // today we insert 2 events per zone (one for begin and one for end) but in // many cases we could reduce this by inserting events only between zones and // using the differences between them. - void iree_hal_hip_stream_tracing_zone_begin_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream, + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, const iree_tracing_location_t* src_loc) { IREE_ASSERT_ARGUMENT(context); - uint16_t query_id = - iree_hal_hip_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); } void iree_hal_hip_stream_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream, + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { IREE_ASSERT_ARGUMENT(context); - uint16_t query_id = - iree_hal_hip_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } void iree_hal_hip_graph_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, hipGraphNode_t* out_node, - hipGraph_t graph, hipGraphNode_t* dependency_nodes, - size_t dependency_nodes_count, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length) { + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, + hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { if (!context) return; uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( - context, out_node, graph, dependency_nodes, dependency_nodes_count); + context, event_list, out_node, graph, dependency_nodes, + dependency_nodes_count); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); } void iree_hal_hip_stream_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream) { + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream) { if (!context) return; - uint16_t query_id = - iree_hal_hip_stream_tracing_context_insert_query(context, stream); + uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( + context, event_list, stream); iree_tracing_gpu_zone_end(context->id, query_id); } void iree_hal_hip_graph_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, hipGraphNode_t* out_node, - hipGraph_t graph, hipGraphNode_t* dependency_nodes, - size_t dependency_nodes_count) { + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, + hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( - context, out_node, graph, dependency_nodes, dependency_nodes_count); + context, event_list, out_node, graph, dependency_nodes, + dependency_nodes_count); iree_tracing_gpu_zone_end(context->id, query_id); } @@ -343,4 +501,12 @@ void iree_hal_hip_tracing_context_free( void iree_hal_hip_tracing_context_collect( iree_hal_hip_tracing_context_t* context) {} +void iree_hal_hip_tracing_notify_submitted( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list) {} + +void iree_hal_hip_tracing_free( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list) {} + #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h index c50ced9e1297..24e12b8f3a83 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.h +++ b/runtime/src/iree/hal/drivers/hip/tracing.h @@ -42,6 +42,15 @@ extern "C" { // Thread-compatible: external synchronization is required if using from // multiple threads (same as with hipStream_t itself). typedef struct iree_hal_hip_tracing_context_t iree_hal_hip_tracing_context_t; +typedef struct iree_hal_hip_tracing_context_event_t + iree_hal_hip_tracing_context_event_t; + +// This is used when tracing is enabled. Calls to dispatch and event related +// functions will update the pointers to keep the list up to date. +typedef struct iree_hal_hip_tracing_context_event_list_t { + iree_hal_hip_tracing_context_event_t* head; + iree_hal_hip_tracing_context_event_t* tail; +} iree_hal_hip_tracing_context_event_list_t; // Allocates a tracing context for the given HIP |stream|. // Each context must only be used with the stream it was created for. @@ -61,80 +70,101 @@ void iree_hal_hip_tracing_context_free(iree_hal_hip_tracing_context_t* context); void iree_hal_hip_tracing_context_collect( iree_hal_hip_tracing_context_t* context); +// Notifies that the given list of events has been dispached on to the gpu. +void iree_hal_hip_tracing_notify_submitted( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list); + +// Frees the events and returns them back into the tracing context. +void iree_hal_hip_tracing_free( + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list); + #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE // Begins a normal zone derived on the calling |src_loc|. // Must be perfectly nested and paired with a corresponding zone end. void iree_hal_hip_stream_tracing_zone_begin_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream, + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, const iree_tracing_location_t* src_loc); // Begins an external zone using the given source information. // The provided strings will be copied into the tracy buffer. void iree_hal_hip_stream_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream, + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length); + void iree_hal_hip_graph_tracing_zone_begin_external_impl( - iree_hal_hip_tracing_context_t* context, hipGraphNode_t* out_node, - hipGraph_t graph, hipGraphNode_t* dependency_nodes, - size_t dependency_nodes_count, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length); + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, + hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length); void iree_hal_hip_stream_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, hipStream_t stream); + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream); void iree_hal_hip_graph_tracing_zone_end_impl( - iree_hal_hip_tracing_context_t* context, hipGraphNode_t* out_node, - hipGraph_t graph, hipGraphNode_t* dependency_nodes, - size_t dependency_nodes_count); + iree_hal_hip_tracing_context_t* context, + iree_hal_hip_tracing_context_event_list_t* event_list, + hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count); // Begins a new zone with the parent function name. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, stream) \ +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) \ static const iree_tracing_location_t TracyConcat( \ __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ (uint32_t)__LINE__, 0}; \ iree_hal_hip_stream_tracing_zone_begin_impl( \ - context, stream, &TracyConcat(__tracy_source_location, __LINE__)); + context, event_list, stream, \ + &TracyConcat(__tracy_source_location, __LINE__)); // Begins an externally defined zone with a dynamic source location. // The |file_name|, |function_name|, and optional |name| strings will be copied // into the trace buffer and do not need to persist. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, stream, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_hal_hip_stream_tracing_zone_begin_external_impl( \ - context, stream, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hal_hip_stream_tracing_zone_begin_external_impl( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) #define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) \ + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ iree_hal_hip_graph_tracing_zone_begin_external_impl( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) - -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, stream) \ - iree_hal_hip_stream_tracing_zone_end_impl(context, stream) -#define IREE_HIP_GRAPH_TRACE_ZONE_END( \ - context, out_node, graph, dependency_nodes, dependency_nodes_count) \ - iree_hal_hip_graph_tracing_zone_end_impl( \ - context, out_node, graph, dependency_nodes, dependency_nodes_count) - + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream) \ + iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream) +#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + dependency_nodes, \ + dependency_nodes_count) \ + iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node, \ + graph, dependency_nodes, \ + dependency_nodes_count) #else -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, stream) -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, stream, file_name, file_name_length, line, function_name, \ +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, out_node, graph, dependency_nodes, \ + dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) -#define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, out_node, graph, dpendency_nodes, dpendency_nodes_count, \ - file_name, file_name_length, line, function_name, function_name_length, \ - name, name_length) -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, stream) - +#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream) +#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + dependency_nodes, \ + dependency_nodes_count) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE #ifdef __cplusplus