Skip to content

[Issue]: A Bug in Gin GDAKI Put? #1893

@jywangx

Description

@jywangx

How is this issue impacting you?

Application hang

Share Your Debug Logs

No response

Steps to Reproduce the Issue

I implemented a custom kernel using the GIN API to measure the performance of the Put operation:

template <typename T, typename Coop = ncclCoopWarp>
__global__ void nPing1PongLatKernel(ncclWindow_t sendwin, size_t sendoffset,
                                    ncclWindow_t recvwin, size_t recvoffset,
                                    size_t n, size_t count, int root,
                                    TimingResult *timing_result,
                                    struct ncclDevComm devComm) {
  
  ncclTeam world = ncclTeamWorld(devComm);

  int globalTid = threadIdx.x + blockIdx.x*blockDim.x;
  int globalNthreads = blockDim.x * gridDim.x;

  int globalWid = globalTid / 32;
  int globalNwarps = (globalNthreads+31) / 32;

  if (world.nRanks != 2) return ;

  int ginContext = 0;
  unsigned int signalIndex = 0;

  ncclGin gin { devComm, ginContext };
  uint64_t signalValue = gin.readSignal(signalIndex);
  ncclBarrierSession<ncclCoopCta> bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x };
  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);

  const size_t size = count * sizeof(T);
  int rank = world.rank, peer = world.rank ^ 1;
  if (rank == root) {
    ncclCoopCta().sync();
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    if (globalTid == 0) {
      timing_result->start_time = globaltimer();
    }
    int i;
    for (i = 0; i < n-1; i ++) {
      gin.put<ncclGin_None, ncclGin_None, Coop>(
        world, peer,
        recvwin, recvoffset + i * size,
        sendwin, sendoffset + i * size,
        size,
        ncclGin_None{},
        ncclGin_None{},
        Coop{}
      );
    }
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset + i * size,
      sendwin, sendoffset + i * size,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: send %lu data\n", rank, globalTid, n);
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    ncclCoopCta().sync();
    if (globalTid == 0) {
      timing_result->end_time = globaltimer();
    }
  } else {
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset,
      sendwin, sendoffset,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: put 1 data\n", rank, globalTid);
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset + size,
      sendwin, sendoffset + size,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: put 1 data\n", rank, globalTid);
  }
  gin.flush(ncclCoopCta());
  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
}

However, when I set NCCL_GIN_TYPE=3 to use GDAKI, the kernel hangs during execution. In contrast, it runs correctly when NCCL_GIN_TYPE=2 is set to use the Proxy mode. (NCCL_DMABUF_ENABLE is always set to 0.)

The log output during the hang is as follows:

rank 0, tid 0: recv signal 1
rank 0, tid 1: recv signal 1
rank 0, tid 2: recv signal 1
rank 0, tid 3: recv signal 1
rank 0, tid 4: recv signal 1
rank 0, tid 5: recv signal 1
rank 0, tid 6: recv signal 1
rank 0, tid 7: recv signal 1
rank 0, tid 8: recv signal 1
rank 0, tid 9: recv signal 1
rank 0, tid 10: recv signal 1
rank 0, tid 11: recv signal 1
rank 0, tid 12: recv signal 1
rank 0, tid 13: recv signal 1
rank 0, tid 14: recv signal 1
rank 0, tid 15: recv signal 1
rank 0, tid 16: recv signal 1
rank 0, tid 17: recv signal 1
rank 0, tid 18: recv signal 1
rank 0, tid 19: recv signal 1
rank 0, tid 20: recv signal 1
rank 0, tid 21: recv signal 1
rank 0, tid 22: recv signal 1
rank 0, tid 23: recv signal 1
rank 0, tid 24: recv signal 1
rank 0, tid 25: recv signal 1
rank 0, tid 26: recv signal 1
rank 0, tid 27: recv signal 1
rank 0, tid 28: recv signal 1
rank 0, tid 29: recv signal 1
rank 0, tid 30: recv signal 1
rank 0, tid 31: recv signal 1
rank 0, tid 1: send 5 data
rank 0, tid 2: send 5 data
rank 0, tid 3: send 5 data
rank 0, tid 4: send 5 data
rank 0, tid 5: send 5 data
rank 0, tid 6: send 5 data
rank 0, tid 7: send 5 data
rank 0, tid 8: send 5 data
rank 0, tid 9: send 5 data
rank 0, tid 10: send 5 data
rank 0, tid 11: send 5 data
rank 0, tid 12: send 5 data
rank 0, tid 13: send 5 data
rank 0, tid 14: send 5 data
rank 0, tid 15: send 5 data
rank 0, tid 16: send 5 data
rank 0, tid 17: send 5 data
rank 0, tid 18: send 5 data
rank 0, tid 19: send 5 data
rank 0, tid 20: send 5 data
rank 0, tid 21: send 5 data
rank 0, tid 22: send 5 data
rank 0, tid 23: send 5 data
rank 0, tid 24: send 5 data
rank 0, tid 25: send 5 data
rank 0, tid 26: send 5 data
rank 0, tid 27: send 5 data
rank 0, tid 28: send 5 data
rank 0, tid 29: send 5 data
rank 0, tid 30: send 5 data
rank 0, tid 31: send 5 data
rank 1, tid 1: put 1 data
rank 1, tid 2: put 1 data
rank 1, tid 3: put 1 data
rank 1, tid 4: put 1 data
rank 1, tid 5: put 1 data
rank 1, tid 6: put 1 data
rank 1, tid 7: put 1 data
rank 1, tid 8: put 1 data
rank 1, tid 9: put 1 data
rank 1, tid 10: put 1 data
rank 1, tid 11: put 1 data
rank 1, tid 12: put 1 data
rank 1, tid 13: put 1 data
rank 1, tid 14: put 1 data
rank 1, tid 15: put 1 data
rank 1, tid 16: put 1 data
rank 1, tid 17: put 1 data
rank 1, tid 18: put 1 data
rank 1, tid 19: put 1 data
rank 1, tid 20: put 1 data
rank 1, tid 21: put 1 data
rank 1, tid 22: put 1 data
rank 1, tid 23: put 1 data
rank 1, tid 24: put 1 data
rank 1, tid 25: put 1 data
rank 1, tid 26: put 1 data
rank 1, tid 27: put 1 data
rank 1, tid 28: put 1 data
rank 1, tid 29: put 1 data
rank 1, tid 30: put 1 data
rank 1, tid 31: put 1 data

Notably, the log from tid 0 is missing.

In addition, when I set Coop to ncclCoopThread and modify the kernel so that only tid=0 issues the Put operation (consistent with the latest nccl-tests implementation of ginAllToAll), the kernel runs normally.

The program is launched with:

Below is the complete host-side kernel launch code. In short, the kernel is launched with grid_dim=1 and block_dim=32:

/*************************************************************************
 * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "cuda_runtime.h"
#include "nccl.h"
#include "nccl_device.h"
#include "utils.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>

/*
 * NCCL Device GIN API Profile Example
 */

// Device API kernel launch configuration
// CTA count must match lsaBarrierCount for proper barrier synchronization
#define NCCL_DEVICE_CTA_COUNT 1
#define NCCL_DEVICE_THREADS_PER_CTA 32

// ==========================================================================
// Device Kernel Implementation
// ==========================================================================
__device__ __forceinline__ unsigned long long int globaltimer() {
  unsigned long long int timer;
  asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timer));
  return timer;
}
struct TimingResult {
  unsigned long long int start_time;
  unsigned long long int end_time;
};

template <typename T, typename Coop = ncclCoopWarp>
__global__ void nPing1PongLatKernel(ncclWindow_t sendwin, size_t sendoffset,
                                    ncclWindow_t recvwin, size_t recvoffset,
                                    size_t n, size_t count, int root,
                                    TimingResult *timing_result,
                                    struct ncclDevComm devComm) {
  
  ncclTeam world = ncclTeamWorld(devComm);

  int globalTid = threadIdx.x + blockIdx.x*blockDim.x;
  int globalNthreads = blockDim.x * gridDim.x;

  int globalWid = globalTid / 32;
  int globalNwarps = (globalNthreads+31) / 32;

  if (world.nRanks != 2) return ;

  int ginContext = 0;
  unsigned int signalIndex = 0;

  ncclGin gin { devComm, ginContext };
  uint64_t signalValue = gin.readSignal(signalIndex);
  ncclBarrierSession<ncclCoopCta> bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x };
  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);

  const size_t size = count * sizeof(T);
  int rank = world.rank, peer = world.rank ^ 1;
  if (rank == root) {
    ncclCoopCta().sync();
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    if (globalTid == 0) {
      timing_result->start_time = globaltimer();
    }
    int i;
    for (i = 0; i < n-1; i ++) {
      gin.put<ncclGin_None, ncclGin_None, Coop>(
        world, peer,
        recvwin, recvoffset + i * size,
        sendwin, sendoffset + i * size,
        size,
        ncclGin_None{},
        ncclGin_None{},
        Coop{}
      );
    }
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset + i * size,
      sendwin, sendoffset + i * size,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: send %lu data\n", rank, globalTid, n);
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    ncclCoopCta().sync();
    if (globalTid == 0) {
      timing_result->end_time = globaltimer();
    }
  } else {
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset,
      sendwin, sendoffset,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: put 1 data\n", rank, globalTid);
    gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + 1);
    signalValue = gin.readSignal(signalIndex);
    printf("rank %d, tid %d: recv signal %lu\n", rank, globalTid, signalValue);
    gin.put<ncclGin_SignalInc, ncclGin_None, Coop>(
      world, peer,
      recvwin, recvoffset + size,
      sendwin, sendoffset + size,
      size,
      ncclGin_SignalInc{signalIndex},
      ncclGin_None{},
      Coop{}
    );
    printf("rank %d, tid %d: put 1 data\n", rank, globalTid);
  }
  gin.flush(ncclCoopCta());
  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
}

TimingResult measurePingPongLatency(ncclComm_t comm, ncclDevComm devComm, 
                                   ncclWindow_t send_win, ncclWindow_t recv_win,
                                   size_t n, size_t count, int root, 
                                   cudaStream_t stream) {
  TimingResult* d_timing_result;
  CUDACHECK(cudaMalloc(&d_timing_result, sizeof(TimingResult)));
  
  nPing1PongLatKernel<float><<<NCCL_DEVICE_CTA_COUNT, NCCL_DEVICE_THREADS_PER_CTA, 0, stream>>>(
    send_win, 0, recv_win, 0, n, count, root, d_timing_result, devComm
  );
  
  CUDACHECK(cudaStreamSynchronize(stream));
  
  TimingResult h_timing_result;
  CUDACHECK(cudaMemcpy(&h_timing_result, d_timing_result, sizeof(TimingResult), cudaMemcpyDeviceToHost));
  
  CUDACHECK(cudaFree(d_timing_result));
  
  return h_timing_result;
}

// ==========================================================================
// Host-Side Setup and Device API Initialization
// ==========================================================================

// This function can be called inside an MPI rank or pthread thread. The
// initialization and broadcast are implemented in common/src/utils.cc for
// easier readability. For fully integrated examples using pthreads or MPI see
// examples. in 01_communicators.

void* test(int my_rank, int total_ranks, int local_device, int devices_per_rank, size_t n, size_t count) {
  ncclComm_t comm;
  ncclUniqueId nccl_unique_id;

  if (my_rank == 0) {
    printf("Starting Device API Test initialization\n");
  }


  // Standard NCCL communicator initialization (same as Host API)
  if (my_rank == 0) {
    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
  }

  // Distribute unique ID in case of MPI.
  util_broadcast(0, my_rank, &nccl_unique_id);

  // Set device context for this rank
  CUDACHECK(cudaSetDevice(local_device));
  printf("  Rank %d using GPU device %d\n", my_rank, local_device);

  // ==========================================================================
  // STEP 2: Initialize NCCL Communicator and Allocate Memory
  // ==========================================================================

  // Initialize NCCL communicator (same as Host API)
  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
  printf("  Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks);

  // Allocate memory for AlltoAll operation
  // n and count are now passed as parameters from command line
  size_t size_bytes = n * count * sizeof(float);

  void* d_sendbuff;
  void* d_recvbuff;
  ncclWindow_t send_win;
  ncclWindow_t recv_win;

  // Device API requires allocation compatible with symmetric memory allocation
  // This ensures memory can be accessed directly by device kernels from all ranks
  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));

  // ==========================================================================
  // STEP 4: Register Memory Windows for Device-Side Access
  // ==========================================================================

  // Register symmetric windows for GIN access
  // Windows enable direct peer-to-peer access from device kernels
  NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_DEFAULT));
  NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_DEFAULT));

  // ==========================================================================
  // STEP 5: Create Device Communicator and Configure LSA Barriers
  // ==========================================================================

  // Create stream for kernel execution
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreate(&stream));

  // Create device communicator - this is the key Device API component
  // Requirements specify resources to allocate (e.g., one barrier per CTA)
  ncclDevComm devComm;
  ncclDevCommRequirements reqs;
  memset(&reqs, 0, sizeof(reqs));
  reqs.barrierCount = NCCL_DEVICE_CTA_COUNT;
  reqs.ginSignalCount = NCCL_DEVICE_CTA_COUNT;
  NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
  printf("  Rank %d created device communicator with %d barriers and ginSignals\n", my_rank, NCCL_DEVICE_CTA_COUNT);

  // ==========================================================================
  // STEP 6: Launch Device Kernel for Ping-Pong Operation
  // ==========================================================================

  printf("  Rank %d launching ping-pong kernel...\n", my_rank);
  
  TimingResult timing_result = measurePingPongLatency(comm, devComm, send_win, recv_win, n, count, 0, stream);
  
  printf("  Rank %d completed nPing1PongLatKernel kernel execution\n", my_rank);
  
  if (my_rank == 0) {
    double latency_ns = (double)(timing_result.end_time - timing_result.start_time);
    printf("  Ping-Pong Latency: %.2f ns (%.2f us)\n", latency_ns, latency_ns / 1000.0);
    printf("  Start time: %llu, End time: %llu\n", timing_result.start_time, timing_result.end_time);
  }

  // ==========================================================================
  // STEP 7: Verify Results and Cleanup Resources
  // ==========================================================================

  if (my_rank == 0) {
    printf("Ping-Pong latency test completed successfully\n");
  }

  // Device API specific cleanup
  NCCLCHECK(ncclDevCommDestroy(comm, &devComm));
  NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
  NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
  NCCLCHECK(ncclMemFree(d_sendbuff));
  NCCLCHECK(ncclMemFree(d_recvbuff));

  // Standard NCCL cleanup
  CUDACHECK(cudaStreamDestroy(stream));
  NCCLCHECK(ncclCommFinalize(comm));
  NCCLCHECK(ncclCommDestroy(comm));

  return NULL;
}

void* test_wrapper(int my_rank, int total_ranks, int local_device, int devices_per_rank) {
  extern size_t g_n, g_count;
  return test(my_rank, total_ranks, local_device, devices_per_rank, g_n, g_count);
}

size_t g_n = 100; 
size_t g_count = 1024; 

int main(int argc, char* argv[]) {
  if (argc >= 3) {
    g_n = (size_t)atoi(argv[1]);
    g_count = (size_t)atoi(argv[2]);
  } else if (argc == 2) {
    printf("Usage: %s <n> <count>\n", argv[0]);
    printf("Using default values: n=%zu, count=%zu\n", g_n, g_count);
  }
  
  printf("Running with n=%zu, count=%zu\n", g_n, g_count);
  
  // Run example using the provided utility framework
  return run_example(argc, argv, test_wrapper);
}

NCCL Version

2.28.7+cuda12.2

Your platform details

No response

Error Message & Behavior

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions