Merge pull request #103 from Project-HAMi/fix_v2.6.0

archlitchi · web-flow · commit 1aa6bdf32b7b · 2025-07-30T12:19:55.000+08:00
Fix v2.6.0
diff --git a/src/allocator/allocator.c b/src/allocator/allocator.c
@@ -209,7 +209,7 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea
             t_size=val->entry->length;
             CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemFreeAsync,dptr,hStream);
             LIST_REMOVE(a_list,val);
-
+            a_list->limit-=t_size;
             CUdevice dev;
             cuCtxGetDevice(&dev);
             rm_gpu_device_memory_usage(getpid(),dev,t_size,2);
@@ -242,7 +242,6 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
         LOG_ERROR("cuMemoryAllocate failed res=%d",res);
         return res;
     }
-    LIST_ADD(device_allocasync,e);
     *address = e->entry->address;
     CUmemoryPool pool;
     res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
@@ -257,11 +256,13 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
         return res;
     }
     if ((poollimit!=0) && (poollimit> device_allocasync->limit)) {
-        allocsize = poollimit-device_allocasync->limit;
+        allocsize = (poollimit-device_allocasync->limit < size)? poollimit-device_allocasync->limit : size;
         cuCtxGetDevice(&dev);
         add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
-        device_allocasync->limit=poollimit;
+        device_allocasync->limit=device_allocasync->limit+allocsize;
+        e->entry->length=allocsize;
     }
+    LIST_ADD(device_allocasync,e);
     return 0;
 }
 
diff --git a/src/cuda/context.c b/src/cuda/context.c
@@ -1,7 +1,7 @@
 #include "include/libcuda_hook.h"
 #include "multiprocess/multiprocess_memory_limit.h"
 
-extern int context_size;
+extern size_t context_size;
 extern int ctx_activate[16];
 
 
@@ -12,13 +12,15 @@ CUresult cuDevicePrimaryCtxGetState( CUdevice dev, unsigned int* flags, int* act
 }
 
 CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev){
-    LOG_INFO("dev=%d context_size=%d",dev,context_size);
+    LOG_INFO("dev=%d context_size=%ld",dev,context_size);
     //for Initialization only
     CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRetain,pctx,dev);
     if (ctx_activate[dev] == 0) {
         add_gpu_device_memory_usage(getpid(),dev,context_size,0); 
     }
-    ctx_activate[dev] = 1;
+    if (context_size>0) {
+        ctx_activate[dev] = 1;
+    }
     return res;
 }
 
@@ -29,11 +31,11 @@ CUresult cuDevicePrimaryCtxSetFlags_v2( CUdevice dev, unsigned int  flags ){
 }
 
 CUresult cuDevicePrimaryCtxRelease_v2( CUdevice dev ){
-    CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);
     if (ctx_activate[dev] == 1) {
         rm_gpu_device_memory_usage(getpid(),dev,context_size,0);
     }
     ctx_activate[dev] = 0;
+    CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);
     return res;
 }
 
@@ -119,7 +121,7 @@ CUresult cuCtxSetCacheConfig ( CUfunc_cache config ){
 CUresult cuCtxSetCurrent ( CUcontext ctx ){
     CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuCtxSetCurrent,ctx);
     if (res!=CUDA_SUCCESS){
-        LOG_ERROR("cuCtxSetCurrent failed res=%d",res);
+        LOG_ERROR("cuCtxSetCurrent111 failed res=%d ctx=%p",res,ctx);
     }
     return res;
 }
diff --git a/src/cuda/device.c b/src/cuda/device.c
@@ -7,9 +7,9 @@
 #include "allocator/allocator.h"
 #include "include/memory_limit.h"
 
-CUresult cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
+CUresult CUDAAPI cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
     CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetAttribute,pi,attrib,dev);
-    LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
+    //LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
     return res;
 }
 
diff --git a/src/cuda/hook.c b/src/cuda/hook.c
@@ -325,6 +325,7 @@ void *find_symbols_in_table(const char *symbol) {
 void *find_symbols_in_table_by_cudaversion(const char *symbol,int  cudaVersion) {
   void *pfn;
   const char *real_symbol;
+  int i;
   real_symbol = get_real_func_name(symbol,cudaVersion);
   if (real_symbol == NULL) {
     // if not find in mulit func version def, use origin logic
@@ -398,6 +399,7 @@ CUresult cuGetProcAddress_v2(const char *symbol, void **pfn, int cudaVersion, cu
         return res;
     }else{
         LOG_DEBUG("found symbol %s",symbol);
-        return CUDA_SUCCESS;
+        void *optr;
+        return CUDA_OVERRIDE_CALL(cuda_library_entry,cuGetProcAddress_v2,symbol,&optr,cudaVersion,flags,symbolStatus);
     } 
 }
diff --git a/src/libvgpu.c b/src/libvgpu.c
@@ -34,7 +34,7 @@ extern int pidfound;
 extern int env_utilization_switch;
 
 /* context size for a certain task, we need to add it into device-memory usage*/
-extern int context_size;
+extern size_t context_size;
 
 /* This is the symbol search function */
 fp_dlsym real_dlsym = NULL;
@@ -75,7 +75,12 @@ FUNC_ATTR_VISIBLE void* dlsym(void* handle, const char* symbol) {
     pthread_once(&dlsym_init_flag,init_dlsym);
     if (real_dlsym == NULL) {
         real_dlsym = dlvsym(RTLD_NEXT,"dlsym","GLIBC_2.2.5");
-        vgpulib = dlopen("/usr/local/vgpu/libvgpu.so",RTLD_LAZY);
+        char *path_search=getenv("CUDA_REDIRECT");
+        if ((path_search!=NULL) && (strlen(path_search)>0)){
+            vgpulib = dlopen(path_search,RTLD_LAZY);
+        }else{
+            vgpulib = dlopen("/usr/local/vgpu/libvgpu.so",RTLD_LAZY);
+        }
         if (real_dlsym == NULL) {
             LOG_ERROR("real dlsym not found");
             real_dlsym = _dl_sym(RTLD_NEXT, "dlsym", dlsym);
@@ -127,8 +132,11 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
             }
         }
     }
+    DLSYM_HOOK_FUNC(cuInit);
+    DLSYM_HOOK_FUNC(cuGetProcAddress);
+    DLSYM_HOOK_FUNC(cuGetProcAddress_v2);
     //Context
-    DLSYM_HOOK_FUNC(cuCtxGetDevice);
+    //DLSYM_HOOK_FUNC(cuCtxGetDevice);
     DLSYM_HOOK_FUNC(cuCtxCreate_v2);
     DLSYM_HOOK_FUNC(cuCtxCreate_v3);
     DLSYM_HOOK_FUNC(cuDevicePrimaryCtxGetState);
@@ -139,12 +147,10 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuDeviceGetTexture1DLinearMaxWidth);
     DLSYM_HOOK_FUNC(cuDeviceSetMemPool);
     DLSYM_HOOK_FUNC(cuFlushGPUDirectRDMAWrites);
-
     DLSYM_HOOK_FUNC(cuCtxDestroy_v2);
     DLSYM_HOOK_FUNC(cuCtxGetApiVersion);
     DLSYM_HOOK_FUNC(cuCtxGetCacheConfig);
     DLSYM_HOOK_FUNC(cuCtxGetCurrent);
-    DLSYM_HOOK_FUNC(cuCtxGetDevice);
     DLSYM_HOOK_FUNC(cuCtxGetFlags);
     DLSYM_HOOK_FUNC(cuCtxGetLimit);
     DLSYM_HOOK_FUNC(cuCtxGetSharedMemConfig);
@@ -158,9 +164,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuCtxSynchronize);
     //DLSYM_HOOK_FUNC(cuCtxEnablePeerAccess);
     //DLSYM_HOOK_FUNC(cuGetExportTable);
-
-
-    DLSYM_HOOK_FUNC(cuInit);
     DLSYM_HOOK_FUNC(cuArray3DCreate_v2);
     DLSYM_HOOK_FUNC(cuArrayCreate_v2);
     DLSYM_HOOK_FUNC(cuArrayDestroy);
@@ -178,6 +181,8 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuStreamCreate);
     DLSYM_HOOK_FUNC(cuStreamDestroy_v2);
     DLSYM_HOOK_FUNC(cuStreamSynchronize);
+    DLSYM_HOOK_FUNC(cuDeviceGet);
+    DLSYM_HOOK_FUNC(cuCtxGetDevice);
     DLSYM_HOOK_FUNC(cuDeviceGetAttribute);
     DLSYM_HOOK_FUNC(cuDeviceGetCount);
     DLSYM_HOOK_FUNC(cuDeviceGet);
@@ -191,7 +196,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuDeviceGetUuid);
     DLSYM_HOOK_FUNC(cuDeviceGetMemPool);
     DLSYM_HOOK_FUNC(cuDeviceTotalMem_v2);
-
     DLSYM_HOOK_FUNC(cuPointerGetAttributes);
     DLSYM_HOOK_FUNC(cuPointerGetAttribute);
     DLSYM_HOOK_FUNC(cuPointerSetAttribute);
@@ -231,7 +235,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuMemsetD8_v2);
     DLSYM_HOOK_FUNC(cuMemsetD8Async);
     DLSYM_HOOK_FUNC(cuMemAdvise);
-
     DLSYM_HOOK_FUNC(cuEventCreate);
     DLSYM_HOOK_FUNC(cuEventDestroy_v2);
     DLSYM_HOOK_FUNC(cuModuleLoad);
@@ -248,14 +251,11 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuLinkAddFile_v2);
     DLSYM_HOOK_FUNC(cuLinkComplete);
     DLSYM_HOOK_FUNC(cuLinkDestroy);
-
     DLSYM_HOOK_FUNC(cuMemAddressReserve);
     DLSYM_HOOK_FUNC(cuMemCreate);
     DLSYM_HOOK_FUNC(cuMemMap);
     DLSYM_HOOK_FUNC(cuMemAllocAsync);
-    DLSYM_HOOK_FUNC(cuGetProcAddress);
-    DLSYM_HOOK_FUNC(cuGetProcAddress_v2);
-    /* cuda 11.7 new memory ops */
+    // cuda 11.7 new memory ops
     DLSYM_HOOK_FUNC(cuMemHostGetDevicePointer_v2);
     DLSYM_HOOK_FUNC(cuMemHostGetFlags);
     DLSYM_HOOK_FUNC(cuMemPoolTrimTo);
@@ -279,7 +279,7 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuMemPrefetchAsync);
     DLSYM_HOOK_FUNC(cuMemRangeGetAttribute);
     DLSYM_HOOK_FUNC(cuMemRangeGetAttributes);
-    /* cuda 11.7 external resource interoperability */
+    // cuda 11.7 external resource interoperability
     DLSYM_HOOK_FUNC(cuImportExternalMemory);
     DLSYM_HOOK_FUNC(cuExternalMemoryGetMappedBuffer);
     DLSYM_HOOK_FUNC(cuExternalMemoryGetMappedMipmappedArray);
@@ -288,7 +288,7 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
     DLSYM_HOOK_FUNC(cuSignalExternalSemaphoresAsync);
     DLSYM_HOOK_FUNC(cuWaitExternalSemaphoresAsync);
     DLSYM_HOOK_FUNC(cuDestroyExternalSemaphore);
-    /* cuda Graph */
+    // cuda Graph 
     DLSYM_HOOK_FUNC(cuGraphCreate);
     DLSYM_HOOK_FUNC(cuGraphAddKernelNode_v2);
     DLSYM_HOOK_FUNC(cuGraphKernelNodeGetParams_v2);
diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c
@@ -46,7 +46,7 @@ static shared_region_info_t region_info = {0, -1, PTHREAD_ONCE_INIT, NULL, 0};
 //size_t initial_offset=117440512;
 int env_utilization_switch;
 int enable_active_oom_killer;
-int context_size;
+size_t context_size;
 size_t initial_offset=0;
 //lock for record kernel time
 pthread_mutex_t _kernel_mutex;
@@ -409,7 +409,7 @@ int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
 }
 
 int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
-    LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type);
+    LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu:%lu",pid,cudadev,cuda_to_nvml_map(cudadev),type,usage);
     int dev = cuda_to_nvml_map(cudadev);
     ensure_initialized();
     lock_shrreg();
@@ -430,6 +430,7 @@ int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
                     region_info.shared_region->procs[i].used[dev].data_size -= usage;
                 }
             }
+            LOG_INFO("after delete:%lu",region_info.shared_region->procs[i].used[dev].total);
         }
     }
     unlock_shrreg();
diff --git a/src/multiprocess/multiprocess_utilization_watcher.c b/src/multiprocess/multiprocess_utilization_watcher.c
@@ -122,7 +122,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
     struct timeval cur;
     size_t microsec;
 
-    int i,sum=0;
+    int i;
     unsigned int infcount;
     nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
 
@@ -133,7 +133,6 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
     int devi,cudadev;
     for (devi=0;devi<nvmlCounts;devi++){
       uint64_t sum=0;
-      uint64_t usedGpuMemory=0;
       infcount = SHARED_REGION_MAX_PROCESS_NUM;
       shrreg_proc_slot_t *proc;
       cudadev = nvml_to_cuda_map((unsigned int)(devi));
@@ -149,7 +148,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
         for (i=0; i<infcount; i++){
           proc = find_proc_by_hostpid(infos[i].pid);
           if (proc != NULL){
-              usedGpuMemory += infos[i].usedGpuMemory;
+              proc->monitorused[cudadev] = infos[i].usedGpuMemory;
           }
         }
       }
@@ -164,17 +163,12 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
           proc = find_proc_by_hostpid(processes_sample[i].pid);
           if (proc != NULL){
               sum += processes_sample[i].smUtil;
+              proc->device_util[cudadev].sm_util = processes_sample[i].smUtil;
           }
         }
       }
       if (sum < 0)
         sum = 0;
-      if (usedGpuMemory < 0)
-        usedGpuMemory = 0;
-      if (proc != NULL) {
-        proc->device_util[cudadev].sm_util = sum;
-        proc->monitorused[cudadev] = usedGpuMemory;
-      }
       userutil[cudadev] = sum;
     }
     unlock_shrreg();
diff --git a/src/nvml/hook.c b/src/nvml/hook.c
@@ -385,13 +385,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo ( nvmlDevice_t device, unsigned in
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByIndex ( unsigned int  index, nvmlDevice_t* device ){
+    nvmlReturn_t res;
     LOG_DEBUG("nvmlDeviceGetHandleByIndex index=%u",index); 
-    return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
+    res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex,index,device);
+    return res;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByIndex_v2 ( unsigned int  index, nvmlDevice_t* device ){
+    nvmlReturn_t res;
     LOG_DEBUG("nvmlDeviceGetHandleByIndex_v2 index=%u",index); 
-    return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
+    res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
+    return res;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2 ( const char* pciBusId, nvmlDevice_t* device ) {
diff --git a/src/utils.c b/src/utils.c
@@ -13,7 +13,7 @@
 
 const char* unified_lock="/tmp/vgpulock/lock";
 const int retry_count=20;
-extern int context_size;
+extern size_t context_size;
 extern int cuda_to_nvml_map_array[16];
 
 // 0 unified_lock lock success
@@ -105,7 +105,7 @@ nvmlReturn_t set_task_pid() {
     nvmlDevice_t device;
     nvmlReturn_t res;
     CUcontext pctx;
-    int i;
+    int i,t;
     CHECK_NVML_API(nvmlInit());
     CHECK_NVML_API(nvmlDeviceGetHandleByIndex(0, &device));
     
@@ -127,6 +127,7 @@ nvmlReturn_t set_task_pid() {
             }
         }while(res==NVML_ERROR_INSUFFICIENT_SIZE); 
         mergepid(&previous,&merged_num,(nvmlProcessInfo_t1 *)tmp_pids_on_device,pre_pids_on_device);
+        break;
     }
     previous = merged_num;
     merged_num = 0;
@@ -146,6 +147,7 @@ nvmlReturn_t set_task_pid() {
             }
         }while(res == NVML_ERROR_INSUFFICIENT_SIZE);
         mergepid(&running_processes,&merged_num,(nvmlProcessInfo_t1 *)tmp_pids_on_device,pids_on_device);
+        break;
     }
     running_processes = merged_num;
     LOG_INFO("current processes num = %u %u",previous,running_processes);
diff --git a/test/test_runtime_launch.cu b/test/test_runtime_launch.cu
@@ -11,6 +11,14 @@ __global__ void add(float* a, float* b, float* c) {
     c[idx] = a[idx] + b[idx];
 }
 
+__global__ void computeKernel(double* data, int N, int iterations) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < N) {
+        double temp = 0.0;
+        temp += sin(data[tid]) * cos(data[tid]);
+        data[tid] = temp;
+    }
+}
 
 int main() {
     float *a, *b, *c;
@@ -19,5 +27,26 @@ int main() {
     CHECK_RUNTIME_API(cudaMalloc(&c, 1024 * sizeof(float)));
 
     add<<<1, 1024>>>(a, b, c);
+
+    int N = 1 << 27; 
+    double* d_data;
+
+    cudaMalloc(&d_data, N * sizeof(double));
+
+    int threadsPerBlock = 256;
+    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+    int iterations = 1000000; 
+    int num_launches = 100; 
+
+    for (int i = 0; i < num_launches; ++i) {
+        computeKernel<<<blocks, threadsPerBlock>>>(d_data, N, iterations);
+        cudaDeviceSynchronize();  
+    }
+
+    cudaFree(d_data);
+
+    sleep(100);
+    printf("completed");
     return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#include "include/libcuda_hook.h"`
`2`	`2`	`#include "multiprocess/multiprocess_memory_limit.h"`
`3`	`3`
`4`		`-extern int context_size;`
	`4`	`+extern size_t context_size;`
`5`	`5`	`extern int ctx_activate[16];`
`6`	`6`
`7`	`7`
`@@ -12,13 +12,15 @@ CUresult cuDevicePrimaryCtxGetState( CUdevice dev, unsigned int* flags, int* act`
`12`	`12`	`}`
`13`	`13`
`14`	`14`	`CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev){`
`15`		`- LOG_INFO("dev=%d context_size=%d",dev,context_size);`
	`15`	`+ LOG_INFO("dev=%d context_size=%ld",dev,context_size);`
`16`	`16`	`//for Initialization only`
`17`	`17`	`CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRetain,pctx,dev);`
`18`	`18`	`if (ctx_activate[dev] == 0) {`
`19`	`19`	`add_gpu_device_memory_usage(getpid(),dev,context_size,0);`
`20`	`20`	`}`
`21`		`- ctx_activate[dev] = 1;`
	`21`	`+ if (context_size>0) {`
	`22`	`+ ctx_activate[dev] = 1;`
	`23`	`+ }`
`22`	`24`	`return res;`
`23`	`25`	`}`
`24`	`26`
`@@ -29,11 +31,11 @@ CUresult cuDevicePrimaryCtxSetFlags_v2( CUdevice dev, unsigned int flags ){`
`29`	`31`	`}`
`30`	`32`
`31`	`33`	`CUresult cuDevicePrimaryCtxRelease_v2( CUdevice dev ){`
`32`		`- CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);`
`33`	`34`	`if (ctx_activate[dev] == 1) {`
`34`	`35`	`rm_gpu_device_memory_usage(getpid(),dev,context_size,0);`
`35`	`36`	`}`
`36`	`37`	`ctx_activate[dev] = 0;`
	`38`	`+ CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);`
`37`	`39`	`return res;`
`38`	`40`	`}`
`39`	`41`
`@@ -119,7 +121,7 @@ CUresult cuCtxSetCacheConfig ( CUfunc_cache config ){`
`119`	`121`	`CUresult cuCtxSetCurrent ( CUcontext ctx ){`
`120`	`122`	`CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuCtxSetCurrent,ctx);`
`121`	`123`	`if (res!=CUDA_SUCCESS){`
`122`		`- LOG_ERROR("cuCtxSetCurrent failed res=%d",res);`
	`124`	`+ LOG_ERROR("cuCtxSetCurrent111 failed res=%d ctx=%p",res,ctx);`
`123`	`125`	`}`
`124`	`126`	`return res;`
`125`	`127`	`}`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ static shared_region_info_t region_info = {0, -1, PTHREAD_ONCE_INIT, NULL, 0};`
`46`	`46`	`//size_t initial_offset=117440512;`
`47`	`47`	`int env_utilization_switch;`
`48`	`48`	`int enable_active_oom_killer;`
`49`		`-int context_size;`
	`49`	`+size_t context_size;`
`50`	`50`	`size_t initial_offset=0;`
`51`	`51`	`//lock for record kernel time`
`52`	`52`	`pthread_mutex_t _kernel_mutex;`
`@@ -409,7 +409,7 @@ int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){`
`409`	`409`	`}`
`410`	`410`
`411`	`411`	`int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){`
`412`		`- LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type);`
	`412`	`+ LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu:%lu",pid,cudadev,cuda_to_nvml_map(cudadev),type,usage);`
`413`	`413`	`int dev = cuda_to_nvml_map(cudadev);`
`414`	`414`	`ensure_initialized();`
`415`	`415`	`lock_shrreg();`
`@@ -430,6 +430,7 @@ int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){`
`430`	`430`	`region_info.shared_region->procs[i].used[dev].data_size -= usage;`
`431`	`431`	`}`
`432`	`432`	`}`
	`433`	`+ LOG_INFO("after delete:%lu",region_info.shared_region->procs[i].used[dev].total);`
`433`	`434`	`}`
`434`	`435`	`}`
`435`	`436`	`unlock_shrreg();`
Original file line number	Diff line number	Diff line change
`@@ -385,13 +385,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo ( nvmlDevice_t device, unsigned in`
`385`	`385`	`}`
`386`	`386`
`387`	`387`	`nvmlReturn_t nvmlDeviceGetHandleByIndex ( unsigned int index, nvmlDevice_t* device ){`
	`388`	`+ nvmlReturn_t res;`
`388`	`389`	`LOG_DEBUG("nvmlDeviceGetHandleByIndex index=%u",index);`
`389`		`- return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);`
	`390`	`+ res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex,index,device);`
	`391`	`+ return res;`
`390`	`392`	`}`
`391`	`393`
`392`	`394`	`nvmlReturn_t nvmlDeviceGetHandleByIndex_v2 ( unsigned int index, nvmlDevice_t* device ){`
	`395`	`+ nvmlReturn_t res;`
`393`	`396`	`LOG_DEBUG("nvmlDeviceGetHandleByIndex_v2 index=%u",index);`
`394`		`- return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);`
	`397`	`+ res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);`
	`398`	`+ return res;`
`395`	`399`	`}`
`396`	`400`
`397`	`401`	`nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2 ( const char* pciBusId, nvmlDevice_t* device ) {`