Skip to content

Commit 1aa6bdf

Browse files
authored
Merge pull request #103 from Project-HAMi/fix_v2.6.0
Fix v2.6.0
2 parents eb26b57 + a7ba17b commit 1aa6bdf

File tree

10 files changed

+78
-43
lines changed

10 files changed

+78
-43
lines changed

src/allocator/allocator.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea
209209
t_size=val->entry->length;
210210
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemFreeAsync,dptr,hStream);
211211
LIST_REMOVE(a_list,val);
212-
212+
a_list->limit-=t_size;
213213
CUdevice dev;
214214
cuCtxGetDevice(&dev);
215215
rm_gpu_device_memory_usage(getpid(),dev,t_size,2);
@@ -242,7 +242,6 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
242242
LOG_ERROR("cuMemoryAllocate failed res=%d",res);
243243
return res;
244244
}
245-
LIST_ADD(device_allocasync,e);
246245
*address = e->entry->address;
247246
CUmemoryPool pool;
248247
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
@@ -257,11 +256,13 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
257256
return res;
258257
}
259258
if ((poollimit!=0) && (poollimit> device_allocasync->limit)) {
260-
allocsize = poollimit-device_allocasync->limit;
259+
allocsize = (poollimit-device_allocasync->limit < size)? poollimit-device_allocasync->limit : size;
261260
cuCtxGetDevice(&dev);
262261
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
263-
device_allocasync->limit=poollimit;
262+
device_allocasync->limit=device_allocasync->limit+allocsize;
263+
e->entry->length=allocsize;
264264
}
265+
LIST_ADD(device_allocasync,e);
265266
return 0;
266267
}
267268

src/cuda/context.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "include/libcuda_hook.h"
22
#include "multiprocess/multiprocess_memory_limit.h"
33

4-
extern int context_size;
4+
extern size_t context_size;
55
extern int ctx_activate[16];
66

77

@@ -12,13 +12,15 @@ CUresult cuDevicePrimaryCtxGetState( CUdevice dev, unsigned int* flags, int* act
1212
}
1313

1414
CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev){
15-
LOG_INFO("dev=%d context_size=%d",dev,context_size);
15+
LOG_INFO("dev=%d context_size=%ld",dev,context_size);
1616
//for Initialization only
1717
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRetain,pctx,dev);
1818
if (ctx_activate[dev] == 0) {
1919
add_gpu_device_memory_usage(getpid(),dev,context_size,0);
2020
}
21-
ctx_activate[dev] = 1;
21+
if (context_size>0) {
22+
ctx_activate[dev] = 1;
23+
}
2224
return res;
2325
}
2426

@@ -29,11 +31,11 @@ CUresult cuDevicePrimaryCtxSetFlags_v2( CUdevice dev, unsigned int flags ){
2931
}
3032

3133
CUresult cuDevicePrimaryCtxRelease_v2( CUdevice dev ){
32-
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);
3334
if (ctx_activate[dev] == 1) {
3435
rm_gpu_device_memory_usage(getpid(),dev,context_size,0);
3536
}
3637
ctx_activate[dev] = 0;
38+
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDevicePrimaryCtxRelease_v2,dev);
3739
return res;
3840
}
3941

@@ -119,7 +121,7 @@ CUresult cuCtxSetCacheConfig ( CUfunc_cache config ){
119121
CUresult cuCtxSetCurrent ( CUcontext ctx ){
120122
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuCtxSetCurrent,ctx);
121123
if (res!=CUDA_SUCCESS){
122-
LOG_ERROR("cuCtxSetCurrent failed res=%d",res);
124+
LOG_ERROR("cuCtxSetCurrent111 failed res=%d ctx=%p",res,ctx);
123125
}
124126
return res;
125127
}

src/cuda/device.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
#include "allocator/allocator.h"
88
#include "include/memory_limit.h"
99

10-
CUresult cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
10+
CUresult CUDAAPI cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
1111
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetAttribute,pi,attrib,dev);
12-
LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
12+
//LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
1313
return res;
1414
}
1515

src/cuda/hook.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ void *find_symbols_in_table(const char *symbol) {
325325
void *find_symbols_in_table_by_cudaversion(const char *symbol,int cudaVersion) {
326326
void *pfn;
327327
const char *real_symbol;
328+
int i;
328329
real_symbol = get_real_func_name(symbol,cudaVersion);
329330
if (real_symbol == NULL) {
330331
// if not find in mulit func version def, use origin logic
@@ -398,6 +399,7 @@ CUresult cuGetProcAddress_v2(const char *symbol, void **pfn, int cudaVersion, cu
398399
return res;
399400
}else{
400401
LOG_DEBUG("found symbol %s",symbol);
401-
return CUDA_SUCCESS;
402+
void *optr;
403+
return CUDA_OVERRIDE_CALL(cuda_library_entry,cuGetProcAddress_v2,symbol,&optr,cudaVersion,flags,symbolStatus);
402404
}
403405
}

src/libvgpu.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ extern int pidfound;
3434
extern int env_utilization_switch;
3535

3636
/* context size for a certain task, we need to add it into device-memory usage*/
37-
extern int context_size;
37+
extern size_t context_size;
3838

3939
/* This is the symbol search function */
4040
fp_dlsym real_dlsym = NULL;
@@ -75,7 +75,12 @@ FUNC_ATTR_VISIBLE void* dlsym(void* handle, const char* symbol) {
7575
pthread_once(&dlsym_init_flag,init_dlsym);
7676
if (real_dlsym == NULL) {
7777
real_dlsym = dlvsym(RTLD_NEXT,"dlsym","GLIBC_2.2.5");
78-
vgpulib = dlopen("/usr/local/vgpu/libvgpu.so",RTLD_LAZY);
78+
char *path_search=getenv("CUDA_REDIRECT");
79+
if ((path_search!=NULL) && (strlen(path_search)>0)){
80+
vgpulib = dlopen(path_search,RTLD_LAZY);
81+
}else{
82+
vgpulib = dlopen("/usr/local/vgpu/libvgpu.so",RTLD_LAZY);
83+
}
7984
if (real_dlsym == NULL) {
8085
LOG_ERROR("real dlsym not found");
8186
real_dlsym = _dl_sym(RTLD_NEXT, "dlsym", dlsym);
@@ -127,8 +132,11 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
127132
}
128133
}
129134
}
135+
DLSYM_HOOK_FUNC(cuInit);
136+
DLSYM_HOOK_FUNC(cuGetProcAddress);
137+
DLSYM_HOOK_FUNC(cuGetProcAddress_v2);
130138
//Context
131-
DLSYM_HOOK_FUNC(cuCtxGetDevice);
139+
//DLSYM_HOOK_FUNC(cuCtxGetDevice);
132140
DLSYM_HOOK_FUNC(cuCtxCreate_v2);
133141
DLSYM_HOOK_FUNC(cuCtxCreate_v3);
134142
DLSYM_HOOK_FUNC(cuDevicePrimaryCtxGetState);
@@ -139,12 +147,10 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
139147
DLSYM_HOOK_FUNC(cuDeviceGetTexture1DLinearMaxWidth);
140148
DLSYM_HOOK_FUNC(cuDeviceSetMemPool);
141149
DLSYM_HOOK_FUNC(cuFlushGPUDirectRDMAWrites);
142-
143150
DLSYM_HOOK_FUNC(cuCtxDestroy_v2);
144151
DLSYM_HOOK_FUNC(cuCtxGetApiVersion);
145152
DLSYM_HOOK_FUNC(cuCtxGetCacheConfig);
146153
DLSYM_HOOK_FUNC(cuCtxGetCurrent);
147-
DLSYM_HOOK_FUNC(cuCtxGetDevice);
148154
DLSYM_HOOK_FUNC(cuCtxGetFlags);
149155
DLSYM_HOOK_FUNC(cuCtxGetLimit);
150156
DLSYM_HOOK_FUNC(cuCtxGetSharedMemConfig);
@@ -158,9 +164,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
158164
DLSYM_HOOK_FUNC(cuCtxSynchronize);
159165
//DLSYM_HOOK_FUNC(cuCtxEnablePeerAccess);
160166
//DLSYM_HOOK_FUNC(cuGetExportTable);
161-
162-
163-
DLSYM_HOOK_FUNC(cuInit);
164167
DLSYM_HOOK_FUNC(cuArray3DCreate_v2);
165168
DLSYM_HOOK_FUNC(cuArrayCreate_v2);
166169
DLSYM_HOOK_FUNC(cuArrayDestroy);
@@ -178,6 +181,8 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
178181
DLSYM_HOOK_FUNC(cuStreamCreate);
179182
DLSYM_HOOK_FUNC(cuStreamDestroy_v2);
180183
DLSYM_HOOK_FUNC(cuStreamSynchronize);
184+
DLSYM_HOOK_FUNC(cuDeviceGet);
185+
DLSYM_HOOK_FUNC(cuCtxGetDevice);
181186
DLSYM_HOOK_FUNC(cuDeviceGetAttribute);
182187
DLSYM_HOOK_FUNC(cuDeviceGetCount);
183188
DLSYM_HOOK_FUNC(cuDeviceGet);
@@ -191,7 +196,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
191196
DLSYM_HOOK_FUNC(cuDeviceGetUuid);
192197
DLSYM_HOOK_FUNC(cuDeviceGetMemPool);
193198
DLSYM_HOOK_FUNC(cuDeviceTotalMem_v2);
194-
195199
DLSYM_HOOK_FUNC(cuPointerGetAttributes);
196200
DLSYM_HOOK_FUNC(cuPointerGetAttribute);
197201
DLSYM_HOOK_FUNC(cuPointerSetAttribute);
@@ -231,7 +235,6 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
231235
DLSYM_HOOK_FUNC(cuMemsetD8_v2);
232236
DLSYM_HOOK_FUNC(cuMemsetD8Async);
233237
DLSYM_HOOK_FUNC(cuMemAdvise);
234-
235238
DLSYM_HOOK_FUNC(cuEventCreate);
236239
DLSYM_HOOK_FUNC(cuEventDestroy_v2);
237240
DLSYM_HOOK_FUNC(cuModuleLoad);
@@ -248,14 +251,11 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
248251
DLSYM_HOOK_FUNC(cuLinkAddFile_v2);
249252
DLSYM_HOOK_FUNC(cuLinkComplete);
250253
DLSYM_HOOK_FUNC(cuLinkDestroy);
251-
252254
DLSYM_HOOK_FUNC(cuMemAddressReserve);
253255
DLSYM_HOOK_FUNC(cuMemCreate);
254256
DLSYM_HOOK_FUNC(cuMemMap);
255257
DLSYM_HOOK_FUNC(cuMemAllocAsync);
256-
DLSYM_HOOK_FUNC(cuGetProcAddress);
257-
DLSYM_HOOK_FUNC(cuGetProcAddress_v2);
258-
/* cuda 11.7 new memory ops */
258+
// cuda 11.7 new memory ops
259259
DLSYM_HOOK_FUNC(cuMemHostGetDevicePointer_v2);
260260
DLSYM_HOOK_FUNC(cuMemHostGetFlags);
261261
DLSYM_HOOK_FUNC(cuMemPoolTrimTo);
@@ -279,7 +279,7 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
279279
DLSYM_HOOK_FUNC(cuMemPrefetchAsync);
280280
DLSYM_HOOK_FUNC(cuMemRangeGetAttribute);
281281
DLSYM_HOOK_FUNC(cuMemRangeGetAttributes);
282-
/* cuda 11.7 external resource interoperability */
282+
// cuda 11.7 external resource interoperability
283283
DLSYM_HOOK_FUNC(cuImportExternalMemory);
284284
DLSYM_HOOK_FUNC(cuExternalMemoryGetMappedBuffer);
285285
DLSYM_HOOK_FUNC(cuExternalMemoryGetMappedMipmappedArray);
@@ -288,7 +288,7 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
288288
DLSYM_HOOK_FUNC(cuSignalExternalSemaphoresAsync);
289289
DLSYM_HOOK_FUNC(cuWaitExternalSemaphoresAsync);
290290
DLSYM_HOOK_FUNC(cuDestroyExternalSemaphore);
291-
/* cuda Graph */
291+
// cuda Graph
292292
DLSYM_HOOK_FUNC(cuGraphCreate);
293293
DLSYM_HOOK_FUNC(cuGraphAddKernelNode_v2);
294294
DLSYM_HOOK_FUNC(cuGraphKernelNodeGetParams_v2);

src/multiprocess/multiprocess_memory_limit.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ static shared_region_info_t region_info = {0, -1, PTHREAD_ONCE_INIT, NULL, 0};
4646
//size_t initial_offset=117440512;
4747
int env_utilization_switch;
4848
int enable_active_oom_killer;
49-
int context_size;
49+
size_t context_size;
5050
size_t initial_offset=0;
5151
//lock for record kernel time
5252
pthread_mutex_t _kernel_mutex;
@@ -409,7 +409,7 @@ int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
409409
}
410410

411411
int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
412-
LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type);
412+
LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu:%lu",pid,cudadev,cuda_to_nvml_map(cudadev),type,usage);
413413
int dev = cuda_to_nvml_map(cudadev);
414414
ensure_initialized();
415415
lock_shrreg();
@@ -430,6 +430,7 @@ int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
430430
region_info.shared_region->procs[i].used[dev].data_size -= usage;
431431
}
432432
}
433+
LOG_INFO("after delete:%lu",region_info.shared_region->procs[i].used[dev].total);
433434
}
434435
}
435436
unlock_shrreg();

src/multiprocess/multiprocess_utilization_watcher.c

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
122122
struct timeval cur;
123123
size_t microsec;
124124

125-
int i,sum=0;
125+
int i;
126126
unsigned int infcount;
127127
nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
128128

@@ -133,7 +133,6 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
133133
int devi,cudadev;
134134
for (devi=0;devi<nvmlCounts;devi++){
135135
uint64_t sum=0;
136-
uint64_t usedGpuMemory=0;
137136
infcount = SHARED_REGION_MAX_PROCESS_NUM;
138137
shrreg_proc_slot_t *proc;
139138
cudadev = nvml_to_cuda_map((unsigned int)(devi));
@@ -149,7 +148,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
149148
for (i=0; i<infcount; i++){
150149
proc = find_proc_by_hostpid(infos[i].pid);
151150
if (proc != NULL){
152-
usedGpuMemory += infos[i].usedGpuMemory;
151+
proc->monitorused[cudadev] = infos[i].usedGpuMemory;
153152
}
154153
}
155154
}
@@ -164,17 +163,12 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
164163
proc = find_proc_by_hostpid(processes_sample[i].pid);
165164
if (proc != NULL){
166165
sum += processes_sample[i].smUtil;
166+
proc->device_util[cudadev].sm_util = processes_sample[i].smUtil;
167167
}
168168
}
169169
}
170170
if (sum < 0)
171171
sum = 0;
172-
if (usedGpuMemory < 0)
173-
usedGpuMemory = 0;
174-
if (proc != NULL) {
175-
proc->device_util[cudadev].sm_util = sum;
176-
proc->monitorused[cudadev] = usedGpuMemory;
177-
}
178172
userutil[cudadev] = sum;
179173
}
180174
unlock_shrreg();

src/nvml/hook.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,13 +385,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo ( nvmlDevice_t device, unsigned in
385385
}
386386

387387
nvmlReturn_t nvmlDeviceGetHandleByIndex ( unsigned int index, nvmlDevice_t* device ){
388+
nvmlReturn_t res;
388389
LOG_DEBUG("nvmlDeviceGetHandleByIndex index=%u",index);
389-
return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
390+
res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex,index,device);
391+
return res;
390392
}
391393

392394
nvmlReturn_t nvmlDeviceGetHandleByIndex_v2 ( unsigned int index, nvmlDevice_t* device ){
395+
nvmlReturn_t res;
393396
LOG_DEBUG("nvmlDeviceGetHandleByIndex_v2 index=%u",index);
394-
return NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
397+
res = NVML_OVERRIDE_CALL_NO_LOG(nvml_library_entry,nvmlDeviceGetHandleByIndex_v2,index,device);
398+
return res;
395399
}
396400

397401
nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2 ( const char* pciBusId, nvmlDevice_t* device ) {

src/utils.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
const char* unified_lock="/tmp/vgpulock/lock";
1515
const int retry_count=20;
16-
extern int context_size;
16+
extern size_t context_size;
1717
extern int cuda_to_nvml_map_array[16];
1818

1919
// 0 unified_lock lock success
@@ -105,7 +105,7 @@ nvmlReturn_t set_task_pid() {
105105
nvmlDevice_t device;
106106
nvmlReturn_t res;
107107
CUcontext pctx;
108-
int i;
108+
int i,t;
109109
CHECK_NVML_API(nvmlInit());
110110
CHECK_NVML_API(nvmlDeviceGetHandleByIndex(0, &device));
111111

@@ -127,6 +127,7 @@ nvmlReturn_t set_task_pid() {
127127
}
128128
}while(res==NVML_ERROR_INSUFFICIENT_SIZE);
129129
mergepid(&previous,&merged_num,(nvmlProcessInfo_t1 *)tmp_pids_on_device,pre_pids_on_device);
130+
break;
130131
}
131132
previous = merged_num;
132133
merged_num = 0;
@@ -146,6 +147,7 @@ nvmlReturn_t set_task_pid() {
146147
}
147148
}while(res == NVML_ERROR_INSUFFICIENT_SIZE);
148149
mergepid(&running_processes,&merged_num,(nvmlProcessInfo_t1 *)tmp_pids_on_device,pids_on_device);
150+
break;
149151
}
150152
running_processes = merged_num;
151153
LOG_INFO("current processes num = %u %u",previous,running_processes);

test/test_runtime_launch.cu

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ __global__ void add(float* a, float* b, float* c) {
1111
c[idx] = a[idx] + b[idx];
1212
}
1313

14+
__global__ void computeKernel(double* data, int N, int iterations) {
15+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
16+
if (tid < N) {
17+
double temp = 0.0;
18+
temp += sin(data[tid]) * cos(data[tid]);
19+
data[tid] = temp;
20+
}
21+
}
1422

1523
int main() {
1624
float *a, *b, *c;
@@ -19,5 +27,26 @@ int main() {
1927
CHECK_RUNTIME_API(cudaMalloc(&c, 1024 * sizeof(float)));
2028

2129
add<<<1, 1024>>>(a, b, c);
30+
31+
int N = 1 << 27;
32+
double* d_data;
33+
34+
cudaMalloc(&d_data, N * sizeof(double));
35+
36+
int threadsPerBlock = 256;
37+
int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
38+
39+
int iterations = 1000000;
40+
int num_launches = 100;
41+
42+
for (int i = 0; i < num_launches; ++i) {
43+
computeKernel<<<blocks, threadsPerBlock>>>(d_data, N, iterations);
44+
cudaDeviceSynchronize();
45+
}
46+
47+
cudaFree(d_data);
48+
49+
sleep(100);
50+
printf("completed");
2251
return 0;
2352
}

0 commit comments

Comments
 (0)