diff --git a/src/allocator/allocator.c b/src/allocator/allocator.c index 99215b1..ba264ba 100755 --- a/src/allocator/allocator.c +++ b/src/allocator/allocator.c @@ -11,7 +11,7 @@ size_t OVERSIZE = 134217728; region_list *r_list; allocated_list *device_overallocated; -allocated_list *array_list; +allocated_list *device_allocasync; #define ALIGN 2097152 #define MULTI_PARAM 1 @@ -93,7 +93,9 @@ void allocator_init(){ device_overallocated = malloc(sizeof(allocated_list)); LIST_INIT(device_overallocated); - + device_allocasync=malloc(sizeof(allocated_list)); + LIST_INIT(device_allocasync); + pthread_mutex_init(&mutex,NULL); } @@ -196,13 +198,6 @@ int free_raw(CUdeviceptr dptr){ return tmp; } -int free_raw_async(CUdeviceptr dptr, CUstream hStream){ - pthread_mutex_lock(&mutex); - unsigned int tmp = remove_chunk_async(device_overallocated,dptr,hStream); - pthread_mutex_unlock(&mutex); - return tmp; -} - int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStream){ size_t t_size; if (a_list->length==0) { @@ -224,10 +219,9 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea return -1; } -int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){ - int tmp; +int free_raw_async(CUdeviceptr dptr, CUstream hStream){ pthread_mutex_lock(&mutex); - tmp = add_chunk_async(dptr,size,hStream); + unsigned int tmp = remove_chunk_async(device_allocasync,dptr,hStream); pthread_mutex_unlock(&mutex); return tmp; } @@ -248,11 +242,33 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){ LOG_ERROR("cuMemoryAllocate failed res=%d",res); return res; } - LIST_ADD(device_overallocated,e); - //uint64_t t_size; + LIST_ADD(device_allocasync,e); *address = e->entry->address; - allocsize = size; - cuCtxGetDevice(&dev); - add_gpu_device_memory_usage(getpid(),dev,allocsize,2); + CUmemoryPool pool; + res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev); + if (res!=CUDA_SUCCESS){ + LOG_ERROR("cuDeviceGetMemPool failed res=%d",res); + return res; + } + size_t poollimit; + res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,&poollimit); + if (res!=CUDA_SUCCESS) { + LOG_ERROR("cuMemPoolGetAttribute failed res=%d",res); + return res; + } + if ((poollimit!=0) && (poollimit> device_allocasync->limit)) { + allocsize = poollimit-device_allocasync->limit; + cuCtxGetDevice(&dev); + add_gpu_device_memory_usage(getpid(),dev,allocsize,2); + device_allocasync->limit=poollimit; + } return 0; } + +int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){ + int tmp; + pthread_mutex_lock(&mutex); + tmp = add_chunk_async(dptr,size,hStream); + pthread_mutex_unlock(&mutex); + return tmp; +} diff --git a/src/allocator/allocator.h b/src/allocator/allocator.h index c1218fd..2064ee3 100755 --- a/src/allocator/allocator.h +++ b/src/allocator/allocator.h @@ -16,7 +16,7 @@ struct allocated_device_memory_struct{ typedef struct allocated_device_memory_struct allocated_device_memory; struct allocated_list_entry_struct{ - allocated_device_memory *entry;; + allocated_device_memory *entry; struct allocated_list_entry_struct *next,*prev; }; typedef struct allocated_list_entry_struct allocated_list_entry; @@ -25,6 +25,7 @@ struct allocated_list_struct{ allocated_list_entry *head; allocated_list_entry *tail; size_t length; + size_t limit; }; typedef struct allocated_list_struct allocated_list; @@ -55,13 +56,14 @@ typedef struct region_list_struct region_list; extern region_list *r_list; extern allocated_list *device_overallocated; -extern allocated_list *array_list; +extern allocated_list *device_allocasync; extern pthread_mutex_t mutex; #define LIST_INIT(list) { \ list->head=NULL; \ list->tail=NULL; \ list->length=0; \ + list->limit=0; \ } #define __LIST_INIT(list) LIST_INIT(list) diff --git a/src/cuda/context.c b/src/cuda/context.c index 4e4a0ff..fdbfa5d 100755 --- a/src/cuda/context.c +++ b/src/cuda/context.c @@ -2,7 +2,6 @@ #include "multiprocess/multiprocess_memory_limit.h" extern int context_size; -extern int cuda_to_nvml_map[16]; extern int ctx_activate[16]; diff --git a/src/cuda/device.c b/src/cuda/device.c index e870add..2ce1aab 100755 --- a/src/cuda/device.c +++ b/src/cuda/device.c @@ -7,8 +7,6 @@ #include "allocator/allocator.h" #include "include/memory_limit.h" -extern int cuda_to_nvml_map[16]; - CUresult cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) { CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetAttribute,pi,attrib,dev); LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi); diff --git a/src/cuda/memory.c b/src/cuda/memory.c index 14673a8..1a4bb8a 100755 --- a/src/cuda/memory.c +++ b/src/cuda/memory.c @@ -619,8 +619,9 @@ CUresult cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void } CUresult cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value) { - LOG_DEBUG("cuMemPoolGetAttribute"); - return CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value); + CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value); + LOG_INFO("cuMemPoolGetAttribute %d %ld",attr,*(long *)value); + return res; } CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count) { diff --git a/src/include/log_utils.h b/src/include/log_utils.h index 2779a4f..16bafea 100755 --- a/src/include/log_utils.h +++ b/src/include/log_utils.h @@ -8,39 +8,73 @@ #include #include -#ifdef MEMORY_LIMIT_DEBUG -#define LOG_DEBUG(msg, ...) fprintf(stderr, msg"\n", ##__VA_ARGS__); -#else +FILE *fp1; + +#ifdef FILEDEBUG #define LOG_DEBUG(msg, ...) { \ - if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) \ - fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\ + if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \ + fprintf(fp1, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ } -#endif - #define LOG_INFO(msg, ...) { \ if ( \ /*(getenv("LIBCUDA_LOG_LEVEL")==NULL) || */\ - (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) \ - fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\ + if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \ + fprintf(fp1, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ } - #define LOG_WARN(msg, ...) { \ if ( \ (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \ - ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \ - fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\ + if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \ + fprintf(fp1, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ } - #define LOG_MSG(msg, ...) { \ if ( \ (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \ - ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \ - fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\ + if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \ + fprintf(fp1, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ + } +#define LOG_ERROR(msg, ...) { \ + if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \ + fprintf(fp1, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \ +} +#else +#define LOG_DEBUG(msg, ...) { \ + if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\ + fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ + } +#define LOG_INFO(msg, ...) { \ + if ( \ + (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\ + fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ + } +#define LOG_WARN(msg, ...) { \ + if ( \ + (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \ + ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\ + fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ + } +#define LOG_MSG(msg, ...) { \ + if ( \ + (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \ + ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\ + fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \ + }\ } - #define LOG_ERROR(msg, ...) { \ fprintf(stderr, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \ } +#endif #define CHECK_DRV_API(f) { \ CUresult status = (f); \ diff --git a/src/libvgpu.c b/src/libvgpu.c index f645eaf..48bbd01 100644 --- a/src/libvgpu.c +++ b/src/libvgpu.c @@ -30,10 +30,6 @@ pthread_once_t dlsym_init_flag = PTHREAD_ONCE_INIT; where to find its core utilization */ extern int pidfound; -/* cuda_to_nvml_map indicates cuda_visible_devices, we need to map it into nvml_visible_devices, -to let device-memory be counted successfully*/ -extern int cuda_to_nvml_map[16]; - /* used to switch on/off the core utilization limitation*/ extern int env_utilization_switch; @@ -848,14 +844,11 @@ void preInit(){ } void postInit(){ - map_cuda_visible_devices(); allocator_init(); try_lock_unified_lock(); nvmlReturn_t res = set_task_pid(); try_unlock_unified_lock(); - - LOG_MSG("Initialized"); if (res!=NVML_SUCCESS){ LOG_WARN("SET_TASK_PID FAILED."); @@ -863,6 +856,8 @@ void postInit(){ }else{ pidfound=1; } + + map_cuda_visible_devices(); //add_gpu_device_memory_usage(getpid(),0,context_size,0); env_utilization_switch = set_env_utilization_switch(); init_utilization_watcher(); diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c index 4b94030..b8a2619 100755 --- a/src/multiprocess/multiprocess_memory_limit.c +++ b/src/multiprocess/multiprocess_memory_limit.c @@ -380,8 +380,9 @@ uint64_t nvml_get_device_memory_usage(const int dev) { return usage; } -int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){ - LOG_INFO("add_gpu_device_memory:%d %d %lu",pid,dev,usage); +int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){ + LOG_INFO("add_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),usage); + int dev = cuda_to_nvml_map(cudadev); ensure_initialized(); lock_shrreg(); int i; @@ -408,8 +409,9 @@ int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){ return 0; } -int rm_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){ - LOG_INFO("rm_gpu_device_memory:%d %d %lu",pid,dev,usage); +int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){ + LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type); + int dev = cuda_to_nvml_map(cudadev); ensure_initialized(); lock_shrreg(); int i; diff --git a/src/multiprocess/multiprocess_memory_limit.h b/src/multiprocess/multiprocess_memory_limit.h index 228469f..bb2e09c 100755 --- a/src/multiprocess/multiprocess_memory_limit.h +++ b/src/multiprocess/multiprocess_memory_limit.h @@ -178,6 +178,7 @@ int load_env_from_file(char *filename); int comparelwr(const char *s1,char *s2); int put_device_info(); unsigned int nvml_to_cuda_map(unsigned int nvmldev); +unsigned int cuda_to_nvml_map(unsigned int cudadev); #endif // __MULTIPROCESS_MEMORY_LIMIT_H__ diff --git a/src/multiprocess/multiprocess_utilization_watcher.c b/src/multiprocess/multiprocess_utilization_watcher.c index 0fad5fd..91d2b72 100644 --- a/src/multiprocess/multiprocess_utilization_watcher.c +++ b/src/multiprocess/multiprocess_utilization_watcher.c @@ -29,7 +29,7 @@ static int g_max_thread_per_sm; static volatile long g_cur_cuda_cores = 0; static volatile long g_total_cuda_cores = 0; extern int pidfound; -int cuda_to_nvml_map[16]; +int cuda_to_nvml_map_array[16]; void rate_limiter(int grids, int blocks) { long before_cuda_cores = 0; @@ -100,12 +100,16 @@ unsigned int nvml_to_cuda_map(unsigned int nvmldev){ CHECK_NVML_API(nvmlDeviceGetCount_v2(&devcount)); int i=0; for (i=0;imonitorused[cudadev] += infos[i].usedGpuMemory; } } @@ -161,7 +165,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) { proc = find_proc_by_hostpid(processes_sample[i].pid); if (proc != NULL){ sum += processes_sample[i].smUtil; - LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil); + //LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil); proc->device_util[cudadev].sm_util += processes_sample[i].smUtil; } } diff --git a/src/nvml/hook.c b/src/nvml/hook.c index 7d8bfb9..0d0894e 100644 --- a/src/nvml/hook.c +++ b/src/nvml/hook.c @@ -260,7 +260,7 @@ pthread_once_t init_virtual_map_post_flag = PTHREAD_ONCE_INIT; typedef void* (*fp_dlsym)(void*, const char*); extern fp_dlsym real_dlsym; extern int virtual_nvml_devices; -extern int cuda_to_nvml_map[16]; +extern int cuda_to_nvml_map_array[16]; nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index) { return NVML_OVERRIDE_CALL(nvml_library_entry, nvmlDeviceGetIndex, device, index); @@ -307,7 +307,7 @@ void nvml_preInit() { load_nvml_libraries(); int i; for (i=0; i<16; i++) { - cuda_to_nvml_map[i] = i; + cuda_to_nvml_map_array[i] = i; } } diff --git a/src/utils.c b/src/utils.c index e4e8906..47a1067 100755 --- a/src/utils.c +++ b/src/utils.c @@ -14,7 +14,7 @@ const char* unified_lock="/tmp/vgpulock/lock"; const int retry_count=20; extern int context_size; -extern int cuda_to_nvml_map[16]; +extern int cuda_to_nvml_map_array[16]; // 0 unified_lock lock success // -1 unified_lock lock fail @@ -177,20 +177,20 @@ int parse_cuda_visible_env() { char *s = getenv("CUDA_VISIBLE_DEVICES"); count = 0; for (i=0; i<16; i++) { - cuda_to_nvml_map[i] = i; + cuda_to_nvml_map_array[i] = i; } if (need_cuda_virtualize()) { for (i=0; i %d",i,cuda_to_nvml_map[i]); + LOG_INFO("device %d -> %d",i,cuda_to_nvml_map(i)); } LOG_DEBUG("get default cuda from %s",getenv("CUDA_VISIBLE_DEVICES")); return count;