Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 33 additions & 17 deletions src/allocator/allocator.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ size_t OVERSIZE = 134217728;

region_list *r_list;
allocated_list *device_overallocated;
allocated_list *array_list;
allocated_list *device_allocasync;

#define ALIGN 2097152
#define MULTI_PARAM 1
Expand Down Expand Up @@ -93,7 +93,9 @@ void allocator_init(){

device_overallocated = malloc(sizeof(allocated_list));
LIST_INIT(device_overallocated);

device_allocasync=malloc(sizeof(allocated_list));
LIST_INIT(device_allocasync);

pthread_mutex_init(&mutex,NULL);
}

Expand Down Expand Up @@ -196,13 +198,6 @@ int free_raw(CUdeviceptr dptr){
return tmp;
}

int free_raw_async(CUdeviceptr dptr, CUstream hStream){
pthread_mutex_lock(&mutex);
unsigned int tmp = remove_chunk_async(device_overallocated,dptr,hStream);
pthread_mutex_unlock(&mutex);
return tmp;
}

int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStream){
size_t t_size;
if (a_list->length==0) {
Expand All @@ -224,10 +219,9 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea
return -1;
}

int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){
int tmp;
int free_raw_async(CUdeviceptr dptr, CUstream hStream){
pthread_mutex_lock(&mutex);
tmp = add_chunk_async(dptr,size,hStream);
unsigned int tmp = remove_chunk_async(device_allocasync,dptr,hStream);
pthread_mutex_unlock(&mutex);
return tmp;
}
Expand All @@ -248,11 +242,33 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
LOG_ERROR("cuMemoryAllocate failed res=%d",res);
return res;
}
LIST_ADD(device_overallocated,e);
//uint64_t t_size;
LIST_ADD(device_allocasync,e);
*address = e->entry->address;
allocsize = size;
cuCtxGetDevice(&dev);
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
CUmemoryPool pool;
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
if (res!=CUDA_SUCCESS){
LOG_ERROR("cuDeviceGetMemPool failed res=%d",res);
return res;
}
size_t poollimit;
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,&poollimit);
if (res!=CUDA_SUCCESS) {
LOG_ERROR("cuMemPoolGetAttribute failed res=%d",res);
return res;
}
if ((poollimit!=0) && (poollimit> device_allocasync->limit)) {
allocsize = poollimit-device_allocasync->limit;
cuCtxGetDevice(&dev);
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
device_allocasync->limit=poollimit;
}
return 0;
}

int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){
int tmp;
pthread_mutex_lock(&mutex);
tmp = add_chunk_async(dptr,size,hStream);
pthread_mutex_unlock(&mutex);
return tmp;
}
6 changes: 4 additions & 2 deletions src/allocator/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ struct allocated_device_memory_struct{
typedef struct allocated_device_memory_struct allocated_device_memory;

struct allocated_list_entry_struct{
allocated_device_memory *entry;;
allocated_device_memory *entry;
struct allocated_list_entry_struct *next,*prev;
};
typedef struct allocated_list_entry_struct allocated_list_entry;
Expand All @@ -25,6 +25,7 @@ struct allocated_list_struct{
allocated_list_entry *head;
allocated_list_entry *tail;
size_t length;
size_t limit;
};
typedef struct allocated_list_struct allocated_list;

Expand Down Expand Up @@ -55,13 +56,14 @@ typedef struct region_list_struct region_list;

extern region_list *r_list;
extern allocated_list *device_overallocated;
extern allocated_list *array_list;
extern allocated_list *device_allocasync;
extern pthread_mutex_t mutex;

#define LIST_INIT(list) { \
list->head=NULL; \
list->tail=NULL; \
list->length=0; \
list->limit=0; \
}
#define __LIST_INIT(list) LIST_INIT(list)

Expand Down
1 change: 0 additions & 1 deletion src/cuda/context.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include "multiprocess/multiprocess_memory_limit.h"

extern int context_size;
extern int cuda_to_nvml_map[16];
extern int ctx_activate[16];


Expand Down
2 changes: 0 additions & 2 deletions src/cuda/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
#include "allocator/allocator.h"
#include "include/memory_limit.h"

extern int cuda_to_nvml_map[16];

CUresult cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetAttribute,pi,attrib,dev);
LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
Expand Down
5 changes: 3 additions & 2 deletions src/cuda/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,9 @@ CUresult cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void
}

CUresult cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value) {
LOG_DEBUG("cuMemPoolGetAttribute");
return CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value);
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value);
LOG_INFO("cuMemPoolGetAttribute %d %ld",attr,*(long *)value);
return res;
}

CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count) {
Expand Down
66 changes: 50 additions & 16 deletions src/include/log_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,73 @@
#include <stdlib.h>
#include <stdio.h>

#ifdef MEMORY_LIMIT_DEBUG
#define LOG_DEBUG(msg, ...) fprintf(stderr, msg"\n", ##__VA_ARGS__);
#else
FILE *fp1;

#ifdef FILEDEBUG
#define LOG_DEBUG(msg, ...) { \
if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) \
fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
fprintf(fp1, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}
#endif

#define LOG_INFO(msg, ...) { \
if ( \
/*(getenv("LIBCUDA_LOG_LEVEL")==NULL) || */\
(getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) \
fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
(getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
fprintf(fp1, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}

#define LOG_WARN(msg, ...) { \
if ( \
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \
fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
fprintf(fp1, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}

#define LOG_MSG(msg, ...) { \
if ( \
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \
fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
fprintf(fp1, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}
#define LOG_ERROR(msg, ...) { \
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
fprintf(fp1, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \
}
#else
#define LOG_DEBUG(msg, ...) { \
if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}
#define LOG_INFO(msg, ...) { \
if ( \
(getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}
#define LOG_WARN(msg, ...) { \
if ( \
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}
#define LOG_MSG(msg, ...) { \
if ( \
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
}\
}

#define LOG_ERROR(msg, ...) { \
fprintf(stderr, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \
}
#endif

#define CHECK_DRV_API(f) { \
CUresult status = (f); \
Expand Down
9 changes: 2 additions & 7 deletions src/libvgpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ pthread_once_t dlsym_init_flag = PTHREAD_ONCE_INIT;
where to find its core utilization */
extern int pidfound;

/* cuda_to_nvml_map indicates cuda_visible_devices, we need to map it into nvml_visible_devices,
to let device-memory be counted successfully*/
extern int cuda_to_nvml_map[16];

/* used to switch on/off the core utilization limitation*/
extern int env_utilization_switch;

Expand Down Expand Up @@ -848,21 +844,20 @@ void preInit(){
}

void postInit(){
map_cuda_visible_devices();
allocator_init();

try_lock_unified_lock();
nvmlReturn_t res = set_task_pid();
try_unlock_unified_lock();


LOG_MSG("Initialized");
if (res!=NVML_SUCCESS){
LOG_WARN("SET_TASK_PID FAILED.");
pidfound=0;
}else{
pidfound=1;
}

map_cuda_visible_devices();
//add_gpu_device_memory_usage(getpid(),0,context_size,0);
env_utilization_switch = set_env_utilization_switch();
init_utilization_watcher();
Expand Down
10 changes: 6 additions & 4 deletions src/multiprocess/multiprocess_memory_limit.c
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,9 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
return usage;
}

int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
LOG_INFO("add_gpu_device_memory:%d %d %lu",pid,dev,usage);
int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
LOG_INFO("add_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),usage);
int dev = cuda_to_nvml_map(cudadev);
ensure_initialized();
lock_shrreg();
int i;
Expand All @@ -408,8 +409,9 @@ int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
return 0;
}

int rm_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
LOG_INFO("rm_gpu_device_memory:%d %d %lu",pid,dev,usage);
int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type);
int dev = cuda_to_nvml_map(cudadev);
ensure_initialized();
lock_shrreg();
int i;
Expand Down
1 change: 1 addition & 0 deletions src/multiprocess/multiprocess_memory_limit.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ int load_env_from_file(char *filename);
int comparelwr(const char *s1,char *s2);
int put_device_info();
unsigned int nvml_to_cuda_map(unsigned int nvmldev);
unsigned int cuda_to_nvml_map(unsigned int cudadev);

#endif // __MULTIPROCESS_MEMORY_LIMIT_H__

12 changes: 8 additions & 4 deletions src/multiprocess/multiprocess_utilization_watcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static int g_max_thread_per_sm;
static volatile long g_cur_cuda_cores = 0;
static volatile long g_total_cuda_cores = 0;
extern int pidfound;
int cuda_to_nvml_map[16];
int cuda_to_nvml_map_array[16];

void rate_limiter(int grids, int blocks) {
long before_cuda_cores = 0;
Expand Down Expand Up @@ -100,12 +100,16 @@ unsigned int nvml_to_cuda_map(unsigned int nvmldev){
CHECK_NVML_API(nvmlDeviceGetCount_v2(&devcount));
int i=0;
for (i=0;i<devcount;i++){
if (cuda_to_nvml_map[i]==nvmldev)
if (cuda_to_nvml_map(i)==nvmldev)
return i;
}
return -1;
}

unsigned int cuda_to_nvml_map(unsigned int cudadev){
return cuda_to_nvml_map_array[cudadev];
}

int setspec() {
CHECK_NVML_API(nvmlInit());
CHECK_CU_RESULT(cuDeviceGetAttribute(&g_sm_num,CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,0));
Expand Down Expand Up @@ -144,7 +148,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
for (i=0; i<infcount; i++){
proc = find_proc_by_hostpid(infos[i].pid);
if (proc != NULL){
LOG_DEBUG("pid=%u monitor=%lld\n", infos[i].pid, infos[i].usedGpuMemory);
//LOG_DEBUG("pid=%u monitor=%lld\n", infos[i].pid, infos[i].usedGpuMemory);
proc->monitorused[cudadev] += infos[i].usedGpuMemory;
}
}
Expand All @@ -161,7 +165,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
proc = find_proc_by_hostpid(processes_sample[i].pid);
if (proc != NULL){
sum += processes_sample[i].smUtil;
LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil);
//LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil);
proc->device_util[cudadev].sm_util += processes_sample[i].smUtil;
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/nvml/hook.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ pthread_once_t init_virtual_map_post_flag = PTHREAD_ONCE_INIT;
typedef void* (*fp_dlsym)(void*, const char*);
extern fp_dlsym real_dlsym;
extern int virtual_nvml_devices;
extern int cuda_to_nvml_map[16];
extern int cuda_to_nvml_map_array[16];

nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index) {
return NVML_OVERRIDE_CALL(nvml_library_entry, nvmlDeviceGetIndex, device, index);
Expand Down Expand Up @@ -307,7 +307,7 @@ void nvml_preInit() {
load_nvml_libraries();
int i;
for (i=0; i<16; i++) {
cuda_to_nvml_map[i] = i;
cuda_to_nvml_map_array[i] = i;
}
}

Expand Down
Loading