Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions src/multiprocess/multiprocess_memory_limit.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,16 @@ size_t get_limit_from_env(const char* env_name) {
return scaled_res;
}

int put_device_info() {
int init_device_info() {
unsigned int i,nvmlDevicesCount;
CHECK_NVML_API(nvmlDeviceGetCount_v2(&nvmlDevicesCount));
LOG_INFO("put_device_info finished %d",nvmlDevicesCount);
region_info.shared_region->num=nvmlDevicesCount;
region_info.shared_region->device_num=nvmlDevicesCount;
nvmlDevice_t dev;
for(i=0;i<nvmlDevicesCount;i++){
CHECK_NVML_API(nvmlDeviceGetHandleByIndex(i, &dev));
CHECK_NVML_API(nvmlDeviceGetUUID(dev,region_info.shared_region->uuids[i],96));
}
LOG_INFO("put_device_info finished %d",nvmlDevicesCount);
return 0;
}

Expand Down Expand Up @@ -332,13 +332,14 @@ int set_gpu_device_sm_utilization(int32_t pid,int dev, unsigned int smUtil){ //
return 1;
}

int init_gpu_device_sm_utilization(){
int init_gpu_device_utilization(){
int i,dev;
ensure_initialized();
lock_shrreg();
for (i=0;i<region_info.shared_region->proc_num;i++){
for (dev=0;dev<CUDA_DEVICE_MAX_COUNT;dev++){
region_info.shared_region->procs[i].device_util[dev].sm_util = 0;
region_info.shared_region->procs[i].monitorused[dev] = 0;
break;
}
}
Expand Down Expand Up @@ -722,11 +723,11 @@ void try_create_shrreg() {
if (lockf(fd, F_LOCK, SHARED_REGION_SIZE_MAGIC) != 0) {
LOG_ERROR("Fail to lock shrreg %s: errno=%d", shr_reg_file, errno);
}
put_device_info();
if (region->initialized_flag !=
MULTIPROCESS_SHARED_REGION_MAGIC_FLAG) {
region->major_version = MAJOR_VERSION;
region->minor_version = MINOR_VERSION;
init_device_info();
do_init_device_memory_limits(
region->limit, CUDA_DEVICE_MAX_COUNT);
do_init_device_sm_limits(
Expand Down Expand Up @@ -955,6 +956,17 @@ shrreg_proc_slot_t *find_proc_by_hostpid(int hostpid) {
return NULL;
}

void print_all() {
int i;
LOG_INFO("Total process: %d",region_info.shared_region->proc_num);
for (i=0;i<region_info.shared_region->proc_num;i++) {
for (int dev=0;dev<CUDA_DEVICE_MAX_COUNT;dev++){
LOG_INFO("Process %d hostPid: %d, sm: %d, memory: %d, record: %d",region_info.shared_region->procs[i].pid, region_info.shared_region->procs[i].hostpid,
region_info.shared_region->procs[i].device_util[dev].sm_util, region_info.shared_region->procs[i].monitorused[dev], region_info.shared_region->procs[i].used[dev].total);
}
}
}

int comparelwr(const char *s1,char *s2){
if ((s1==NULL) || (s2==NULL))
return 1;
Expand Down
5 changes: 3 additions & 2 deletions src/multiprocess/multiprocess_memory_limit.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ typedef struct {
int32_t sm_init_flag;
size_t owner_pid;
sem_t sem;
uint64_t num;
uint64_t device_num;
uuid uuids[CUDA_DEVICE_MAX_COUNT];
uint64_t limit[CUDA_DEVICE_MAX_COUNT];
uint64_t sm_limit[CUDA_DEVICE_MAX_COUNT];
Expand Down Expand Up @@ -143,7 +143,7 @@ int set_env_utilization_switch();

int set_gpu_device_memory_monitor(int32_t pid,int dev,size_t monitor);
int set_gpu_device_sm_utilization(int32_t pid,int dev, unsigned int smUtil);
int init_gpu_device_sm_utilization();
int init_gpu_device_utilization();
int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type);
int rm_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type);

Expand Down Expand Up @@ -172,6 +172,7 @@ void suspend_all();
void resume_all();
int wait_status_self(int status);
int wait_status_all(int status);
void print_all();

int load_env_from_file(char *filename);
int comparelwr(const char *s1,char *s2);
Expand Down
60 changes: 25 additions & 35 deletions src/multiprocess/multiprocess_utilization_watcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,70 +120,58 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {

int i,sum=0;
unsigned int infcount;
size_t summonitor=0;
nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];

unsigned int nvmlCounts;
CHECK_NVML_API(nvmlDeviceGetCount(&nvmlCounts));

lock_shrreg();

int devi,cudadev;
for (devi=0;devi<nvmlCounts;devi++){
sum=0;
summonitor=0;
infcount = SHARED_REGION_MAX_PROCESS_NUM;
shrreg_proc_slot_t *proc;
cudadev = nvml_to_cuda_map((unsigned int)(devi));
if (cudadev<0)
continue;
nvmlDevice_t device;
char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
CHECK_NVML_API(nvmlDeviceGetHandleByIndex(cudadev, &device));
// Get device UUID
CHECK_NVML_API(nvmlDeviceGetUUID(device, uuid, NVML_DEVICE_UUID_BUFFER_SIZE));

// Get Memory for container
nvmlReturn_t res = nvmlDeviceGetComputeRunningProcesses(device,&infcount,infos);
if (res==NVML_ERROR_INSUFFICIENT_SIZE){
continue;
if (res == NVML_SUCCESS) {
for (i=0; i<infcount; i++){
proc = find_proc_by_hostpid(infos[i].pid);
if (proc != NULL){
LOG_DEBUG("pid=%u monitor=%lld\n", infos[i].pid, infos[i].usedGpuMemory);
proc->monitorused[cudadev] += infos[i].usedGpuMemory;
}
}
}

// Get SM util for container
gettimeofday(&cur,NULL);
microsec = (cur.tv_sec - 1) * 1000UL * 1000UL + cur.tv_usec;
nvmlProcessUtilizationSample_t processes_sample[SHARED_REGION_MAX_PROCESS_NUM];
unsigned int processes_num = SHARED_REGION_MAX_PROCESS_NUM;
res = nvmlDeviceGetProcessUtilization(device,processes_sample,&processes_num,microsec);
LOG_DEBUG("processes_num=%d\n",processes_num);
LOG_DEBUG("Device UUID: %s\n", uuid);
if ((res==NVML_ERROR_INSUFFICIENT_SIZE) || (res==NVML_ERROR_NOT_FOUND)){
userutil[cudadev] = 0;
for (i=0; i<infcount; i++){
proc = find_proc_by_hostpid(infos[i].pid);
if (proc != NULL){
LOG_DEBUG("pid=%u monitor=%lld\n",infos[i].pid,infos[i].usedGpuMemory);
summonitor += infos[i].usedGpuMemory;
}
set_gpu_device_memory_monitor(infos[i].pid,cudadev,summonitor);
set_gpu_device_sm_utilization(infos[i].pid,cudadev,0);
}
continue;
}
for (i=0; i<processes_num; i++){
//if (processes_sample[i].timeStamp >= microsec){
if (res == NVML_SUCCESS) {
for (i=0; i<processes_num; i++){
proc = find_proc_by_hostpid(processes_sample[i].pid);
if (proc != NULL){
//LOG_WARN("pid=%u num=%d\n",processes_sample[i].pid,processes_num);
//proc = find_proc_by_hostpid(processes_sample[i].pid);
//if (proc!=NULL) {
// printf("inner pid=%u\n",proc->pid);
sum += processes_sample[i].smUtil;
summonitor += infos[i].usedGpuMemory;
//LOG_WARN("monitorused=%lld %d %d %d",infos[i].usedGpuMemory,proc->hostpid,proc->pid,pidfound);
//LOG_WARN("smutil=%d %d %lu %u %u %u\n",virtual_map[devi],devi,summonitor,processes_sample[i].smUtil,processes_sample[i].encUtil,processes_sample[i].decUtil);
//}
LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil);
proc->device_util[cudadev].sm_util += processes_sample[i].smUtil;
}
set_gpu_device_memory_monitor(processes_sample[i].pid,cudadev,summonitor);
set_gpu_device_sm_utilization(processes_sample[i].pid,cudadev,processes_sample[i].smUtil);
}
}

if (sum < 0)
sum = 0;
userutil[cudadev] = sum;
}
unlock_shrreg();
return 0;
}

Expand All @@ -193,6 +181,8 @@ void* utilization_watcher() {
int sysprocnum;
int share = 0;
int upper_limit = get_current_device_sm_limit(0);

ensure_initialized();
LOG_DEBUG("upper_limit=%d\n",upper_limit);
while (1){
nanosleep(&g_wait, NULL);
Expand All @@ -201,7 +191,7 @@ void* utilization_watcher() {
if (pidfound==0)
continue;
}
init_gpu_device_sm_utilization();
init_gpu_device_utilization();
get_used_gpu_utilization(userutil,&sysprocnum);
//if (sysprocnum == 1 &&
// userutil < upper_limit / 10) {
Expand Down
9 changes: 9 additions & 0 deletions src/multiprocess/shrreg_tool.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ void create_new() {
}


void print_shared_region(){
ensure_initialized();
print_all();
}


void send_stop_signal(){
ensure_initialized();
suspend_all();
Expand Down Expand Up @@ -65,6 +71,9 @@ int main(int argc, char* argv[]) {
if (strcmp(arg, "--resume") == 0){
send_resume_signal();
}
if (strcmp(arg, "--print") == 0){
print_shared_region();
}
if (strcmp(arg, "--version") == 0){
printf("shrreg size: %ld, version %d.%d\n",
sizeof(shared_region_t),
Expand Down