diff --git a/src/allocator/allocator.c b/src/allocator/allocator.c
index 99215b1..ba264ba 100755
--- a/src/allocator/allocator.c
+++ b/src/allocator/allocator.c
@@ -11,7 +11,7 @@ size_t OVERSIZE = 134217728;
 
 region_list *r_list;
 allocated_list *device_overallocated;
-allocated_list *array_list;
+allocated_list *device_allocasync;
 
 #define ALIGN       2097152
 #define MULTI_PARAM 1
@@ -93,7 +93,9 @@ void allocator_init(){
     
     device_overallocated = malloc(sizeof(allocated_list));
     LIST_INIT(device_overallocated);
-    
+    device_allocasync=malloc(sizeof(allocated_list));
+    LIST_INIT(device_allocasync);
+
     pthread_mutex_init(&mutex,NULL);
 }
 
@@ -196,13 +198,6 @@ int free_raw(CUdeviceptr dptr){
     return tmp;
 }
 
-int free_raw_async(CUdeviceptr dptr, CUstream hStream){
-    pthread_mutex_lock(&mutex);
-    unsigned int tmp = remove_chunk_async(device_overallocated,dptr,hStream);
-    pthread_mutex_unlock(&mutex);
-    return tmp;
-}
-
 int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStream){
     size_t t_size;
     if (a_list->length==0) {
@@ -224,10 +219,9 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea
     return -1;
 }
 
-int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){
-    int tmp;
+int free_raw_async(CUdeviceptr dptr, CUstream hStream){
     pthread_mutex_lock(&mutex);
-    tmp = add_chunk_async(dptr,size,hStream);
+    unsigned int tmp = remove_chunk_async(device_allocasync,dptr,hStream);
     pthread_mutex_unlock(&mutex);
     return tmp;
 }
@@ -248,11 +242,33 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
         LOG_ERROR("cuMemoryAllocate failed res=%d",res);
         return res;
     }
-    LIST_ADD(device_overallocated,e);
-    //uint64_t t_size;
+    LIST_ADD(device_allocasync,e);
     *address = e->entry->address;
-    allocsize = size;
-    cuCtxGetDevice(&dev);
-    add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
+    CUmemoryPool pool;
+    res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
+    if (res!=CUDA_SUCCESS){
+        LOG_ERROR("cuDeviceGetMemPool failed res=%d",res);
+        return res;
+    }
+    size_t poollimit;
+    res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,&poollimit);
+    if (res!=CUDA_SUCCESS) {
+        LOG_ERROR("cuMemPoolGetAttribute failed res=%d",res);
+        return res;
+    }
+    if ((poollimit!=0) && (poollimit> device_allocasync->limit)) {
+        allocsize = poollimit-device_allocasync->limit;
+        cuCtxGetDevice(&dev);
+        add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
+        device_allocasync->limit=poollimit;
+    }
     return 0;
 }
+
+int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){
+    int tmp;
+    pthread_mutex_lock(&mutex);
+    tmp = add_chunk_async(dptr,size,hStream);
+    pthread_mutex_unlock(&mutex);
+    return tmp;
+}
diff --git a/src/allocator/allocator.h b/src/allocator/allocator.h
index c1218fd..2064ee3 100755
--- a/src/allocator/allocator.h
+++ b/src/allocator/allocator.h
@@ -16,7 +16,7 @@ struct allocated_device_memory_struct{
 typedef struct allocated_device_memory_struct allocated_device_memory;
 
 struct allocated_list_entry_struct{
-    allocated_device_memory *entry;;
+    allocated_device_memory *entry;
     struct allocated_list_entry_struct *next,*prev;
 };
 typedef struct allocated_list_entry_struct allocated_list_entry;
@@ -25,6 +25,7 @@ struct allocated_list_struct{
     allocated_list_entry *head;
     allocated_list_entry *tail;
     size_t length;
+    size_t limit;
 };
 typedef struct allocated_list_struct allocated_list;
 
@@ -55,13 +56,14 @@ typedef struct region_list_struct region_list;
 
 extern region_list *r_list;
 extern allocated_list *device_overallocated;
-extern allocated_list *array_list;
+extern allocated_list *device_allocasync;
 extern pthread_mutex_t mutex;
 
 #define LIST_INIT(list) {   \
     list->head=NULL;         \
     list->tail=NULL;         \
     list->length=0;          \
+    list->limit=0;           \
     }
 #define __LIST_INIT(list) LIST_INIT(list)
 
diff --git a/src/cuda/context.c b/src/cuda/context.c
index 4e4a0ff..fdbfa5d 100755
--- a/src/cuda/context.c
+++ b/src/cuda/context.c
@@ -2,7 +2,6 @@
 #include "multiprocess/multiprocess_memory_limit.h"
 
 extern int context_size;
-extern int cuda_to_nvml_map[16];
 extern int ctx_activate[16];
 
 
diff --git a/src/cuda/device.c b/src/cuda/device.c
index e870add..2ce1aab 100755
--- a/src/cuda/device.c
+++ b/src/cuda/device.c
@@ -7,8 +7,6 @@
 #include "allocator/allocator.h"
 #include "include/memory_limit.h"
 
-extern int cuda_to_nvml_map[16];
-
 CUresult cuDeviceGetAttribute ( int* pi, CUdevice_attribute attrib, CUdevice dev ) {
     CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetAttribute,pi,attrib,dev);
     LOG_DEBUG("[%d]cuDeviceGetAttribute dev=%d attrib=%d %d",res,dev,(int)attrib,*pi);
diff --git a/src/cuda/memory.c b/src/cuda/memory.c
index 14673a8..1a4bb8a 100755
--- a/src/cuda/memory.c
+++ b/src/cuda/memory.c
@@ -619,8 +619,9 @@ CUresult cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void
 }
 
 CUresult cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value) {
-    LOG_DEBUG("cuMemPoolGetAttribute");
-    return CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value);
+    CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,attr,value);
+    LOG_INFO("cuMemPoolGetAttribute %d %ld",attr,*(long *)value);
+    return res;
 }
 
 CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count) {
diff --git a/src/include/log_utils.h b/src/include/log_utils.h
index 2779a4f..16bafea 100755
--- a/src/include/log_utils.h
+++ b/src/include/log_utils.h
@@ -8,39 +8,73 @@
 #include <stdlib.h>
 #include <stdio.h>
 
-#ifdef MEMORY_LIMIT_DEBUG
-#define LOG_DEBUG(msg, ...) fprintf(stderr, msg"\n", ##__VA_ARGS__);
-#else
+FILE *fp1;
+
+#ifdef FILEDEBUG 
 #define LOG_DEBUG(msg, ...) { \
-    if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) \
-       fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+    if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
+        if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
+        fprintf(fp1, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        }\
     }
-#endif
-
 #define LOG_INFO(msg, ...) { \
     if ( \
          /*(getenv("LIBCUDA_LOG_LEVEL")==NULL) || */\
-         (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) \
-       fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+         (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
+        if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
+        fprintf(fp1, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+         }\
     }
-
 #define LOG_WARN(msg, ...) { \
     if ( \
         (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
-        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \
-       fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
+        if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
+        fprintf(fp1, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        }\
     }
-
 #define LOG_MSG(msg, ...) { \
     if ( \
         (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
-        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) \
-       fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
+        if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
+        fprintf(fp1, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+         }\
+    }
+#define LOG_ERROR(msg, ...) { \
+    if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
+    fprintf(fp1, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \
+}
+#else
+#define LOG_DEBUG(msg, ...) { \
+    if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
+        fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+         }\
+    }
+#define LOG_INFO(msg, ...) { \
+    if ( \
+        (getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
+        fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        }\
+    }
+#define LOG_WARN(msg, ...) { \
+    if ( \
+        (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
+        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
+        fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+        }\
+    }
+#define LOG_MSG(msg, ...) { \
+    if ( \
+        (getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
+        ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
+        fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
+         }\
     }
-
 #define LOG_ERROR(msg, ...) { \
     fprintf(stderr, "[HAMI-core ERROR (pid:%d thread=%ld %s:%d)]: "msg"\n", getpid(), pthread_self(), basename(__FILE__),__LINE__, ##__VA_ARGS__); \
 }
+#endif
 
 #define CHECK_DRV_API(f)  {                   \
     CUresult status = (f);                    \
diff --git a/src/libvgpu.c b/src/libvgpu.c
index f645eaf..48bbd01 100644
--- a/src/libvgpu.c
+++ b/src/libvgpu.c
@@ -30,10 +30,6 @@ pthread_once_t dlsym_init_flag = PTHREAD_ONCE_INIT;
  where to find its core utilization */
 extern int pidfound;
 
-/* cuda_to_nvml_map indicates cuda_visible_devices, we need to map it into nvml_visible_devices, 
-to let device-memory be counted successfully*/
-extern int cuda_to_nvml_map[16];
-
 /* used to switch on/off the core utilization limitation*/
 extern int env_utilization_switch;
 
@@ -848,14 +844,11 @@ void preInit(){
 }
 
 void postInit(){
-    map_cuda_visible_devices();
     allocator_init();
 
     try_lock_unified_lock();
     nvmlReturn_t res = set_task_pid();
     try_unlock_unified_lock();
-
-
     LOG_MSG("Initialized");
     if (res!=NVML_SUCCESS){
         LOG_WARN("SET_TASK_PID FAILED.");
@@ -863,6 +856,8 @@ void postInit(){
     }else{
         pidfound=1;
     }
+
+    map_cuda_visible_devices();
     //add_gpu_device_memory_usage(getpid(),0,context_size,0);
     env_utilization_switch = set_env_utilization_switch();
     init_utilization_watcher();
diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c
index 4b94030..b8a2619 100755
--- a/src/multiprocess/multiprocess_memory_limit.c
+++ b/src/multiprocess/multiprocess_memory_limit.c
@@ -380,8 +380,9 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
     return usage;
 }
 
-int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
-    LOG_INFO("add_gpu_device_memory:%d %d %lu",pid,dev,usage);
+int add_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
+    LOG_INFO("add_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),usage);
+    int dev = cuda_to_nvml_map(cudadev);
     ensure_initialized();
     lock_shrreg();
     int i;
@@ -408,8 +409,9 @@ int add_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
     return 0;
 }
 
-int rm_gpu_device_memory_usage(int32_t pid,int dev,size_t usage,int type){
-    LOG_INFO("rm_gpu_device_memory:%d %d %lu",pid,dev,usage);
+int rm_gpu_device_memory_usage(int32_t pid,int cudadev,size_t usage,int type){
+    LOG_INFO("rm_gpu_device_memory:%d %d->%d %lu",pid,cudadev,cuda_to_nvml_map(cudadev),type);
+    int dev = cuda_to_nvml_map(cudadev);
     ensure_initialized();
     lock_shrreg();
     int i;
diff --git a/src/multiprocess/multiprocess_memory_limit.h b/src/multiprocess/multiprocess_memory_limit.h
index 228469f..bb2e09c 100755
--- a/src/multiprocess/multiprocess_memory_limit.h
+++ b/src/multiprocess/multiprocess_memory_limit.h
@@ -178,6 +178,7 @@ int load_env_from_file(char *filename);
 int comparelwr(const char *s1,char *s2);
 int put_device_info();
 unsigned int nvml_to_cuda_map(unsigned int nvmldev);
+unsigned int cuda_to_nvml_map(unsigned int cudadev);
 
 #endif  // __MULTIPROCESS_MEMORY_LIMIT_H__
 
diff --git a/src/multiprocess/multiprocess_utilization_watcher.c b/src/multiprocess/multiprocess_utilization_watcher.c
index 0fad5fd..91d2b72 100644
--- a/src/multiprocess/multiprocess_utilization_watcher.c
+++ b/src/multiprocess/multiprocess_utilization_watcher.c
@@ -29,7 +29,7 @@ static int g_max_thread_per_sm;
 static volatile long g_cur_cuda_cores = 0;
 static volatile long g_total_cuda_cores = 0;
 extern int pidfound;
-int cuda_to_nvml_map[16];
+int cuda_to_nvml_map_array[16];
 
 void rate_limiter(int grids, int blocks) {
   long before_cuda_cores = 0;
@@ -100,12 +100,16 @@ unsigned int nvml_to_cuda_map(unsigned int nvmldev){
     CHECK_NVML_API(nvmlDeviceGetCount_v2(&devcount));
     int i=0;
     for (i=0;i<devcount;i++){
-        if (cuda_to_nvml_map[i]==nvmldev)
+        if (cuda_to_nvml_map(i)==nvmldev)
           return i;
     }
     return -1;
 }
 
+unsigned int cuda_to_nvml_map(unsigned int cudadev){
+    return cuda_to_nvml_map_array[cudadev];
+}
+
 int setspec() {
     CHECK_NVML_API(nvmlInit());
     CHECK_CU_RESULT(cuDeviceGetAttribute(&g_sm_num,CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,0));
@@ -144,7 +148,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
         for (i=0; i<infcount; i++){
           proc = find_proc_by_hostpid(infos[i].pid);
           if (proc != NULL){
-              LOG_DEBUG("pid=%u monitor=%lld\n", infos[i].pid, infos[i].usedGpuMemory);
+              //LOG_DEBUG("pid=%u monitor=%lld\n", infos[i].pid, infos[i].usedGpuMemory);
               proc->monitorused[cudadev] += infos[i].usedGpuMemory;
           }
         }
@@ -161,7 +165,7 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
           proc = find_proc_by_hostpid(processes_sample[i].pid);
           if (proc != NULL){
               sum += processes_sample[i].smUtil;
-              LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil);
+              //LOG_DEBUG("pid=%u smUtil=%d\n", processes_sample[i].pid, processes_sample[i].smUtil);
               proc->device_util[cudadev].sm_util += processes_sample[i].smUtil;
           }
         }
diff --git a/src/nvml/hook.c b/src/nvml/hook.c
index 7d8bfb9..0d0894e 100644
--- a/src/nvml/hook.c
+++ b/src/nvml/hook.c
@@ -260,7 +260,7 @@ pthread_once_t init_virtual_map_post_flag = PTHREAD_ONCE_INIT;
 typedef void* (*fp_dlsym)(void*, const char*);
 extern fp_dlsym real_dlsym;
 extern int virtual_nvml_devices;
-extern int cuda_to_nvml_map[16];
+extern int cuda_to_nvml_map_array[16];
 
 nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index) {
     return NVML_OVERRIDE_CALL(nvml_library_entry, nvmlDeviceGetIndex, device, index);
@@ -307,7 +307,7 @@ void nvml_preInit() {
     load_nvml_libraries();
     int i;
     for (i=0; i<16; i++) {
-        cuda_to_nvml_map[i] = i;
+        cuda_to_nvml_map_array[i] = i;
     }   
 }
 
diff --git a/src/utils.c b/src/utils.c
index e4e8906..47a1067 100755
--- a/src/utils.c
+++ b/src/utils.c
@@ -14,7 +14,7 @@
 const char* unified_lock="/tmp/vgpulock/lock";
 const int retry_count=20;
 extern int context_size;
-extern int cuda_to_nvml_map[16];
+extern int cuda_to_nvml_map_array[16];
 
 // 0 unified_lock lock success
 // -1 unified_lock lock fail
@@ -177,20 +177,20 @@ int parse_cuda_visible_env() {
     char *s = getenv("CUDA_VISIBLE_DEVICES");
     count = 0;
     for (i=0; i<16; i++) {
-        cuda_to_nvml_map[i] = i;
+        cuda_to_nvml_map_array[i] = i;
     }   
 
     if (need_cuda_virtualize()) {
         for (i=0; i<strlen(s); i++){
             if ((s[i] == ',') || (i == 0)){
                 tmp = (i==0) ? atoi(s) : atoi(s + i +1);
-                cuda_to_nvml_map[count] = tmp; 
+                cuda_to_nvml_map_array[count] = tmp; 
                 count++;
             }
         } 
     }
     for (i=0;i<16;i++){
-        LOG_INFO("device %d -> %d",i,cuda_to_nvml_map[i]);
+        LOG_INFO("device %d -> %d",i,cuda_to_nvml_map(i));
     }
     LOG_DEBUG("get default cuda from %s",getenv("CUDA_VISIBLE_DEVICES"));
     return count;