defload_model(self)->None:ifself.vllm_config.model_config.enable_sleep_mode:allocator=CuMemAllocator.get_instance()assertallocator.get_current_usage()==0,("Sleep mode can only be ""used for one instance per process.")context=allocator.use_memory_pool(tag="weights")else:fromcontextlibimportnullcontextcontext=nullcontext()withcontext:self.model_runner.load_model()classWorker():defsleep(self,level:int=1)->None:free_bytes_before_sleep=torch.cuda.mem_get_info()[0]allocator=CuMemAllocator.get_instance()allocator.sleep(offload_tags=("weights",)iflevel==1elsetuple())free_bytes_after_sleep,total=torch.cuda.mem_get_info()freed_bytes=free_bytes_after_sleep-free_bytes_before_sleepused_bytes=total-free_bytes_after_sleepassertfreed_bytes>=0,"Memory usage increased after sleeping."logger.info("Sleep mode freed %.2f GiB memory, ""%.2f GiB memory is still in use.",freed_bytes/GiB_bytes,used_bytes/GiB_bytes)defwake_up(self,tags:Optional[list[str]]=None)->None:allocator=CuMemAllocator.get_instance()allocator.wake_up(tags)
当调用 sleep 方法时,所有具有指定标签的张量都将被卸载(offload)到 CPU 内存中,其余张量将被丢弃(discard)。
当我们调用 wake_up 时,之前卸载的所有张量都将被加载回 GPU 内存,其余张量为空。
defsleep(self,offload_tags:Optional[Union[tuple[str,...],str]]=None)->None:ifoffload_tagsisNone:# by default, allocated tensors are offloaded# when the allocator sleepsoffload_tags=(CuMemAllocator.default_tag,)elifisinstance(offload_tags,str):offload_tags=(offload_tags,)assertisinstance(offload_tags,tuple)forptr,datainself.pointer_to_data.items():handle=data.handleifdata.taginoffload_tags:size_in_bytes=handle[1]cpu_backup_tensor=torch.empty(size_in_bytes,dtype=torch.uint8,device='cpu',pin_memory=is_pin_memory_available())cpu_ptr=cpu_backup_tensor.data_ptr()libcudart.cudaMemcpy(cpu_ptr,ptr,size_in_bytes)data.cpu_backup_tensor=cpu_backup_tensorunmap_and_release(handle)gc.collect()torch.cuda.empty_cache()
wake_up 方法则是调用 create_and_map(handle)实现重新分配显存,并绑定到虚拟地址上。将 offload 的数据从 cpu 拷贝到 gpu 中。