structllama_mmap{void*addr;size_tsize;// list of mapped fragments (first_offset, last_offset)std::vector<std::pair<size_t,size_t>>mapped_fragments;llama_mmap(structllama_file*file,size_tprefetch=(size_t)-1/* -1 = max value */,boolnuma=false){size=file->size;intfd=fileno(file->fp);intflags=MAP_SHARED;// prefetch/readahead impairs performance on NUMA systemsif(numa){prefetch=0;}#ifdef __linux__// advise the kernel to read the file sequentially (increases readahead)if(posix_fadvise(fd,0,0,POSIX_FADV_SEQUENTIAL)){LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",strerror(errno));}if(prefetch){flags|=MAP_POPULATE;}#endifaddr=mmap(NULL,file->size,PROT_READ,flags,fd,0);if(addr==MAP_FAILED){// NOLINTthrowstd::runtime_error(format("mmap failed: %s",strerror(errno)));}if(prefetch>0){// advise the kernel to preload the mapped memoryif(posix_madvise(addr,std::min(file->size,prefetch),POSIX_MADV_WILLNEED)){LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",strerror(errno));}}if(numa){// advise the kernel not to use readahead// (because the next page might not belong on the same node)if(posix_madvise(addr,file->size,POSIX_MADV_RANDOM)){LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",strerror(errno));}}// initialize list of mapped_fragmentsmapped_fragments.emplace_back(0,file->size);}}usingllama_mmaps=std::vector<std::unique_ptr<llama_mmap>>;
structllama_file{// use FILE * so we don't have to re-open the file to mmapFILE*fp;// 文件流指针size_tsize;// 文件sizellama_file(constchar*fname,constchar*mode){// ggml_fopen封装了win32和linux api的区别,在linux实现这里直接是fopen(fname, mode)fp=ggml_fopen(fname,mode);if(fp==NULL){throwstd::runtime_error(format("failed to open %s: %s",fname,strerror(errno)));}seek(0,SEEK_END);size=tell();seek(0,SEEK_SET);}}
// llama.cppvoidinit_mappings(boolprefetch=true,llama_mlocks*mlock_mmaps=nullptr){if(use_mmap){mappings.reserve(files.size());mmaps_used.reserve(files.size());for(constauto&file:files){std::unique_ptr<llama_mmap>mapping(newllama_mmap(file.get(),prefetch?-1:0,ggml_is_numa()));mmaps_used.emplace_back(mapping->size,0);if(mlock_mmaps){std::unique_ptr<llama_mlock>mlock_mmap(newllama_mlock());mlock_mmap->init(mapping->addr);mlock_mmaps->emplace_back(std::move(mlock_mmap));}mappings.emplace_back(std::move(mapping));}}// compute the total size of all tensors for progress reportingfor(auto&w:weights){size_data+=ggml_nbytes(w.tensor);}}
// Represents some region of memory being locked using mlock or VirtualLock;// will automatically unlock on destruction.structllama_mlock{void*addr=NULL;size_tsize=0;~llama_mlock(){if(size){// 调用了munlock函数raw_unlock(addr,size);}}voidinit(void*ptr){// NOLINT注释,如果编码者确认没问题,是让一些静态代码分析工具不报警GGML_ASSERT(addr==NULL&&size==0);// NOLINTaddr=ptr;}// 在llama_load_all 函数中调用voidgrow_to(size_ttarget_size){GGML_ASSERT(addr);if(failed_already){return;}// 获取pagesizesize_tgranularity=lock_granularity();// 将target_size按照page_size对齐,这是一种常用的写法,比如将数字7按照8对齐,对齐结果就是8target_size=(target_size+granularity-1)&~(granularity-1);if(target_size>size){// 调用mlockif(raw_lock((uint8_t*)addr+size,target_size-size)){size=target_size;}else{failed_already=true;}}}boolraw_lock(constvoid*addr,size_tsize)const{if(!mlock(addr,size)){returntrue;}...// 如果内存不足,通过ulimit -l进行查看,通过ulimit设置更大的数值returnfalse;}}
boolload_all_data(structggml_context*ctx,llama_buf_map&bufs_mmap,llama_mlocks*lmlocks,llama_progress_callbackprogress_callback,void*progress_callback_user_data){// 循环处理每一个tensorfor(structggml_tensor*cur=ggml_get_first_tensor(ctx);cur!=NULL;cur=ggml_get_next_tensor(ctx,cur)){constauto*weight=get_weight(ggml_get_name(cur));......if(use_mmap){// 获取每个tensor 权重weight的mmapingconstauto&mapping=mappings.at(weight->idx);ggml_backend_buffer_tbuf_mmap=nullptr;if(bufs_mmap.count(weight->idx)){buf_mmap=bufs_mmap.at(weight->idx);}// 根据offset定位到文件中权重保存的位置,获取mmap data指针uint8_t*data=(uint8_t*)mapping->addr+weight->offs;if(check_tensors){validation_result.emplace_back(std::async(std::launch::async,[cur,data,n_size]{returnstd::make_pair(cur,ggml_validate_row_data(cur->type,data,n_size));}));}GGML_ASSERT(buf_mmap||cur->data);// either we have a buffer to allocate the tensor in, or it is already allocatedif(buf_mmap&&cur->data==nullptr){ggml_backend_tensor_alloc(buf_mmap,cur,data);// 锁住保存weight的内存if(lmlocks){constauto&lmlock=lmlocks->at(weight->idx);lmlock->grow_to(weight->offs+n_size);}auto&mmap_used=mmaps_used[weight->idx];mmap_used.first=std::min(mmap_used.first,weight->offs);mmap_used.second=std::max(mmap_used.second,weight->offs+n_size);}else{ggml_backend_tensor_set(cur,data,0,n_size);}}....}