_,hf_config=get_model_arch(model_path)kwargs=dict(model_path=model_path,with_llm=with_llm,max_memory=max_memory,hf_config=hf_config,backend=backend)forname,moduleinVISION_MODELS.module_dict.items():try:ifmodule.match(hf_config):logger.info(f'matching vision model: {name}')model=module(**kwargs)model.build_preprocessor()# build the vision part of a VLM model when backend is# turbomind, or load the whole VLM model when `with_llm==True`ifbackend=='turbomind'orwith_llm:model.build_model()returnmodelexceptExceptionase:logger.error(f'build vision model {name} failed, {e}')raise
def_preprocess_v1_5(self,image,params=None):image_res={'low':6,'medium':12,'high':24}max_num=params.get('max_dynamic_patch')ifmax_numisNoneornotisinstance(max_num,int):res_key=params.get('detail','default')max_num=image_res.get(res_key,self.config.max_dynamic_patch)out=dynamic_preprocess(image,min_num=self.config.min_dynamic_patch,max_num=max_num,image_size=self.config.vision_config.image_size,use_thumbnail=self.config.use_thumbnail)pixel_values=[self.transform(x)forxinout]# (patch) x c x h x wpixel_values=torch.stack(pixel_values)returnpixel_values