vlm
vit
将图片按照patch大小进行切分,比如3 * 48 * 48的图片,patch size为 16 * 16。
layer = torch.nn.Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
image = torch.rand(batch_size, 3, 48, 48)
projected_patches = layer(image)
print(projected_patches.flatten(-2).transpose(-1, -2).shape)
# This prints
# torch.Size([1, 9, 768])
原始图像: 96x96×3
↓ 分成6×6个patch
每个patch: 16×16×3
↓ 卷积 (每个patch→768维向量)
输出: 6×6×768 (36个token,每个768维)
然后还要加上position embedding,将每个token的位置信息加入到token中。
Clip(Contrastive Language Image Pretraining)是一种基于对比文本-图像对的预训练方法/模型
训练数据:文本-图像对:一张图像和它对应的文本描述,这里希望通过对比学习,模型能够学习到文本-图像对的匹配关系
CLIP包括两个模型:Text Encoder和Image Encoder
- Text Encoder用来提取文本的特征,可以采用NLP中常用的text transformer模型;
- Image Encoder用来提取图像的特征,可以采用常用CNN模型或者vision transformer
连接器(Projector/Connector)
Connector(模态适配器) 将 ViT 输出的视觉特征映射到语言模型能理解的语义空间,是多模态模型中用于弥合视觉与语言模态间“鸿沟 ”的关键组件。 其核心作用是将视觉特征(如CNN或ViT提取的图像向量)转换为与语言模型(如BERT、GPT)输入空间兼容的语义表示。 通过全连接层、Transformer编码器或可学习的非线性映射,Connector对齐跨模态的语义和统计分布。
import torch
from transformers import AutoConfig, AutoModelForImageTextToText
from accelerate import init_empty_weights
model_name = "./models/models/Qwen3-VL-2B-Instruct"
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
# 使用 init_empty_weights 上下文管理器
# 在这个上下文里创建的模型,其参数将不占用实际内存
with init_empty_weights():
model = AutoModelForImageTextToText.from_config(config, trust_remote_code=True)
print("\n--- 模型结构(层级打印)---")
print(model)
--- 模型结构(层级打印)---
Qwen3VLForConditionalGeneration(
(model): Qwen3VLModel(
(visual): Qwen3VLVisionModel(
(patch_embed): Qwen3VLVisionPatchEmbed(
(proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
)
(pos_embed): Embedding(2304, 1024)
(rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
(blocks): ModuleList(
(0-23): 24 x Qwen3VLVisionBlock(
(norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(attn): Qwen3VLVisionAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(mlp): Qwen3VLVisionMLP(
(linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)
(linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)
(act_fn): GELUTanh()
)
)
)
(merger): Qwen3VLVisionPatchMerger(
(norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
(linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)
(act_fn): GELU(approximate='none')
(linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)
)
(deepstack_merger_list): ModuleList(
(0-2): 3 x Qwen3VLVisionPatchMerger(
(norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)
(linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)
(act_fn): GELU(approximate='none')
(linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)
)
)
)
(language_model): Qwen3VLTextModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-27): 28 x Qwen3VLTextDecoderLayer(
(self_attn): Qwen3VLTextAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
(k_proj): Linear(in_features=2048, out_features=1024, bias=False)
(v_proj): Linear(in_features=2048, out_features=1024, bias=False)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)
(k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)
)
(mlp): Qwen3VLTextMLP(
(gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
(up_proj): Linear(in_features=2048, out_features=6144, bias=False)
(down_proj): Linear(in_features=6144, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
(rotary_emb): Qwen3VLTextRotaryEmbedding()
)
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)