Skip to main content

moe_tbo

python3 -m sglang.launch_server --model-path /data_workspace/st/models/Qwen1.5-MoE-A2.7B-Chat --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66

curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-r1-distill-qwen-1.5b",
"messages": [
{"role": "user", "content": "你是谁?"}
]
}'


nsys launch --trace=cublas,cuda,cudnn,nvtx --show-output=true --cuda-graph-trace node --cuda-memory-usage=true --python-backtrace=cuda --trace-fork-before-exec=true --session-new prefill \
export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1
# Qwen2Moe不行,Qwen3Moe才支持tbo特性(sglang v0.5.6.post2)
python3 -m sglang.launch_server --model-path /data_workspace/wt/models/Qwen1.5-MoE-A2.7B-Chat --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66 \
--decode-log-interval 1 \
--enable-dp-attention \
--enable-dp-lm-head \
--moe-dense-tp-size 1 \
--disable-cuda-graph \
--enable-two-batch-overlap \
--enable-layerwise-nvtx-marker


--attention-backend cutlass_mla \
--moe-a2a-backend deepep \
--moe-runner-backend deep_gemm \
--deepep-mode auto
modelscope download --model Qwen/Qwen3-30B-A3B --local_dir ./models/Qwen3-30B-A3B

export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1
export LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib/:/usr/local/lib:$LD_LIBRARY_PATH
python3 -m sglang.launch_server --model-path /data_workspace/wt/models/Qwen3-30B-A3B --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66 \
--decode-log-interval 1 \
--enable-dp-attention \
--enable-dp-lm-head \
--moe-dense-tp-size 1 \
--disable-cuda-graph \
--enable-two-batch-overlap \
--enable-layerwise-nvtx-marker \
--moe-runner-backend triton \
--moe-a2a-backend mooncake \
--elastic-ep-backend mooncake
wget https://github.com/kvcache-ai/Mooncake/archive/refs/tags/v0.3.6.post1.tar.gz
tar -zxvf
cd
export BUILD_WITH_EP=1
# A100
cmake -B build -S . -DUSE_CUDA=true -DWITH_EP=ON -DCMAKE_CUDA_ARCHITECTURES="80"
cmake --build build -- -j16

# 注释vim mooncake-integration/ep/ep_py.cpp 对torch==2.8版本的要求
./scripts/build_wheel.sh
python3 -m pip install ./mooncake-wheel/dist/mooncake_transfer_engine-0.3.7.post2-cp312-cp312-manylinux_2_17_x86_64.whl


报错:

  File "/sgl-workspace/sglang/python/sglang/srt/distributed/parallel_state.py", line 1523, in init_distributed_environment
torch.distributed.init_process_group(
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 95, in wrapper
func_return = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 1769, in init_process_group
default_pg, _ = _new_process_group_helper(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 2074, in _new_process_group_helper
backend_class = creator_fn(dist_backend_opts, backend_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Failed to register local memory.

WITH_NVIDIA_PEERMEM 是一个编译选项,用于启用 NVIDIA GPUDirect RDMA (Remote Direct Memory Access) 功能,允许第三方PCIe设备(如Mellanox InfiniBand网卡)直接访问GPU显存,绕过CPU和系统内存,大幅降低延迟并提升带宽。

在mooncake中:

int RdmaContext::registerMemoryRegionInternal(void *addr, size_t length,
int access,
MemoryRegionMeta &mrMeta) {
if (length > (size_t)globalConfig().max_mr_size) {
PLOG(WARNING) << "The buffer length exceeds device max_mr_size, "
<< "shrink it to " << globalConfig().max_mr_size;
length = (size_t)globalConfig().max_mr_size;
}
#if !defined(WITH_NVIDIA_PEERMEM) && defined(USE_CUDA)
// Implement register memory in a way that does not assume the presence of
// nvidia-peermem. If memory is on CPU call ibv_reg_mr() as usual. If memory
// is on GPU then use ibv_reg_dmabuf_mr() instead which does not require
// nvidia-peermem.
CUmemorytype memType;
CUresult result = cuPointerGetAttribute(
&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr);

// Register memory depending on whether memory is on host or GPU.
if (result != CUDA_SUCCESS || memType == CU_MEMORYTYPE_HOST) {
mrMeta.addr = addr;
mrMeta.mr = ibv_reg_mr(pd_, addr, length, access);
} else if (memType == CU_MEMORYTYPE_DEVICE) {
size_t allocSize;
cuPointerGetAttribute(&allocSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE,
(CUdeviceptr)addr);
int dmabuf_fd;
result = cuMemGetHandleForAddressRange(
&dmabuf_fd, (CUdeviceptr)addr, allocSize,
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
if (result != CUDA_SUCCESS) {
const char *errStr;
cuGetErrorString(result, &errStr);
LOG(ERROR) << "Failed to retrieve dmabuf for " << (uintptr_t)addr
<< " cuda error=" << errStr;
return ERR_CONTEXT;
}
mrMeta.addr = addr;
mrMeta.mr = ibv_reg_dmabuf_mr(pd_, 0 /* offset */, length,
(uintptr_t)addr, dmabuf_fd, access);
}
#else
mrMeta.addr = addr;
mrMeta.mr = ibv_reg_mr(pd_, addr, length, access);
#endif
if (!mrMeta.mr) {
PLOG(ERROR) << "Failed to register memory " << addr;
return ERR_CONTEXT;
}
return 0;
}

编译mooncake时,加上:-DWITH_NVIDIA_PEERMEM=OFF

export BUILD_WITH_EP=1
# A100
cmake -B build -S . -DUSE_CUDA=true -DWITH_EP=ON -DCMAKE_CUDA_ARCHITECTURES="80" -DWITH_NVIDIA_PEERMEM=OFF
cmake --build build -- -j16

# 注释vim mooncake-integration/ep/ep_py.cpp 对torch==2.8版本的要求
./scripts/build_wheel.sh
python3 -m pip install ./mooncake-wheel/dist/mooncake_transfer_engine-0.3.7.post2-cp312-cp312-manylinux_2_17_x86_64.whl --force-reinstall --no-deps

#!/bin/bash
local_ip=0.0.0.0
MASTER_IP=192.168.1.2
model_path=/pfs/pfs-OqLB2M/hefeixiang/models/DeepSeek-R1-0528
#expert_location=prefill_in4096.json
TP_SIZE=16
DP_SIZE=16
EP_SIZE=16
NNODES=2
NODE_RANK=0
#export NCCL_DEBUG=INFO
#export NCCL_DEBUG_SUBSYS=ALL
export NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_9,mlx5_10,mlx5_11,mlx5_12
export NVSHMEM_HCA_LIST=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_9,mlx5_10,mlx5_11,mlx5_12

# force mooncake to use MNNVL
#export MC_FORCE_MNNVL=True

# environment copied from GB200 blog2 Low-prec decode
# https://github.com/sgl-project/sglang/issues/10903
#export SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1
#export SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export NCCL_MNNVL_ENABLE=1
export NCCL_CUMEM_ENABLE=1
export PYTHONUNBUFFERED=1
export SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0
#export SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256

# debug & profile
export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1

nsys launch --trace=cublas,cuda,cudnn,nvtx --show-output=true --cuda-graph-trace node --cuda-memory-usage=true --python-backtrace=cuda --trace-fork-before-exec=true --session-new prefill \
python3 -m sglang.launch_server \
--dist-init-addr $MASTER_IP:6676 \
--nnodes $NNODES --node-rank $NODE_RANK \
--model-path ${model_path} \
--host 0.0.0.0 \
--port 30000 \
--decode-log-interval 1 \
--max-running-requests 2048 \
--disable-radix-cache \
--disable-shared-experts-fusion \
--watchdog-timeout 1000000 \
--tp-size ${TP_SIZE} \
--dp-size ${DP_SIZE} \
--ep-size ${EP_SIZE} \
--enable-dp-attention \
--enable-dp-lm-head \
--moe-dense-tp-size 1 \
--chunked-prefill-size 262144 \
--max-prefill-tokens 1048576 \
--trust-remote-code \
--disable-cuda-graph \
--attention-backend cutlass_mla \
--enable-two-batch-overlap \
--enable-layerwise-nvtx-marker \
--moe-a2a-backend deepep \
--moe-runner-backend deep_gemm \
--deepep-mode auto \
--tokenizer-worker-num 16 \
--mem-fraction-static 0.8