moe_tbo

python3 -m sglang.launch_server --model-path /data_workspace/st/models/Qwen1.5-MoE-A2.7B-Chat --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66

curl http://localhost:30000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "deepseek-r1-distill-qwen-1.5b",
        "messages": [
            {"role": "user", "content": "你是谁?"}
        ]
    }'


nsys launch --trace=cublas,cuda,cudnn,nvtx --show-output=true --cuda-graph-trace node --cuda-memory-usage=true --python-backtrace=cuda --trace-fork-before-exec=true --session-new prefill \
export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1
# Qwen2Moe不行，Qwen3Moe才支持tbo特性（sglang v0.5.6.post2）
python3 -m sglang.launch_server --model-path /data_workspace/wt/models/Qwen1.5-MoE-A2.7B-Chat --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66 \
--decode-log-interval 1 \
--enable-dp-attention \
--enable-dp-lm-head \
--moe-dense-tp-size 1 \
--disable-cuda-graph \
--enable-two-batch-overlap \
--enable-layerwise-nvtx-marker


--attention-backend cutlass_mla \
--moe-a2a-backend deepep \
--moe-runner-backend deep_gemm \
--deepep-mode auto 

modelscope download --model Qwen/Qwen3-30B-A3B --local_dir ./models/Qwen3-30B-A3B

export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1
export LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib/:/usr/local/lib:$LD_LIBRARY_PATH
python3 -m sglang.launch_server --model-path /data_workspace/wt/models/Qwen3-30B-A3B --tp 4 --ep 4 --trust-remote-code --mem-fraction-static 0.66 \
--decode-log-interval 1 \
--enable-dp-attention \
--enable-dp-lm-head \
--moe-dense-tp-size 1 \
--disable-cuda-graph \
--enable-two-batch-overlap \
--enable-layerwise-nvtx-marker \
--moe-runner-backend triton \
--moe-a2a-backend mooncake \
--elastic-ep-backend mooncake

wget https://github.com/kvcache-ai/Mooncake/archive/refs/tags/v0.3.6.post1.tar.gz
tar -zxvf
cd
export BUILD_WITH_EP=1
# A100
cmake -B build -S . -DUSE_CUDA=true -DWITH_EP=ON -DCMAKE_CUDA_ARCHITECTURES="80"
cmake --build build -- -j16

# 注释vim mooncake-integration/ep/ep_py.cpp 对torch==2.8版本的要求
./scripts/build_wheel.sh
python3 -m pip install ./mooncake-wheel/dist/mooncake_transfer_engine-0.3.7.post2-cp312-cp312-manylinux_2_17_x86_64.whl

报错:

  File "/sgl-workspace/sglang/python/sglang/srt/distributed/parallel_state.py", line 1523, in init_distributed_environment
    torch.distributed.init_process_group(
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 95, in wrapper
    func_return = func(*args, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 1769, in init_process_group
    default_pg, _ = _new_process_group_helper(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 2074, in _new_process_group_helper
    backend_class = creator_fn(dist_backend_opts, backend_options)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Failed to register local memory.

WITH_NVIDIA_PEERMEM 是一个编译选项，用于启用 NVIDIA GPUDirect RDMA (Remote Direct Memory Access) 功能，允许第三方PCIe设备（如Mellanox InfiniBand网卡）直接访问GPU显存，绕过CPU和系统内存，大幅降低延迟并提升带宽。

在mooncake中：

int RdmaContext::registerMemoryRegionInternal(void *addr, size_t length,
                                              int access,
                                              MemoryRegionMeta &mrMeta) {
    if (length > (size_t)globalConfig().max_mr_size) {
        PLOG(WARNING) << "The buffer length exceeds device max_mr_size, "
                      << "shrink it to " << globalConfig().max_mr_size;
        length = (size_t)globalConfig().max_mr_size;
    }
#if !defined(WITH_NVIDIA_PEERMEM) && defined(USE_CUDA)
    // Implement register memory in a way that does not assume the presence of
    // nvidia-peermem. If memory is on CPU call ibv_reg_mr() as usual. If memory
    // is on GPU then use ibv_reg_dmabuf_mr() instead which does not require
    // nvidia-peermem.
    CUmemorytype memType;
    CUresult result = cuPointerGetAttribute(
        &memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr);

    // Register memory depending on whether memory is on host or GPU.
    if (result != CUDA_SUCCESS || memType == CU_MEMORYTYPE_HOST) {
        mrMeta.addr = addr;
        mrMeta.mr = ibv_reg_mr(pd_, addr, length, access);
    } else if (memType == CU_MEMORYTYPE_DEVICE) {
        size_t allocSize;
        cuPointerGetAttribute(&allocSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE,
                              (CUdeviceptr)addr);
        int dmabuf_fd;
        result = cuMemGetHandleForAddressRange(
            &dmabuf_fd, (CUdeviceptr)addr, allocSize,
            CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
        if (result != CUDA_SUCCESS) {
            const char *errStr;
            cuGetErrorString(result, &errStr);
            LOG(ERROR) << "Failed to retrieve dmabuf for " << (uintptr_t)addr
                       << " cuda error=" << errStr;
            return ERR_CONTEXT;
        }
        mrMeta.addr = addr;
        mrMeta.mr = ibv_reg_dmabuf_mr(pd_, 0 /* offset */, length,
                                      (uintptr_t)addr, dmabuf_fd, access);
    }
#else
    mrMeta.addr = addr;
    mrMeta.mr = ibv_reg_mr(pd_, addr, length, access);
#endif
    if (!mrMeta.mr) {
        PLOG(ERROR) << "Failed to register memory " << addr;
        return ERR_CONTEXT;
    }
    return 0;
}

编译mooncake时，加上：-DWITH_NVIDIA_PEERMEM=OFF

export BUILD_WITH_EP=1
# A100
cmake -B build -S . -DUSE_CUDA=true -DWITH_EP=ON -DCMAKE_CUDA_ARCHITECTURES="80" -DWITH_NVIDIA_PEERMEM=OFF
cmake --build build -- -j16

# 注释vim mooncake-integration/ep/ep_py.cpp 对torch==2.8版本的要求
./scripts/build_wheel.sh
python3 -m pip install ./mooncake-wheel/dist/mooncake_transfer_engine-0.3.7.post2-cp312-cp312-manylinux_2_17_x86_64.whl --force-reinstall --no-deps

#!/bin/bash
local_ip=0.0.0.0
MASTER_IP=192.168.1.2
model_path=/pfs/pfs-OqLB2M/hefeixiang/models/DeepSeek-R1-0528
#expert_location=prefill_in4096.json
TP_SIZE=16
DP_SIZE=16
EP_SIZE=16
NNODES=2
NODE_RANK=0
#export NCCL_DEBUG=INFO
#export NCCL_DEBUG_SUBSYS=ALL
export NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_9,mlx5_10,mlx5_11,mlx5_12
export NVSHMEM_HCA_LIST=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_9,mlx5_10,mlx5_11,mlx5_12

# force mooncake to use MNNVL
#export MC_FORCE_MNNVL=True

# environment copied from GB200 blog2 Low-prec decode
# https://github.com/sgl-project/sglang/issues/10903
#export SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN=1
#export SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2=1
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export NCCL_MNNVL_ENABLE=1
export NCCL_CUMEM_ENABLE=1
export PYTHONUNBUFFERED=1
export SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0
#export SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256

# debug & profile
export SGLANG_TBO_DEBUG=1
export SGLANG_OPERATIONS_ENABLE_PROFILE=1

nsys launch --trace=cublas,cuda,cudnn,nvtx --show-output=true --cuda-graph-trace node --cuda-memory-usage=true --python-backtrace=cuda --trace-fork-before-exec=true --session-new prefill \
python3 -m sglang.launch_server \
    --dist-init-addr $MASTER_IP:6676 \
    --nnodes $NNODES --node-rank $NODE_RANK \
    --model-path ${model_path} \
    --host 0.0.0.0 \
    --port 30000 \
    --decode-log-interval 1 \
    --max-running-requests 2048 \
    --disable-radix-cache \
    --disable-shared-experts-fusion \
    --watchdog-timeout 1000000 \
    --tp-size ${TP_SIZE} \
    --dp-size ${DP_SIZE} \
    --ep-size ${EP_SIZE} \
    --enable-dp-attention \
    --enable-dp-lm-head \
    --moe-dense-tp-size 1 \
    --chunked-prefill-size 262144 \
    --max-prefill-tokens 1048576 \
    --trust-remote-code \
    --disable-cuda-graph \
    --attention-backend cutlass_mla \
    --enable-two-batch-overlap \
    --enable-layerwise-nvtx-marker \
    --moe-a2a-backend deepep \
    --moe-runner-backend deep_gemm \
    --deepep-mode auto \
    --tokenizer-worker-num 16 \
    --mem-fraction-static 0.8