Skip to main content

部署

docker pull docker.m.daocloud.io/lmsysorg/sglang:latest
docker pull docker.1ms.run/lmsysorg/sglang:v0.5.6.post2

docker run --gpus all --ipc=host --network=host -v /home/ken:/workspace -v /ssd2:/data_workspace --name st_sglang --cap-add SYS_NICE --cap-add IPC_LOCK --entrypoint bash -it docker.m.daocloud.io/lmsysorg/sglang

# --ulimit memlock=-1: 允许容器内的进程锁定任意多的内存,不让它们被交换到磁盘,这对性能敏感的应用重要
# --ulimit stack=67108864:设置每个线程的栈空间上限为 64MB,防止栈溢出或过度占用内存


IB_DEVICES=$(find /dev/infiniband/* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
docker run --gpus all ${IB_DEVICES} --device /dev/gdrdrv:/dev/gdrdrv -v /etc/topo:/etc/topo -v /nfs:/nfs --ipc=host --network=host -v /home/ken:/workspace -v /ssd2:/data_workspace --name st_sglang --cap-add SYS_NICE --cap-add IPC_LOCK --entrypoint bash -it docker.1ms.run/lmsysorg/sglang:v0.5.6.post2

$ pip list
sglang 0.4.9.post2 /sgl-workspace/sglang/python

$ grep -r "pip install vllm" /sgl-workspace/sglang/
/sgl-workspace/sglang/benchmark/benchmark_vllm_060/README.md:pip install vllm==0.6.0
/sgl-workspace/sglang/benchmark/blog_v0_2/README.md:pip install vllm==0.5.2
/sgl-workspace/sglang/python/sglang/srt/layers/quantization/__init__.py: "Please install vllm by `pip install vllm==0.9.0.1`"
/sgl-workspace/sglang/python/sglang/srt/layers/quantization/__init__.py: "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --local-dir /data_workspace/st/models/DeepSeek-R1-Distill-Qwen-1.5B

python3 -m sglang.launch_server --model-path /data_workspace/st/models/DeepSeek-R1-Distill-Qwen-1.5B --tp 2 --trust-remote-code --mem-fraction-static 0.66

curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-r1-distill-qwen-1.5b",
"messages": [
{"role": "user", "content": "你是谁?"}
]
}'

curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-r1-distill-qwen-1.5b",
"messages": [
{"role": "user", "content": "计算330 * 220 + 100 = ?"}
]
}'

单机部署pd测试

# sglang version
sglang 0.4.9.post2 /sgl-workspace/sglang/python


CUDA_VISIBLE_DEVICES=0 python -m sglang.launch_server \
--model-path /data_workspace/st/models/DeepSeek-R1-Distill-Qwen-1.5B \
--port 7000 \
--host 0.0.0.0 \
--disaggregation-mode prefill \
--disaggregation-bootstrap-port 8998 \
--disaggregation-transfer-backend mooncake

CUDA_VISIBLE_DEVICES=1 python -m sglang.launch_server \
--model-path /data_workspace/st/models/DeepSeek-R1-Distill-Qwen-1.5B \
--port 7001 \
--host 0.0.0.0 \
--disaggregation-mode decode \
--disaggregation-bootstrap-port 8999 \
--disaggregation-transfer-backend mooncake


python -m sglang.srt.disaggregation.mini_lb \
--prefill http://127.0.0.1:7000 \
--prefill-bootstrap-ports 8998 \
--decode http://127.0.0.1:7001 \
--host 0.0.0.0 \
--port 8111

curl http://localhost:8111/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-r1-distill-qwen-1.5b",
"messages": [
{"role": "user", "content": "你是谁?"}
]
}'

没有检测到RDMA,采用了tcp。

4892 transfer_engine.cpp:366] Metrics reporting is disabled (set MC_TE_METRIC=1 to enable)
4892 transfer_engine.cpp:44] Transfer Engine starting. Server: 10.213.76.75, Metadata: P2PHANDSHAKE, ip_or_host_name: , rpc_port: 0
4892 transfer_engine.cpp:100] Transfer Engine RPC using P2P handshake, listening on 10.213.76.75:16332
4892 transfer_engine.cpp:112] Auto-discovering topology...
4892 topology.cpp:58] No RDMA devices found, check your device installation
4892 transfer_engine.cpp:127] Topology discovery complete. Found 0 HCAs.
4892 tcp_transport.cpp:241] TcpTransport: listen on port 16333
prefill:

[2026-03-08 23:21:25] The server is fired up and ready to roll!
[2026-03-08 23:27:18] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 0, #token: 0, token usage: 0.00, #unbootstrapped-req: 0, #queue-req: 0, #transferring-req: 0, input throughput (token/s): 0.01, timestamp: 2026-03-08T23:27:18.508506
E0308 23:27:18.575773 4613 tcp_transport.cpp:136] Session::writeBody failed. Error: Bad address (value: 14), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.575831 4613 tcp_transport.cpp:136] Session::writeBody failed. Error: Bad address (value: 14), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576057 4613 tcp_transport.cpp:136] Session::writeBody failed. Error: Bad address (value: 14), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576133 4613 tcp_transport.cpp:136] Session::writeBody failed. Error: Bad address (value: 14), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576243 4613 tcp_transport.cpp:136] Session::writeBody failed. Error: Bad address (value: 14), total_transferred_bytes_: 0, current transferred_bytes: 0
[2026-03-08 23:27:18] Session 10.213.76.75:16332 failed.
[2026-03-08 23:27:18] Prefill transfer failed for request rank=0 req.rid='c35967e34c5841f4b5a2a830e81898fc' req.bootstrap_room=866697408905544338 with exception KVTransferError(bootstrap_room=866697408905544338): Failed to send kv chunk of 866697408905544338 to 10.213.76.75:11519
[2026-03-08 23:27:18] INFO: 127.0.0.1:58074 - "POST /v1/chat/completions HTTP/1.1" 200 OK
decode:
[2026-03-08 23:25:06] The server is fired up and ready to roll!
E0308 23:27:18.575848 5532 tcp_transport.cpp:170] Session::readBody failed. Error: End of file (value: 2), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.575906 5532 tcp_transport.cpp:170] Session::readBody failed. Error: End of file (value: 2), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576079 5532 tcp_transport.cpp:170] Session::readBody failed. Error: End of file (value: 2), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576151 5532 tcp_transport.cpp:170] Session::readBody failed. Error: End of file (value: 2), total_transferred_bytes_: 0, current transferred_bytes: 0
E0308 23:27:18.576262 5532 tcp_transport.cpp:170] Session::readBody failed. Error: End of file (value: 2), total_transferred_bytes_: 0, current transferred_bytes: 0
[2026-03-08 23:27:18] Decode transfer failed for request rank=0 decode_req.req.rid='55d72d6c60284419b84b1aceb9197c2e' decode_req.req.bootstrap_room=866697408905544338 with exception KVTransferError(bootstrap_room=866697408905544338): Failed to get kvcache from prefill instance, it might be dead
[2026-03-08 23:27:18] INFO: 127.0.0.1:51136 - "POST /v1/chat/completions HTTP/1.1" 200 OK

Mooncake传输引擎没有检测到RDMA设备,自动回退到了TCP传输模式,而TCP模式在Prefill和Decode实例之间传输GPU显存(VRAM)数据时失败了。

TCP的致命缺陷:TCP传输只支持CPU内存(DRAM) 之间的数据传输,不支持直接访问GPU显存(VRAM)。当它试图写入或读取GPU上的KV Cache时,就触发了这个地址错误

服务启动后,发送请求,prefill 节点都报错 Failed to send kv chunk 在 sglang 社区中也找到同样的问题: https://github.com/sgl-project/sglang/issues/7118

这里主要的原因是在 mooncake 不支持这种 vram 的 tcp 传输,这个在 patch https://github.com/kvcache-ai/Mooncake/pull/702 这个增加了支持; 更新 mooncake 版本到最新的 0.3.6

python3 -m pip list | grep moon
mooncake-transfer-engine 0.3.4.post2

更新0.3.6后出现了段错误

Fatal Python error: Segmentation fault

Thread 0x00007f748fffe640 (most recent call first):
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/mooncake/conn.py", line 706 in heartbeat_checker
File "/usr/lib/python3.10/threading.py", line 953 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f7493fff640 (most recent call first):
File "/usr/local/lib/python3.10/dist-packages/zmq/sugar/socket.py", line 799 in recv_multipart
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/mooncake/conn.py", line 677 in decode_thread
File "/usr/lib/python3.10/threading.py", line 953 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f75cbffe640 (most recent call first):
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2041 in watchdog_thread
File "/usr/lib/python3.10/threading.py", line 953 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f75cffff640 (most recent call first):
File "/usr/lib/python3.10/threading.py", line 320 in wait
File "/usr/lib/python3.10/queue.py", line 171 in get
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 141 in forward_thread_func_
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116 in decorate_context
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 129 in forward_thread_func
File "/usr/lib/python3.10/threading.py", line 953 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f7657fff640 (most recent call first):
File "/usr/lib/python3.10/threading.py", line 324 in wait
File "/usr/lib/python3.10/threading.py", line 607 in wait
File "/usr/local/lib/python3.10/dist-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f7d9bfff640 (most recent call first):
File "/usr/lib/python3.10/threading.py", line 324 in wait
File "/usr/lib/python3.10/threading.py", line 607 in wait
File "/usr/local/lib/python3.10/dist-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f7fa841c640 (most recent call first):
File "/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_worker/subproc_pool.py", line 55 in _recv_msg
File "/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_worker/subproc_pool.py", line 191 in _read_thread
File "/usr/lib/python3.10/threading.py", line 953 in run
File "/usr/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/usr/lib/python3.10/threading.py", line 973 in _bootstrap

Thread 0x00007f83a3877480 (most recent call first):
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 999 in _rank_not_in_group
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2815 in all_reduce
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 81 in wrapper
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/utils.py", line 56 in poll_and_all_reduce
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/decode.py", line 555 in pop_transferred
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/decode.py", line 875 in process_decode_queue
File "/sgl-workspace/sglang/python/sglang/srt/disaggregation/decode.py", line 706 in event_loop_overlap_disagg_decode
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116 in decorate_context
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2778 in run_scheduler_process
File "/usr/lib/python3.10/multiprocessing/process.py", line 108 in run
File "/usr/lib/python3.10/multiprocessing/process.py", line 314 in _bootstrap
File "/usr/lib/python3.10/multiprocessing/spawn.py", line 129 in _main
File "/usr/lib/python3.10/multiprocessing/spawn.py", line 116 in spawn_main
File "<string>", line 1 in <module>

Extension modules: numpy._core._multiarray_umath, numpy.linalg._umath_linalg, pybase64._pybase64, charset_normalizer.md, requests.packages.charset_normalizer.md, requests.packages.chardet.md, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, frozenlist._frozenlist, uvloop.loop, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, psutil._psutil_linux, psutil._psutil_posix, zmq.backend.cython._zmq, PIL._imaging, setproctitle._setproctitle, yaml._yaml, regex._regex, markupsafe._speedups, PIL._imagingft, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, scipy._lib._ccallback_c, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, sentencepiece._sentencepiece, cuda.bindings._lib.utils, cuda.bindings._bindings.cydriver, cuda.bindings.cydriver, cuda.bindings.driver, cuda.bindings._bindings.cynvrtc, cuda.bindings.cynvrtc, cuda.bindings.nvrtc, msgspec._core, cuda_utils, __triton_launcher (total: 111)
Child process unexpectedly failed with exitcode=139. pid=8699

这里主要是在做 transfer_sync 调用 mooncake 底层接口出现了断错误;生成对应的 core 文件 结合上面支持的patch https://github.com/kvcache-ai/Mooncake/pull/702

char *dram_buffer = addr + total_transferred_bytes_;

#ifdef USE_CUDA
if (isCudaMemory(addr)) {
dram_buffer = new char[buffer_size];
cudaMemcpy(dram_buffer, addr + total_transferred_bytes_,
buffer_size, cudaMemcpyDefault);
}
#endif

dram_buffer 就是传进来的地址 addr ,而不是 new 新申请的,因此 USE_CUDA 这个宏是没有使能,导致这里直接访问 vram 的地址,没有转换成 dram,因此出现 segment fault;

在社区中也查到,社区release 的版本是,默认不开启 USE_CUDA 这个宏,因此需要我们自己打开这个宏手动编译个版本替换; 开启 USE_CUDA 这个宏的新版本替换后,不再出错,可以正常处理请求;

从社区下载安装 0.3.6.post1 版本后,替换 /usr/local/lib/python3.xx/dist-packages/mooncake/engine.so 这个文件

python3 -m pip install mooncake-transfer-engine==0.3.6.post1 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

wget https://github.com/kvcache-ai/Mooncake/archive/refs/tags/v0.3.6.post1.tar.gz

编译mooncake

wget https://github.com/kvcache-ai/Mooncake/archive/refs/tags/v0.3.6.post1.tar.gz
tar -zxvf
cd
cmake -B build -S . -DUSE_CUDA=true -DBUILD_SHARED_LIBS=ON
cmake --build build -- -j16

依赖:
https://github.com/alibaba/yalantinglibs/blob/main/website/docs/zh/guide/what_is_yalantinglibs.md
find build/ -name *.so
build/mooncake-integration/store.cpython-310-x86_64-linux-gnu.so
build/mooncake-integration/engine.cpython-310-x86_64-linux-gnu.so
build/mooncake-common/src/libmooncake_common.so
build/mooncake-transfer-engine/src/libtransfer_engine.so
build/mooncake-store/src/libmooncake_store.so
cp build/mooncake-integration/engine.cpython-310-x86_64-linux-gnu.so /usr/local/lib/python3.10/dist-packages/mooncake/engine.so

# 或者
卸载mookcake-tranfer-engine,make install安装
# 解决找不到so,export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH

重新运行,成功解决问题!!!!pd分离成功。

单机网络拓扑查看

rdma link show
link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev xgbe2
link mlx5_1/1 state DOWN physical_state DISABLED netdev xgbe3

nvidia-smi topo -m

mooncake中是如何找rdma设备的?

// ~/mooncake-transfer-engine/src/topology.cpp
static std::vector<InfinibandDevice> listInfiniBandDevices(
const std::vector<std::string> &filter) {
int num_devices = 0;
std::vector<InfinibandDevice> devices;

struct ibv_device **device_list = ibv_get_device_list(&num_devices);
if (!device_list) {
LOG(WARNING) << "No RDMA devices found, check your device installation";
return {};
}
if (device_list && num_devices <= 0) {
LOG(WARNING) << "No RDMA devices found, check your device installation";
ibv_free_device_list(device_list);
return {};
}
...
}

测试:

#include <stdio.h>
#include <infiniband/verbs.h>

int main() {
struct ibv_device **dev_list;
int num_devices, i;

// 1. 获取设备列表
dev_list = ibv_get_device_list(&num_devices);

if (!dev_list) {
perror("ibv_get_device_list failed");
return -1;
}

// 2. 检查是否有设备
if (num_devices == 0) {
printf("No RDMA devices found in system.\n");
printf("This explains why Mooncake reported 'No RDMA devices found'\n");
} else {
printf("Found %d RDMA device(s):\n", num_devices);
for (i = 0; i < num_devices; i++) {
// 3. 打印设备名称(如 mlx5_0, mlx5_1)
printf(" [%d] %s\n", i, ibv_get_device_name(dev_list[i]));
}
}

// 4. 重要:使用完毕后必须释放列表
ibv_free_device_list(dev_list);

return 0;
}

gcc -o rdma_check rdma_check.c -libverbs
./rdma_check

没把设置映射进入容器:

IB_DEVICES=$(find /dev/infiniband/* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
docker run --gpus all ${IB_DEVICES} --device /dev/gdrdrv:/dev/gdrdrv -v /etc/topo:/etc/topo -v /nfs:/nfs --ipc=host --network=host -v /home/ken:/workspace -v /ssd2:/data_workspace --name st_sglang --cap-add SYS_NICE --cap-add IPC_LOCK --entrypoint bash -it docker.m.daocloud.io/lmsysorg/sglang
Found 2 RDMA device(s):
[0] mlx5_0
[1] mlx5_1