CUDA_VISIBLE_DEVICES=0\,1\,2\,3\,4\,5\,6\,7 INFER_CUDA_DEVICES=0\,1\,2\,3\,4\,5\,6\,7 ARLE_DSV4_MOE_BACKEND=deepep ARLE_DSV4_INCREMENTAL_KV=1 ARLE_DSV4_FUSED_DISPATCH_PAYLOAD=1 nsys profile --trace cuda\,nvtx\,osrt --capture-range=cudaProfilerApi --capture-range-end=stop --export=sqlite --kill=none --force-overwrite=true --output docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-current-user/trace target/release/infer --model-path /root/DeepSeek-V4-Flash --port 18217 --num-slots 1 --max-seq-len 4096 --mem-fraction-static 0.10 --kv-cache-dtype fp8 --deepseek-distributed-layers 43
