nsys profile --output /root/arle-nsys-one-token-c89d3457/docs/trace-artifacts/2026-05-14-dsv4-deepep/nsys-one-token-current/trace --force-overwrite=true --trace cuda\,nvtx\,osrt --capture-range=cudaProfilerApi --capture-range-end=stop --export=sqlite --kill=none /root/arle/target/release/infer --model-path /root/DeepSeek-V4-Flash --port 18110 --num-slots 1 --max-seq-len 4096 --mem-fraction-static 0.10 --kv-cache-dtype fp8 --deepseek-distributed-layers 43 
