Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-padded-peer-combine/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_api_sum.py]...

 ** CUDA API Summary (cuda_api_sum):

 Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                 Name
 --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------
     35.4    1,163,962,638     37,235     31,259.9      2,510.0       271    958,698     68,367.8  cuMemFreeAsync
     23.8      781,508,350     37,235     20,988.5      6,108.0       143    404,642     31,130.7  cuMemAllocAsync
     12.7      418,369,243     38,791     10,785.2      8,017.0     2,861    101,270      7,262.4  cudaLaunchKernel
     11.8      386,874,377     39,619      9,764.9      6,800.0       111     96,789      7,703.3  cuMemsetD8Async
      9.3      304,449,576      1,044    291,618.4    136,671.5    12,062  5,621,339    420,146.9  cuMemcpyDtoHAsync_v2
      1.4       44,520,347      6,896      6,456.0      3,462.0       496     86,973      7,159.5  cudaEventRecord
      1.2       38,966,928      6,192      6,293.1      3,245.5       577     68,826      7,069.3  cudaStreamWaitEvent
      1.0       33,532,244      3,096     10,830.8      7,925.0     2,905     74,372      7,479.8  cuLaunchKernelEx
      0.9       31,176,880      2,084     14,960.1     11,108.0     4,493     84,269     10,076.8  cuMemcpyDtoDAsync_v2
      0.8       27,156,432      2,076     13,081.1      9,757.5     2,875     60,057      9,248.5  cuMemcpyHtoDAsync_v2
      0.7       23,386,039      6,616      3,534.8      1,608.0       246     44,936      4,769.0  cudaStreamGetCaptureInfo_v2_v11030
      0.6       19,035,361         16  1,189,710.1  1,138,570.5   145,682  2,356,165    944,247.5  cuStreamSynchronize
      0.2        7,890,441      3,096      2,548.6        701.0       131     64,103      4,337.3  cudaGetFuncBySymbol_v11000
      0.1        4,673,256        344     13,585.0     11,274.5     4,452    117,835      9,155.6  cudaMemsetAsync
      0.1        4,093,488          8    511,686.0    480,811.0    69,192    972,811    432,662.0  cuMemGetInfo_v2
      0.0           25,506          2     12,753.0     12,753.0     7,616     17,890      7,264.8  cuCtxSynchronize
      0.0            5,493          1      5,493.0      5,493.0     5,493      5,493          0.0  cuProfilerStart
      0.0            2,339          1      2,339.0      2,339.0     2,339      2,339          0.0  cuProfilerStop
