Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-padded-dispatch/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_api_sum.py]...

 ** CUDA API Summary (cuda_api_sum):

 Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                 Name
 --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------
     33.9    1,261,893,636     37,235     33,890.0      3,200.0       385    993,410     71,173.0  cuMemFreeAsync
     26.4      982,457,075     37,235     26,385.3     10,863.0       168    489,109     35,288.4  cuMemAllocAsync
     12.6      469,333,861     38,791     12,099.0      8,722.0     2,731    142,145      8,860.7  cudaLaunchKernel
     11.8      440,153,219     39,963     11,014.0      7,468.0       111    305,232      9,038.5  cuMemsetD8Async
      7.8      290,620,577      1,044    278,372.2    174,814.5    11,810  6,663,680    387,520.3  cuMemcpyDtoHAsync_v2
      1.4       54,016,580      6,896      7,833.0      4,145.0       483    133,044      8,709.5  cudaEventRecord
      1.3       47,825,020      6,192      7,723.7      3,930.5       540     85,790      8,737.5  cudaStreamWaitEvent
      1.0       38,438,017      3,096     12,415.4      8,903.5     3,309     77,022      8,788.6  cuLaunchKernelEx
      1.0       35,979,703      2,084     17,264.7     13,555.5     4,641     94,393     10,974.7  cuMemcpyDtoDAsync_v2
      0.9       32,136,663      2,076     15,480.1     11,721.5     2,896     94,396     11,034.6  cuMemcpyHtoDAsync_v2
      0.8       28,180,483      6,616      4,259.4      1,837.0       244    100,103      6,069.0  cudaStreamGetCaptureInfo_v2_v11030
      0.7       25,221,431         16  1,576,339.4  1,481,083.5    28,354  3,420,539  1,449,634.3  cuStreamSynchronize
      0.3        9,853,666      3,096      3,182.7        839.0       135     57,672      5,368.7  cudaGetFuncBySymbol_v11000
      0.1        5,050,269        344     14,681.0     10,750.0     4,114     53,889      9,077.5  cudaMemsetAsync
      0.1        4,579,074          8    572,384.3    557,583.0   416,414    737,006    116,309.4  cuMemGetInfo_v2
      0.0           21,986          2     10,993.0     10,993.0     5,526     16,460      7,731.5  cuCtxSynchronize
      0.0            6,010          1      6,010.0      6,010.0     6,010      6,010          0.0  cuProfilerStart
      0.0            1,900          1      1,900.0      1,900.0     1,900      1,900          0.0  cuProfilerStop
