{
  "source_model": "/home/ckl/.cache/modelscope/hub/Qwen/Qwen3___5-9B",
  "tq_model": "/home/ckl/.cache/modelscope/hub/Qwen/Qwen3___5-9B-TQ4",
  "tensor_base": "model.language_model.layers.1.mlp.gate_proj",
  "row_start": 0,
  "row_count": 8,
  "shape": [
    8,
    4096
  ],
  "bits": 4,
  "group_size": 128,
  "centroids_config_vs_cuda": {
    "max_abs": 1.4901161193847656e-08,
    "mean_abs": 3.3760443329811096e-09,
    "config": [
      -0.23777656257152557,
      -0.18096588551998138,
      -0.14193911850452423,
      -0.11041539162397385,
      -0.08293881267309189,
      -0.05785765498876572,
      -0.03420550003647804,
      -0.011320935562252998,
      0.011320935562252998,
      0.03420550003647804,
      0.05785765498876572,
      0.08293881267309189,
      0.11041539162397385,
      0.14193911850452423,
      0.18096588551998138,
      0.23777656257152557
    ],
    "cuda": [
      -0.23777654767036438,
      -0.18096588551998138,
      -0.14193911850452423,
      -0.11041539162397385,
      -0.08293882012367249,
      -0.057857658714056015,
      -0.03420550003647804,
      -0.011320936493575573,
      0.011320936493575573,
      0.03420550003647804,
      0.057857658714056015,
      0.08293882012367249,
      0.11041539162397385,
      0.14193911850452423,
      0.18096588551998138,
      0.23777654767036438
    ]
  },
  "metrics": [
    {
      "lhs": "python_faithful_dequant",
      "rhs": "bf16_source",
      "elements": 32768,
      "max_abs": 0.005407883320003748,
      "mean_abs": 0.0008539303671568632,
      "rmse": 0.0010728667257353663,
      "rhs_rms": 0.011161223985254765,
      "rmse_over_rhs_rms": 0.09612446870995012,
      "max_rel": 1238.516845703125,
      "mean_rel": 0.5199573040008545,
      "rel_p50": 0.0958661288022995,
      "rel_p90": 0.5863989591598511,
      "rel_p95": 1.1582262516021729,
      "rel_p99": 5.039450645446777,
      "rel_p999": 45.884220123291016,
      "top1pct_rel_mean": 27.755062103271484,
      "top1pct_rel_max": 1238.516845703125
    },
    {
      "lhs": "arle_cuda_dequant",
      "rhs": "bf16_source",
      "elements": 32768,
      "max_abs": 0.0054073333740234375,
      "mean_abs": 0.0008538661058992147,
      "rmse": 0.001073163584806025,
      "rhs_rms": 0.011161223985254765,
      "rmse_over_rhs_rms": 0.09615106606800429,
      "max_rel": 1235.123779296875,
      "mean_rel": 0.519916832447052,
      "rel_p50": 0.09589041024446487,
      "rel_p90": 0.5870600342750549,
      "rel_p95": 1.158634901046753,
      "rel_p99": 5.039076805114746,
      "rel_p999": 45.783775329589844,
      "top1pct_rel_mean": 27.751319885253906,
      "top1pct_rel_max": 1235.123779296875
    },
    {
      "lhs": "arle_cuda_dequant",
      "rhs": "python_faithful_dequant",
      "elements": 32768,
      "max_abs": 0.00012020766735076904,
      "mean_abs": 1.2454496754799038e-05,
      "rmse": 1.8410868506180122e-05,
      "rhs_rms": 0.011117699556052685,
      "rmse_over_rhs_rms": 0.001655996225959974,
      "max_rel": 0.003881394863128662,
      "mean_rel": 0.001406661351211369,
      "rel_p50": 0.0013481031637638807,
      "rel_p90": 0.0026339932810515165,
      "rel_p95": 0.00296353199519217,
      "rel_p99": 0.00345249124802649,
      "rel_p999": 0.003757251426577568,
      "top1pct_rel_mean": 0.0036028441973030567,
      "top1pct_rel_max": 0.003881394863128662
    }
  ],
  "fwht_bug_control": {
    "cuda_vs_good_python_max_abs": 0.00012020766735076904,
    "cuda_vs_good_python_rmse": 1.8410868506180122e-05,
    "cuda_vs_bug_emulation_max_abs": 0.09718066453933716,
    "cuda_vs_bug_emulation_rmse": 0.015668567270040512,
    "cuda_vs_bug_emulation_bf16_max_abs": 0.09716796875,
    "cuda_vs_bug_emulation_bf16_rmse": 0.01566854678094387,
    "interpretation": "Compare ARLE CUDA against the faithful Python dequant and the pre-fix FWHT sign-bug emulation. A fixed kernel should match faithful Python after BF16 rounding and differ from the bug emulation."
  },
  "decision": "cuda_dequant_matches_python_with_bf16_rounding"
}
