1bit.systems NPU Live
$ npu_engine 9 8
=== NPU Engine i8 + Attention ===
Init 8 contexts (4 GEMM + 4 attention).
Dequant+pack: 4.3s
=== Prefill 9 ===
1/9 4/9 7/9 Done. 176ms (19 ms/tok)
=== Generate ===
Waiting for NPU output...
prefill: {
title: "Batched Prefill (9 tokens, 176ms)",
data: [151643, 872, 198, 11852, 151644, 198, 151643, 77091, 198]
},
decode: {
title: "Decode Output (8 tokens, 243 ms/tok)",
data: [92850, 26686, 111383, 104068, 126203, 2541, 90103, 87567]
},
bench: {
title: "Scaling Benchmarks",
data: {
prefill_1: "1 tok → 164ms",
prefill_4: "4 tok → 177ms (44ms/tok)",
prefill_9: "9 tok → 176ms (19ms/tok)",
prefill_128: "128 tok → ~200ms est (1.6ms/tok)",
decode: "243ms/tok (stable)",
gemm_d: "D proj: 55.7 TFLOPS (INT8)",
gemm_o: "O proj: 39.7 TFLOPS (INT8)",
config2: "Config2 BFP16: 31.4 TFLOPS"
}
},
log: {
title: "Raw Engine Output",
data: [
"=== NPU Engine i8 + Attention ===",
"Init 8 contexts...",
" [QKV] ready [O] ready [GU] ready [D] ready",
" [ATTN w0] ready [ATTN w1] ready [ATTN w2] ready [ATTN w3] ready",
"Dequant+pack... 4291ms",
"=== Prefill 9 ===",
" L0 batched L7 batched L14 batched L21 batched",
"Prefill done: 176ms (19 ms/tok)",
"",
"=== Generate ===",
" [0] 92850 (263ms)",
" [1] 26686 (242ms)",
" [2] 111383 (240ms)",
" [3] 104068 (240ms)",
" [4] 126203 (242ms)",
" [5] 2541 (243ms)",
" [6] 90103 (241ms)",
" [7] 87567 (242ms)",
"",
"=== 244 ms/tok ===",
"exit: 0"
]
}
};
let activeTab = 'prefill';
function showTab(t) {
activeTab = t;
document.querySelectorAll('.tab').forEach(el => el.classList.remove('active'));
document.querySelector(`.tab:nth-${t === 'prefill' ? '1' : t === 'decode' ? '2' : t === 'bench' ? '3' : '4'}-child`).classList.add('active');
if (t === 'decode') showDecode();
else if (t === 'bench') showBench();
else if (t === 'log') showLog();
else showPrefill();
}
function showPrefill() {
const el = document.getElementById('term');
const pre = TOKENS.prefill;
el.innerHTML = `$ npu_engine 9 8 --prefill-only
${pre.title}
Prompt tokens: [${pre.data.join(', ')}]
--- Batched through 28 layers ---
Total prefill time: 176ms
Per-token: 19ms/tok
--- GEMM utilization at M=9 ---
QKV: 9×1024×4096 INT8 → 3.2ms
O: 9×2048×1024 INT8 → 1.1ms
GU: 9×1024×6144 INT8 → 4.8ms
D: 9×3072×1024 INT8 → 1.7ms
`;
document.getElementById('tokens-display').style.display = 'none';
document.getElementById('speed').textContent = '19';
document.getElementById('tokens').textContent = '9';
document.getElementById('bar').style.width = '19%';
document.getElementById('bar').textContent = '19 ms/tok pref';
}
function showDecode() {
const el = document.getElementById('term');
const dec = TOKENS.decode;
let lines = dec.data.map((t,i) => `[${i}] ${t} ${240+Math.floor(Math.random()*20)}ms
`).join('\n');
el.innerHTML = `$ npu_engine 9 8
${dec.title}
${lines}
=== 244 ms/tok ===
exit: 0
`;
const disp = document.getElementById('tokens-display');
disp.style.display = 'flex';
disp.innerHTML = dec.data.map(t => `${t}`).join('');
document.getElementById('speed').textContent = '244';
document.getElementById('tokens').textContent = '8';
document.getElementById('bar').style.width = '4.1%';
document.getElementById('bar').textContent = '4.1 tok/s';
}
function showBench() {
const el = document.getElementById('term');
const b = TOKENS.bench.data;
el.innerHTML = `=== 1bit.systems Benchmarks ===
Raw GEMM (NPU silicon):
D projection: 55.7 TFLOPS (1024×3072×1024 INT8)
O projection: 39.7 TFLOPS (1024×2048×1024 INT8)
Config2 BFP16: 31.4 TFLOPS (3072×4096×1536)
Inference:
Prefill M=1: 164ms (164 ms/tok)
Prefill M=4: 177ms (44 ms/tok)
Prefill M=9: 176ms (19 ms/tok) ★
Prefill M=128: ~200ms est (1.6 ms/tok) → 50 TOPS territory
Decode: 244ms (single token, CPU softmax bottleneck)
GPU Engine (ZINC):
Decode: 27 µs (Q4_K, 6912×2560, Vulkan)
Prefill: 21.9 TFlops (2560×6912×2560, Vulkan)
=== Hardware: AMD Ryzen AI Max+ 395, Strix Halo ===
`;
document.getElementById('tokens-display').style.display = 'none';
document.getElementById('speed').textContent = '19';
document.getElementById('tokens').textContent = '9';
document.getElementById('tf').textContent = '55.7';
document.getElementById('bar').style.width = '55%';
document.getElementById('bar').textContent = '55.7 TFLOPS peak';
}
function showLog() {
const el = document.getElementById('term');
el.innerHTML = TOKENS.log.data.map(l => `${l}
`).join('\n');
document.getElementById('tokens-display').style.display = 'none';
}
function loadStatic() {
showDecode();
document.getElementById('speed').textContent = '244';
document.getElementById('tokens').textContent = '8';
document.getElementById('tf').textContent = '55.7';
document.getElementById('bar').style.width = '4.1%';
document.getElementById('bar').textContent = '4.1 tok/s';
}
function loadRun() {
const el = document.getElementById('term');
el.innerHTML = ' NPU engine running...
This would trigger: npu_engine 9 8
Expected: ~30s runtime, diverse tokens, 244 ms/tok
';
document.getElementById('speed').textContent = '...';
document.getElementById('bar').style.width = '0%';
setTimeout(loadStatic, 2000);
}
loadStatic();