# 1bit.systems NPU backend for Ollama
# Usage: ollama create qwen3-npu -f Modelfile
# Then:   ollama run qwen3-npu

FROM llama.cpp

PARAMETER temperature 0.7
PARAMETER top_k 40
PARAMETER top_p 0.9

# Override the LLM backend to use the NPU HTTP API
# Start the NPU server first: ./1bit-server 8081
PARAMETER api_base http://localhost:8081/v1

TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
"""
