@
c0xt30a ```yaml
services:
ollama:
# 0.11.6 0.12.3
image:
docker.m.daocloud.io/ollama/ollama:0.13.0 container_name: ollama
networks:
- service
ports:
- 11434:11434
volumes:
- /data/home/.ollama/:/root/.ollama/
restart: unless-stopped
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
environment:
# qwen3 vl 最大 256K
#OLLAMA_CONTEXT_LENGTH: 16384 # 16k
#OLLAMA_CONTEXT_LENGTH: 32768 # 32k
# qwen3-vl:32b 256K 206.6 GiB
# qwen3-vl:32b 64K 42.7 GiB
OLLAMA_CONTEXT_LENGTH: 64000
OLLAMA_DEBUG: 1
#OLLAMA_KV_CACHE_TYPE: q4_0
OLLAMA_FLASH_ATTENTION: 1
#OLLAMA_NEW_ENGINE: true
OLLAMA_NUM_PARALLEL: 2
llama:
image:
ghcr.io/mostlygeek/llama-swap:v172-cuda-b7062 container_name: llama
command:
- -watch-config
- -config
- /llama/llama-swap.yaml
volumes:
- ./models:/models
- ./models:/root/.cache/llama.cpp
# the user is app, home is /app, uid=10001
- ./models:/app/.cache/llama.cpp
- ./llama:/llama
ports:
- 11435:8080
environment:
HF_ENDPOINT:
https://modelscope.cn sysctls:
net.ipv6.conf.all.disable_ipv6: 1
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
extra_hosts:
- "host.docker.internal:host-gateway"
```