llama-gpt/api/run.sh

29 lines
711 B
Bash
Executable File

#!/bin/bash
make build
# Get the number of available threads on the system
n_threads=$(grep -c ^processor /proc/cpuinfo)
# Define context window
n_ctx=4096
# Offload everything to CPU
n_gpu_layers=0
# Define batch size
n_batch=2096
# If total RAM is less than 8GB, set batch size to 1024
total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
if [ $total_ram -lt 8000000 ]; then
n_batch=1024
fi
echo "Initializing server with:"
echo "Batch size: $n_batch"
echo "Number of CPU threads: $n_threads"
echo "Number of GPU layers: $n_gpu_layers"
echo "Context window: $n_ctx"
exec python3 -m llama_cpp.server --n_ctx $n_ctx --n_threads $n_threads --n_gpu_layers $n_gpu_layers --n_batch $n_batch