SERVER_HOST=127.0.0.1 SERVER_PORT=6060 LOG_LEVEL=info MAX_CACHE_SIZE=16384 MAX_CACHE_TTL=3600 MODEL_PATH=/path/to/model USE_LLAMA_CPP=false LLAMA_CPP_MAX_CONTEXT=2048 LLAMA_CPP_BATCH_TOKENS=2048 LLAMA_RAM_CACHE_MB=512 ENABLE_CUDA=true DEVICE_MAP_AUTO=false TARGET_GPU_INDEX=0 LOAD_IN_8BIT=false LOAD_IN_4BIT=false SPACE_TOKEN_CHAR=▁ MODEL_SEED=42 MODEL_DEFAULT_NUM_BEAMS=1 MODEL_LOADER=AutoModelForCausalLM MODEL_DEFAULT_DO_SAMPLE=true MODEL_DEFAULT_TEMPERATURE=1.0 MODEL_DEFAULT_TOP_P=1.0 MODEL_DEFAULT_TOP_K=50 MODEL_DEFAULT_MAX_NEW_TOKENS=2048 MODEL_DEFAULT_REPETITION_PENALTY=1.0 MODEL_DEFAULT_LENGTH_PENALTY=1.0