Environment Variables¶
vLLM Spyre uses the following environment variables to configure the system:
environment_variables: dict[str, Callable[[], Any]] = {
# Defines the prompt lengths the Spyre accelerator should be prepared
# for, formatted as comma separated list. Only applicable in static batching
# mode (VLLM_SPYRE_USE_CB=0).
"VLLM_SPYRE_WARMUP_PROMPT_LENS":
lambda: [
int(p) for p in os.getenv(key='VLLM_SPYRE_WARMUP_PROMPT_LENS',
default='64').split(',')
],
# Defines the max output tokens the Spyre accelerator should be prepared
# for, formatted as comma separated list. Only applicable in static batching
# mode (VLLM_SPYRE_USE_CB=0).
"VLLM_SPYRE_WARMUP_NEW_TOKENS":
lambda: [
int(d) for d in os.getenv(key='VLLM_SPYRE_WARMUP_NEW_TOKENS',
default='20').split(',')
],
# Defines the batch sizes the Spyre accelerator should be prepared
# for, formatted as comma separated list. Only applicable in static batching
# mode (VLLM_SPYRE_USE_CB=0).
"VLLM_SPYRE_WARMUP_BATCH_SIZES":
lambda: [
int(b) for b in os.getenv(key='VLLM_SPYRE_WARMUP_BATCH_SIZES',
default='1').split(',')
],
# Defines the backend that torch.compile will use when using Spyre
# Available options:
# - "sendnn": Compile for execution on Spyre hardware
# - "inductor": Compile for execution on CPU (for debug and testing)
# - "eager": Skip compile entirely (for debug and testing)
#
# - "sendnn_decoder": Deprecated in favor of "sendnn"
"VLLM_SPYRE_DYNAMO_BACKEND":
_backend_backwards_compat,
# If set, use the V1 continuous batching implementation. Otherwise, static
# batching mode will be enabled.
"VLLM_SPYRE_USE_CB":
lambda: bool(int(os.getenv("VLLM_SPYRE_USE_CB", "0"))),
# Enable performance metric logging. This captures startup information
# such as warmup times, and loading times. It is turned off by default.
"VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED":
lambda: int(os.getenv("VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED", 0)),
# Directory to write performance metric logging files. By default,
# logs are written to /tmp.
"VLLM_SPYRE_PERF_METRIC_LOGGING_DIR":
lambda: os.getenv("VLLM_SPYRE_PERF_METRIC_LOGGING_DIR", "/tmp"),
# If set, override the signal handler for vllm-spyre on
# vLLM V1 + torch_sendnn backend to be able to gracefully
# shutdown the engine.
"VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER":
lambda: bool(int(os.getenv("VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER", "1"))),
# If set, enables the `prompt_logprobs` sampling parameter.
# Currently, prompt_logprobs aren't supported
"VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS":
lambda: False,
# If set, enables the joining of a new sequence even if its prompt length
# is exceeding the tkv of the current decode batch. As this shifts all the
# sequences in the decode batch to the right (increasing the tkv), there is
# also a potential performance decrease coming with this.
"VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION":
lambda: bool(int(os.getenv("VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION", "1"))
),
# scheduling heuristic: prefill vs decode prioritization
# Prefills using up to VLLM_SPYRE_N_TOKENS_PREFILL_PRIO tokens will always
# be prioritized. If limit is exceeded, decodes are prioritized.
"VLLM_SPYRE_N_TOKENS_PREFILL_PRIO":
lambda: int(os.getenv("VLLM_SPYRE_N_TOKENS_PREFILL_PRIO", "-1")),
# Allow vllm-spyre to update env vars related to multi-threading (eg. OMP)
# based on the detected CPU cores and server configuration
"VLLM_SPYRE_UPDATE_THREAD_CONFIG":
lambda: bool(int(os.getenv("VLLM_SPYRE_UPDATE_THREAD_CONFIG", "1"))),
# If set, limit the number of concurrent processes loading/compiling
# large models or models with larger context lengths to limit
# memory usage.
# Set to 0 to allow any number of processes
"VLLM_SPYRE_MAX_LOAD_PROCESSES":
lambda: int(os.getenv("VLLM_SPYRE_MAX_LOAD_PROCESSES", "0")),
# If set, redirects all stdout and stderr from worker processes to files
# within this director. This is useful for debugging card-specific errors
# in multi-AIU setups, but should never be enabled in production settings.
# This removes all output from stdout and stderr for the worker processes.
"VLLM_SPYRE_WORKER_LOG_REDIRECT_DIR":
lambda: os.getenv("VLLM_SPYRE_WORKER_LOG_REDIRECT_DIR", ""),
# If set, overrides the default (30 minutes) timeout for
# torch.distributed.init_process_group
"VLLM_SPYRE_GLOO_TIMEOUT_MINUTES":
lambda: int(os.getenv("VLLM_SPYRE_GLOO_TIMEOUT_MINUTES", "60")),
# If set, this will require use of pre-compiled models and
# disable compilation for decoders
"VLLM_SPYRE_REQUIRE_PRECOMPILED_DECODERS":
lambda: bool(int(os.getenv("VLLM_SPYRE_REQUIRE_PRECOMPILED_DECODERS", "0"))
),
# Simple compile backend for some dynamically compiled operations, like
# gathering logprobs in the sampler.
# Defaults to eager, iductor can be used if python headers and a compiler
# are available.
"VLLM_SPYRE_SIMPLE_COMPILE_BACKEND":
lambda: os.getenv("VLLM_SPYRE_SIMPLE_COMPILE_BACKEND", "inductor"),
# Configures the number of CPUs used when determining multi-threading
# configurations
# Set to 0 to have vllm-spyre attempt to detect the CPU count
"VLLM_SPYRE_NUM_CPUS":
lambda: int(os.getenv("VLLM_SPYRE_NUM_CPUS", "0")),
}