Skip to content

Environment Variables

vLLM Spyre uses the following environment variables to configure the system:

environment_variables: dict[str, Callable[[], Any]] = {
    # Defines the prompt lengths the Spyre accelerator should be prepared
    # for, formatted as comma separated list. Only applicable in static batching
    # mode (VLLM_SPYRE_USE_CB=0).
    "VLLM_SPYRE_WARMUP_PROMPT_LENS":
    lambda: [
        int(p) for p in os.getenv(key='VLLM_SPYRE_WARMUP_PROMPT_LENS',
                                  default='64').split(',')
    ],
    # Defines the max output tokens the Spyre accelerator should be prepared
    # for, formatted as comma separated list. Only applicable in static batching
    # mode (VLLM_SPYRE_USE_CB=0).
    "VLLM_SPYRE_WARMUP_NEW_TOKENS":
    lambda: [
        int(d) for d in os.getenv(key='VLLM_SPYRE_WARMUP_NEW_TOKENS',
                                  default='20').split(',')
    ],
    # Defines the batch sizes the Spyre accelerator should be prepared
    # for, formatted as comma separated list. Only applicable in static batching
    # mode (VLLM_SPYRE_USE_CB=0).
    "VLLM_SPYRE_WARMUP_BATCH_SIZES":
    lambda: [
        int(b) for b in os.getenv(key='VLLM_SPYRE_WARMUP_BATCH_SIZES',
                                  default='1').split(',')
    ],

    # Defines the backend that torch.compile will use when using Spyre
    # Available options:
    # - "sendnn": Compile for execution on Spyre hardware
    # - "inductor": Compile for execution on CPU (for debug and testing)
    # - "eager": Skip compile entirely (for debug and testing)
    #
    # - "sendnn_decoder": Deprecated in favor of "sendnn"
    "VLLM_SPYRE_DYNAMO_BACKEND":
    _backend_backwards_compat,

    # If set, use the V1 continuous batching implementation. Otherwise, static
    # batching mode will be enabled.
    "VLLM_SPYRE_USE_CB":
    lambda: bool(int(os.getenv("VLLM_SPYRE_USE_CB", "0"))),

    # Enable performance metric logging. This captures startup information
    # such as warmup times, and loading times. It is turned off by default.
    "VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED":
    lambda: int(os.getenv("VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED", 0)),

    # Directory to write performance metric logging files. By default,
    # logs are written to /tmp.
    "VLLM_SPYRE_PERF_METRIC_LOGGING_DIR":
    lambda: os.getenv("VLLM_SPYRE_PERF_METRIC_LOGGING_DIR", "/tmp"),

    # If set, override the signal handler for vllm-spyre on
    # vLLM V1 + torch_sendnn backend to be able to gracefully
    # shutdown the engine.
    "VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER":
    lambda: bool(int(os.getenv("VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER", "1"))),

    # If set, enables the `prompt_logprobs` sampling parameter.
    # Currently, prompt_logprobs aren't supported
    "VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS":
    lambda: False,

    # If set, enables the joining of a new sequence even if its prompt length
    # is exceeding the tkv of the current decode batch. As this shifts all the
    # sequences in the decode batch to the right (increasing the tkv), there is
    # also a potential performance decrease coming with this.
    "VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION":
    lambda: bool(int(os.getenv("VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION", "1"))
                 ),

    # scheduling heuristic: prefill vs decode prioritization
    # Prefills using up to VLLM_SPYRE_N_TOKENS_PREFILL_PRIO tokens will always
    # be prioritized. If limit is exceeded, decodes are prioritized.
    "VLLM_SPYRE_N_TOKENS_PREFILL_PRIO":
    lambda: int(os.getenv("VLLM_SPYRE_N_TOKENS_PREFILL_PRIO", "-1")),

    # Allow vllm-spyre to update env vars related to multi-threading (eg. OMP)
    # based on the detected CPU cores and server configuration
    "VLLM_SPYRE_UPDATE_THREAD_CONFIG":
    lambda: bool(int(os.getenv("VLLM_SPYRE_UPDATE_THREAD_CONFIG", "1"))),

    # If set, limit the number of concurrent processes loading/compiling
    # large models or models with larger context lengths to limit
    # memory usage.
    # Set to 0 to allow any number of processes
    "VLLM_SPYRE_MAX_LOAD_PROCESSES":
    lambda: int(os.getenv("VLLM_SPYRE_MAX_LOAD_PROCESSES", "0")),

    # If set, redirects all stdout and stderr from worker processes to files
    # within this director. This is useful for debugging card-specific errors
    # in multi-AIU setups, but should never be enabled in production settings.
    # This removes all output from stdout and stderr for the worker processes.
    "VLLM_SPYRE_WORKER_LOG_REDIRECT_DIR":
    lambda: os.getenv("VLLM_SPYRE_WORKER_LOG_REDIRECT_DIR", ""),

    # If set, overrides the default (30 minutes) timeout for
    #  torch.distributed.init_process_group
    "VLLM_SPYRE_GLOO_TIMEOUT_MINUTES":
    lambda: int(os.getenv("VLLM_SPYRE_GLOO_TIMEOUT_MINUTES", "60")),

    # If set, this will require use of pre-compiled models and
    # disable compilation for decoders
    "VLLM_SPYRE_REQUIRE_PRECOMPILED_DECODERS":
    lambda: bool(int(os.getenv("VLLM_SPYRE_REQUIRE_PRECOMPILED_DECODERS", "0"))
                 ),

    # Simple compile backend for some dynamically compiled operations, like
    # gathering logprobs in the sampler.
    # Defaults to eager, iductor can be used if python headers and a compiler
    # are available.
    "VLLM_SPYRE_SIMPLE_COMPILE_BACKEND":
    lambda: os.getenv("VLLM_SPYRE_SIMPLE_COMPILE_BACKEND", "inductor"),

    # Configures the number of CPUs used when determining multi-threading
    # configurations
    # Set to 0 to have vllm-spyre attempt to detect the CPU count
    "VLLM_SPYRE_NUM_CPUS":
    lambda: int(os.getenv("VLLM_SPYRE_NUM_CPUS", "0")),
}