vLLM Engine Schema
Engine version: 0.7.3
Discovered at: 2026-05-06T22:57:22+02:00
Discovery method: dataclasses.fields(EngineArgs) + msgspec.json.schema(SamplingParams)
Schema version: 1.0.0
Summary: 104 engine parameters, 31 sampling parameters.
Discovery limitations
sampling_params— constraints (e.g. temperature>=0, top_p in (0,1]) live in imperative _verify_args() and are not introspectable from field metadataengine_params— per-field descriptions unavailable (vLLM EngineArgs has only a class docstring)
Engine Parameters
| Field | Type | Default | Description |
|---|---|---|---|
model | str | facebook/opt-125m | |
served_model_name | `str | list[str] | None` |
tokenizer | `str | None` | — |
task | Literal['auto', 'generate', 'embedding', 'embed', 'classify', 'score', 'reward', 'transcription'] | auto | |
skip_tokenizer_init | bool | false | |
tokenizer_mode | str | auto | |
trust_remote_code | bool | false | |
allowed_local_media_path | str | "" | |
download_dir | `str | None` | — |
load_format | str | auto | |
config_format | ConfigFormat | auto | |
dtype | str | auto | |
kv_cache_dtype | str | auto | |
seed | int | 0 | |
max_model_len | `int | None` | — |
distributed_executor_backend | `str | type[ExecutorBase] | None` |
pipeline_parallel_size | int | 1 | |
tensor_parallel_size | int | 1 | |
max_parallel_loading_workers | `int | None` | — |
block_size | `int | None` | — |
enable_prefix_caching | `bool | None` | — |
disable_sliding_window | bool | false | |
use_v2_block_manager | bool | true | |
swap_space | float | 4 | |
cpu_offload_gb | float | 0 | |
gpu_memory_utilization | float | 0.9 | |
max_num_batched_tokens | `int | None` | — |
max_num_partial_prefills | `int | None` | 1 |
max_long_partial_prefills | `int | None` | 1 |
long_prefill_token_threshold | `int | None` | 0 |
max_num_seqs | `int | None` | — |
max_logprobs | int | 20 | |
disable_log_stats | bool | false | |
revision | `str | None` | — |
code_revision | `str | None` | — |
rope_scaling | `dict[str, Any] | None` | — |
rope_theta | `float | None` | — |
hf_overrides | `dict[str, Any] | Callable[[<class 'transformers.configuration_utils.PretrainedConfig'>], PretrainedConfig] | None` |
tokenizer_revision | `str | None` | — |
quantization | `str | None` | — |
enforce_eager | `bool | None` | — |
max_seq_len_to_capture | int | 8192 | |
disable_custom_all_reduce | bool | false | |
tokenizer_pool_size | int | 0 | |
tokenizer_pool_type | `str | type[ForwardRef('BaseTokenizerGroup')]` | ray |
tokenizer_pool_extra_config | `dict[str, Any] | None` | — |
limit_mm_per_prompt | `Mapping[str, int] | None` | — |
mm_processor_kwargs | `dict[str, Any] | None` | — |
disable_mm_preprocessor_cache | bool | false | |
enable_lora | bool | false | |
enable_lora_bias | bool | false | |
max_loras | int | 1 | |
max_lora_rank | int | 16 | |
enable_prompt_adapter | bool | false | |
max_prompt_adapters | int | 1 | |
max_prompt_adapter_token | int | 0 | |
fully_sharded_loras | bool | false | |
lora_extra_vocab_size | int | 256 | |
long_lora_scaling_factors | `tuple[float] | None` | — |
lora_dtype | `str | dtype | None` |
max_cpu_loras | `int | None` | — |
device | str | auto | |
num_scheduler_steps | int | 1 | |
multi_step_stream_outputs | bool | true | |
ray_workers_use_nsight | bool | false | |
num_gpu_blocks_override | `int | None` | — |
num_lookahead_slots | int | 0 | |
model_loader_extra_config | `dict | None` | — |
ignore_patterns | `str | list[str] | None` |
preemption_mode | `str | None` | — |
scheduler_delay_factor | float | 0.0 | |
enable_chunked_prefill | `bool | None` | — |
guided_decoding_backend | str | xgrammar | |
logits_processor_pattern | `str | None` | — |
speculative_model | `str | None` | — |
speculative_model_quantization | `str | None` | — |
speculative_draft_tensor_parallel_size | `int | None` | — |
num_speculative_tokens | `int | None` | — |
speculative_disable_mqa_scorer | `bool | None` | false |
speculative_max_model_len | `int | None` | — |
speculative_disable_by_batch_size | `int | None` | — |
ngram_prompt_lookup_max | `int | None` | — |
ngram_prompt_lookup_min | `int | None` | — |
spec_decoding_acceptance_method | str | rejection_sampler | |
typical_acceptance_sampler_posterior_threshold | `float | None` | — |
typical_acceptance_sampler_posterior_alpha | `float | None` | — |
qlora_adapter_name_or_path | `str | None` | — |
disable_logprobs_during_spec_decoding | `bool | None` | — |
otlp_traces_endpoint | `str | None` | — |
collect_detailed_traces | `str | None` | — |
disable_async_output_proc | bool | false | |
scheduling_policy | Literal['fcfs', 'priority'] | fcfs | |
scheduler_cls | `str | type[object]` | vllm.core.scheduler.Scheduler |
override_neuron_config | `dict[str, Any] | None` | — |
override_pooler_config | `PoolerConfig | None` | — |
compilation_config | `CompilationConfig | None` | — |
worker_cls | str | auto | |
kv_transfer_config | `KVTransferConfig | None` | — |
generation_config | `str | None` | — |
override_generation_config | `dict[str, Any] | None` | — |
enable_sleep_mode | bool | false | |
model_impl | str | auto | |
calculate_kv_scales | `bool | None` | — |
additional_config | `dict[str, Any] | None` | — |
Sampling Parameters
| Field | Type | Default | Description |
|---|---|---|---|
n | integer | 1 | |
best_of | unknown | — | |
_real_n | unknown | — | |
presence_penalty | number | 0.0 | |
frequency_penalty | number | 0.0 | |
repetition_penalty | number | 1.0 | |
temperature | number | 1.0 | |
top_p | number | 1.0 | |
top_k | integer | -1 | |
min_p | number | 0.0 | |
seed | unknown | — | |
stop | unknown | — | |
stop_token_ids | unknown | — | |
bad_words | unknown | — | |
ignore_eos | boolean | false | |
max_tokens | unknown | 16 | |
min_tokens | integer | 0 | |
logprobs | unknown | — | |
prompt_logprobs | unknown | — | |
detokenize | boolean | true | |
skip_special_tokens | boolean | true | |
spaces_between_special_tokens | boolean | true | |
logits_processors | unknown | — | |
include_stop_str_in_output | boolean | false | |
truncate_prompt_tokens | unknown | — | |
output_kind | unknown | 0 | |
output_text_buffer_length | integer | 0 | |
_all_stop_token_ids | array | [] | |
guided_decoding | unknown | — | |
logit_bias | unknown | — | |
allowed_token_ids | unknown | — |