vLLM Parameter Curation
Engine version: 0.19.1
Discovered at: 2026-06-22 16:20:19.046420+00:00
Discovery method: dataclasses.fields(EngineArgs) + msgspec.json.schema(SamplingParams) + config/*.py declarative-constraint overlay
Summary: 37/220 parameters curated (185 engine + 35 sampling discovered)
Engine Parameters
| Field | Type | Default | Curated? |
|---|---|---|---|
_api_process_count | int | 1 | - |
_api_process_rank | int | 0 | - |
additional_config | dict[str, Any] | {} | - |
aggregate_engine_logging | bool | False | - |
all2all_backend | str | allgather_reducescatter | - |
allow_deprecated_quantization | bool | False | - |
allowed_local_media_path | str | `` | - |
allowed_media_domains | list[str] | None | null | - |
async_scheduling | bool | None | null | - |
attention_backend | AttentionBackendEnum | None | null | - |
attention_config | unknown | null | - |
block_size | int | None | null | yes |
calculate_kv_scales | bool | False | - |
code_revision | str | None | null | yes |
collect_detailed_traces | list[Literal['model', 'worker', 'all']] | None | null | - |
compilation_config | unknown | null | - |
config_format | str | auto | - |
convert | str | auto | - |
cp_kv_cache_interleave_size | int | 1 | - |
cpu_offload_gb | float | 0 | yes |
cpu_offload_params | set[str] | [] | - |
cudagraph_capture_sizes | list[int] | None | null | yes |
cudagraph_metrics | bool | False | - |
data_parallel_address | str | None | null | - |
data_parallel_backend | str | mp | - |
data_parallel_external_lb | bool | False | - |
data_parallel_hybrid_lb | bool | False | - |
data_parallel_rank | int | None | null | - |
data_parallel_rpc_port | int | None | null | - |
data_parallel_size | int | 1 | - |
data_parallel_size_local | int | None | null | - |
data_parallel_start_rank | int | None | null | - |
dbo_decode_token_threshold | int | 32 | - |
dbo_prefill_token_threshold | int | 512 | - |
dcp_comm_backend | str | ag_rs | - |
dcp_kv_cache_interleave_size | int | 1 | - |
decode_context_parallel_size | int | 1 | - |
default_mm_loras | dict[str, str] | None | null | - |
disable_cascade_attn | bool | True | - |
disable_chunked_mm_input | bool | False | - |
disable_custom_all_reduce | bool | False | yes |
disable_hybrid_kv_cache_manager | bool | None | null | - |
disable_log_stats | bool | False | - |
disable_nccl_for_dp_synchronization | bool | None | null | - |
disable_sliding_window | bool | False | - |
distributed_executor_backend | str | Literal['ray', 'mp', 'uni', 'external_launcher'] | type[Any] | None | null | yes |
distributed_timeout_seconds | int | None | null | - |
download_dir | str | None | null | - |
dtype | str | auto | yes |
ec_transfer_config | unknown | null | - |
enable_chunked_prefill | bool | None | null | yes |
enable_dbo | bool | False | - |
enable_elastic_ep | bool | False | - |
enable_ep_weight_filter | bool | False | - |
enable_eplb | bool | False | - |
enable_expert_parallel | bool | False | - |
enable_flashinfer_autotune | bool | null | - |
enable_layerwise_nvtx_tracing | bool | False | - |
enable_logging_iteration_details | bool | False | - |
enable_lora | bool | False | - |
enable_mfu_metrics | bool | False | - |
enable_mm_embeds | bool | False | - |
enable_mm_processor_stats | bool | False | - |
enable_prefix_caching | bool | None | null | yes |
enable_prompt_embeds | bool | False | - |
enable_return_routed_experts | bool | False | - |
enable_sleep_mode | bool | False | - |
enable_tower_connector_lora | bool | False | - |
enforce_eager | bool | False | yes |
eplb_config | unknown | null | - |
expert_placement_strategy | str | linear | - |
fail_on_environ_validation | bool | False | - |
fully_sharded_loras | bool | False | - |
gdn_prefill_backend | str | null | - |
generation_config | str | auto | - |
gpu_memory_utilization | float | 0.9 | yes |
hf_config_path | str | None | null | - |
hf_overrides | dict[str, Any] | Callable[[typing.Any], Any] | {} | - |
hf_token | bool | str | None | null | - |
ignore_patterns | str | list[str] | ['original/**/*'] | - |
interleave_mm_strings | bool | False | - |
io_processor_plugin | str | None | null | - |
kernel_config | unknown | null | - |
kv_cache_dtype | str | auto | yes |
kv_cache_dtype_skip_layers | list[str] | [] | - |
kv_cache_memory_bytes | int | None | null | yes |
kv_cache_metrics | bool | False | - |
kv_cache_metrics_sample | float | 0.01 | - |
kv_events_config | unknown | null | - |
kv_offloading_backend | str | native | - |
kv_offloading_size | float | None | null | - |
kv_sharing_fast_prefill | bool | False | - |
kv_transfer_config | unknown | null | - |
language_model_only | bool | False | - |
limit_mm_per_prompt | dict[str, int | dict[str, int]] | {} | - |
load_format | str | Any | auto | - |
logits_processors | list[str | type[LogitsProcessor]] | None | null | - |
logprobs_mode | str | raw_logprobs | - |
long_prefill_token_threshold | int | 0 | - |
lora_dtype | str | dtype | None | auto | - |
lora_target_modules | list[str] | None | null | - |
mamba_block_size | int | None | null | - |
mamba_cache_dtype | str | auto | - |
mamba_cache_mode | str | none | - |
mamba_ssm_cache_dtype | str | auto | - |
master_addr | str | 127.0.0.1 | - |
master_port | int | 29501 | - |
max_cpu_loras | int | None | null | - |
max_cudagraph_capture_size | int | None | null | yes |
max_logprobs | int | 20 | - |
max_long_partial_prefills | int | 1 | - |
max_lora_rank | int | 16 | - |
max_loras | int | 1 | - |
max_model_len | int | null | yes |
max_num_batched_tokens | int | None | null | yes |
max_num_partial_prefills | int | 1 | - |
max_num_seqs | int | None | null | yes |
max_parallel_loading_workers | int | None | null | - |
media_io_kwargs | dict[str, dict[str, Any]] | {} | - |
mm_encoder_attn_backend | AttentionBackendEnum | str | None | null | - |
mm_encoder_only | bool | False | - |
mm_encoder_tp_mode | str | weights | - |
mm_processor_cache_gb | float | 4 | - |
mm_processor_cache_type | str | lru | - |
mm_processor_kwargs | dict[str, Any] | None | null | - |
mm_shm_cache_max_object_size_mb | int | 128 | - |
mm_tensor_ipc | str | direct_rpc | - |
model | str | Qwen/Qwen3-0.6B | yes |
model_impl | str | auto | - |
model_loader_extra_config | dict | {} | - |
model_weights | str | `` | - |
moe_backend | str | auto | yes |
nnodes | int | 1 | - |
node_rank | int | 0 | - |
num_gpu_blocks_override | int | None | null | - |
offload_backend | str | auto | - |
offload_group_size | int | 0 | yes |
offload_num_in_group | int | 1 | yes |
offload_params | set[str] | [] | yes |
offload_prefetch_step | int | 1 | yes |
optimization_level | OptimizationLevel | 2 | - |
otlp_traces_endpoint | str | None | null | - |
override_attention_dtype | str | None | null | - |
override_generation_config | dict[str, Any] | {} | - |
performance_mode | str | balanced | - |
pipeline_parallel_size | int | 1 | yes |
pooler_config | unknown | null | - |
prefill_context_parallel_size | int | 1 | - |
prefix_caching_hash_algo | str | sha256 | - |
profiler_config | unknown | null | - |
pt_load_map_location | str | dict[str, str] | cpu | - |
quantization | Any | str | None | null | yes |
ray_workers_use_nsight | bool | False | - |
reasoning_config | unknown | null | - |
reasoning_parser | str | `` | - |
reasoning_parser_plugin | str | None | null | - |
renderer_num_workers | int | 1 | - |
revision | str | None | null | yes |
runner | str | auto | - |
safetensors_load_strategy | str | None | null | - |
scheduler_cls | str | type[object] | None | null | - |
scheduler_reserve_full_isl | bool | True | - |
scheduling_policy | str | fcfs | - |
seed | int | 0 | - |
served_model_name | str | list[str] | None | null | - |
show_hidden_metrics_for_version | str | None | null | - |
shutdown_timeout | int | 0 | - |
skip_mm_profiling | bool | False | - |
skip_tokenizer_init | bool | False | - |
specialize_active_lora | bool | False | - |
speculative_config | unknown | null | - |
stream_interval | int | 1 | - |
structured_outputs_config | unknown | null | - |
tensor_parallel_size | int | 1 | yes |
tokenizer | str | None | null | - |
tokenizer_mode | Literal['auto', 'hf', 'slow', 'mistral', 'deepseek_v32'] | str | auto | - |
tokenizer_revision | str | None | null | - |
tokens_only | bool | False | - |
trust_remote_code | bool | False | - |
ubatch_size | int | 0 | - |
use_tqdm_on_load | bool | True | - |
video_pruning_rate | float | None | null | - |
weight_transfer_config | unknown | null | - |
worker_cls | str | auto | - |
worker_extension_cls | str | `` | - |
Sampling Parameters
| Field | Type | Default | Curated? |
|---|---|---|---|
_all_stop_token_ids | array | [] | - |
_bad_words_token_ids | list[list[int]] | None | null | - |
_eos_token_id | int | None | null | - |
allowed_token_ids | list[int] | None | null | - |
bad_words | list[str] | None | null | - |
detokenize | boolean | True | - |
extra_args | dict[str, Any] | None | null | - |
flat_logprobs | boolean | False | - |
frequency_penalty | number | 0.0 | yes |
ignore_eos | boolean | False | yes |
include_stop_str_in_output | boolean | False | - |
logit_bias | dict[int, float] | None | null | - |
logprobs | int | None | null | - |
max_tokens | int | None | 16 | - |
min_p | number | 0.0 | yes |
min_tokens | integer | 0 | yes |
n | integer | 1 | yes |
output_kind | RequestOutputKind | 0 | - |
output_text_buffer_length | integer | 0 | - |
presence_penalty | number | 0.0 | yes |
prompt_logprobs | int | None | null | - |
repetition_detection | RepetitionDetectionParams | None | null | - |
repetition_penalty | number | 1.0 | yes |
seed | int | None | null | - |
skip_clone | boolean | False | - |
skip_reading_prefix_cache | bool | None | null | - |
skip_special_tokens | boolean | True | - |
spaces_between_special_tokens | boolean | True | - |
stop | str | list[str] | None | null | - |
stop_token_ids | list[int] | None | null | - |
structured_outputs | StructuredOutputsParams | None | null | - |
temperature | number | 1.0 | yes |
thinking_token_budget | int | None | null | - |
top_k | integer | 0 | yes |
top_p | number | 1.0 | yes |