Skip to content

vllm.entrypoints.openai.chat_completion.protocol

_LONG_INFO module-attribute

_LONG_INFO = iinfo(long)

logger module-attribute

logger = init_logger(__name__)

ChatCompletionLogProb

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionLogProb(OpenAIBaseModel):
    token: str
    logprob: float = -9999.0
    bytes: list[int] | None = None

bytes class-attribute instance-attribute

bytes: list[int] | None = None

logprob class-attribute instance-attribute

logprob: float = -9999.0

token instance-attribute

token: str

ChatCompletionLogProbs

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionLogProbs(OpenAIBaseModel):
    content: list[ChatCompletionLogProbsContent] | None = None

content class-attribute instance-attribute

content: list[ChatCompletionLogProbsContent] | None = None

ChatCompletionLogProbsContent

Bases: ChatCompletionLogProb

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
    # Workaround: redefine fields name cache so that it's not
    # shared with the super class.
    field_names: ClassVar[set[str] | None] = None
    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)

field_names class-attribute

field_names: set[str] | None = None

top_logprobs class-attribute instance-attribute

top_logprobs: list[ChatCompletionLogProb] = Field(
    default_factory=list
)

ChatCompletionNamedFunction

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionNamedFunction(OpenAIBaseModel):
    name: str

name instance-attribute

name: str

ChatCompletionNamedToolChoiceParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
    function: ChatCompletionNamedFunction
    type: Literal["function"] = "function"

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
class ChatCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/chat/create
    messages: list[ChatCompletionMessageParam]
    model: str | None = None
    frequency_penalty: float | None = 0.0
    logit_bias: dict[str, float] | None = None
    logprobs: bool | None = False
    top_logprobs: int | None = 0
    max_tokens: int | None = Field(
        default=None,
        deprecated="max_tokens is deprecated in favor of "
        "the max_completion_tokens field",
    )
    max_completion_tokens: int | None = None
    n: int | None = 1
    presence_penalty: float | None = 0.0
    response_format: AnyResponseFormat | None = None
    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    stop: str | list[str] | None = []
    stream: bool | None = False
    stream_options: StreamOptions | None = None
    temperature: float | None = None
    top_p: float | None = None
    tools: list[ChatCompletionToolsParam] | None = None
    tool_choice: (
        Literal["none"]
        | Literal["auto"]
        | Literal["required"]
        | ChatCompletionNamedToolChoiceParam
        | None
    ) = "none"
    reasoning_effort: Literal["low", "medium", "high"] | None = None
    include_reasoning: bool = True
    parallel_tool_calls: bool | None = True

    # NOTE this will be ignored by vLLM
    user: str | None = None

    # --8<-- [start:chat-completion-sampling-params]
    use_beam_search: bool = False
    top_k: int | None = None
    min_p: float | None = None
    repetition_penalty: float | None = None
    length_penalty: float = 1.0
    stop_token_ids: list[int] | None = []
    include_stop_str_in_output: bool = False
    ignore_eos: bool = False
    min_tokens: int = 0
    skip_special_tokens: bool = True
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
        None
    )
    prompt_logprobs: int | None = None
    allowed_token_ids: list[int] | None = None
    bad_words: list[str] = Field(default_factory=list)
    # --8<-- [end:chat-completion-sampling-params]

    # --8<-- [start:chat-completion-extra-params]
    echo: bool = Field(
        default=False,
        description=(
            "If true, the new message will be prepended with the last message "
            "if they belong to the same role."
        ),
    )
    add_generation_prompt: bool = Field(
        default=True,
        description=(
            "If true, the generation prompt will be added to the chat template. "
            "This is a parameter used by chat template in tokenizer config of the "
            "model."
        ),
    )
    continue_final_message: bool = Field(
        default=False,
        description=(
            "If this is set, the chat will be formatted so that the final "
            "message in the chat is open-ended, without any EOS tokens. The "
            "model will continue this message rather than starting a new one. "
            'This allows you to "prefill" part of the model\'s response for it. '
            "Cannot be used at the same time as `add_generation_prompt`."
        ),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."
        ),
    )
    documents: list[dict[str, str]] | None = Field(
        default=None,
        description=(
            "A list of dicts representing documents that will be accessible to "
            "the model if it is performing RAG (retrieval-augmented generation)."
            " If the template does not support RAG, this argument will have no "
            "effect. We recommend that each document should be a dict containing "
            '"title" and "text" keys.'
        ),
    )
    chat_template: str | None = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."
        ),
    )
    chat_template_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."
        ),
    )
    mm_processor_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    structured_outputs: StructuredOutputsParams | None = Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    request_id: str = Field(
        default_factory=random_uuid,
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    logits_processors: LogitsProcessors | None = Field(
        default=None,
        description=(
            "A list of either qualified names of logits processors, or "
            "constructor objects, to apply when sampling. A constructor is "
            "a JSON object with a required 'qualname' field specifying the "
            "qualified name of the processor class/factory, and optional "
            "'args' and 'kwargs' fields containing positional and keyword "
            "arguments. For example: {'qualname': "
            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
            "{'param': 'value'}}."
        ),
    )
    return_tokens_as_token_ids: bool | None = Field(
        default=None,
        description=(
            "If specified with 'logprobs', tokens are represented "
            " as strings of the form 'token_id:{token_id}' so that tokens "
            "that are not JSON-encodable can be identified."
        ),
    )
    return_token_ids: bool | None = Field(
        default=None,
        description=(
            "If specified, the result will include token IDs alongside the "
            "generated text. In streaming mode, prompt_token_ids is included "
            "only in the first chunk, and token_ids contains the delta tokens "
            "for each chunk. This is useful for debugging or when you "
            "need to map generated text back to input tokens."
        ),
    )
    cache_salt: str | None = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit)."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
        default=None,
        description=(
            "Additional request parameters with (list of) string or "
            "numeric values, used by custom extensions."
        ),
    )

    # --8<-- [end:chat-completion-extra-params]

    # Default sampling parameters for chat completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_beam_search_params(
        self, max_tokens: int, default_sampling_params: dict
    ) -> BeamSearchParams:
        n = self.n if self.n is not None else 1
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            ignore_eos=self.ignore_eos,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self,
        max_tokens: int,
        logits_processor_pattern: str | None,
        default_sampling_params: dict,
    ) -> SamplingParams:
        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
            )
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
            )

        prompt_logprobs = self.prompt_logprobs
        if prompt_logprobs is None and self.echo:
            prompt_logprobs = self.top_logprobs

        response_format = self.response_format
        if response_format is not None:
            # If structured outputs wasn't already enabled,
            # we must enable it for these features to work
            if self.structured_outputs is None:
                self.structured_outputs = StructuredOutputsParams()

            # Set structured output params for response format
            if response_format.type == "json_object":
                self.structured_outputs.json_object = True
            elif response_format.type == "json_schema":
                json_schema = response_format.json_schema
                assert json_schema is not None
                self.structured_outputs.json = json_schema.json_schema
            elif response_format.type == "structural_tag":
                structural_tag = response_format
                assert structural_tag is not None and isinstance(
                    structural_tag,
                    (
                        LegacyStructuralTagResponseFormat,
                        StructuralTagResponseFormat,
                    ),
                )
                s_tag_obj = structural_tag.model_dump(by_alias=True)
                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            # Pass in kv_transfer_params via extra_args
            extra_args["kv_transfer_params"] = self.kv_transfer_params
        return SamplingParams.from_optional(
            n=self.n,
            presence_penalty=self.presence_penalty,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            seed=self.seed,
            stop=self.stop,
            stop_token_ids=self.stop_token_ids,
            logprobs=self.top_logprobs if self.logprobs else None,
            prompt_logprobs=prompt_logprobs,
            ignore_eos=self.ignore_eos,
            max_tokens=max_tokens,
            min_tokens=self.min_tokens,
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            logits_processors=get_logits_processors(
                self.logits_processors, logits_processor_pattern
            ),
            include_stop_str_in_output=self.include_stop_str_in_output,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            structured_outputs=self.structured_outputs,
            logit_bias=self.logit_bias,
            bad_words=self.bad_words,
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        if data.get("stream_options") and not data.get("stream"):
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter="stream_options",
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_logprobs(cls, data):
        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
                raise VLLMValidationError(
                    "`prompt_logprobs` are not available when `stream=True`.",
                    parameter="prompt_logprobs",
                )

            if prompt_logprobs < 0 and prompt_logprobs != -1:
                raise VLLMValidationError(
                    "`prompt_logprobs` must be a positive value or -1.",
                    parameter="prompt_logprobs",
                    value=prompt_logprobs,
                )
        if (top_logprobs := data.get("top_logprobs")) is not None:
            if top_logprobs < 0 and top_logprobs != -1:
                raise VLLMValidationError(
                    "`top_logprobs` must be a positive value or -1.",
                    parameter="top_logprobs",
                    value=top_logprobs,
                )

            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
                raise VLLMValidationError(
                    "when using `top_logprobs`, `logprobs` must be set to true.",
                    parameter="top_logprobs",
                )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_structured_outputs_count(cls, data):
        if isinstance(data, ValueError):
            raise data

        if data.get("structured_outputs", None) is None:
            return data

        structured_outputs_kwargs = data["structured_outputs"]
        count = sum(
            structured_outputs_kwargs.get(k) is not None
            for k in ("json", "regex", "choice")
        )
        # you can only use one kind of constraints for structured outputs
        if count > 1:
            raise ValueError(
                "You can only use one kind of constraints for structured "
                "outputs ('json', 'regex' or 'choice')."
            )
        # you can only either use structured outputs or tools, not both
        if count > 1 and data.get("tool_choice", "none") not in (
            "none",
            "auto",
            "required",
        ):
            raise ValueError(
                "You can only either use constraints for structured outputs "
                "or tools, not both."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tool_usage(cls, data):
        # if "tool_choice" is not specified but tools are provided,
        # default to "auto" tool_choice
        if "tool_choice" not in data and data.get("tools"):
            data["tool_choice"] = "auto"

        # if "tool_choice" is "none" -- no validation is needed for tools
        if "tool_choice" in data and data["tool_choice"] == "none":
            return data

        # if "tool_choice" is specified -- validation
        if "tool_choice" in data and data["tool_choice"] is not None:
            # ensure that if "tool choice" is specified, tools are present
            if "tools" not in data or data["tools"] is None:
                raise ValueError("When using `tool_choice`, `tools` must be set.")

            # make sure that tool choice is either a named tool
            # OR that it's set to "auto" or "required"
            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
                data["tool_choice"], dict
            ):
                raise ValueError(
                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
                    'Only named tools, "none", "auto" or "required" '
                    "are supported."
                )

            # if tool_choice is "required" but the "tools" list is empty,
            # override the data to behave like "none" to align with
            # OpenAI’s behavior.
            if (
                data["tool_choice"] == "required"
                and isinstance(data["tools"], list)
                and len(data["tools"]) == 0
            ):
                data["tool_choice"] = "none"
                del data["tools"]
                return data

            # ensure that if "tool_choice" is specified as an object,
            # it matches a valid tool
            correct_usage_message = (
                'Correct usage: `{"type": "function",'
                ' "function": {"name": "my_function"}}`'
            )
            if isinstance(data["tool_choice"], dict):
                valid_tool = False
                function = data["tool_choice"].get("function")
                if not isinstance(function, dict):
                    raise ValueError(
                        f"Invalid value for `function`: `{function}` in "
                        f"`tool_choice`! {correct_usage_message}"
                    )
                if "name" not in function:
                    raise ValueError(
                        f"Expected field `name` in `function` in "
                        f"`tool_choice`! {correct_usage_message}"
                    )
                function_name = function["name"]
                if not isinstance(function_name, str) or len(function_name) == 0:
                    raise ValueError(
                        f"Invalid `name` in `function`: `{function_name}`"
                        f" in `tool_choice`! {correct_usage_message}"
                    )
                for tool in data["tools"]:
                    if tool["function"]["name"] == function_name:
                        valid_tool = True
                        break
                if not valid_tool:
                    raise ValueError(
                        "The tool specified in `tool_choice` does not match any"
                        " of the specified `tools`"
                    )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get("add_generation_prompt"):
            raise ValueError(
                "Cannot set both `continue_final_message` and "
                "`add_generation_prompt` to True."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None and (
            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
        ):
            raise ValueError(
                "Parameter 'cache_salt' must be a non-empty string if provided."
            )
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=True,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: list[int] | None = None

bad_words class-attribute instance-attribute

bad_words: list[str] = Field(default_factory=list)

cache_salt class-attribute instance-attribute

cache_salt: str | None = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit).",
)

chat_template class-attribute instance-attribute

chat_template: str | None = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message class-attribute instance-attribute

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

documents class-attribute instance-attribute

documents: list[dict[str, str]] | None = Field(
    default=None,
    description='A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing "title" and "text" keys.',
)

echo class-attribute instance-attribute

echo: bool = Field(
    default=False,
    description="If true, the new message will be prepended with the last message if they belong to the same role.",
)

frequency_penalty class-attribute instance-attribute

frequency_penalty: float | None = 0.0

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

include_reasoning class-attribute instance-attribute

include_reasoning: bool = True

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: dict[str, Any] | None = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

logit_bias class-attribute instance-attribute

logit_bias: dict[str, float] | None = None

logits_processors class-attribute instance-attribute

logits_processors: LogitsProcessors | None = Field(
    default=None,
    description="A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}.",
)

logprobs class-attribute instance-attribute

logprobs: bool | None = False

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: int | None = None

max_tokens class-attribute instance-attribute

max_tokens: int | None = Field(
    default=None,
    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
)

messages instance-attribute

min_p class-attribute instance-attribute

min_p: float | None = None

min_tokens class-attribute instance-attribute

min_tokens: int = 0

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: str | None = None

n class-attribute instance-attribute

n: int | None = 1

parallel_tool_calls class-attribute instance-attribute

parallel_tool_calls: bool | None = True

presence_penalty class-attribute instance-attribute

presence_penalty: float | None = 0.0

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: int | None = None

reasoning_effort class-attribute instance-attribute

reasoning_effort: (
    Literal["low", "medium", "high"] | None
) = None

repetition_penalty class-attribute instance-attribute

repetition_penalty: float | None = None

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=random_uuid,
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

response_format class-attribute instance-attribute

response_format: AnyResponseFormat | None = None

return_token_ids class-attribute instance-attribute

return_token_ids: bool | None = Field(
    default=None,
    description="If specified, the result will include token IDs alongside the generated text. In streaming mode, prompt_token_ids is included only in the first chunk, and token_ids contains the delta tokens for each chunk. This is useful for debugging or when you need to map generated text back to input tokens.",
)

return_tokens_as_token_ids class-attribute instance-attribute

return_tokens_as_token_ids: bool | None = Field(
    default=None,
    description="If specified with 'logprobs', tokens are represented  as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.",
)

seed class-attribute instance-attribute

seed: int | None = Field(None, ge=min, le=max)

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

stop class-attribute instance-attribute

stop: str | list[str] | None = []

stop_token_ids class-attribute instance-attribute

stop_token_ids: list[int] | None = []

stream class-attribute instance-attribute

stream: bool | None = False

stream_options class-attribute instance-attribute

stream_options: StreamOptions | None = None

structured_outputs class-attribute instance-attribute

structured_outputs: StructuredOutputsParams | None = Field(
    default=None,
    description="Additional kwargs for structured outputs",
)

temperature class-attribute instance-attribute

temperature: float | None = None

tool_choice class-attribute instance-attribute

tool_choice: (
    Literal["none"]
    | Literal["auto"]
    | Literal["required"]
    | ChatCompletionNamedToolChoiceParam
    | None
) = "none"

tools class-attribute instance-attribute

tools: list[ChatCompletionToolsParam] | None = None

top_k class-attribute instance-attribute

top_k: int | None = None

top_logprobs class-attribute instance-attribute

top_logprobs: int | None = 0

top_p class-attribute instance-attribute

top_p: float | None = None

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: (
    Annotated[int, Field(ge=-1, le=max)] | None
) = None
use_beam_search: bool = False

user class-attribute instance-attribute

user: str | None = None

vllm_xargs class-attribute instance-attribute

vllm_xargs: (
    dict[str, str | int | float | list[str | int | float]]
    | None
) = Field(
    default=None,
    description="Additional request parameters with (list of) string or numeric values, used by custom extensions.",
)

check_cache_salt_support classmethod

check_cache_salt_support(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
    if data.get("cache_salt") is not None and (
        not isinstance(data["cache_salt"], str) or not data["cache_salt"]
    ):
        raise ValueError(
            "Parameter 'cache_salt' must be a non-empty string if provided."
        )
    return data

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get("add_generation_prompt"):
        raise ValueError(
            "Cannot set both `continue_final_message` and "
            "`add_generation_prompt` to True."
        )
    return data

check_logprobs classmethod

check_logprobs(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
    if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
        if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
            raise VLLMValidationError(
                "`prompt_logprobs` are not available when `stream=True`.",
                parameter="prompt_logprobs",
            )

        if prompt_logprobs < 0 and prompt_logprobs != -1:
            raise VLLMValidationError(
                "`prompt_logprobs` must be a positive value or -1.",
                parameter="prompt_logprobs",
                value=prompt_logprobs,
            )
    if (top_logprobs := data.get("top_logprobs")) is not None:
        if top_logprobs < 0 and top_logprobs != -1:
            raise VLLMValidationError(
                "`top_logprobs` must be a positive value or -1.",
                parameter="top_logprobs",
                value=top_logprobs,
            )

        if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
            raise VLLMValidationError(
                "when using `top_logprobs`, `logprobs` must be set to true.",
                parameter="top_logprobs",
            )

    return data

check_structured_outputs_count classmethod

check_structured_outputs_count(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):
    if isinstance(data, ValueError):
        raise data

    if data.get("structured_outputs", None) is None:
        return data

    structured_outputs_kwargs = data["structured_outputs"]
    count = sum(
        structured_outputs_kwargs.get(k) is not None
        for k in ("json", "regex", "choice")
    )
    # you can only use one kind of constraints for structured outputs
    if count > 1:
        raise ValueError(
            "You can only use one kind of constraints for structured "
            "outputs ('json', 'regex' or 'choice')."
        )
    # you can only either use structured outputs or tools, not both
    if count > 1 and data.get("tool_choice", "none") not in (
        "none",
        "auto",
        "required",
    ):
        raise ValueError(
            "You can only either use constraints for structured outputs "
            "or tools, not both."
        )
    return data

check_tool_usage classmethod

check_tool_usage(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def check_tool_usage(cls, data):
    # if "tool_choice" is not specified but tools are provided,
    # default to "auto" tool_choice
    if "tool_choice" not in data and data.get("tools"):
        data["tool_choice"] = "auto"

    # if "tool_choice" is "none" -- no validation is needed for tools
    if "tool_choice" in data and data["tool_choice"] == "none":
        return data

    # if "tool_choice" is specified -- validation
    if "tool_choice" in data and data["tool_choice"] is not None:
        # ensure that if "tool choice" is specified, tools are present
        if "tools" not in data or data["tools"] is None:
            raise ValueError("When using `tool_choice`, `tools` must be set.")

        # make sure that tool choice is either a named tool
        # OR that it's set to "auto" or "required"
        if data["tool_choice"] not in ["auto", "required"] and not isinstance(
            data["tool_choice"], dict
        ):
            raise ValueError(
                f"Invalid value for `tool_choice`: {data['tool_choice']}! "
                'Only named tools, "none", "auto" or "required" '
                "are supported."
            )

        # if tool_choice is "required" but the "tools" list is empty,
        # override the data to behave like "none" to align with
        # OpenAI’s behavior.
        if (
            data["tool_choice"] == "required"
            and isinstance(data["tools"], list)
            and len(data["tools"]) == 0
        ):
            data["tool_choice"] = "none"
            del data["tools"]
            return data

        # ensure that if "tool_choice" is specified as an object,
        # it matches a valid tool
        correct_usage_message = (
            'Correct usage: `{"type": "function",'
            ' "function": {"name": "my_function"}}`'
        )
        if isinstance(data["tool_choice"], dict):
            valid_tool = False
            function = data["tool_choice"].get("function")
            if not isinstance(function, dict):
                raise ValueError(
                    f"Invalid value for `function`: `{function}` in "
                    f"`tool_choice`! {correct_usage_message}"
                )
            if "name" not in function:
                raise ValueError(
                    f"Expected field `name` in `function` in "
                    f"`tool_choice`! {correct_usage_message}"
                )
            function_name = function["name"]
            if not isinstance(function_name, str) or len(function_name) == 0:
                raise ValueError(
                    f"Invalid `name` in `function`: `{function_name}`"
                    f" in `tool_choice`! {correct_usage_message}"
                )
            for tool in data["tools"]:
                if tool["function"]["name"] == function_name:
                    valid_tool = True
                    break
            if not valid_tool:
                raise ValueError(
                    "The tool specified in `tool_choice` does not match any"
                    " of the specified `tools`"
                )
    return data

to_beam_search_params

to_beam_search_params(
    max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
def to_beam_search_params(
    self, max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams:
    n = self.n if self.n is not None else 1
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
        )

    return BeamSearchParams(
        beam_width=n,
        max_tokens=max_tokens,
        ignore_eos=self.ignore_eos,
        temperature=temperature,
        length_penalty=self.length_penalty,
        include_stop_str_in_output=self.include_stop_str_in_output,
    )

to_sampling_params

to_sampling_params(
    max_tokens: int,
    logits_processor_pattern: str | None,
    default_sampling_params: dict,
) -> SamplingParams
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
def to_sampling_params(
    self,
    max_tokens: int,
    logits_processor_pattern: str | None,
    default_sampling_params: dict,
) -> SamplingParams:
    # Default parameters
    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
        )
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
        )
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
        )
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
        )

    prompt_logprobs = self.prompt_logprobs
    if prompt_logprobs is None and self.echo:
        prompt_logprobs = self.top_logprobs

    response_format = self.response_format
    if response_format is not None:
        # If structured outputs wasn't already enabled,
        # we must enable it for these features to work
        if self.structured_outputs is None:
            self.structured_outputs = StructuredOutputsParams()

        # Set structured output params for response format
        if response_format.type == "json_object":
            self.structured_outputs.json_object = True
        elif response_format.type == "json_schema":
            json_schema = response_format.json_schema
            assert json_schema is not None
            self.structured_outputs.json = json_schema.json_schema
        elif response_format.type == "structural_tag":
            structural_tag = response_format
            assert structural_tag is not None and isinstance(
                structural_tag,
                (
                    LegacyStructuralTagResponseFormat,
                    StructuralTagResponseFormat,
                ),
            )
            s_tag_obj = structural_tag.model_dump(by_alias=True)
            self.structured_outputs.structural_tag = json.dumps(s_tag_obj)

    extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
    if self.kv_transfer_params:
        # Pass in kv_transfer_params via extra_args
        extra_args["kv_transfer_params"] = self.kv_transfer_params
    return SamplingParams.from_optional(
        n=self.n,
        presence_penalty=self.presence_penalty,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        seed=self.seed,
        stop=self.stop,
        stop_token_ids=self.stop_token_ids,
        logprobs=self.top_logprobs if self.logprobs else None,
        prompt_logprobs=prompt_logprobs,
        ignore_eos=self.ignore_eos,
        max_tokens=max_tokens,
        min_tokens=self.min_tokens,
        skip_special_tokens=self.skip_special_tokens,
        spaces_between_special_tokens=self.spaces_between_special_tokens,
        logits_processors=get_logits_processors(
            self.logits_processors, logits_processor_pattern
        ),
        include_stop_str_in_output=self.include_stop_str_in_output,
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        output_kind=RequestOutputKind.DELTA
        if self.stream
        else RequestOutputKind.FINAL_ONLY,
        structured_outputs=self.structured_outputs,
        logit_bias=self.logit_bias,
        bad_words=self.bad_words,
        allowed_token_ids=self.allowed_token_ids,
        extra_args=extra_args or None,
        skip_clone=True,  # Created fresh per request, safe to skip clone
    )

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    if data.get("stream_options") and not data.get("stream"):
        raise VLLMValidationError(
            "Stream options can only be defined when `stream=True`.",
            parameter="stream_options",
        )

    return data

ChatCompletionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion"] = "chat.completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseChoice]
    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
    system_fingerprint: str | None = None
    usage: UsageInfo

    # vLLM-specific fields that are not in OpenAI spec
    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
    prompt_token_ids: list[int] | None = None
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None, description="KVTransfer parameters."
    )

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: dict[str, Any] | None = Field(
    default=None, description="KVTransfer parameters."
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['chat.completion'] = 'chat.completion'

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: list[dict[int, Logprob] | None] | None = (
    None
)

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: list[int] | None = None

service_tier class-attribute instance-attribute

service_tier: (
    Literal["auto", "default", "flex", "scale", "priority"]
    | None
) = None

system_fingerprint class-attribute instance-attribute

system_fingerprint: str | None = None

usage instance-attribute

usage: UsageInfo

ChatCompletionResponseChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionResponseChoice(OpenAIBaseModel):
    index: int
    message: ChatMessage
    logprobs: ChatCompletionLogProbs | None = None
    # per OpenAI spec this is the default
    finish_reason: str | None = "stop"
    # not part of the OpenAI spec but included in vLLM for legacy reasons
    stop_reason: int | str | None = None
    # not part of the OpenAI spec but is useful for tracing the tokens
    # in agent scenarios
    token_ids: list[int] | None = None

finish_reason class-attribute instance-attribute

finish_reason: str | None = 'stop'

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: ChatCompletionLogProbs | None = None

message instance-attribute

message: ChatMessage

stop_reason class-attribute instance-attribute

stop_reason: int | str | None = None

token_ids class-attribute instance-attribute

token_ids: list[int] | None = None

ChatCompletionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
    index: int
    delta: DeltaMessage
    logprobs: ChatCompletionLogProbs | None = None
    finish_reason: str | None = None
    stop_reason: int | str | None = None
    # not part of the OpenAI spec but for tracing the tokens
    token_ids: list[int] | None = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: str | None = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: ChatCompletionLogProbs | None = None

stop_reason class-attribute instance-attribute

stop_reason: int | str | None = None

token_ids class-attribute instance-attribute

token_ids: list[int] | None = None

ChatCompletionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseStreamChoice]
    usage: UsageInfo | None = Field(default=None)
    # not part of the OpenAI spec but for tracing the tokens
    prompt_token_ids: list[int] | None = None

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["chat.completion.chunk"] = (
    "chat.completion.chunk"
)

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: list[int] | None = None

usage class-attribute instance-attribute

usage: UsageInfo | None = Field(default=None)

ChatCompletionToolsParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatCompletionToolsParam(OpenAIBaseModel):
    type: Literal["function"] = "function"
    function: FunctionDefinition

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatMessage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
class ChatMessage(OpenAIBaseModel):
    role: str
    content: str | None = None
    refusal: str | None = None
    annotations: OpenAIAnnotation | None = None
    audio: OpenAIChatCompletionAudio | None = None
    function_call: FunctionCall | None = None
    tool_calls: list[ToolCall] = Field(default_factory=list)

    # vLLM-specific fields that are not in OpenAI spec
    reasoning: str | None = None
    reasoning_content: str | None = None
    """Deprecated: use `reasoning` instead."""

    @model_validator(mode="after")
    def handle_deprecated_reasoning_content(self):
        """Copy reasoning to reasoning_content for backward compatibility."""
        self.reasoning_content = self.reasoning
        return self

annotations class-attribute instance-attribute

annotations: Annotation | None = None

audio class-attribute instance-attribute

audio: ChatCompletionAudio | None = None

content class-attribute instance-attribute

content: str | None = None

function_call class-attribute instance-attribute

function_call: FunctionCall | None = None

reasoning class-attribute instance-attribute

reasoning: str | None = None

reasoning_content class-attribute instance-attribute

reasoning_content: str | None = None

Deprecated: use reasoning instead.

refusal class-attribute instance-attribute

refusal: str | None = None

role instance-attribute

role: str

tool_calls class-attribute instance-attribute

tool_calls: list[ToolCall] = Field(default_factory=list)

handle_deprecated_reasoning_content

handle_deprecated_reasoning_content()

Copy reasoning to reasoning_content for backward compatibility.

Source code in vllm/entrypoints/openai/chat_completion/protocol.py
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
    """Copy reasoning to reasoning_content for backward compatibility."""
    self.reasoning_content = self.reasoning
    return self