Skip to content

vllm.v1.worker.gpu.states

NO_LORA_ID module-attribute

NO_LORA_ID = 0

ExtraData dataclass

Source code in vllm/v1/worker/gpu/states.py
@dataclass
class ExtraData:
    lora_request: LoRARequest | None
    in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)

in_progress_prompt_logprobs class-attribute instance-attribute

in_progress_prompt_logprobs: list[LogprobsTensors] = field(
    default_factory=list
)

lora_request instance-attribute

lora_request: LoRARequest | None

__init__

__init__(
    lora_request: LoRARequest | None,
    in_progress_prompt_logprobs: list[
        LogprobsTensors
    ] = list(),
) -> None

RequestState

Source code in vllm/v1/worker/gpu/states.py
class RequestState:
    def __init__(
        self,
        max_num_reqs: int,
        max_model_len: int,
        max_num_batched_tokens: int,
        num_speculative_steps: int,
        vocab_size: int,
        device: torch.device,
    ):
        self.max_num_reqs = max_num_reqs
        self.max_model_len = max_model_len
        self.max_num_batched_tokens = max_num_batched_tokens
        self.num_speculative_steps = num_speculative_steps
        self.vocab_size = vocab_size
        self.device = device

        self.req_id_to_index: dict[str, int] = {}
        self.index_to_req_id: dict[int, str] = {}
        self.free_indices = list(range(max_num_reqs))
        self.extra_data: dict[str, ExtraData] = {}

        self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
        # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
        # depending on the configured max_num_reqs and max_model_len.
        # To save GPU memory, we use UVA instead of GPU for this tensor.
        self.prefill_token_ids = StagedWriteTensor(
            (self.max_num_reqs, self.max_model_len),
            dtype=torch.int32,
            device=device,
            uva_instead_of_gpu=True,
        )
        self.prefill_len = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)

        # Number of computed tokens.
        self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
        self.num_computed_tokens = StagedWriteTensor(
            self.max_num_reqs, dtype=torch.int32, device=device
        )

        # Last sampled tokens.
        self.last_sampled_tokens = torch.zeros(
            self.max_num_reqs,
            1,
            dtype=torch.int64,
            device=device,
        )

        # Draft tokens.
        self.draft_tokens = torch.zeros(
            self.max_num_reqs,
            self.num_speculative_steps,
            dtype=torch.int64,
            device=device,
        )
        self.next_prefill_tokens = torch.zeros(
            self.max_num_reqs, dtype=torch.int32, device=device
        )

        # LoRA.
        self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
        self.lora_ids.fill(NO_LORA_ID)

        self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)

    @property
    def num_reqs(self) -> int:
        return len(self.req_id_to_index)

    def add_request(
        self,
        req_id: str,
        prompt_len: int,
        prefill_token_ids: list[int],
        num_computed_tokens: int,
        sampling_params: SamplingParams,
        lora_request: LoRARequest | None,
    ) -> None:
        assert len(self.free_indices) > 0, "No free indices"
        req_idx = self.free_indices.pop()
        self.req_id_to_index[req_id] = req_idx
        self.index_to_req_id[req_idx] = req_id
        self.extra_data[req_id] = ExtraData(lora_request)

        self.prompt_len[req_idx] = prompt_len
        prefill_len = len(prefill_token_ids)
        assert prefill_len >= prompt_len, (
            f"prefill_len {prefill_len} < prompt_len {prompt_len}"
        )
        self.prefill_len.np[req_idx] = prefill_len
        self.prefill_token_ids.stage_write(req_idx, 0, prefill_token_ids)
        self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
        self.num_computed_tokens.stage_write_elem(req_idx, num_computed_tokens)

        if lora_request is not None:
            self.lora_ids[req_idx] = lora_request.lora_int_id
        else:
            self.lora_ids[req_idx] = NO_LORA_ID

        # For now, only support prompt logprobs for the prompt tokens.
        needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
        self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs

    def apply_staged_writes(self) -> None:
        self.prefill_len.copy_to_uva()
        self.prefill_token_ids.apply_write()
        self.num_computed_tokens.apply_write()

    def remove_request(self, req_id: str) -> None:
        self.extra_data.pop(req_id, None)
        req_idx = self.req_id_to_index.pop(req_id, None)
        if req_idx is None:
            # Request not found.
            return
        self.index_to_req_id.pop(req_idx, None)
        self.free_indices.append(req_idx)

    def make_lora_inputs(
        self,
        req_ids: list[str],
        idx_mapping: np.ndarray,
        num_scheduled_tokens: np.ndarray,
    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
        lora_ids = self.lora_ids[idx_mapping]
        prompt_lora_mapping = tuple(lora_ids)
        token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens))

        active_lora_requests: set[LoRARequest] = set()
        for req_id in req_ids:
            lora_request = self.extra_data[req_id].lora_request
            if lora_request is not None:
                active_lora_requests.add(lora_request)
        return prompt_lora_mapping, token_lora_mapping, active_lora_requests

device instance-attribute

device = device

draft_tokens instance-attribute

draft_tokens = zeros(
    max_num_reqs,
    num_speculative_steps,
    dtype=int64,
    device=device,
)

extra_data instance-attribute

extra_data: dict[str, ExtraData] = {}

free_indices instance-attribute

free_indices = list(range(max_num_reqs))

index_to_req_id instance-attribute

index_to_req_id: dict[int, str] = {}

last_sampled_tokens instance-attribute

last_sampled_tokens = zeros(
    max_num_reqs, 1, dtype=int64, device=device
)

lora_ids instance-attribute

lora_ids = zeros(max_num_reqs, dtype=int32)

max_model_len instance-attribute

max_model_len = max_model_len

max_num_batched_tokens instance-attribute

max_num_batched_tokens = max_num_batched_tokens

max_num_reqs instance-attribute

max_num_reqs = max_num_reqs

needs_prompt_logprobs instance-attribute

needs_prompt_logprobs = zeros(max_num_reqs, dtype=bool)

next_prefill_tokens instance-attribute

next_prefill_tokens = zeros(
    max_num_reqs, dtype=int32, device=device
)

num_computed_prefill_tokens instance-attribute

num_computed_prefill_tokens = zeros(
    max_num_reqs, dtype=int32
)

num_computed_tokens instance-attribute

num_computed_tokens = StagedWriteTensor(
    max_num_reqs, dtype=int32, device=device
)

num_reqs property

num_reqs: int

num_speculative_steps instance-attribute

num_speculative_steps = num_speculative_steps

prefill_len instance-attribute

prefill_len = UvaBackedTensor(max_num_reqs, dtype=int32)

prefill_token_ids instance-attribute

prefill_token_ids = StagedWriteTensor(
    (max_num_reqs, max_model_len),
    dtype=int32,
    device=device,
    uva_instead_of_gpu=True,
)

prompt_len instance-attribute

prompt_len = zeros(max_num_reqs, dtype=int32)

req_id_to_index instance-attribute

req_id_to_index: dict[str, int] = {}

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    max_num_reqs: int,
    max_model_len: int,
    max_num_batched_tokens: int,
    num_speculative_steps: int,
    vocab_size: int,
    device: device,
)
Source code in vllm/v1/worker/gpu/states.py
def __init__(
    self,
    max_num_reqs: int,
    max_model_len: int,
    max_num_batched_tokens: int,
    num_speculative_steps: int,
    vocab_size: int,
    device: torch.device,
):
    self.max_num_reqs = max_num_reqs
    self.max_model_len = max_model_len
    self.max_num_batched_tokens = max_num_batched_tokens
    self.num_speculative_steps = num_speculative_steps
    self.vocab_size = vocab_size
    self.device = device

    self.req_id_to_index: dict[str, int] = {}
    self.index_to_req_id: dict[int, str] = {}
    self.free_indices = list(range(max_num_reqs))
    self.extra_data: dict[str, ExtraData] = {}

    self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
    # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
    # depending on the configured max_num_reqs and max_model_len.
    # To save GPU memory, we use UVA instead of GPU for this tensor.
    self.prefill_token_ids = StagedWriteTensor(
        (self.max_num_reqs, self.max_model_len),
        dtype=torch.int32,
        device=device,
        uva_instead_of_gpu=True,
    )
    self.prefill_len = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)

    # Number of computed tokens.
    self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
    self.num_computed_tokens = StagedWriteTensor(
        self.max_num_reqs, dtype=torch.int32, device=device
    )

    # Last sampled tokens.
    self.last_sampled_tokens = torch.zeros(
        self.max_num_reqs,
        1,
        dtype=torch.int64,
        device=device,
    )

    # Draft tokens.
    self.draft_tokens = torch.zeros(
        self.max_num_reqs,
        self.num_speculative_steps,
        dtype=torch.int64,
        device=device,
    )
    self.next_prefill_tokens = torch.zeros(
        self.max_num_reqs, dtype=torch.int32, device=device
    )

    # LoRA.
    self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
    self.lora_ids.fill(NO_LORA_ID)

    self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)

add_request

add_request(
    req_id: str,
    prompt_len: int,
    prefill_token_ids: list[int],
    num_computed_tokens: int,
    sampling_params: SamplingParams,
    lora_request: LoRARequest | None,
) -> None
Source code in vllm/v1/worker/gpu/states.py
def add_request(
    self,
    req_id: str,
    prompt_len: int,
    prefill_token_ids: list[int],
    num_computed_tokens: int,
    sampling_params: SamplingParams,
    lora_request: LoRARequest | None,
) -> None:
    assert len(self.free_indices) > 0, "No free indices"
    req_idx = self.free_indices.pop()
    self.req_id_to_index[req_id] = req_idx
    self.index_to_req_id[req_idx] = req_id
    self.extra_data[req_id] = ExtraData(lora_request)

    self.prompt_len[req_idx] = prompt_len
    prefill_len = len(prefill_token_ids)
    assert prefill_len >= prompt_len, (
        f"prefill_len {prefill_len} < prompt_len {prompt_len}"
    )
    self.prefill_len.np[req_idx] = prefill_len
    self.prefill_token_ids.stage_write(req_idx, 0, prefill_token_ids)
    self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
    self.num_computed_tokens.stage_write_elem(req_idx, num_computed_tokens)

    if lora_request is not None:
        self.lora_ids[req_idx] = lora_request.lora_int_id
    else:
        self.lora_ids[req_idx] = NO_LORA_ID

    # For now, only support prompt logprobs for the prompt tokens.
    needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
    self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs

apply_staged_writes

apply_staged_writes() -> None
Source code in vllm/v1/worker/gpu/states.py
def apply_staged_writes(self) -> None:
    self.prefill_len.copy_to_uva()
    self.prefill_token_ids.apply_write()
    self.num_computed_tokens.apply_write()

make_lora_inputs

make_lora_inputs(
    req_ids: list[str],
    idx_mapping: ndarray,
    num_scheduled_tokens: ndarray,
) -> tuple[
    tuple[int, ...], tuple[int, ...], set[LoRARequest]
]
Source code in vllm/v1/worker/gpu/states.py
def make_lora_inputs(
    self,
    req_ids: list[str],
    idx_mapping: np.ndarray,
    num_scheduled_tokens: np.ndarray,
) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
    lora_ids = self.lora_ids[idx_mapping]
    prompt_lora_mapping = tuple(lora_ids)
    token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens))

    active_lora_requests: set[LoRARequest] = set()
    for req_id in req_ids:
        lora_request = self.extra_data[req_id].lora_request
        if lora_request is not None:
            active_lora_requests.add(lora_request)
    return prompt_lora_mapping, token_lora_mapping, active_lora_requests

remove_request

remove_request(req_id: str) -> None
Source code in vllm/v1/worker/gpu/states.py
def remove_request(self, req_id: str) -> None:
    self.extra_data.pop(req_id, None)
    req_idx = self.req_id_to_index.pop(req_id, None)
    if req_idx is None:
        # Request not found.
        return
    self.index_to_req_id.pop(req_idx, None)
    self.free_indices.append(req_idx)