class LogitBiasState:
def __init__(
self,
max_num_reqs: int,
device: torch.device,
):
self.max_num_reqs = max_num_reqs
# Allowed token IDs.
self.num_allowed_token_ids = UvaBackedTensor(
self.max_num_reqs, dtype=torch.int32
)
self.allowed_token_ids = StagedWriteTensor(
(self.max_num_reqs, MAX_NUM_ALLOWED_TOKEN_IDS),
dtype=torch.int32,
device=device,
)
# Logit bias.
self.num_logit_bias = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
self.logit_bias_token_ids = StagedWriteTensor(
(self.max_num_reqs, MAX_NUM_LOGIT_BIAS_TOKENS),
dtype=torch.int32,
device=device,
)
self.logit_bias = StagedWriteTensor(
(self.max_num_reqs, MAX_NUM_LOGIT_BIAS_TOKENS),
dtype=torch.float32,
device=device,
)
# Min tokens.
self.min_lens = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
self.num_stop_token_ids = UvaBackedTensor(self.max_num_reqs, dtype=torch.int32)
self.stop_token_ids = StagedWriteTensor(
(self.max_num_reqs, MAX_NUM_STOP_TOKEN_IDS),
dtype=torch.int32,
device=device,
)
def add_request(
self,
req_idx: int,
prompt_len: int,
sampling_params: SamplingParams,
) -> None:
# Allowed token IDs.
allowed_token_ids = sampling_params.allowed_token_ids
if allowed_token_ids:
num_allowed_token_ids = len(allowed_token_ids)
if num_allowed_token_ids > MAX_NUM_ALLOWED_TOKEN_IDS:
raise ValueError(
f"Too many allowed token IDs: {num_allowed_token_ids}. "
f"The max size is {MAX_NUM_ALLOWED_TOKEN_IDS}."
)
self.num_allowed_token_ids.np[req_idx] = num_allowed_token_ids
self.allowed_token_ids.stage_write(req_idx, 0, allowed_token_ids)
else:
self.num_allowed_token_ids.np[req_idx] = 0
# Logit bias.
logit_bias = sampling_params.logit_bias
if logit_bias:
num_logit_bias = len(logit_bias)
if num_logit_bias > MAX_NUM_LOGIT_BIAS_TOKENS:
raise ValueError(
f"Too many logit bias tokens: {num_logit_bias}. "
f"The max size is {MAX_NUM_LOGIT_BIAS_TOKENS}."
)
self.num_logit_bias.np[req_idx] = num_logit_bias
self.logit_bias_token_ids.stage_write(req_idx, 0, logit_bias.keys())
self.logit_bias.stage_write(req_idx, 0, logit_bias.values())
else:
self.num_logit_bias.np[req_idx] = 0
# Min tokens.
min_tokens = sampling_params.min_tokens
min_len = prompt_len + min_tokens
self.min_lens.np[req_idx] = min_len
stop_token_ids = sampling_params.all_stop_token_ids
if stop_token_ids:
num_stop_token_ids = len(stop_token_ids)
if num_stop_token_ids > MAX_NUM_STOP_TOKEN_IDS:
raise ValueError(
f"Too many stop tokens: {num_stop_token_ids}. "
f"The max size is {MAX_NUM_STOP_TOKEN_IDS}."
)
self.num_stop_token_ids.np[req_idx] = num_stop_token_ids
self.stop_token_ids.stage_write(req_idx, 0, stop_token_ids)
else:
self.num_stop_token_ids.np[req_idx] = 0
def apply_staged_writes(self) -> None:
self.num_allowed_token_ids.copy_to_uva()
self.allowed_token_ids.apply_write()
self.num_logit_bias.copy_to_uva()
self.logit_bias_token_ids.apply_write()
self.logit_bias.apply_write()
self.min_lens.copy_to_uva()
self.num_stop_token_ids.copy_to_uva()
self.stop_token_ids.apply_write()
def apply_logit_bias(
self,
logits: torch.Tensor,
idx_mapping: torch.Tensor,
pos: torch.Tensor,
) -> None:
apply_logit_bias(
logits,
idx_mapping,
pos,
self.num_allowed_token_ids.gpu,
self.allowed_token_ids.gpu,
self.num_logit_bias.gpu,
self.logit_bias_token_ids.gpu,
self.logit_bias.gpu,
self.min_lens.gpu,
self.num_stop_token_ids.gpu,
self.stop_token_ids.gpu,
)