Models — TensorRT-LLM (original) (raw)
class tensorrt_llm.models.BaichuanForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of BaichuanConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a BaichuanForCausalLM object from give parameters
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
class tensorrt_llm.models.BertForQuestionAnswering(*args, **kwargs)[source]#
Bases: BertBase
forward(
input_ids=None,
input_lengths=None,
token_type_ids=None,
position_ids=None,
hidden_states=None,
max_input_length=None,
class tensorrt_llm.models.BertForSequenceClassification(*args, **kwargs)[source]#
Bases: BertBase
forward(
input_ids,
input_lengths,
token_type_ids=None,
position_ids=None,
hidden_states=None,
max_input_length=None,
class tensorrt_llm.models.BertModel(*args, **kwargs)[source]#
Bases: BertBase
forward(
input_ids=None,
input_lengths=None,
position_ids=None,
token_type_ids=None,
hidden_states=None,
max_input_length=None,
class tensorrt_llm.models.BloomForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
class tensorrt_llm.models.BloomModel(
config: PretrainedConfig,
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_params=None,
class tensorrt_llm.models.CLIPVisionTransformer(
image_size,
num_channels,
patch_size,
hidden_size,
num_attention_heads,
max_position_embeddings,
norm_epsilon,
intermediate_size,
hidden_act,
num_hidden_layers,
require_ln_f,
mapping: Mapping,
dtype,
Bases: Module
forward(pixel_values)[source]#
class tensorrt_llm.models.ChatGLMConfig(
*,
chatglm_version: str = 'chatglm3',
add_bias_linear: bool = False,
add_qkv_bias: bool = True,
apply_query_key_layer_scaling: bool = False,
apply_residual_connection_post_layernorm: bool = False,
rmsnorm: bool = True,
rotary_pct: float = 0.5,
rotary_base: float = 10000.0,
rotary_scaling: dict | None = None,
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.ChatGLMForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of ChatGLMConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a LLaMAForCausalLM object from give parameters
prepare_inputs(*args, **kwargs)[source]#
See PretrainedModel.prepare_inputs for the detailed parameter list.
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
class tensorrt_llm.models.ChatGLMModel(
config: ChatGLMConfig,
Bases: Module
forward(
input_ids: Tensor = None,
position_ids: Tensor = None,
use_cache: bool = False,
attention_mask: Tensor = None,
kv_cache_params: KeyValueCacheParams = None,
attention_params: AttentionParams = None,
class tensorrt_llm.models.CogVLMConfig(
*,
mlp_bias: bool = False,
attn_bias: bool = False,
rotary_base: float = 10000.0,
rotary_scaling: dict | None = None,
**kwargs,
Bases: PretrainedConfig
class tensorrt_llm.models.CogVLMForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
, TopModelMixin
config_class#
alias of CogVLMConfig
default_plugin_config(**kwargs)[source]#
Return the default plugin config for this model, when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it.
classmethod from_hugging_face(
hf_model_dir,
dtype='float16',
mapping: Mapping | None = None,
quant_mode: QuantMode | None = None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
classmethod quantize(
hf_model_dir,
output_dir,
quant_config: QuantConfig,
*,
dtype='float16',
mapping: Mapping | None = None,
calib_batches=512,
calib_batch_size=1,
random_seed=1234,
tokenizer_max_seq_length=2048,
**kwargs,
class tensorrt_llm.models.CohereForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of CohereConfig
classmethod from_hugging_face(
hf_model_or_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a CohereForCausalLM object from give parameters
class tensorrt_llm.models.DbrxConfig(
*,
bias: bool = False,
clip_qkv: float | None = None,
rotary_base: float = 500000.0,
rotary_scaling: dict | None = None,
moe: MoeConfig | dict | None = None,
**kwargs,
Bases: PretrainedConfig
class tensorrt_llm.models.DbrxForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of DbrxConfig
class tensorrt_llm.models.DecoderModel(*args, **kwargs)[source]#
Bases: PretrainedModel
check_config(
config: PretrainedConfig,
forward(
decoder_input_ids: Tensor,
encoder_output: Tensor,
position_ids=None,
token_type_ids=None,
use_cache=False,
attention_mask_params=None,
last_token_ids=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
lora_params: LoraParams = None,
cross_kv_cache_gen: Tensor | None = None,
cross_kv_reuse: Tensor | None = None,
language_adapter_routings: Tensor | None = None,
precompute_relative_attention_bias(build_config)[source]#
prepare_inputs(
max_batch_size,
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_encoder_input_len,
gather_context_logits: bool = False,
lora_target_modules: List[str] = None,
use_cache=True,
*args,
**kwargs,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.DeepseekForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of DeepSeekV1Config
classmethod from_hugging_face(
model_dir,
dtype: str = 'auto',
mapping: Mapping | None = None,
override_fields={},
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
class tensorrt_llm.models.DeepseekV2ForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of DeepSeekV2Config
classmethod from_hugging_face(
model_dir,
dtype: str = 'auto',
hf_model: PreTrainedModel | None = None,
use_preloading: bool = False,
use_safetensors_loading: bool = False,
mapping: Mapping | None = None,
override_fields={},
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
class tensorrt_llm.models.DiT(*args, **kwargs)[source]#
Bases: PretrainedModel
check_config(
config: PretrainedConfig,
forward(latent, timestep, label)[source]#
Forward pass of DiT. latent: (N, C, H, W) timestep: (N,) label: (N,)
forward_with_cfg(x, t, y)[source]#
Forward pass with classifier-free guidance.
forward_without_cfg(x, t, y)[source]#
Forward pass without classifier-free guidance.
prepare_inputs(max_batch_size, **kwargs)[source]#
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
unpatchify(x: Tensor)[source]#
class tensorrt_llm.models.EagleForCausalLM(*args, **kwargs)[source]#
Bases: LLaMAForCausalLM
config_class#
alias of EagleConfig
forward(*args, **kwargs)[source]#
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a LLaMAForCausalLM object from give parameters
prepare_inputs(*args, **kwargs)[source]#
Inputs needed:
device_request_types: [bs] draft_tokens: [bs, max_draft_len] draft_lens: [bs] spec_decoding_generation_lengths: [bs] spec_decoding_position_offsets: [bs, max_gen_tokens] spec_decoding_packed_mask: [bs, max_draft_len, packed_length] ** eagle_temperature: [bs] rand_data_validation: [bs, max_draft_tokens]
** The mask is tricky since the boolean mask will need to be
packed in runtime. So, the last dim will be:
packed_length = ceil((max_draft_tokens+1)/32)
class tensorrt_llm.models.EncoderModel(*args, **kwargs)[source]#
Bases: PretrainedModel
check_config(
config: PretrainedConfig,
forward(
input_ids: Tensor,
input_lengths=None,
position_ids=None,
token_type_ids=None,
hidden_states=None,
max_input_length=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
lora_params: LoraParams = None,
language_adapter_routings: Tensor | None = None,
precompute_relative_attention_bias(build_config)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
prompt_embedding_table_size: int = 0,
lora_target_modules: List[str] = None,
*args,
**kwargs,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
Enable p tuning when build the TRT engine, call this before to_trt
class tensorrt_llm.models.FalconConfig(
*,
bias: bool = False,
parallel_attention: bool = False,
num_ln_in_parallel_attn: int | None = None,
new_decoder_architecture: bool = False,
rotary_base: float = 10000.0,
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.FalconForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of FalconConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a FalconForCausalLM object from give parameters
class tensorrt_llm.models.FalconModel(config: FalconConfig)[source]#
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
class tensorrt_llm.models.GPTConfig(
*,
gpt_variant: str = 'gpt2',
bias: bool = True,
q_scaling: float = 1.0,
embedding_scale: float | None = None,
apply_query_key_layer_scaling: bool = False,
rotary_pct: float = 1.0,
rotary_base: float = 10000.0,
rotary_scaling: dict | None = None,
inner_layernorm: bool = False,
norm_before_bmm1: bool = False,
moe: MoeConfig | dict | None = None,
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
classmethod from_nemo(
nemo_ckpt_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.GPTForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of GPTConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a LLaMAForCausalLM object from give parameters
classmethod from_nemo(
nemo_ckpt_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.GPTJConfig(*, rotary_dim: int = 64, **kwargs)[source]#
Bases: PretrainedConfig
This is the configuration class to store the configuration of GPTJ model.
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.GPTJForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of GPTJConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config=None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
class tensorrt_llm.models.GPTJModel(config: GPTJConfig)[source]#
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
class tensorrt_llm.models.GPTModel(config: GPTConfig)[source]#
Bases: Module
forward(
input_ids,
position_ids,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
spec_decoding_params=None,
class tensorrt_llm.models.GPTNeoXForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
class tensorrt_llm.models.GPTNeoXModel(
config: PretrainedConfig,
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
class tensorrt_llm.models.GemmaConfig(
*,
architecture: str,
rotary_base: float = 10000.0,
rotary_scaling: dict | None = None,
attn_bias: bool = False,
mlp_bias: bool = False,
position_embedding_type: PositionEmbeddingType = PositionEmbeddingType.rope_gpt_neox,
query_pre_attn_scalar: int | None = None,
final_logit_softcapping: float | None = None,
attn_logit_softcapping: float | None = None,
mapping: Mapping | dict | None = None,
sliding_window_pattern: int = None,
rope_local_base_freq: int = None,
sliding_window: int = None,
**kwargs,
Bases: PretrainedConfig
GEMMA2_ADDED_FIELDS = {'attn_logit_softcapping', 'final_logit_softcapping', 'query_pre_attn_scalar'}#
GEMMA3_ADDED_FIELDS = {'final_logit_softcapping', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window', 'sliding_window_pattern'}#
GEMMA_ADDED_FIELDS = {'attn_bias', 'inter_layernorms', 'mlp_bias', 'rotary_base', 'rotary_scaling'}#
VERBATIM = {'attn_logit_softcapping', 'final_logit_softcapping', 'hidden_act', 'hidden_size', 'intermediate_size', 'max_position_embeddings', 'num_attention_heads', 'num_hidden_layers', 'query_pre_attn_scalar', 'rope_local_base_freq', 'sliding_window', 'sliding_window_pattern', 'use_parallel_embedding', 'vocab_size'}#
classmethod from_hugging_face(
hf_config_or_dir: HfConfigOrDir,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
static get_hf_config(config_dir: str | PathLike)[source]#
property is_gemma_2_: bool_#
property is_gemma_3_: bool_#
Serialize the fields added in GemmaConfig
class tensorrt_llm.models.GemmaForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
NATIVE_QUANT_FLOW = {QuantAlgo.W4A16, QuantAlgo.W8A16, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN, QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN, QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN}#
classmethod assert_valid_quant_algo(
quant_algo: QuantAlgo | None,
config_class#
alias of GemmaConfig
classmethod from_hugging_face(
hf_model_dir: HfConfigOrDir,
dtype='float16',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
load_model_on_cpu: bool = True,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'float16',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
gemma_config_kwargs: Dict[str, Any] = None,
**quantize_kwargs: Dict[str, Any],
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.LLaMAConfig(
*,
mlp_bias: bool = False,
attn_bias: bool = False,
rotary_base: float = 10000.0,
rotary_scaling: dict | None = None,
residual_mlp: bool = False,
disable_weight_only_quant_plugin: bool = False,
moe: MoeConfig | dict | None = None,
remove_duplicated_kv_heads: bool = False,
embedding_multiplier: float = 1.0,
attention_multiplier: float = 1.0,
residual_multiplier: float = 1.0,
output_multiplier_scale: float = 1.0,
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
classmethod from_meta_ckpt(
meta_ckpt_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.LLaMAForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of LLaMAConfig
default_plugin_config(**kwargs)[source]#
Return the default plugin config for this model, when the plugin_config value is not given in to_trt() call. If users need to set different plugin configs, they can start from the return object and change it.
classmethod from_hugging_face(
hf_model_or_dir: str | PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a LLaMAForCausalLM object from give parameters
classmethod from_meta_ckpt(
meta_ckpt_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.LLaMAModel(config: LLaMAConfig)[source]#
Bases: Module
forward(
input_ids,
position_ids=None,
use_cache=False,
attention_mask=None,
spec_decoding_params=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
hidden_states_for_embed=None,
prompt_embedding_table: Tensor | None = None,
prompt_tasks: Tensor | None = None,
prompt_vocab_size: Tensor | None = None,
lora_params=None,
class tensorrt_llm.models.LlavaNextVisionConfig(
*,
image_size: int,
patch_size: int,
text_hidden_size: int,
projector_hidden_act: str = 'gelu',
num_channels: int = 3,
vision_model_type: str = 'clip_vision_model',
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.LlavaNextVisionWrapper(*args, **kwargs)[source]#
Bases: PretrainedModel
forward(pixel_values, position_ids=None)[source]#
classmethod from_hugging_face(
hf_model_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a LlavaNextVisionWrapper object from give parameters
prepare_inputs(max_batch_size, **kwargs)[source]#
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
save_checkpoint(output_dir, save_config=True)[source]#
class tensorrt_llm.models.MLLaMAForCausalLM(*args, **kwargs)[source]#
Bases: PretrainedModel
config_class#
alias of MLLaMAConfig
forward(
decoder_input_ids: Tensor,
encoder_output: Tensor,
use_cache=False,
attention_mask_params=None,
last_token_ids=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
lora_params: LoraParams = None,
cross_kv_cache_gen: Tensor | None = None,
cross_kv_reuse: Tensor | None = None,
prompt_embedding_table: Tensor | None = None,
prompt_tasks: Tensor | None = None,
prompt_vocab_size: Tensor | None = None,
skip_cross_attn_blocks: Tensor | None = None,
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create a MLLaMAForCausalLM object from give parameters
prepare_inputs(
max_batch_size,
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_encoder_input_len,
gather_context_logits: bool = False,
gather_generation_logits: bool = False,
lora_target_modules: List[str] = None,
prompt_embedding_table_size: int = 0,
use_cache=True,
*args,
**kwargs,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.MPTForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
class tensorrt_llm.models.MPTModel(config: PretrainedConfig)[source]#
Bases: Module
forward(
input_ids,
position_ids,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
class tensorrt_llm.models.MambaForCausalLM(*args, **kwargs)[source]#
Bases: PretrainedModel
config_class#
alias of MambaConfig
forward(
input_ids,
conv_states,
ssm_states,
host_request_types,
last_token_ids,
last_token_ids_for_logits,
host_context_lengths,
slot_mapping: Tensor | None = None,
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width: int = 1,
opt_num_tokens: int = None,
opt_batch_size: int = 0,
prompt_embedding_table_size: int = 0,
max_draft_len: int = 0,
gather_context_logits: bool = False,
lora_target_modules: List[str] = None,
speculative_decoding_draft_tokens_external: bool = False,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
class tensorrt_llm.models.MedusaConfig(
*,
num_medusa_heads: int = 4,
num_medusa_layers: int = 1,
max_draft_len: int = 63,
**kwargs,
Bases: PretrainedConfig
classmethod from_hugging_face(
hf_config_or_dir: str | transformers.PretrainedConfig,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
class tensorrt_llm.models.MedusaForCausalLm(*args, **kwargs)[source]#
Bases: PretrainedModel
config_class#
alias of MedusaConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
class tensorrt_llm.models.OPTForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
class tensorrt_llm.models.OPTModel(config: PretrainedConfig)[source]#
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
**kwargs,
class tensorrt_llm.models.Phi3ForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of Phi3Config
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.Phi3Model(
config: PretrainedConfig,
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
class tensorrt_llm.models.PhiForCausalLM(*args, **kwargs)[source]#
Bases: DecoderModelForCausalLM
config_class#
alias of PhiConfig
classmethod from_hugging_face(
hf_model_or_dir: str | transformers.PreTrainedModel,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
**kwargs,
Create LLM object and load weights from hugging face :param hf_model_dir: the hugging face model directory :param dtype: str, the default weights data type when loading from the hugging face model :param mapping: Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used
use_lora(
lora_config: LoraConfig,
Load lora weights from the give config to the module :param lora_config: the lora config
class tensorrt_llm.models.PhiModel(config: PretrainedConfig)[source]#
Bases: Module
forward(
input_ids: Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
class tensorrt_llm.models.PretrainedConfig(
*,
architecture: str,
dtype: str,
hidden_size: int,
num_hidden_layers: int,
num_attention_heads: int,
vocab_size: int | None = None,
hidden_act: str = 'gelu',
logits_dtype: str = 'float32',
norm_epsilon: float = 1e-05,
position_embedding_type: PositionEmbeddingType | str = PositionEmbeddingType.learned_absolute,
max_position_embeddings: int | None = None,
rotary_embedding_dim: int | None = None,
num_key_value_heads: int | None = None,
intermediate_size: int | None = None,
mapping: Mapping | dict | None = None,
quantization: QuantConfig | dict | None = None,
use_parallel_embedding: bool = False,
embedding_sharding_dim: int = 0,
head_size: int | None = None,
qk_layernorm: bool = False,
runtime_defaults: RuntimeDefaultsIn = None,
**kwargs,
Bases: object
static create_runtime_defaults(
defaults: RuntimeDefaultsIn = None,
) → RuntimeDefaults | None[source]#
for_each_rank() → Generator[Self, None, None][source]#
classmethod from_checkpoint(ckpt_dir: str)[source]#
classmethod from_dict(config: dict)[source]#
classmethod from_json_file(config_file: str)[source]#
get_config_group(group_cls: Type[CG]) → CG[source]#
has_config_group(group_cls: Type[CG]) → bool[source]#
property kv_dtype#
property quant_algo#
property quant_mode#
set_if_not_exist(key, value)[source]#
to_json_file(config_file: str)[source]#
to_layer_quant_config(config_file: str)[source]#
class tensorrt_llm.models.PretrainedModel(*args, **kwargs)[source]#
Bases: Module
, GenerationMixin
, TopModelMixin
classmethod from_checkpoint(
ckpt_dir: str,
rank: int | None = None,
config: PretrainedConfig | None = None,
*,
preprocess_weights_hook: Callable[[Dict[str, Tensor]], Dict[str, Tensor]] | None = None,
classmethod from_config(
config: PretrainedConfig,
load(weights, from_pruned=False)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width: int = 1,
opt_num_tokens: int = None,
prompt_embedding_table_size: int = 0,
position_encoding_2d: bool = False,
max_draft_len: int = 0,
speculative_decoding_draft_tokens_external: bool = False,
spec_decoding_is_generation_length_variable: bool = False,
gather_context_logits: bool = False,
lora_target_modules: List[str] = None,
opt_batch_size: int = 0,
num_hidden_layers: int = None,
mrope_rotary_cos_sin_size: int = None,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
classmethod quantize(
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Mapping | None = None,
quant_config: QuantConfig | None = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
save_checkpoint(output_dir, save_config=True)[source]#
class tensorrt_llm.models.ReDrafterForCausalLM(*args, **kwargs)[source]#
Bases: LLaMAForCausalLM
forward(*args, **kwargs)[source]#
- run base model, get logits, hidden_states
prepare_inputs(*args, **kwargs)[source]#
Inputs needed:
Assuming, max_gen_tokens = 1 + nb*(bl - 1), counting true token device_request_types: [bs] draft_tokens: [bs, nb, bl] draft_indices: [bs, nb, bl] draft_probs: [bs, nb, bl-1, V] spec_decoding_generation_lengths: [bs] spec_decoding_position_offsets: [bs, max_gen_tokens] spec_decoding_packed_mask: [bs, max_gen_tokens, packed_length] ** redrafter_inverted_temperature: [bs] rand_data_sample: [bs] rand_data_validation: [bs, nb, bl-1]
** The mask is tricky since the boolean mask will need to be
packed in runtime. So, the last dim will be:
packed_length = ceil(max_gen_tokens/32)
class tensorrt_llm.models.RecurrentGemmaForCausalLM(*args, **kwargs)[source]#
Bases: PretrainedModel
forward(
input_ids,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
conv_states=None,
rnn_states=None,
host_request_types=None,
last_token_ids=None,
last_token_ids_for_logits=None,
host_context_lengths=None,
slot_mapping=None,
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width: int = 1,
opt_num_tokens: int = None,
opt_batch_size: int = 0,
prompt_embedding_table_size: int = 0,
max_draft_len: int = 0,
gather_context_logits: bool = False,
lora_target_modules: List[str] = None,
speculative_decoding_draft_tokens_external: bool = False,
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
prepare_recurrent_inputs(
max_batch_size,
num_profiles,
mapping,
tensorrt_llm.models.RobertaForQuestionAnswering#
alias of BertForQuestionAnswering
tensorrt_llm.models.RobertaForSequenceClassification#
alias of BertForSequenceClassification
tensorrt_llm.models.RobertaModel#
alias of BertModel
class tensorrt_llm.models.SD3Transformer2DModel(*args, **kwargs)[source]#
Bases: PretrainedModel
property attn_processors#
config_class#
alias of SD3Transformer2DModelConfig
disable_forward_chunking()[source]#
enable_forward_chunking(
chunk_size: int | None = None,
dim: int = 0,
forward(
hidden_states: Tensor,
encoder_hidden_states: Tensor | None = None,
pooled_projections: Tensor | None = None,
timestep: Tensor | None = None,
block_controlnet_hidden_states: List[Tensor] = None,
joint_attention_kwargs: Dict[str, Any] | None = None,
classmethod from_pretrained(
pretrained_model_name_or_path: str,
dtype='float16',
mapping=<tensorrt_llm.mapping.Mapping object>,
**kwargs,
fuse_qkv_projections()[source]#
load(weights, from_pruned=False)[source]#
prepare_inputs(max_batch_size, **kwargs)[source]#
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()
set_attn_processor(processor)[source]#
unfuse_qkv_projections()[source]#
class tensorrt_llm.models.SpeculativeDecodingMode(
value,
names=,
*values,
module=None,
qualname=None,
type=None,
start=1,
boundary=None,
Bases: IntFlag
DRAFT_TOKENS_EXTERNAL = 2#
EAGLE = 32#
EXPLICIT_DRAFT_TOKENS = 16#
LOOKAHEAD_DECODING = 8#
MEDUSA = 4#
NGRAM = 64#
NONE = 1#
static from_arguments(args: Namespace)[source]#
class tensorrt_llm.models.WhisperEncoder(*args, **kwargs)[source]#
Bases: PretrainedModel
forward(
input_features: Tensor,
input_lengths=None,
position_ids=None,
precompute_relative_attention_bias(build_config)[source]#
prepare_inputs(max_batch_size=16)[source]#
@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes.
@return: a list contains values which can be fed into the self.forward()