Layers — TensorRT-LLM (original) (raw)
Activation#
class tensorrt_llm.layers.activation.Mish[source]#
Bases: Module
Attention#
class tensorrt_llm.layers.attention.Attention(*, local_layer_idx, hidden_size, num_attention_heads, num_kv_heads=None, max_position_embeddings=1024, num_layers=1, apply_query_key_layer_scaling=False, attention_head_size=None, qk_layernorm=False, layernorm_type=LayerNormType.LayerNorm, layernorm_share=True, inner_layernorm=False, eps=1e-05, attention_mask_type=AttentionMaskType.padding, bias=True, dtype=None, position_embedding_type=PositionEmbeddingType.learned_absolute, rotary_embedding_base=10000.0, rotary_embedding_base_local=1.0, rotary_embedding_scaling=None, rotary_embedding_percentage=1.0, rope_scaling_short_factors=None, rope_scaling_long_factors=None, rope_scaling_short_mscale=None, rope_scaling_long_mscale=None, original_max_position_embeddings=1024, tp_group=None, tp_size=1, tp_rank=0, quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>, q_scaling=1.0, cross_attention=False, relative_attention=False, max_distance=0, num_buckets=0, dense_bias=None, clip_qkv=None, alibi_bias_max=8, skip_cross_kv=False, max_attn_value=0.0, block_sparse_params=None, use_implicit_relative_attention=False, reorder=False, enable_qkv=True, cp_group=[0], cp_size=1, cp_rank=0, max_seqlen_for_logn_scaling=8192, use_logn_scaling=False, is_local=False)[source]#
Bases: Module
static create_attention_const_params(model_cls, config)[source]#
static fill_attention_params(model_cls, attention_params)[source]#
forward(
hidden_states: Tensor,
attention_mask=None,
attention_packed_mask=None,
use_cache=False,
spec_decoding_params=None,
mrope_params=None,
kv_cache_params=None,
attention_params=None,
encoder_output: Tensor | None = None,
position_embedding=None,
norm_before_bmm1=False,
lora_layer_params=None,
cross_kv_cache_gen: Tensor | None = None,
cross_kv_reuse: Tensor | None = None,
all_reduce_params: AllReduceParams | None = None,
skip_attn=None,
postprocess(tllm_key, weights, **kwargs)[source]#
set_rel_attn_table(
max_seq_len,
precomputed_relative_attention,
class tensorrt_llm.layers.attention.AttentionMaskParams(
self_attention_mask: Tensor = None,
self_attention_packed_mask: Tensor = None,
cross_attention_mask: Tensor = None,
cross_attention_packed_mask: Tensor = None,
Bases: object
class tensorrt_llm.layers.attention.AttentionParams(
sequence_length: Tensor = None,
context_lengths: Tensor = None,
host_context_lengths: Tensor = None,
max_context_length: int = None,
host_request_types: Tensor = None,
encoder_input_lengths: Tensor = None,
encoder_max_input_length: Tensor = None,
host_runtime_perf_knobs: Tensor = None,
host_context_progress: Tensor = None,
Bases: object
fill_attention_const_params_for_long_rope(
embed_positions,
long_rope_embed_positions,
rotary_inv_freq,
long_rope_rotary_inv_freq,
embed_positions_for_gpt_attention,
long_rope_embed_positions_for_gpt_attention,
short_mscale,
long_mscale,
fill_attention_const_params_for_rope(
embed_positions: Tensor = None,
rotary_inv_freq: Tensor = None,
embed_positions_for_gpt_attention: Tensor = None,
embed_positions_local: Tensor = None,
rotary_inv_freq_local: Tensor = None,
embed_positions_for_gpt_attention_local: Tensor = None,
is_valid(
gpt_attention_plugin,
remove_input_padding,
use_kv_cache,
is_valid_cross_attn(do_cross_attention)[source]#
class tensorrt_llm.layers.attention.BertAttention(
hidden_size,
num_attention_heads,
max_position_embeddings=1024,
num_layers=1,
attention_head_size=None,
num_kv_heads=None,
q_scaling=1.0,
apply_query_key_layer_scaling=False,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
tp_rank=0,
cp_group=None,
cp_size=1,
relative_attention=False,
max_distance=0,
num_buckets=0,
quant_mode=<QuantMode: 0>,
Bases: Module
forward(
hidden_states: Tensor,
attention_mask=None,
input_lengths=None,
max_input_length=None,
lora_layer_params=None,
class tensorrt_llm.layers.attention.BlockSparseAttnParams(
block_size: int = 64,
homo_head_pattern: bool = False,
num_local_blocks: int = 16,
vertical_stride: int = 8,
Bases: object
class tensorrt_llm.layers.attention.CogVLMAttention(
*,
local_layer_idx,
hidden_size,
num_attention_heads,
num_kv_heads=None,
max_position_embeddings=1024,
attention_mask_type=AttentionMaskType.causal,
bias=True,
dtype=None,
position_embedding_type=PositionEmbeddingType.learned_absolute,
rotary_embedding_base=10000.0,
rotary_embedding_scaling=None,
tp_group=None,
tp_size=1,
tp_rank=0,
quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
dense_bias=None,
Bases: Attention
forward(
hidden_states: Tensor,
use_cache=False,
kv_cache_params=None,
attention_params=None,
vision_token_mask=None,
position_embedding=None,
class tensorrt_llm.layers.attention.DeepseekV2Attention(
*,
local_layer_idx,
hidden_size,
num_attention_heads,
q_lora_rank,
kv_lora_rank,
qk_nope_head_dim=None,
qk_rope_head_dim=None,
v_head_dim=None,
eps=1e-06,
attention_mask_type=AttentionMaskType.causal,
dtype=None,
position_embedding_type=PositionEmbeddingType.learned_absolute,
max_position_embeddings=1024,
rotary_embedding_base=10000.0,
rotary_embedding_scaling=None,
rotary_embedding_beta_fast=32,
rotary_embedding_beta_slow=1,
rotary_embedding_mscale=1,
rotary_embedding_mscale_all_dim=0,
rotary_embedding_origin_max_position=4096,
rotary_scaling=None,
tp_group=None,
tp_size=1,
tp_rank=0,
quant_mode: ~tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
Bases: Attention
forward(
hidden_states: Tensor,
use_cache=False,
spec_decoding_params=None,
kv_cache_params=None,
attention_params=None,
postprocess(tllm_key, weights, **kwargs)[source]#
weight_loader(
mapping: Mapping,
param: Parameter,
loaded_weight: Tensor,
class tensorrt_llm.layers.attention.DiffusersAttention(
*,
query_dim: int,
cross_attention_dim: int | None = None,
heads: int = 8,
kv_heads: int | None = None,
dim_head: int = 64,
dropout: float = 0.0,
bias: bool = False,
upcast_attention: bool = False,
upcast_softmax: bool = False,
cross_attention_norm: str | None = None,
cross_attention_norm_num_groups: int = 32,
qk_norm: str | None = None,
added_kv_proj_dim: int | None = None,
added_proj_bias: bool | None = True,
norm_num_groups: int | None = None,
spatial_norm_dim: int | None = None,
out_bias: bool = True,
scale_qk: bool = True,
only_cross_attention: bool = False,
eps: float = 1e-05,
rescale_output_factor: float = 1.0,
residual_connection: bool = False,
out_dim: int = None,
out_context_dim: int = None,
context_pre_only=None,
pre_only=False,
elementwise_affine: bool = True,
is_causal: bool = False,
attn_forward_funcname: str = 'joint_attn_forward',
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
hidden_states: Tensor,
encoder_hidden_states: Tensor | None = None,
attention_mask: Tensor | None = None,
max_input_length: Tensor | None = None,
*args,
**kwargs,
joint_attn_forward(
hidden_states: Tensor,
encoder_hidden_states: Tensor | None = None,
attention_mask: Tensor | None = None,
max_input_length: Tensor | None = None,
*args,
**kwargs,
class tensorrt_llm.layers.attention.KeyValueCacheParams(
past_key_value: List[Tensor] = None,
host_past_key_value_lengths: Tensor = None,
host_max_attention_window_sizes: Tensor = None,
host_sink_token_length: Tensor = None,
kv_cache_block_offsets: Tensor = None,
host_kv_cache_block_offsets: Tensor = None,
host_kv_cache_pool_pointers: Tensor = None,
host_kv_cache_pool_mapping: Tensor = None,
cache_indirection: Tensor = None,
past_key_value_length: Tensor = None,
cross_kv_cache_block_offsets: Tensor = None,
host_cross_kv_cache_block_offsets: Tensor = None,
host_cross_kv_cache_pool_pointers: Tensor = None,
host_cross_kv_cache_pool_mapping: Tensor = None,
Bases: object
fill_none_tensor_list(list_size)[source]#
get_first_past_key_value()[source]#
is_valid(gpt_attention_plugin)[source]#
class tensorrt_llm.layers.attention.MropeParams(
mrope_rotary_cos_sin: Tensor = None,
mrope_position_deltas: Tensor = None,
Bases: object
class tensorrt_llm.layers.attention.SpecDecodingParams(
spec_decoding_is_generation_length_variable: bool = False,
spec_decoding_max_generation_length: int = 1,
spec_decoding_generation_lengths: Tensor = None,
spec_decoding_position_offsets: Tensor = None,
spec_decoding_packed_mask: Tensor = None,
spec_decoding_use: Tensor = None,
Bases: object
tensorrt_llm.layers.attention.compute_relative_bias(
query_length,
key_length,
num_buckets,
max_distance,
bidirectional,
rel_attn_table,
tp_size=1,
tp_group=None,
tp_rank=None,
tensorrt_llm.layers.attention.make_causal_mask(bsz, tgt_len, past_key_values_length, dtype)[source]#
Cast#
class tensorrt_llm.layers.cast.Cast(output_dtype: str = 'float32')[source]#
Bases: Module
Conv#
class tensorrt_llm.layers.conv.Conv1d(
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros',
dtype=None,
Bases: Module
class tensorrt_llm.layers.conv.Conv2d(
in_channels: int,
out_channels: int,
kernel_size: Tuple[int, int],
stride: Tuple[int, int] = (1, 1),
padding: Tuple[int, int] = (0, 0),
dilation: Tuple[int, int] = (1, 1),
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros',
dtype=None,
Bases: Module
class tensorrt_llm.layers.conv.Conv3d(
in_channels: int,
out_channels: int,
kernel_size: Tuple[int, int, int],
stride: Tuple[int, int, int] = (1, 1, 1),
padding: Tuple[int, int, int] = (0, 0, 0),
dilation: Tuple[int, int, int] = (1, 1, 1),
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros',
dtype=None,
Bases: Module
class tensorrt_llm.layers.conv.ConvTranspose2d(
in_channels: int,
out_channels: int,
kernel_size: Tuple[int, int],
stride: Tuple[int, int] = (1, 1),
padding: Tuple[int, int] = (0, 0),
output_padding: Tuple[int, int] = (0, 0),
dilation: Tuple[int, int] = (1, 1),
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros',
dtype=None,
Bases: Module
forward(input, output_size=None)[source]#
Embedding#
class tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings(
num_classes,
embedding_dim,
class_dropout_prob=0.0,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
timestep: Tensor,
class_labels: Tensor,
hidden_dtype: str | None = 'float32',
class tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings(
embedding_dim,
pooled_projection_dim,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
timestep: Tensor,
pooled_projection: Tensor,
class tensorrt_llm.layers.embedding.Embedding(
num_embeddings: int,
embedding_dim: int,
dtype: str | None = None,
tp_size: int = 1,
tp_group: list | None = None,
sharding_dim: int = 0,
tp_rank: int | None = None,
Bases: Module
The embedding layer takes input indices (x) and the embedding lookup table (weight) as input. And output the corresponding embeddings according to input indices. The size of weight is [num_embeddings, embedding_dim]
Four parameters (tp_size, tp_group, sharding_dim, tp_rank) are involved in tensor parallelism. Only when “tp_size > 1 and tp_group is not None”, tensor parallelism is enabled.
When “sharding_dim == 0”, the weight is shared in the vocabulary dimension.
tp_rank must be set when sharding_dim == 0.
When “sharding_dim == 1”, the weight is shard in the hidden dimension.
postprocess(tllm_key, weights, **kwargs)[source]#
weight_loader(
mapping: Mapping,
param: Parameter,
loaded_weight: Tensor,
class tensorrt_llm.layers.embedding.LabelEmbedding(
num_classes: int,
hidden_size: int,
dropout_prob: float = 0.0,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
labels: Tensor,
force_drop_ids: Tensor | None = None,
token_drop(
labels: Tensor,
force_drop_ids: Tensor,
class tensorrt_llm.layers.embedding.PixArtAlphaTextProjection(
in_features,
hidden_size,
out_features=None,
act_fn='gelu_tanh',
mapping=None,
dtype=None,
Bases: Module
Projects caption embeddings. Also handles dropout for classifier-free guidance.
Adapted from PixArt-alpha/PixArt-alpha
class tensorrt_llm.layers.embedding.PromptTuningEmbedding(
num_embeddings,
embedding_dim,
vocab_size=None,
dtype=None,
tp_size=1,
tp_group=None,
sharding_dim=0,
tp_rank=0,
Bases: Embedding
PromptTuningEmbedding handles fine-tuned prompts with virtual tokens. At runtime, a supplementary embedding dictionary is passed. Tokens whose ids are >= vocab_size are embedded with that additional dictionary. The prompt tuning dictionary holds multiple tasks, and each sequence is assigned a given task. Prompt-tuned tokens from a given sequence use the adequate task dictionary, as defined by the tasks input.
forward(
tokens,
prompt_embedding_table,
tasks,
task_vocab_size,
Pass all tokens through both normal and prompt embedding tables. Tokens are masked so that “normal” embedding only see “normal” tokens. Same logic for “prompt” embedding. After those two embedding, combine results based on whether the token was “normal” or “prompt-tuned”.
Parameters:
- tokens – Tensor the ids to embed, size [batch_size, seq_len]
- prompt_embedding_table – Tensor the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]
- tasks – Tensor the task required by each token, size [batch_size, seq_len]
- task_vocab_size – Tensor the number of tokens used for each task, should be equal to prompt_embedding_table’s num_tokens_per_task, size [1]
Returns:
Tokens’ embedding
class tensorrt_llm.layers.embedding.SD3PatchEmbed(
height: int = 224,
width: int = 224,
patch_size: int = 16,
in_channels: int = 3,
embed_dim: int = 768,
layer_norm: bool = False,
flatten: bool = True,
bias: bool = True,
interpolation_scale: int = 1,
pos_embed_type: str = 'sincos',
pos_embed_max_size: int | None = None,
dtype=None,
Bases: Module
2D Image to Patch Embedding with support for SD3 cropping.
cropped_pos_embed(height, width)[source]#
Crops positional embeddings for SD3 compatibility.
class tensorrt_llm.layers.embedding.TimestepEmbedding(
in_channels: int,
time_embed_dim: int,
act_fn: str = 'silu',
out_dim: int = None,
post_act_fn: str | None = None,
cond_proj_dim=None,
sample_proj_bias=True,
mapping=None,
dtype=None,
Bases: Module
forward(sample, condition=None)[source]#
class tensorrt_llm.layers.embedding.Timesteps(
num_channels: int,
flip_sin_to_cos: bool,
downscale_freq_shift: float,
scale: int = 1,
Bases: Module
forward(timesteps) → Tensor[source]#
tensorrt_llm.layers.embedding.get_1d_sincos_pos_embed_from_grid(
embed_dim: int,
pos: Tensor,
tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed(
embed_dim: int,
grid_size: int | Sequence[int],
cls_token: bool = False,
extra_tokens: int = 0,
interpolation_scale: float = 1.0,
base_size: int = 16,
tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed_from_grid(
embed_dim: int,
grid: Sequence[Tensor],
tensorrt_llm.layers.embedding.get_timestep_embedding(
timesteps: Tensor,
embedding_dim: int,
flip_sin_to_cos: bool = False,
downscale_freq_shift: float = 1,
scale: float = 1,
max_period: int = 10000,
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
Args
timesteps (Tensor):
a 1-D Tensor of N indices, one per batch element. These may be fractional.
embedding_dim (int):
the dimension of the output.
flip_sin_to_cos (bool):
Whether the embedding order should be cos, sin (if True) or sin, cos (if False)
downscale_freq_shift (float):
Controls the delta between frequencies between dimensions
scale (float):
Scaling factor applied to the embeddings.
max_period (int):
Controls the maximum frequency of the embeddings
Returns
Tensor: an [N x dim] Tensor of positional embeddings.
Linear#
tensorrt_llm.layers.linear.ColumnLinear#
alias of Linear
class tensorrt_llm.layers.linear.Linear(
in_features,
out_features,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
gather_output=True,
share_weight=None,
strict_dtype=False,
pad_lda=0,
pad_ldc=0,
prefer_managed_weight=True,
is_qkv=False,
Bases: LinearBase
collect_and_bias(x, **kwargs)[source]#
postprocess(tllm_key, weights, **kwargs)[source]#
classmethod tp_split_dim() → int[source]#
class tensorrt_llm.layers.linear.LinearBase(
local_in_features,
local_out_features,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
share_weight=None,
strict_dtype=False,
pad_lda=0,
pad_ldc=0,
prefer_managed_weight=True,
Bases: Module
abstract collect_and_bias(
x: Tensor,
forward(
x,
lora_runtime_params: LoraRuntimeParams | None = None,
lora_hidden_state: Tensor | None = None,
**kwargs,
get_weight() → Tensor[source]#
multiply_and_lora(
x,
weight,
gemm_plugin: str | None = None,
low_latency_gemm_plugin: str | None = None,
use_fp8: bool = False,
alpha: ndarray | None = None,
lora_runtime_params: LoraRuntimeParams | None = None,
lora_hidden_state: Tensor | None = None,
multiply_collect(
x,
weight,
gemm_plugin: str | None = None,
low_latency_gemm_plugin: str | None = None,
use_fp8: bool = False,
alpha: ndarray | None = None,
lora_runtime_params: LoraRuntimeParams | None = None,
lora_hidden_state: Tensor | None = None,
**kwargs,
abstract classmethod tp_split_dim() → int[source]#
weight_loader(
mapping: Mapping,
param: Parameter,
loaded_weight: Tensor,
class tensorrt_llm.layers.linear.RowLinear(
in_features,
out_features,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
strict_dtype: bool = False,
pad_lda=0,
prefer_managed_weight=True,
is_expert=False,
Bases: LinearBase
collect_and_bias(x, **kwargs)[source]#
multiply_collect(
x,
weight,
gemm_plugin: str | None = None,
low_latency_gemm_plugin: str | None = None,
use_fp8: bool = False,
alpha: ndarray | None = None,
lora_runtime_params: LoraRuntimeParams | None = None,
lora_hidden_state: Tensor | None = None,
**kwargs,
classmethod tp_split_dim() → int[source]#
MLP#
class tensorrt_llm.layers.mlp.FusedGatedMLP(
hidden_size,
ffn_hidden_size,
hidden_act,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
quant_mode=<QuantMode: 0>,
inner_layernorm=False,
eps=1e-05,
is_expert=False,
Bases: Module
fc_gate(hidden_states, lora_layer_params=None)[source]#
fc_gate_plugin(hidden_states, lora_layer_params=None)[source]#
forward(
hidden_states,
lora_layer_params=None,
all_reduce_params: AllReduceParams | None = None,
class tensorrt_llm.layers.mlp.GatedMLP(
hidden_size,
ffn_hidden_size,
hidden_act,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
quant_mode=<QuantMode: 0>,
inner_layernorm=False,
eps=1e-05,
is_expert=False,
Bases: MLP
forward(
hidden_states,
lora_layer_params=None,
all_reduce_params: AllReduceParams | None = None,
class tensorrt_llm.layers.mlp.LinearActivation(
dim_in: int,
dim_out: int,
bias: bool = True,
activation: str = 'silu',
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(hidden_states)[source]#
class tensorrt_llm.layers.mlp.LinearApproximateGELU(
dim_in: int,
dim_out: int,
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
class tensorrt_llm.layers.mlp.LinearGEGLU(
dim_in: int,
dim_out: int,
approximate: str = 'tanh',
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(hidden_states)[source]#
class tensorrt_llm.layers.mlp.LinearGELU(
dim_in: int,
dim_out: int,
approximate: str = 'tanh',
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(hidden_states)[source]#
class tensorrt_llm.layers.mlp.LinearSwiGLU(
dim_in: int,
dim_out: int,
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(hidden_states)[source]#
class tensorrt_llm.layers.mlp.MLP(
hidden_size,
ffn_hidden_size,
hidden_act,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
quant_mode=<QuantMode: 0>,
inner_layernorm=False,
eps=1e-05,
is_expert=False,
Bases: Module
forward(hidden_states, lora_layer_params=None, gegelu_limit=None)[source]#
tensorrt_llm.layers.mlp.fc_gate_dora(
hidden_states,
dora,
fused_gate_up_dora,
lora_layer_params,
tensorrt_llm.layers.mlp.fc_gate_lora(
hidden_states,
lora,
fused_gate_up_lora,
lora_layer_params,
Normalization#
class tensorrt_llm.layers.normalization.AdaLayerNorm(
embedding_dim: int,
num_embeddings: int | None = None,
output_dim: int | None = None,
norm_elementwise_affine: bool = False,
norm_eps: float = 1e-05,
chunk_dim: int = 0,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
x: Tensor,
timestep: Tensor | None = None,
temb: Tensor | None = None,
class tensorrt_llm.layers.normalization.AdaLayerNormContinuous(
embedding_dim: int,
conditioning_embedding_dim: int,
elementwise_affine: bool = True,
eps: float = 1e-05,
bias: bool = True,
norm_type: str = 'layer_norm',
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
x: Tensor,
conditioning_embedding: Tensor,
class tensorrt_llm.layers.normalization.AdaLayerNormZero(
embedding_dim: int,
num_embeddings: int | None = None,
norm_type: str = 'layer_norm',
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
x: Tensor,
timestep: Tensor | None = None,
class_labels: Tensor | None = None,
hidden_dtype: str = None,
emb: Tensor | None = None,
class tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle(
embedding_dim: int,
norm_type: str = 'layer_norm',
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
x: Tensor,
emb: Tensor | None = None,
class tensorrt_llm.layers.normalization.GroupNorm(
num_groups,
num_channels,
eps=1e-05,
affine=True,
dtype=None,
Bases: Module
class tensorrt_llm.layers.normalization.LayerNorm(
normalized_shape,
eps=1e-05,
elementwise_affine=True,
bias=True,
dtype=None,
tp_size=1,
tp_dim=-1,
Bases: Module
forward(x, normalized_shape=None)[source]#
class tensorrt_llm.layers.normalization.RmsNorm(
normalized_shape,
num_groups=1,
eps=1e-06,
elementwise_affine=True,
dtype=None,
Bases: Module
forward(x, normalized_shape=None)[source]#
class tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX(
embedding_dim: int,
norm_type: str = 'layer_norm',
bias: bool = True,
mapping=<tensorrt_llm.mapping.Mapping object>,
dtype=None,
Bases: Module
forward(
hidden_states: Tensor,
emb: Tensor,
Pooling#
class tensorrt_llm.layers.pooling.AvgPool2d(
kernel_size: Tuple[int],
stride: Tuple[int] | None = None,
padding: Tuple[int] | None = (0, 0),
ceil_mode: bool = False,
count_include_pad: bool = True,
Bases: Module