Skip to content

BSBR Extras API

bsbr_extras.standard_transformer.StandardTransformerModel

Bases: Module

Full Standard Transformer model stacking multiple Standard Transformer layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of transformer layers

required
num_heads int

Number of attention heads

required
ff_dim int

Feed-forward intermediate dimension

required
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/standard_transformer.py
class StandardTransformerModel(nn.Module):
    """
    Full Standard Transformer model stacking multiple Standard Transformer layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of transformer layers
        num_heads (int): Number of attention heads
        ff_dim (int): Feed-forward intermediate dimension
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        ff_dim: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            StandardTransformerLayer(
                hidden_dim=hidden_dim,
                num_heads=num_heads,
                ff_dim=ff_dim,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Forward pass for the full Standard Transformer model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask)

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

forward(input_ids, attention_mask=None)

Forward pass for the full Standard Transformer model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

Source code in src/bsbr_extras/standard_transformer.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
    """
    Forward pass for the full Standard Transformer model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    for layer in self.layers:
        hidden_states = layer(hidden_states, attention_mask)

    hidden_states = self.layer_norm(hidden_states)
    return hidden_states

bsbr_extras.linear_transformer.LinearTransformerModel

Bases: Module

Full Linear Transformer model stacking multiple Linear Transformer layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of LinearTransformer layers

required
num_heads int

Number of attention heads

required
ff_dim int

Feed-forward intermediate dimension

required
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/linear_transformer.py
class LinearTransformerModel(nn.Module):
    """
    Full Linear Transformer model stacking multiple Linear Transformer layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of LinearTransformer layers
        num_heads (int): Number of attention heads
        ff_dim (int): Feed-forward intermediate dimension
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        ff_dim: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            LinearTransformerLayer(
                hidden_dim=hidden_dim,
                num_heads=num_heads,
                ff_dim=ff_dim,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.num_layers = num_layers

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        states: Optional[list] = None
    ) -> Tuple[torch.Tensor, list]:
        """
        Forward pass for the full Linear Transformer model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]
            states: Optional previous state list for each layer

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
            new_states: Updated state list for each layer
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        # Initialize states if not provided
        if states is None:
            states = [None] * self.num_layers

        new_states = []

        for i, layer in enumerate(self.layers):
            hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
            new_states.append(new_state)

        hidden_states = self.layer_norm(hidden_states)

        return hidden_states, new_states

forward(input_ids, attention_mask=None, states=None)

Forward pass for the full Linear Transformer model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None
states Optional[list]

Optional previous state list for each layer

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

new_states list

Updated state list for each layer

Source code in src/bsbr_extras/linear_transformer.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None,
    states: Optional[list] = None
) -> Tuple[torch.Tensor, list]:
    """
    Forward pass for the full Linear Transformer model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]
        states: Optional previous state list for each layer

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        new_states: Updated state list for each layer
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    # Initialize states if not provided
    if states is None:
        states = [None] * self.num_layers

    new_states = []

    for i, layer in enumerate(self.layers):
        hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
        new_states.append(new_state)

    hidden_states = self.layer_norm(hidden_states)

    return hidden_states, new_states

bsbr_extras.delta_net.DeltaNetModel

Bases: Module

Full DeltaNet model stacking multiple DeltaNet layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of DeltaNet layers

required
num_heads int

Number of attention heads

required
ff_dim int

Feed-forward intermediate dimension

required
beta float

Forgetting/update rate parameter (β in the paper)

0.9
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/delta_net.py
class DeltaNetModel(nn.Module):
    """
    Full DeltaNet model stacking multiple DeltaNet layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of DeltaNet layers
        num_heads (int): Number of attention heads
        ff_dim (int): Feed-forward intermediate dimension
        beta (float): Forgetting/update rate parameter (β in the paper)
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        ff_dim: int,
        beta: float = 0.9,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            DeltaNetLayer(
                hidden_dim=hidden_dim,
                num_heads=num_heads,
                ff_dim=ff_dim,
                beta=beta,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.num_layers = num_layers

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        states: Optional[list] = None
    ) -> Tuple[torch.Tensor, list]:
        """
        Forward pass for the full DeltaNet model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]
            states: Optional previous state list for each layer

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
            new_states: Updated state list for each layer
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        # Initialize states if not provided
        if states is None:
            states = [None] * self.num_layers

        new_states = []

        for i, layer in enumerate(self.layers):
            hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
            new_states.append(new_state)

        hidden_states = self.layer_norm(hidden_states)

        return hidden_states, new_states

forward(input_ids, attention_mask=None, states=None)

Forward pass for the full DeltaNet model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None
states Optional[list]

Optional previous state list for each layer

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

new_states list

Updated state list for each layer

Source code in src/bsbr_extras/delta_net.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None,
    states: Optional[list] = None
) -> Tuple[torch.Tensor, list]:
    """
    Forward pass for the full DeltaNet model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]
        states: Optional previous state list for each layer

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        new_states: Updated state list for each layer
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    # Initialize states if not provided
    if states is None:
        states = [None] * self.num_layers

    new_states = []

    for i, layer in enumerate(self.layers):
        hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
        new_states.append(new_state)

    hidden_states = self.layer_norm(hidden_states)

    return hidden_states, new_states

bsbr_extras.gau.GAUModel

Bases: Module

Full Gated Attention Unit model stacking multiple GAU layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of GAU layers

required
chunk_size int

Size of chunks for parallel processing

required
ff_dim int

Feed-forward intermediate dimension

required
expansion_factor int

Expansion factor for GAU

2
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/gau.py
class GAUModel(nn.Module):
    """
    Full Gated Attention Unit model stacking multiple GAU layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of GAU layers
        chunk_size (int): Size of chunks for parallel processing
        ff_dim (int): Feed-forward intermediate dimension
        expansion_factor (int): Expansion factor for GAU
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        chunk_size: int,
        ff_dim: int,
        expansion_factor: int = 2,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            GAULayer(
                hidden_dim=hidden_dim,
                chunk_size=chunk_size,
                ff_dim=ff_dim,
                expansion_factor=expansion_factor,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Forward pass for the full GAU model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask)

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

forward(input_ids, attention_mask=None)

Forward pass for the full GAU model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

Source code in src/bsbr_extras/gau.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
    """
    Forward pass for the full GAU model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    for layer in self.layers:
        hidden_states = layer(hidden_states, attention_mask)

    hidden_states = self.layer_norm(hidden_states)
    return hidden_states

bsbr_extras.hopfield_network.HopfieldNetworkModel

Bases: Module

Full Hopfield Network model stacking multiple Hopfield Network layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of Hopfield Network layers

required
num_heads int

Number of attention heads

required
ff_dim int

Feed-forward intermediate dimension

required
temperature float

Temperature parameter for the Hopfield energy function

1.0
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/hopfield_network.py
class HopfieldNetworkModel(nn.Module):
    """
    Full Hopfield Network model stacking multiple Hopfield Network layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of Hopfield Network layers
        num_heads (int): Number of attention heads
        ff_dim (int): Feed-forward intermediate dimension
        temperature (float): Temperature parameter for the Hopfield energy function
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        ff_dim: int,
        temperature: float = 1.0,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            HopfieldNetworkLayer(
                hidden_dim=hidden_dim,
                num_heads=num_heads,
                ff_dim=ff_dim,
                temperature=temperature,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.num_layers = num_layers

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        states: Optional[list] = None
    ) -> Tuple[torch.Tensor, list]:
        """
        Forward pass for the full Hopfield Network model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]
            states: Optional previous states list for each layer

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
            new_states: Updated states list for each layer
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        # Initialize states if not provided
        if states is None:
            states = [None] * self.num_layers

        new_states = []

        for i, layer in enumerate(self.layers):
            hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
            new_states.append(new_state)

        hidden_states = self.layer_norm(hidden_states)

        return hidden_states, new_states

forward(input_ids, attention_mask=None, states=None)

Forward pass for the full Hopfield Network model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None
states Optional[list]

Optional previous states list for each layer

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

new_states list

Updated states list for each layer

Source code in src/bsbr_extras/hopfield_network.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None,
    states: Optional[list] = None
) -> Tuple[torch.Tensor, list]:
    """
    Forward pass for the full Hopfield Network model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]
        states: Optional previous states list for each layer

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        new_states: Updated states list for each layer
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    # Initialize states if not provided
    if states is None:
        states = [None] * self.num_layers

    new_states = []

    for i, layer in enumerate(self.layers):
        hidden_states, new_state = layer(hidden_states, attention_mask, states[i])
        new_states.append(new_state)

    hidden_states = self.layer_norm(hidden_states)

    return hidden_states, new_states

bsbr_extras.sliding_window_transformer.SlidingWindowTransformerModel

Bases: Module

Full Sliding Window Transformer model stacking multiple transformer layers.

Parameters:

Name Type Description Default
vocab_size int

Vocabulary size for embedding layer

required
hidden_dim int

Hidden dimension size

required
num_layers int

Number of transformer layers

required
num_heads int

Number of attention heads

required
window_size int

Size of the attention window

required
ff_dim int

Feed-forward intermediate dimension

required
dropout float

Dropout probability

0.1
Source code in src/bsbr_extras/sliding_window_transformer.py
class SlidingWindowTransformerModel(nn.Module):
    """
    Full Sliding Window Transformer model stacking multiple transformer layers.

    Args:
        vocab_size (int): Vocabulary size for embedding layer
        hidden_dim (int): Hidden dimension size
        num_layers (int): Number of transformer layers
        num_heads (int): Number of attention heads
        window_size (int): Size of the attention window
        ff_dim (int): Feed-forward intermediate dimension
        dropout (float): Dropout probability
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        window_size: int,
        ff_dim: int,
        dropout: float = 0.1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.pos_encoding = PositionalEncoding(hidden_dim, dropout)

        self.layers = nn.ModuleList([
            SlidingWindowTransformerLayer(
                hidden_dim=hidden_dim,
                num_heads=num_heads,
                window_size=window_size,
                ff_dim=ff_dim,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_dim)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Forward pass for the full Sliding Window Transformer model.

        Args:
            input_ids: Token IDs of shape [batch_size, seq_len]
            attention_mask: Optional attention mask of shape [batch_size, seq_len]

        Returns:
            output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
        """
        hidden_states = self.embedding(input_ids)
        hidden_states = self.pos_encoding(hidden_states)

        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask)

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

forward(input_ids, attention_mask=None)

Forward pass for the full Sliding Window Transformer model.

Parameters:

Name Type Description Default
input_ids LongTensor

Token IDs of shape [batch_size, seq_len]

required
attention_mask Optional[Tensor]

Optional attention mask of shape [batch_size, seq_len]

None

Returns:

Name Type Description
output Tensor

Processed tensor of shape [batch_size, seq_len, hidden_dim]

Source code in src/bsbr_extras/sliding_window_transformer.py
def forward(
    self,
    input_ids: torch.LongTensor,
    attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
    """
    Forward pass for the full Sliding Window Transformer model.

    Args:
        input_ids: Token IDs of shape [batch_size, seq_len]
        attention_mask: Optional attention mask of shape [batch_size, seq_len]

    Returns:
        output: Processed tensor of shape [batch_size, seq_len, hidden_dim]
    """
    hidden_states = self.embedding(input_ids)
    hidden_states = self.pos_encoding(hidden_states)

    for layer in self.layers:
        hidden_states = layer(hidden_states, attention_mask)

    hidden_states = self.layer_norm(hidden_states)
    return hidden_states