Quick Start Guide
This guide will help you get started with BSBR quickly. We'll cover basic usage, model configuration, and common patterns.
Basic Usage
Creating a BSBR Model
import torch
from bsbr import BSBRModel
# Create a model with default settings
model = BSBRModel(
vocab_size=10000, # Size of your vocabulary
hidden_dim=512, # Hidden dimension of the model
num_layers=4, # Number of transformer layers
num_heads=8, # Number of attention heads
chunk_size=128, # Size of attention chunks
ff_dim=2048, # Feed-forward network dimension
dropout=0.1, # Dropout rate
compression_factor=4 # Optional compression factor
)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
Processing Input
# Create sample input
batch_size = 2
seq_length = 256
input_ids = torch.randint(0, 10000, (batch_size, seq_length))
attention_mask = torch.ones(batch_size, seq_length)
# Move inputs to the same device as the model
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Forward pass
outputs = model(input_ids, attention_mask)
Advanced Configuration
Customizing Attention
from bsbr import BSBRModel, BSBRAttention
# Create a custom attention layer
attention = BSBRAttention(
hidden_dim=512,
num_heads=8,
chunk_size=128,
compression_factor=4,
dropout=0.1
)
# Use it in a model
model = BSBRModel(
vocab_size=10000,
hidden_dim=512,
num_layers=4,
num_heads=8,
chunk_size=128,
ff_dim=2048,
dropout=0.1,
attention_layer=attention # Use custom attention
)
Using Different Models
BSBR provides several attention variants for comparison:
from bsbr_extras import (
LinearTransformer,
DeltaNet,
SlidingWindowTransformer,
HopfieldNetwork,
GAU
)
# Linear Transformer
linear_model = LinearTransformer(
vocab_size=10000,
hidden_dim=512,
num_layers=4,
num_heads=8
)
# DeltaNet
deltanet_model = DeltaNet(
vocab_size=10000,
hidden_dim=512,
num_layers=4,
num_heads=8
)
# Sliding Window Transformer
window_model = SlidingWindowTransformer(
vocab_size=10000,
hidden_dim=512,
num_layers=4,
num_heads=8,
window_size=128
)
Training Example
Here's a basic training loop:
import torch.nn as nn
from torch.optim import Adam
# Create model and move to device
model = BSBRModel(
vocab_size=10000,
hidden_dim=512,
num_layers=4,
num_heads=8
).to(device)
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
# Training loop
model.train()
for epoch in range(num_epochs):
for batch in dataloader:
input_ids, attention_mask, labels = batch
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(input_ids, attention_mask)
loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
Evaluation
BSBR provides tools for evaluating different models:
from evals.compare_models import compare_models
# Compare models across different sequence lengths
results = compare_models(
models=["BSBR", "Linear", "Hopfield", "GAU"],
seq_lengths=[64, 128, 256, 512, 1024]
)
# Analyze results
from evals.analyze_results import analyze_results
analysis = analyze_results(results)
Next Steps
- Explore the User Guide for detailed explanations
- Check out Examples for more use cases
- Read the API Reference for complete documentation