Quick Start¶
Graft looped attention onto CogVideoX-2B¶
from canvas_engineering import graft_looped_blocks, CurriculumScheduler
from diffusers import CogVideoXTransformer3DModel
import torch
# Load pretrained video diffusion model
transformer = CogVideoXTransformer3DModel.from_pretrained(
"THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.bfloat16
)
# Graft 3-loop attention onto all 30 frozen DiT blocks
looped_blocks, action_head = graft_looped_blocks(
transformer,
max_loops=3, # 3 is optimal (empirically validated)
freeze="full", # freeze backbone, train only loop params
action_dim=7, # 6DOF end-effector + gripper
)
# Only 350K params to optimize
optimizer = torch.optim.AdamW(
[p for b in looped_blocks for p in b.parameters() if p.requires_grad]
+ list(action_head.parameters()),
lr=1e-4,
)
# Curriculum: gradually ramp from 1 to 3 loops during training
scheduler = CurriculumScheduler(max_loops=3, total_steps=5000)
That's it. The frozen 1.69B-parameter backbone now loops its computation 3 times per forward pass, with learned iteration embeddings that cost 0.02% of the model.
Define a canvas layout¶
from canvas_engineering import CanvasLayout, RegionSpec, SpatiotemporalCanvas
layout = CanvasLayout(
T=5, H=8, W=8, d_model=256,
regions={
"visual": (0, 5, 0, 6, 0, 6), # 180 positions — video patches
"action": RegionSpec(
bounds=(0, 5, 6, 7, 0, 1),
loss_weight=2.0, # emphasize action accuracy
),
"reward": RegionSpec(
bounds=(2, 3, 7, 8, 0, 1),
period=5, # low-frequency
),
},
t_current=2,
)
canvas = SpatiotemporalCanvas(layout)
batch = canvas.create_empty(batch_size=4) # (4, 320, 256)
batch = canvas.place(batch, visual_embs, "visual") # write video patches
actions = canvas.extract(batch, "action") # read predictions
Define a topology¶
from canvas_engineering import Connection, CanvasTopology
topology = CanvasTopology(connections=[
Connection(src="visual", dst="visual"), # self-attention
Connection(src="action", dst="visual"), # action reads visual
Connection(src="action", dst="action"), # action self-attention
Connection(src="reward", dst="visual", fn="pooling"), # cheap summary
Connection(src="reward", dst="action", fn="gated"), # optional conditioning
])
# Compile to attention mask
mask = topology.to_attention_mask(layout) # (320, 320) float tensor
Compile with process semantics (v2)¶
Use compile_program() to get typed process semantics with auto-wired operators:
from dataclasses import dataclass
from canvas_engineering import Field, compile_program
@dataclass
class Robot:
camera: Field = Field(6, 6, family="observation")
belief: Field = Field(4, 4, family="state", tags=("belief",))
action: Field = Field(1, 4, family="action", loss_weight=2.0)
bound, program = compile_program(Robot(), T=8, d_model=256)
# program.regions has typed metadata per field:
print(program.regions["camera"].family) # "observation"
print(program.regions["belief"].carrier) # "deterministic"
# Auto-wired operators on connections:
for c in program.schema.topology.connections:
if c.operator != "attend":
print(f"{c.src} → {c.dst}: {c.operator}")
# camera → belief: observe
# belief → camera: predict
# belief → action: act
compile_schema() still works unchanged — compile_program() wraps it and adds the program layer.