part("6) pack unpack")
B, Cemb = 2, 128
class_token = torch.randn(B, 1, Cemb, machine=machine)
image_tokens = torch.randn(B, 196, Cemb, machine=machine)
text_tokens = torch.randn(B, 32, Cemb, machine=machine)
show_shape("class_token", class_token)
show_shape("image_tokens", image_tokens)
show_shape("text_tokens", text_tokens)
packed, ps = pack([class_token, image_tokens, text_tokens], "b * c")
show_shape("packed", packed)
print("packed_shapes (ps):", ps)
mixer = nn.Sequential(
nn.LayerNorm(Cemb),
nn.Linear(Cemb, 4 * Cemb),
nn.GELU(),
nn.Linear(4 * Cemb, Cemb),
).to(machine)
blended = mixer(packed)
show_shape("blended", blended)
class_out, image_out, text_out = unpack(blended, ps, "b * c")
show_shape("class_out", class_out)
show_shape("image_out", image_out)
show_shape("text_out", text_out)
assert class_out.form == class_token.form
assert image_out.form == image_tokens.form
assert text_out.form == text_tokens.form
part("7) layers")
class PatchEmbed(nn.Module):
def __init__(self, in_channels=3, emb_dim=192, patch=8):
tremendous().__init__()
self.patch = patch
self.to_patches = Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=patch, p2=patch)
self.proj = nn.Linear(in_channels * patch * patch, emb_dim)
def ahead(self, x):
x = self.to_patches(x)
return self.proj(x)
class SimpleVisionHead(nn.Module):
def __init__(self, emb_dim=192, num_classes=10):
tremendous().__init__()
self.pool = Scale back("b t c -> b c", discount="imply")
self.classifier = nn.Linear(emb_dim, num_classes)
def ahead(self, tokens):
x = self.pool(tokens)
return self.classifier(x)
patch_embed = PatchEmbed(in_channels=3, emb_dim=192, patch=8).to(machine)
head = SimpleVisionHead(emb_dim=192, num_classes=10).to(machine)
imgs = torch.randn(4, 3, 32, 32, machine=machine)
tokens = patch_embed(imgs)
logits = head(tokens)
show_shape("tokens", tokens)
show_shape("logits", logits)
part("8) sensible")
x = torch.randn(2, 32, 16, 16, machine=machine)
g = 8
xg = rearrange(x, "b (g cg) h w -> (b g) cg h w", g=g)
show_shape("x", x)
show_shape("xg", xg)
imply = cut back(xg, "bg cg h w -> bg 1 1 1", "imply")
var = cut back((xg - imply) ** 2, "bg cg h w -> bg 1 1 1", "imply")
xg_norm = (xg - imply) / torch.sqrt(var + 1e-5)
x_norm = rearrange(xg_norm, "(b g) cg h w -> b (g cg) h w", b=2, g=g)
show_shape("x_norm", x_norm)
z = torch.randn(3, 64, 20, 30, machine=machine)
z_flat = rearrange(z, "b c h w -> b c (h w)")
z_unflat = rearrange(z_flat, "b c (h w) -> b c h w", h=20, w=30)
assert (z - z_unflat).abs().max().merchandise() < 1e-6
show_shape("z_flat", z_flat)
part("9) views")
a = torch.randn(2, 3, 4, 5, machine=machine)
b = rearrange(a, "b c h w -> b h w c")
print("a.is_contiguous():", a.is_contiguous())
print("b.is_contiguous():", b.is_contiguous())
print("b._base is a:", getattr(b, "_base", None) is a)
part("Executed ✅ You now have reusable einops patterns for imaginative and prescient, consideration, and multimodal token packing")
