Part 1 covered the math — convolution, weight sharing, pooling, hierarchical features.
Now we translate that into code. This post walks through a pure PyTorch implementation: a custom Conv2D layer, pooling operations, and three CNN architectures of increasing depth.
The Convolutional Layer
Our Conv2D wraps the sliding window operation with learnable kernels and Xavier initialization for stable early training.
class Conv2D(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size,
stride=1, padding=0, bias=True):
super().__init__()
# Learnable convolution kernels
# Shape: (out_channels, in_channels, kernel_size, kernel_size)
self.weight = nn.Parameter(
torch.randn(out_channels, in_channels, kernel_size, kernel_size)
)
if bias:
self.bias = nn.Parameter(torch.zeros(out_channels))
# Initialize weights using Xavier initialization
nn.init.xavier_uniform_(self.weight)
def forward(self, x):
if self.padding > 0:
x = F.pad(x, (self.padding,) * 4)
return F.conv2d(x, self.weight, self.bias, stride=self.stride)
Pooling Layers
Both MaxPool2D and AveragePool2D for spatial downsampling. Max pooling keeps the strongest activations; average pooling smooths them.
class MaxPool2D(nn.Module):
"""Reduces spatial dimensions by taking the maximum in each window."""
def __init__(self, kernel_size, stride=None, padding=0):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
class AveragePool2D(nn.Module):
"""Reduces spatial dimensions by taking the average in each window."""
def __init__(self, kernel_size, stride=None, padding=0):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding)
Architecture 1: LeNet-5 (1998)
Yann LeCun's LeNet-5 for digit recognition. It uses tanh activations and average pooling — conventions of the pre-ReLU era.
class LeNet5(nn.Module):
def __init__(self, num_classes=10, in_channels=1):
super().__init__()
# Feature extractor
self.conv1 = Conv2D(in_channels, 6, kernel_size=5, padding=2)
self.pool1 = AveragePool2D(kernel_size=2, stride=2)
self.conv2 = Conv2D(6, 16, kernel_size=5)
self.pool2 = AveragePool2D(kernel_size=2, stride=2)
# Classifier
self.fc1 = nn.Linear(16 * 6 * 6, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, num_classes)
def forward(self, x):
x = tanh(self.conv1(x)) # Conv -> Tanh
x = self.pool1(x) # AvgPool
x = tanh(self.conv2(x)) # Conv -> Tanh
x = self.pool2(x) # AvgPool
x = flatten(x) # Flatten for FC layers
x = tanh(self.fc1(x))
x = tanh(self.fc2(x))
return self.fc3(x)
Architecture 2: SimpleCNN
The standard modern pattern — Conv → ReLU → Pool — with dropout:
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10, in_channels=1):
super().__init__()
self.conv1 = Conv2D(in_channels, 32, kernel_size=3, padding=1)
self.conv2 = Conv2D(32, 64, kernel_size=3, padding=1)
self.pool = MaxPool2D(kernel_size=2, stride=2)
self.fc1 = nn.Linear(64 * 7 * 7, 128) # 28x28 input
self.fc2 = nn.Linear(128, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = relu(self.conv1(x)) # 28x28 -> 28x28
x = self.pool(x) # 28x28 -> 14x14
x = relu(self.conv2(x)) # 14x14 -> 14x14
x = self.pool(x) # 14x14 -> 7x7
x = flatten(x) # 64 * 7 * 7 = 3136
x = self.dropout(relu(self.fc1(x)))
return self.fc2(x)
Architecture 3: DeepCNN (VGG-Style)
A deeper network built from reusable CNN blocks — Conv → BatchNorm → ReLU → Pool — doubling channels at each stage:
class CNNBlock(nn.Module):
"""Reusable Conv -> BatchNorm -> ReLU -> Pool block."""
def __init__(self, in_channels, out_channels, kernel_size=3, pool=True):
super().__init__()
self.conv = Conv2D(in_channels, out_channels, kernel_size,
padding=kernel_size // 2)
self.bn = nn.BatchNorm2d(out_channels)
self.pool = MaxPool2D(kernel_size=2, stride=2) if pool else nn.Identity()
def forward(self, x):
return self.pool(relu(self.bn(self.conv(x))))
class DeepCNN(nn.Module):
"""VGG-inspired architecture with stacked CNN blocks."""
def __init__(self, num_classes=10, in_channels=1,
num_blocks=4, initial_channels=32):
super().__init__()
blocks = []
channels = initial_channels
for i in range(num_blocks):
blocks.append(CNNBlock(
in_channels if i == 0 else channels, channels,
pool=(i < num_blocks - 1)
))
if i < num_blocks - 1:
channels *= 2 # Double channels: 32 -> 64 -> 128
self.features = nn.Sequential(*blocks)
self.classifier = nn.Sequential(
Flatten(),
nn.Dropout(0.5),
nn.Linear(channels * 3 * 3, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
Training Strategy
We train SimpleCNN on MNIST with:
- Cross-entropy loss for multi-class classification
- Adam optimizer at learning rate $0.001$
- StepLR scheduler — halves LR every 5 epochs
- Dropout ($p=0.5$) in the FC layers
model = SimpleCNN(num_classes=10, in_channels=1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
for epoch in range(1, EPOCHS + 1):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
scheduler.step()
Up Next
That covers the implementation — custom convolutional layers through three complete architectures. Part 3 trains this on MNIST, visualizes the learned filters, and analyzes feature maps at each layer.