In Part 1, we explored the mathematical foundations of convolutions — the sliding window operation, weight sharing, pooling, and hierarchical feature learning.
Today, we take that math and translate it into a pure PyTorch implementation. We will build every component from scratch: a custom Conv2D layer, pooling operations, activation functions, and three complete CNN architectures of increasing depth.
The Convolutional Layer
Our custom Conv2D layer implements the sliding window operation with learnable kernels. We use Xavier initialization to ensure stable gradient flow at the start of training.
class Conv2D(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size,
stride=1, padding=0, bias=True):
super().__init__()
# Learnable convolution kernels
# Shape: (out_channels, in_channels, kernel_size, kernel_size)
self.weight = nn.Parameter(
torch.randn(out_channels, in_channels, kernel_size, kernel_size)
)
if bias:
self.bias = nn.Parameter(torch.zeros(out_channels))
# Initialize weights using Xavier initialization
nn.init.xavier_uniform_(self.weight)
def forward(self, x):
if self.padding > 0:
x = F.pad(x, (self.padding,) * 4)
return F.conv2d(x, self.weight, self.bias, stride=self.stride)
Pooling Layers
We implement both MaxPool2D and AveragePool2D for spatial downsampling. Max pooling preserves the strongest activations while average pooling smooths the representation.
class MaxPool2D(nn.Module):
"""Reduces spatial dimensions by taking the maximum in each window."""
def __init__(self, kernel_size, stride=None, padding=0):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
class AveragePool2D(nn.Module):
"""Reduces spatial dimensions by taking the average in each window."""
def __init__(self, kernel_size, stride=None, padding=0):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding)
Architecture 1: LeNet-5 (1998)
One of the earliest successful CNNs, designed by Yann LeCun for handwritten digit recognition. LeNet-5 uses tanh activations and average pooling — reflecting the conventions of the era before ReLU.
class LeNet5(nn.Module):
def __init__(self, num_classes=10, in_channels=1):
super().__init__()
# Feature extractor
self.conv1 = Conv2D(in_channels, 6, kernel_size=5, padding=2)
self.pool1 = AveragePool2D(kernel_size=2, stride=2)
self.conv2 = Conv2D(6, 16, kernel_size=5)
self.pool2 = AveragePool2D(kernel_size=2, stride=2)
# Classifier
self.fc1 = nn.Linear(16 * 6 * 6, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, num_classes)
def forward(self, x):
x = tanh(self.conv1(x)) # Conv -> Tanh
x = self.pool1(x) # AvgPool
x = tanh(self.conv2(x)) # Conv -> Tanh
x = self.pool2(x) # AvgPool
x = flatten(x) # Flatten for FC layers
x = tanh(self.fc1(x))
x = tanh(self.fc2(x))
return self.fc3(x)
Architecture 2: SimpleCNN
A modern architecture demonstrating the classic Conv → ReLU → Pool pattern with dropout regularization:
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10, in_channels=1):
super().__init__()
self.conv1 = Conv2D(in_channels, 32, kernel_size=3, padding=1)
self.conv2 = Conv2D(32, 64, kernel_size=3, padding=1)
self.pool = MaxPool2D(kernel_size=2, stride=2)
self.fc1 = nn.Linear(64 * 7 * 7, 128) # 28x28 input
self.fc2 = nn.Linear(128, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = relu(self.conv1(x)) # 28x28 -> 28x28
x = self.pool(x) # 28x28 -> 14x14
x = relu(self.conv2(x)) # 14x14 -> 14x14
x = self.pool(x) # 14x14 -> 7x7
x = flatten(x) # 64 * 7 * 7 = 3136
x = self.dropout(relu(self.fc1(x)))
return self.fc2(x)
Architecture 3: DeepCNN (VGG-Style)
A deeper network using reusable CNN blocks with BatchNorm. Each block follows the pattern Conv → BatchNorm → ReLU → Pool, doubling channels at each stage:
class CNNBlock(nn.Module):
"""Reusable Conv -> BatchNorm -> ReLU -> Pool block."""
def __init__(self, in_channels, out_channels, kernel_size=3, pool=True):
super().__init__()
self.conv = Conv2D(in_channels, out_channels, kernel_size,
padding=kernel_size // 2)
self.bn = nn.BatchNorm2d(out_channels)
self.pool = MaxPool2D(kernel_size=2, stride=2) if pool else nn.Identity()
def forward(self, x):
return self.pool(relu(self.bn(self.conv(x))))
class DeepCNN(nn.Module):
"""VGG-inspired architecture with stacked CNN blocks."""
def __init__(self, num_classes=10, in_channels=1,
num_blocks=4, initial_channels=32):
super().__init__()
blocks = []
channels = initial_channels
for i in range(num_blocks):
blocks.append(CNNBlock(
in_channels if i == 0 else channels, channels,
pool=(i < num_blocks - 1)
))
if i < num_blocks - 1:
channels *= 2 # Double channels: 32 -> 64 -> 128
self.features = nn.Sequential(*blocks)
self.classifier = nn.Sequential(
Flatten(),
nn.Dropout(0.5),
nn.Linear(channels * 3 * 3, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
Training Strategy
We train our SimpleCNN on MNIST with:
- Cross-entropy loss for multi-class classification
- Adam optimizer with learning rate $0.001$
- StepLR scheduler halving the learning rate every 5 epochs
- Dropout regularization ($p=0.5$) in fully connected layers
model = SimpleCNN(num_classes=10, in_channels=1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
for epoch in range(1, EPOCHS + 1):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
scheduler.step()
Next Steps: Visualizing What CNNs Learn
We now have a complete implementation — from custom convolutional layers to three full architectures. In Part 3, we will train on MNIST, visualize the learned convolutional filters, and analyze the feature map activations to understand what the network actually detects at each layer.