Back to CNNs Hub

Deconstructing CNNs from Scratch

Part 2: Pure PyTorch Implementation

Introduction

Part 1 covered the math — convolution, weight sharing, pooling, hierarchical features.

Now we translate that into code. This post walks through a pure PyTorch implementation: a custom Conv2D layer, pooling operations, and three CNN architectures of increasing depth.

View Full Setup on GitHub

The Convolutional Layer

Our Conv2D wraps the sliding window operation with learnable kernels and Xavier initialization for stable early training.

class Conv2D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, bias=True):
        super().__init__()
        # Learnable convolution kernels
        # Shape: (out_channels, in_channels, kernel_size, kernel_size)
        self.weight = nn.Parameter(
            torch.randn(out_channels, in_channels, kernel_size, kernel_size)
        )
        if bias:
            self.bias = nn.Parameter(torch.zeros(out_channels))

        # Initialize weights using Xavier initialization
        nn.init.xavier_uniform_(self.weight)

    def forward(self, x):
        if self.padding > 0:
            x = F.pad(x, (self.padding,) * 4)
        return F.conv2d(x, self.weight, self.bias, stride=self.stride)

Pooling Layers

Both MaxPool2D and AveragePool2D for spatial downsampling. Max pooling keeps the strongest activations; average pooling smooths them.

class MaxPool2D(nn.Module):
    """Reduces spatial dimensions by taking the maximum in each window."""
    def __init__(self, kernel_size, stride=None, padding=0):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride if stride is not None else kernel_size

    def forward(self, x):
        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)


class AveragePool2D(nn.Module):
    """Reduces spatial dimensions by taking the average in each window."""
    def __init__(self, kernel_size, stride=None, padding=0):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride if stride is not None else kernel_size

    def forward(self, x):
        return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding)

Architecture 1: LeNet-5 (1998)

Yann LeCun's LeNet-5 for digit recognition. It uses tanh activations and average pooling — conventions of the pre-ReLU era.

class LeNet5(nn.Module):
    def __init__(self, num_classes=10, in_channels=1):
        super().__init__()
        # Feature extractor
        self.conv1 = Conv2D(in_channels, 6, kernel_size=5, padding=2)
        self.pool1 = AveragePool2D(kernel_size=2, stride=2)
        self.conv2 = Conv2D(6, 16, kernel_size=5)
        self.pool2 = AveragePool2D(kernel_size=2, stride=2)

        # Classifier
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)

    def forward(self, x):
        x = tanh(self.conv1(x))  # Conv -> Tanh
        x = self.pool1(x)        # AvgPool
        x = tanh(self.conv2(x))  # Conv -> Tanh
        x = self.pool2(x)        # AvgPool
        x = flatten(x)           # Flatten for FC layers
        x = tanh(self.fc1(x))
        x = tanh(self.fc2(x))
        return self.fc3(x)

Architecture 2: SimpleCNN

The standard modern pattern — Conv → ReLU → Pool — with dropout:

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10, in_channels=1):
        super().__init__()
        self.conv1 = Conv2D(in_channels, 32, kernel_size=3, padding=1)
        self.conv2 = Conv2D(32, 64, kernel_size=3, padding=1)
        self.pool = MaxPool2D(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # 28x28 input
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = relu(self.conv1(x))   # 28x28 -> 28x28
        x = self.pool(x)          # 28x28 -> 14x14
        x = relu(self.conv2(x))   # 14x14 -> 14x14
        x = self.pool(x)          # 14x14 -> 7x7
        x = flatten(x)            # 64 * 7 * 7 = 3136
        x = self.dropout(relu(self.fc1(x)))
        return self.fc2(x)

Architecture 3: DeepCNN (VGG-Style)

A deeper network built from reusable CNN blocks — Conv → BatchNorm → ReLU → Pool — doubling channels at each stage:

class CNNBlock(nn.Module):
    """Reusable Conv -> BatchNorm -> ReLU -> Pool block."""
    def __init__(self, in_channels, out_channels, kernel_size=3, pool=True):
        super().__init__()
        self.conv = Conv2D(in_channels, out_channels, kernel_size,
                          padding=kernel_size // 2)
        self.bn = nn.BatchNorm2d(out_channels)
        self.pool = MaxPool2D(kernel_size=2, stride=2) if pool else nn.Identity()

    def forward(self, x):
        return self.pool(relu(self.bn(self.conv(x))))


class DeepCNN(nn.Module):
    """VGG-inspired architecture with stacked CNN blocks."""
    def __init__(self, num_classes=10, in_channels=1,
                 num_blocks=4, initial_channels=32):
        super().__init__()
        blocks = []
        channels = initial_channels
        for i in range(num_blocks):
            blocks.append(CNNBlock(
                in_channels if i == 0 else channels, channels,
                pool=(i < num_blocks - 1)
            ))
            if i < num_blocks - 1:
                channels *= 2  # Double channels: 32 -> 64 -> 128

        self.features = nn.Sequential(*blocks)
        self.classifier = nn.Sequential(
            Flatten(),
            nn.Dropout(0.5),
            nn.Linear(channels * 3 * 3, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

Training Strategy

We train SimpleCNN on MNIST with:

model = SimpleCNN(num_classes=10, in_channels=1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    scheduler.step()

Up Next

That covers the implementation — custom convolutional layers through three complete architectures. Part 3 trains this on MNIST, visualizes the learned filters, and analyzes feature maps at each layer.