#| echo: false
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import scipy.stats as stats
from scipy.linalg import solve, qr, svd, inv
from scipy.optimize import minimize, minimize_scalar
from scipy.special import expit as sigmoid  # Numerically stable sigmoid
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.figsize': (10, 6),
    'font.size': 12,
    'lines.linewidth': 2.5,
    'axes.labelsize': 12,
    'axes.titlesize': 14,
    'legend.fontsize': 11,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'figure.dpi': 100,
    'axes.grid': True,
    'grid.alpha': 0.3,
    'grid.linewidth': 1
})

np.random.seed(42)

Deep Learning

How Neural Networks Learn

Learning to Classify

What is Machine Learning?

Learning = Task + Performance Measure + Experience

Herbert Simon (1983)

"Learning is any process by which a system improves performance from experience."

Framework

\[ \text{Learning System} = (\mathcal{T}, \mathcal{P}, \mathcal{E}) \]
  • Task \(\mathcal{T}\): What to accomplish
  • Performance \(\mathcal{P}\): How to measure success
  • Experience \(\mathcal{E}\): Data to learn from

Learning occurs when:

\[ \mathcal{P}_{\text{after}}(\mathcal{T}, \mathcal{E}) > \mathcal{P}_{\text{before}}(\mathcal{T}) \]

Example: Email Spam Filter

  • Task (\(\mathcal{T}\)): Classify emails as spam/not spam
  • Performance (\(\mathcal{P}\)): % correctly classified
  • Experience (\(\mathcal{E}\)): Database of labeled emails

Example: Self-Driving Car

  • \(\mathcal{T}\): Navigate roads safely
  • \(\mathcal{P}\): Miles without intervention
  • \(\mathcal{E}\): Hours of human driving data

Generalization is the Goal of Machine Learning

  • Do not care about performance on the dataset we have
  • Do care about performance on similar data that has no labels
  • Accuracy/Generalization trade-off (bias-variance trade):
    • Optimizing accuracy to the extreme reduces capability to generalize

Machine Learning Inverts Traditional Programming

#| echo: false
"""
Diagram contrasting traditional programming (rules + data → output) with machine learning (data + expected output → learned program).
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import FancyBboxPatch

fig, axes = plt.subplots(1, 3, figsize=(15, 6))

# Traditional Programming
ax = axes[0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Input
rect1 = FancyBboxPatch((1, 6), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#E3F2FD', edgecolor='#1976D2', linewidth=2)
ax.add_patch(rect1)
ax.text(2, 6.75, 'Rules', ha='center', fontsize=11, fontweight='bold')

# Program
rect2 = FancyBboxPatch((1, 3), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#FFF9C4', edgecolor='#F57C00', linewidth=2)
ax.add_patch(rect2)
ax.text(2, 3.75, 'Data', ha='center', fontsize=11, fontweight='bold')

# Process box
rect3 = FancyBboxPatch((4.5, 4), 2, 2, boxstyle="round,pad=0.1",
                       facecolor='#F5F5F5', edgecolor='#616161', linewidth=2)
ax.add_patch(rect3)
ax.text(5.5, 5, 'Traditional\nProgram', ha='center', fontsize=10, fontweight='bold')

# Output
rect4 = FancyBboxPatch((7.5, 4.25), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#E8F5E9', edgecolor='#4CAF50', linewidth=2)
ax.add_patch(rect4)
ax.text(8.5, 5, 'Output', ha='center', fontsize=11, fontweight='bold')

# Arrows
ax.arrow(3, 6.75, 1.3, -1.5, head_width=0.15, head_length=0.1, fc='black')
ax.arrow(3, 3.75, 1.3, 0.5, head_width=0.15, head_length=0.1, fc='black')
ax.arrow(6.5, 5, 0.9, 0, head_width=0.15, head_length=0.1, fc='black')

ax.set_title('Traditional Programming', fontsize=13, fontweight='bold', color='#424242')
ax.text(5, 1, 'if (temp > 30):\n    return "hot"\nelse:\n    return "cold"', 
        ha='center', fontsize=9, family='monospace', style='italic')

# Machine Learning
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Input
rect1 = FancyBboxPatch((1, 6), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#FFF9C4', edgecolor='#F57C00', linewidth=2)
ax.add_patch(rect1)
ax.text(2, 6.75, 'Data', ha='center', fontsize=11, fontweight='bold')

# Expected Output
rect2 = FancyBboxPatch((1, 3), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#E8F5E9', edgecolor='#4CAF50', linewidth=2)
ax.add_patch(rect2)
ax.text(2, 3.75, 'Expected\nOutput', ha='center', fontsize=10, fontweight='bold')

# ML box
rect3 = FancyBboxPatch((4.5, 4), 2, 2, boxstyle="round,pad=0.1",
                       facecolor='#FFE0B2', edgecolor='#FF6F00', linewidth=2)
ax.add_patch(rect3)
ax.text(5.5, 5, 'Machine\nLearning', ha='center', fontsize=10, fontweight='bold')

# Program
rect4 = FancyBboxPatch((7.5, 4.25), 2, 1.5, boxstyle="round,pad=0.1",
                       facecolor='#E3F2FD', edgecolor='#1976D2', linewidth=2)
ax.add_patch(rect4)
ax.text(8.5, 5, 'Program', ha='center', fontsize=11, fontweight='bold')

# Arrows
ax.arrow(3, 6.75, 1.3, -1.5, head_width=0.15, head_length=0.1, fc='black')
ax.arrow(3, 3.75, 1.3, 0.5, head_width=0.15, head_length=0.1, fc='black')
ax.arrow(6.5, 5, 0.9, 0, head_width=0.15, head_length=0.1, fc='black')

ax.set_title('Machine Learning', fontsize=13, fontweight='bold', color='#8B0000')
ax.text(5, 1, 'Learns from 1000s\nof examples', 
        ha='center', fontsize=9, style='italic')

# The Result
ax = axes[2]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Examples at top
examples = [
    "Recognize faces",
    "Translate languages",
    "Drive cars",
    "Diagnose diseases",
    "Predict markets"
]

y_start = 8
for i, example in enumerate(examples):
    rect = FancyBboxPatch((1, y_start - i*1.3), 8, 0.9, boxstyle="round,pad=0.05",
                          facecolor='#F3E5F5', edgecolor='#7B1FA2', linewidth=1,
                          alpha=0.7 - i*0.1)
    ax.add_patch(rect)
    ax.text(5, y_start - i*1.3 + 0.45, example, ha='center', fontsize=10)

ax.set_title('Tasks Impossible to Program Explicitly', fontsize=13, fontweight='bold', color='#7B1FA2')

plt.suptitle('The Paradigm Shift: From Rules to Learning', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

Theory-Driven vs Data-Driven Approaches

Classical: Theory-Driven

Modern: Data-Driven

Model Complexity: When to Stop Adding Parameters

George Box (1976)

"All models are wrong, but some are useful"

"Since all models are wrong the scientist cannot obtain a 'correct' one by excessive elaboration"

Box's warning: More parameters ≠ better science

MNIST Classification: Accuracy vs Complexity

  • Nearest neighbor: 3% error, \(\mathcal{O}(n)\) inference
  • Linear classifier: 8% error, \(\mathcal{O}(d)\) inference
  • 2-layer network: 2% error, 50K parameters
  • ConvNet (LeNet-5): 0.8% error, 60K parameters
  • ResNet-50: 0.2% error, 25M parameters

Question: Is 0.2% → 0.1% worth 25M parameters?

Worrying Selectively

It is inappropriate to be concerned about mice when there are tigers abroad

  • Start simple
  • Add complexity purposefully
  • Validate empirically
#| echo: false
"""
Log-log plot showing error rate and training time versus model complexity, with a highlighted "sweet spot" region where performance gains balance computational cost.
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))

# Real data inspired by actual model performances
model_complexity = np.array([1, 10, 100, 1000, 10000, 100000])
error_rate = np.array([12, 8, 3, 1.5, 0.8, 0.7])
training_time = np.array([0.1, 1, 10, 60, 600, 3600])  # seconds

ax.loglog(model_complexity, error_rate, 'o-', linewidth=2, markersize=8, color='#C62828', label='Error Rate (%)')
ax.loglog(model_complexity, training_time/60, 's--', linewidth=2, markersize=7, color='#1976D2', label='Training Time (min)')

# Mark the sweet spot
ax.axvspan(100, 1000, alpha=0.2, color='green')
ax.text(300, 10, 'Ideal?', fontsize=11, fontweight='bold', ha='center', color='#2E7D32')

ax.set_xlabel('Model Parameters', fontsize=11)
ax.set_ylabel('Error / Time', fontsize=11)
ax.set_title('The Complexity-Performance Trade-off', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Course Structure: Statistical Foundations to Neural Networks

#| echo: false
"""
Course structure diagram showing three columns: statistical models (yellow), data-driven methods (blue), and neural networks (green), illustrating how the course bridges classical and modern machine learning approaches.
"""

#| fig-align: center

import matplotlib.pyplot as plt
import matplotlib.patches as patches

fig, ax = plt.subplots(figsize=(14, 9))
ax.set_xlim(0, 12)
ax.set_ylim(0, 9)
ax.axis('off')

# Column headers
ax.text(2, 8.3, 'statistical models', fontsize=15, color='#1976D2', fontweight='bold', ha='center')
ax.text(6, 8.3, 'data driven', fontsize=15, color='#1976D2', fontweight='bold', ha='center')

# Define grid parameters
col_width = 3.2
col_gap = 0.8
row_gap = 0.15
start_x = 0.4
start_y = 7.5

# Column 1 (Yellow) - Statistical Models
yellow_specs = [
    (1.3, ['MMSE Estimation', 'Linear/Affine MMSE Est.', 'FIR Wiener filtering']),
    (1.2, ['Bayesian decision theory', 'Hard decisions', 'soft decisions (APP)']),
    (0.8, ['ML/MAP parameter', 'estimation']),
    (0.9, ['Karhunen-Loeve expansion', 'sufficient statistics'])
]

y_pos = start_y
for height, texts in yellow_specs:
    rect = patches.FancyBboxPatch((start_x, y_pos - height), col_width, height,
                                  boxstyle="round,pad=0.02",
                                  facecolor='#FFF9C4', edgecolor='#F57C00', linewidth=1.5)
    ax.add_patch(rect)
    text_spacing = height / (len(texts) + 0.5)
    for i, text in enumerate(texts):
        ax.text(start_x + col_width/2, y_pos - text_spacing*(i+0.7), text, 
               ha='center', va='center', fontsize=10.5)
    y_pos -= (height + row_gap)

# Column 2 (Blue) - Data-Driven
blue_specs = [
    (1.3, ['general regression', 'linear LS regression', 'stochastic gradient and', 'batches']),
    (1.2, ['Classification from data', 'linear classifier', 'logistical regression', '(perceptron)']),
    (0.6, ['regularization']),
    (0.9, ['PCA', 'feature design'])
]

x_pos = start_x + col_width + col_gap
y_pos = start_y
for height, texts in blue_specs:
    rect = patches.FancyBboxPatch((x_pos, y_pos - height), col_width, height,
                                  boxstyle="round,pad=0.02",
                                  facecolor='#E3F2FD', edgecolor='#0288D1', linewidth=1.5)
    ax.add_patch(rect)
    text_spacing = height / (len(texts) + 0.5)
    for i, text in enumerate(texts):
        ax.text(x_pos + col_width/2, y_pos - text_spacing*(i+0.7), text,
               ha='center', va='center', fontsize=10.5)
    y_pos -= (height + row_gap)

# Column 3 (Green) - Neural Networks (spans multiple rows)
x_pos = start_x + 2*(col_width + col_gap)
nn_height = 5.0
nn_y = start_y
rect = patches.FancyBboxPatch((x_pos, nn_y - nn_height), col_width, nn_height,
                              boxstyle="round,pad=0.02",
                              facecolor='#E8F5E9', edgecolor='#4CAF50', linewidth=1.5)
ax.add_patch(rect)
ax.text(x_pos + col_width/2, nn_y - 1, 'neural networks', ha='center', fontsize=11.5, fontweight='bold')
ax.text(x_pos + col_width/2, nn_y - 1.8, 'for regression and', ha='center', fontsize=10.5)
ax.text(x_pos + col_width/2, nn_y - 2.4, 'classification', ha='center', fontsize=10.5)
ax.text(x_pos + col_width/2, nn_y - 3.8, 'learning with SGD', ha='center', fontsize=10.5)

# Bottom bar (Green) - Working with data (spans columns 2 and 3)
bottom_x = start_x + col_width + col_gap
bottom_width = 2*col_width + col_gap
bottom_y = 1.5
rect = patches.FancyBboxPatch((bottom_x, bottom_y), bottom_width, 0.7,
                              boxstyle="round,pad=0.02",
                              facecolor='#E8F5E9', edgecolor='#4CAF50', linewidth=1.5)
ax.add_patch(rect)
ax.text(bottom_x + bottom_width/2, bottom_y + 0.35, 'working with data', 
       ha='center', fontsize=11, fontweight='bold')

ellipse_x = start_x + col_width + col_gap/2
ellipse_y = start_y - 1.4
ellipse = patches.Ellipse((ellipse_x, ellipse_y), 3.2, 0.7, 
                         facecolor='#E0E0E0', edgecolor='#616161', 
                         linewidth=1.5, alpha=0.9, zorder=10)
ax.add_patch(ellipse)
ax.text(ellipse_x, ellipse_y, 'GD, SGD, LMS', 
       ha='center', va='center', fontsize=10.5, fontweight='bold', zorder=11)

plt.tight_layout()
plt.show()

Semester Progression: MMSE to Convolutional Networks

#| echo: false
"""
Six-panel overview of course topics: MMSE regression with fitted line, logistic regression decision boundary, MLP architecture diagram, PyTorch training loss curves, CNN layer progression, and a placeholder for weeks 12-14.
"""

#| fig-align: center

# Working directory is now lecture/01, so lib is a direct subdirectory
import sys
if 'lib' not in sys.path:
    sys.path.append('lib')
from plotting_utils import draw_neural_network, plot_loss_curves

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Course Overview', fontsize=16, fontweight='bold')

# Week 3-4: MMSE/Regression
ax = axes[0, 0]
np.random.seed(42)
x = np.linspace(0, 10, 100)
y_true = 2 * x + 1
y_obs = y_true + np.random.normal(0, 2, 100)
ax.scatter(x[::5], y_obs[::5], alpha=0.5, label='Observations', s=30, color='#1976D2')
ax.plot(x, y_true, 'r-', linewidth=2, label='MMSE Estimate')
ax.set_title('Weeks 3-4: MMSE/Regression', fontweight='bold')
ax.set_xlabel('Input')
ax.set_ylabel('Output')
ax.legend()
ax.grid(True, alpha=0.3)

# Week 5: Logistic Regression
ax = axes[0, 1]
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, 
                          n_informative=2, n_clusters_per_class=1, random_state=42)
ax.scatter(X[y==0, 0], X[y==0, 1], c='#1976D2', alpha=0.5, label='Class 0', s=30)
ax.scatter(X[y==1, 0], X[y==1, 1], c='#C62828', alpha=0.5, label='Class 1', s=30)
x_line = np.linspace(-3, 3, 100)
ax.plot(x_line, -x_line + 0.5, '--', color='#2E7D32', linewidth=2, label='Decision Boundary')
ax.set_title('Week 5: Logistic Regression', fontweight='bold')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.legend()
ax.grid(True, alpha=0.3)

# Week 5-6: MLP
ax = axes[0, 2]
draw_neural_network(ax, [3, 4, 4, 2])
ax.set_title('Weeks 5-6: Multilayer Perceptron', fontweight='bold')

# Week 8-9: PyTorch Training
ax = axes[1, 0]
epochs = np.linspace(0, 50, 50)
train_loss = 2 * np.exp(-epochs/10) + 0.1 + np.random.normal(0, 0.05, 50)
val_loss = 2 * np.exp(-epochs/12) + 0.15 + np.random.normal(0, 0.08, 50)
plot_loss_curves(ax, train_loss, val_loss)
ax.set_title('Weeks 8-9: PyTorch Training', fontweight='bold')

# Week 10-11: CNN
ax = axes[1, 1]
from matplotlib.patches import Rectangle, FancyBboxPatch

# CNN architecture visualization
layers_info = [
    {'x': 0, 'w': 1.2, 'h': 1.2, 'color': '#E3F2FD', 'label': 'Input\n28×28'},
    {'x': 2, 'w': 1.0, 'h': 1.0, 'color': '#E8F5E9', 'label': 'Conv\n24×24×8'},
    {'x': 3.5, 'w': 0.8, 'h': 0.8, 'color': '#FFF3E0', 'label': 'Pool\n12×12×8'},
    {'x': 4.8, 'w': 0.6, 'h': 0.6, 'color': '#FCE4EC', 'label': 'FC\n128'},
    {'x': 5.8, 'w': 0.3, 'h': 0.3, 'label': '10'}
]

for i, layer in enumerate(layers_info):
    y_center = 0.5
    rect = FancyBboxPatch((layer['x'], y_center - layer['h']/2), layer['w'], layer['h'],
                          boxstyle="round,pad=0.02", 
                          facecolor=layer.get('color', '#F5F5F5'),
                          edgecolor='black', linewidth=1.5)
    ax.add_patch(rect)
    ax.text(layer['x'] + layer['w']/2, y_center, layer['label'], 
           ha='center', va='center', fontsize=9)
    
    if i < len(layers_info) - 1:
        ax.arrow(layer['x'] + layer['w'], y_center, 
                layers_info[i+1]['x'] - layer['x'] - layer['w'] - 0.1, 0,
                head_width=0.05, head_length=0.05, fc='black')

ax.set_xlim(-0.5, 7)
ax.set_ylim(-0.5, 1.5)
ax.axis('off')
ax.set_title('Weeks 10-11: CNN Architecture', fontweight='bold')

# Week 13: RNN/Embeddings
ax = axes[1, 2]
time_steps = 5
for t in range(time_steps):
    rect = FancyBboxPatch((t*1.5, 0), 0.8, 0.8, boxstyle="round,pad=0.02",
                          facecolor='#E1F5FE', edgecolor='#0288D1', linewidth=2)
    ax.add_patch(rect)
    ax.text(t*1.5 + 0.4, 0.4, f'$h_{t}$', ha='center', va='center', fontsize=12)
    if t < time_steps - 1:
        ax.arrow(t*1.5 + 0.8, 0.4, 0.6, 0, head_width=0.05, 
                head_length=0.05, fc='black')
ax.set_title('Week 13: RNN/Sequential Models', fontweight='bold')
ax.set_xlim(-0.5, 7)
ax.set_ylim(-0.2, 1)
ax.axis('off')

plt.tight_layout()
plt.show()

Outline

Foundations

Learning Framework

  • Task, performance, experience
  • Generalization as the goal

Hypothesis Classes

  • Linear models and their limits
  • Bias-variance tradeoff

Data

  • Quality vs quantity
  • Representation and dimensionality

Learning Paradigms

  • Supervised, unsupervised, reinforcement
  • Self-supervised methods

Neural Networks

Architecture

  • Perceptron to deep networks
  • Universal approximation
  • Width vs depth

Optimization

  • Loss landscapes
  • SGD and variants

Generalization

  • The mystery of why networks work

Practice

Environment Setup

PyTorch Demo

  • Fashion-MNIST classifier

Linear Models Fail on Nonlinear Boundaries

#| echo: false
"""
Logistic regression succeeds on linearly separable data (left) but fails on the two-moons dataset (right), demonstrating that linear models cannot learn curved decision boundaries.
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# LEFT: Linearly separable data (success case)
np.random.seed(42)
X_linear, y_linear = make_classification(n_samples=200, n_features=2, n_redundant=0,
                                         n_informative=2, n_clusters_per_class=1, 
                                         class_sep=2.0, random_state=42)
X_lin_train, X_lin_test, y_lin_train, y_lin_test = train_test_split(
    X_linear, y_linear, test_size=0.3, random_state=42
)

# Fit linear classifier to linearly separable data
linear_clf_good = LogisticRegression(random_state=42)
linear_clf_good.fit(X_lin_train, y_lin_train)
lin_accuracy = linear_clf_good.score(X_lin_test, y_lin_test)

# Plot linearly separable case
ax1.scatter(X_lin_train[y_lin_train==0, 0], X_lin_train[y_lin_train==0, 1], 
           c='#1976D2', alpha=0.6, label='Class 0', s=50)
ax1.scatter(X_lin_train[y_lin_train==1, 0], X_lin_train[y_lin_train==1, 1], 
           c='#C62828', alpha=0.6, label='Class 1', s=50)

# Add decision boundary for linear case
h = .02
x_min, x_max = X_linear[:, 0].min() - 0.5, X_linear[:, 0].max() + 0.5
y_min, y_max = X_linear[:, 1].min() - 0.5, X_linear[:, 1].max() + 0.5
xx1, yy1 = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z_good = linear_clf_good.predict_proba(np.c_[xx1.ravel(), yy1.ravel()])[:, 1]
Z_good = Z_good.reshape(xx1.shape)

ax1.contourf(xx1, yy1, Z_good, levels=[0, 0.5, 1], colors=['#E3F2FD', '#FFEBEE'], alpha=0.4)
ax1.contour(xx1, yy1, Z_good, levels=[0.5], colors='#2E7D32', linewidths=2)

ax1.set_title(f'Linearly Separable: {lin_accuracy:.1%} Accuracy', fontweight='bold', fontsize=14)
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.legend(loc='upper right')
ax1.grid(True, alpha=0.3)

# RIGHT: Two-moons dataset (failure case)
X_moons, y_moons = make_moons(n_samples=200, noise=0.2, random_state=42)
X_moon_train, X_moon_test, y_moon_train, y_moon_test = train_test_split(
    X_moons, y_moons, test_size=0.3, random_state=42
)

# Fit linear classifier to two-moons
linear_clf_bad = LogisticRegression(random_state=42)
linear_clf_bad.fit(X_moon_train, y_moon_train)
moon_accuracy = linear_clf_bad.score(X_moon_test, y_moon_test)

# Plot two-moons case with outlines
from matplotlib.patches import Arc

ax2.scatter(X_moon_train[y_moon_train==0, 0], X_moon_train[y_moon_train==0, 1], 
           c='#1976D2', alpha=0.6, label='Class 0', s=50)
ax2.scatter(X_moon_train[y_moon_train==1, 0], X_moon_train[y_moon_train==1, 1], 
           c='#C62828', alpha=0.6, label='Class 1', s=50)

# Add light moon outlines
#arc1 = Arc((0.5, 0.25), 2.0, 2.0, angle=0, theta1=0, theta2=180, 
#           color='#1976D2', linewidth=1.5, alpha=0.3, linestyle='--')
#ax2.add_patch(arc1)
#arc2 = Arc((0.5, -0.25), 2.0, 2.0, angle=0, theta1=180, theta2=360, 
#           color='#C62828', linewidth=1.5, alpha=0.3, linestyle='--')
#ax2.add_patch(arc2)

# Add decision boundary for two-moons
x_min, x_max = X_moons[:, 0].min() - 0.5, X_moons[:, 0].max() + 0.5
y_min, y_max = X_moons[:, 1].min() - 0.5, X_moons[:, 1].max() + 0.5
xx2, yy2 = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z_bad = linear_clf_bad.predict_proba(np.c_[xx2.ravel(), yy2.ravel()])[:, 1]
Z_bad = Z_bad.reshape(xx2.shape)

ax2.contourf(xx2, yy2, Z_bad, levels=[0, 0.5, 1], colors=['#E3F2FD', '#FFEBEE'], alpha=0.4)
ax2.contour(xx2, yy2, Z_bad, levels=[0.5], colors='#C62828', linewidths=2, linestyles='--')

ax2.set_title(f'Two-Moons Dataset: {moon_accuracy:.1%} Accuracy', fontweight='bold', fontsize=14)
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')
ax2.legend(loc='upper right')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Two-Moons Dataset

Tests whether a model can learn curved decision boundaries. Two interleaving half-circles that cannot be separated by any straight line.

Neural Networks Learn Nonlinear Decision Boundaries

#| echo: true
#| code-fold: true

import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

mlp = MLPClassifier(
    hidden_layer_sizes=(10, 10), 
    max_iter=1000, 
    random_state=42
)
mlp.fit(X_train, y_train)

print(f"Training accuracy: {mlp.score(X_train, y_train):.3f}")
print(f"Test accuracy: {mlp.score(X_test, y_test):.3f}")
#| echo: false
"""
Training data scatter plot alongside the neural network's learned nonlinear decision boundary, showing how the model separates two classes on test data.
"""

#| fig-align: center

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Training data
ax1.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], 
           c='#1976D2', alpha=0.6, label='Class 0', s=50)
ax1.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], 
           c='#C62828', alpha=0.6, label='Class 1', s=50)
ax1.set_title('Training Data', fontweight='bold')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Decision boundary
h = .02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = mlp.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

ax2.contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.8)
ax2.scatter(X_test[y_test==0, 0], X_test[y_test==0, 1], c='#1976D2', 
           edgecolor='white', s=60, label='Test Class 0', linewidths=2)
ax2.scatter(X_test[y_test==1, 0], X_test[y_test==1, 1], c='#C62828', 
           edgecolor='white', s=60, label='Test Class 1', linewidths=2)
ax2.set_title('Learned Decision Boundary', fontweight='bold')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Learning Fundamentals

---

Minimize Expected Risk Using Only Finite Samples

Given

  • Training data: \(\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^N\)
  • Hypothesis class: \(\mathcal{H}\)
  • Loss function: \(\mathcal{L}\)

Goal

Find \(h^* \in \mathcal{H}\) that minimizes:

\[ \mathbb{E}_{(\mathbf{x},y) \sim P}[\mathcal{L}(h(\mathbf{x}), y)] \]

But we only have access to:

\[ \frac{1}{N}\sum_{i=1}^N \mathcal{L}(h(\mathbf{x}_i), y_i) \]

Generalization Gap

Minimize error on unseen data using only observed samples

This gap defines machine learning

Example Task: "2s" Detector

MNIST: Input and Output Representations

#| echo: false
"""
Grid of ten handwritten digit samples (0-9) from the MNIST dataset, illustrating the raw pixel input representation for classification.
"""

#| fig-align: center

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import numpy as np

digits = load_digits()
fig, axes = plt.subplots(2, 5, figsize=(12, 6))

for i, ax in enumerate(axes.flat):
    img = digits.images[i]
    im = ax.imshow(img, cmap='gray_r', interpolation='nearest')
    ax.set_title(f'Label: {digits.target[i]}', fontsize=12, fontweight='bold')
    ax.axis('off')
    
    # Add subtle border
    for spine in ax.spines.values():
        spine.set_edgecolor('#E0E0E0')
        spine.set_linewidth(1)

plt.tight_layout()
plt.show()

Input Space

Output Space

Same Data, Multiple Representations

#| echo: false
"""
Four representations of the same handwritten digit: original grayscale pixels, binary threshold, flattened 64-dimensional vector, and edge-detected gradient features.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

digit_idx = 0
original = digits.images[digit_idx]

# Original
ax = axes[0]
ax.imshow(original, cmap='gray_r', interpolation='nearest')
ax.set_title('Original Pixels\n(8×8 grayscale)', fontweight='bold')
ax.axis('off')

# Binary threshold
ax = axes[1]
binary = (original > 7).astype(int)
ax.imshow(binary, cmap='gray_r', interpolation='nearest')
ax.set_title('Binary Threshold\n(0 or 1)', fontweight='bold')
ax.axis('off')

# Flattened vector
ax = axes[2]
flattened = original.flatten()
ax.bar(range(len(flattened)), flattened, color='#1976D2', width=1.0, edgecolor='none')
ax.set_title('Flattened Vector\n(64-dimensional)', fontweight='bold')
ax.set_xlabel('Pixel Index')
ax.set_ylabel('Intensity')
ax.set_ylim(0, 16)
ax.grid(True, alpha=0.3)

# Edge detection
ax = axes[3]
from scipy import ndimage
edges = ndimage.sobel(original)
ax.imshow(np.abs(edges), cmap='hot', interpolation='nearest')
ax.set_title('Edge Features\n(Gradient)', fontweight='bold')
ax.axis('off')

plt.suptitle(f'Same Data, Different Representations (Digit: {digits.target[digit_idx]})', 
             fontsize=16, fontweight='bold', y=1.05)
plt.tight_layout()
plt.show()

Representation Determines Learnability

The choice of representation can make learning tractable or impossible. Deep learning learns representations automatically.

Example: The Data Domain

![](sections/images/02-learning-fundamentals/hypo-good-intro.png)

GOOD
GOOD
BAD
?

The choice of how to represent input is very important

**Can we classify the unknown pattern?**

Converting Pattern to Binary Vector

Binary Representation

x = 0111111011100100000010000
    001011111111101111001110

Label: "GOOD"

Key Insight

The same pattern can be represented as:

A hypothesis class can succeed or fail based on the choice of representation.

Linear Classifier on Binary Representation

Representation: Binary vectors, length \(d = 49\)

\[ \mathbf{x} = \begin{bmatrix} 0111111011100100000010000 \\ 001011111111101111001110 \end{bmatrix} \]
\[ y \in \{-1, +1\} \]

Hypothesize mapping data to label using linear classifier:

\[ \hat{y} = \text{sign}(\mathbf{w} \cdot \mathbf{x}) = \text{sign}(w_1 x_1 + \cdots + w_{49} x_{49}) \]

Definition: Linear Function

A function \(f: \mathbb{R}^d \rightarrow \mathbb{R}\) is linear if \(f(\mathbf{x}) = \mathbf{w}^\top \mathbf{x} + b\) for some \(\mathbf{w} \in \mathbb{R}^d\) and \(b \in \mathbb{R}\). The decision boundary \(\{\mathbf{x} : f(\mathbf{x}) = 0\}\) is a hyperplane.

where:

Linear vs Nonlinear Hypothesis Classes

#| echo: false
"""
XOR classification problem showing underfitting (linear boundary), good fit (quadratic boundary), and overfitting (high-order polynomial) to illustrate hypothesis class complexity trade-offs.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

np.random.seed(42)
n_samples = 100
X = np.random.randn(n_samples, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# Linear boundary
ax = axes[0]
ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0')
ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1')
ax.axhline(y=0, color='#2E7D32', linestyle='--', linewidth=2, label='Linear Boundary')
ax.set_title('Linear Classifier\n(Underfitting)', fontweight='bold')
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')

# Quadratic boundary
ax = axes[1]
ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0')
ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1')

x_grid = np.linspace(-3, 3, 100)
y_grid = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x_grid, y_grid)
Z = np.logical_xor(xx > 0, yy > 0)
ax.contour(xx, yy, Z, levels=[0.5], colors='#2E7D32', linewidths=2)
ax.set_title('Quadratic Classifier\n(Good Fit)', fontweight='bold')
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')

# Complex boundary
ax = axes[2]
ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0')
ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1')

# Draw wiggly boundaries around each point
for i in range(0, len(X), 5):
    if y[i]:
        circle = plt.Circle((X[i, 0], X[i, 1]), 0.3, fill=False, 
                           edgecolor='#2E7D32', linewidth=1, alpha=0.5)
        ax.add_patch(circle)
ax.set_title('High-Order Polynomial\n(Overfitting)', fontweight='bold')
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')

plt.suptitle('Hypothesis Class Complexity Trade-off', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
\[ \mathcal{H}_{\text{linear}}: h(\mathbf{x}) = \text{sign}(\mathbf{w}^T\mathbf{x} + b) \]
\[ \mathcal{H}_{\text{neural}}: h(\mathbf{x}) = h_2(\mathbf{W}_2 \cdot h_1(\mathbf{W}_1\mathbf{x} + \mathbf{b}_1) + \mathbf{b}_2) \]

Perceptron: Linear Combination + Nonlinearity

Mathematical Model

\[ y = h\left(\sum_{i=1}^n w_i x_i + b\right) = h(\mathbf{w}^T\mathbf{x} + b) \]

where \(h\) is an activation function:

#| echo: false
"""
Perceptron architecture diagram showing inputs, weighted connections, summation, activation function, and output.
"""

#| fig-align: center

import matplotlib.patches as patches
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 6))

# Input nodes
input_y = [1.5, 2.5, 3.5]
for i, y in enumerate(input_y):
    circle = plt.Circle((2, y), 0.25, color='#E3F2FD', ec='#1976D2', linewidth=2)
    ax.add_patch(circle)
    ax.text(2, y, f'$x_{i+1}$', ha='center', va='center', fontsize=11, fontweight='bold')

# Bias node
bias_circle = plt.Circle((2, 0.5), 0.25, color='#F5F5F5', ec='#757575', linewidth=2)
ax.add_patch(bias_circle)
ax.text(2, 0.5, '1', ha='center', va='center', fontsize=11, fontweight='bold')

# Summation node
sum_circle = plt.Circle((5, 2.5), 0.35, color='#FFF9C4', ec='#F57C00', linewidth=2)
ax.add_patch(sum_circle)
ax.text(5, 2.5, '$\\Sigma$', ha='center', va='center', fontsize=14, fontweight='bold')

# Activation node
act_circle = plt.Circle((7, 2.5), 0.35, color='#FFE0B2', ec='#F57C00', linewidth=2)
ax.add_patch(act_circle)
ax.text(7, 2.5, '$h$', ha='center', va='center', fontsize=14, fontweight='bold')

# Output
output_circle = plt.Circle((9, 2.5), 0.25, color='#C8E6C9', ec='#4CAF50', linewidth=2)
ax.add_patch(output_circle)
ax.text(9, 2.5, '$y$', ha='center', va='center', fontsize=12, fontweight='bold')

# Connections from inputs to summation (spread out arrows)
weights = ['$w_1$', '$w_2$', '$w_3$']
for i, (y, w) in enumerate(zip(input_y, weights)):
    # Calculate angle to spread arrows
    angle = np.arctan2(2.5 - y, 3)
    end_x = 4.65 - 0.15 * np.cos(angle + np.pi/6 * (i-1))
    end_y = 2.5 - 0.15 * np.sin(angle + np.pi/6 * (i-1))
    
    # Draw line to summation edge
    ax.plot([2.25, end_x], [y, end_y], 'k-', linewidth=1.5)
    # Arrow head outside circle
    ax.arrow(end_x - 0.05, end_y, 0.05, 0, head_width=0.08, head_length=0.03, fc='black')
    # Weight label
    mid_x = 3.3
    mid_y = y + (2.5 - y) * 0.4
    ax.text(mid_x, mid_y + 0.15, w, fontsize=10, color='#212121', ha='center')

# Bias connection (also spread)
end_x_b = 4.65 - 0.15 * np.cos(np.arctan2(2, 3) - np.pi/6)
end_y_b = 2.5 - 0.15 * np.sin(np.arctan2(2, 3) - np.pi/6)
ax.plot([2.25, end_x_b], [0.5, end_y_b], 'k--', linewidth=1.5, alpha=0.7)
ax.arrow(end_x_b - 0.05, end_y_b, 0.05, 0, head_width=0.08, head_length=0.03, fc='black', alpha=0.7)
ax.text(3.3, 1.3, '$b$', fontsize=10, color='#212121', ha='center', style='italic')

# Summation to activation
ax.arrow(5.35, 2.5, 1.3, 0, head_width=0.12, head_length=0.1, fc='black', linewidth=1.5)

# Activation to output
ax.arrow(7.35, 2.5, 1.3, 0, head_width=0.12, head_length=0.1, fc='black', linewidth=1.5)

# Output arrow
ax.arrow(9.25, 2.5, 0.5, 0, head_width=0.12, head_length=0.1, fc='#4CAF50', ec='#4CAF50', linewidth=2)

ax.set_xlim(1, 10.5)
ax.set_ylim(0, 4.5)
ax.axis('off')

plt.tight_layout()
plt.show()

Activation Functions Add Nonlinearity

#| echo: false
"""
Common activation functions (ReLU, Sigmoid, Tanh, Leaky ReLU) and their derivatives, illustrating the nonlinear transformations and gradient behavior critical for deep network training.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 4, figsize=(14, 6))
x = np.linspace(-3, 3, 100)

# ReLU
ax = axes[0, 0]
y = np.maximum(0, x)
ax.plot(x, y, 'b-', linewidth=2)
ax.set_title('ReLU', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim(-0.5, 3)

# Sigmoid
ax = axes[0, 1]
y = 1 / (1 + np.exp(-x))
ax.plot(x, y, 'r-', linewidth=2)
ax.set_title('Sigmoid', fontweight='bold')
ax.grid(True, alpha=0.3)

# Tanh
ax = axes[0, 2]
y = np.tanh(x)
ax.plot(x, y, 'g-', linewidth=2)
ax.set_title('Tanh', fontweight='bold')
ax.grid(True, alpha=0.3)

# Leaky ReLU
ax = axes[0, 3]
y = np.where(x > 0, x, 0.1*x)
ax.plot(x, y, 'm-', linewidth=2)
ax.set_title('Leaky ReLU', fontweight='bold')
ax.grid(True, alpha=0.3)

# Derivatives
# ReLU derivative
ax = axes[1, 0]
y = (x > 0).astype(float)
ax.plot(x, y, 'b--', linewidth=2)
ax.set_title("ReLU'", fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim(-0.5, 1.5)

# Sigmoid derivative
ax = axes[1, 1]
sig = 1 / (1 + np.exp(-x))
y = sig * (1 - sig)
ax.plot(x, y, 'r--', linewidth=2)
ax.set_title("Sigmoid'", fontweight='bold')
ax.grid(True, alpha=0.3)

# Tanh derivative
ax = axes[1, 2]
y = 1 - np.tanh(x)**2
ax.plot(x, y, 'g--', linewidth=2)
ax.set_title("Tanh'", fontweight='bold')
ax.grid(True, alpha=0.3)

# Leaky ReLU derivative
ax = axes[1, 3]
y = np.where(x > 0, 1, 0.1)
ax.plot(x, y, 'm--', linewidth=2)
ax.set_title("Leaky ReLU'", fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim(-0.5, 1.5)

plt.suptitle('Activation Functions and Their Derivatives', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Why Nonlinearity Matters

Without activation functions, stacking layers is pointless: \(f(\mathbf{W}_2 \mathbf{W}_1 \mathbf{x}) = f(\mathbf{W} \mathbf{x})\) where \(\mathbf{W} = \mathbf{W}_2\mathbf{W}_1\)

Later topic: Gradient flow and vanishing gradients during backpropagation

Closed-Form vs Iterative Optimization

Explicit (Closed-form)

\[ \mathbf{w}^* = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|^2 \]

Solution:

\[ \mathbf{w}^* = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{y} \]

Iterative (Gradient-based)

\[ \mathbf{w}_{t+1} = \mathbf{w}_t - \eta \nabla_{\mathbf{w}}\mathcal{L}(\mathbf{w}_t) \]
#| echo: true
#| code-fold: false

def sgd_step(w, x, y, learning_rate=0.01):
    prediction = np.dot(w, x)
    error = prediction - y
    gradient = error * x
    w_new = w - learning_rate * gradient
    return w_new

Gradient Descent Visualization

#| echo: false
"""
Gradient descent optimization on a 2D loss surface with contour lines showing the path from initialization to the minimum, alongside a convergence plot of loss versus iteration.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Loss surface
ax = axes[0]
w1 = np.linspace(-2, 3, 100)
w2 = np.linspace(-2, 3, 100)
W1, W2 = np.meshgrid(w1, w2)
Z = 0.5 * ((W1 - 1)**2 + 2*(W2 - 0.5)**2)

contour = ax.contour(W1, W2, Z, levels=20, cmap='viridis', alpha=0.6)
ax.clabel(contour, inline=True, fontsize=8)

# Gradient descent path
np.random.seed(42)
w_init = np.array([-1.5, 2.5])
path = [w_init]
learning_rate = 0.1
for _ in range(20):
    grad = np.array([path[-1][0] - 1, 2*(path[-1][1] - 0.5)])
    new_w = path[-1] - learning_rate * grad
    path.append(new_w)

path = np.array(path)
ax.plot(path[:, 0], path[:, 1], 'r.-', linewidth=2, markersize=8, 
        label='GD Path', markeredgecolor='white', markeredgewidth=1)
ax.plot(1, 0.5, 'g*', markersize=15, label='Optimum', markeredgecolor='white', markeredgewidth=1)
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_title('Gradient Descent on Loss Surface', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Convergence plot
ax = axes[1]
losses = [0.5 * ((w[0] - 1)**2 + 2*(w[1] - 0.5)**2) for w in path]
ax.plot(losses, 'o-', linewidth=2, markersize=6, color='#1976D2', 
        markeredgecolor='white', markeredgewidth=1)
ax.set_xlabel('Iteration')
ax.set_ylabel('Loss')
ax.set_title('Loss Convergence', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_yscale('log')

plt.tight_layout()
plt.show()

Iterative Optimization Principle

Gradient descent navigates the loss landscape by repeatedly moving in the direction of steepest descent. For convex problems, this guarantees convergence to the global minimum. For neural networks, we settle for local minima that generalize well.

The Bias-Variance Decomposition

Expected Prediction Error

\[ \text{MSE} = \text{Bias}^2 + \text{Variance} + \sigma^2 \]

Bias: Error from wrong model assumptions

Variance: Error from sensitivity to training data

Irreducible error (\(\sigma^2\)): Noise inherent in data

#| echo: false
"""
Bullseye target diagram showing four bias-variance scenarios: low/high bias combined with low/high variance, where point clustering represents variance and distance from center represents bias.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(8, 8))

# 2x2 grid showing bias-variance combinations
scenarios = [
    ('Low Bias\nLow Variance', 1, 3, '#2E7D32'),
    ('Low Bias\nHigh Variance', 3, 3, '#F57C00'),
    ('High Bias\nLow Variance', 1, 1, '#1976D2'),
    ('High Bias\nHigh Variance', 3, 1, '#C62828')
]

for label, x, y, color in scenarios:
    # Draw target circles
    for r in [0.3, 0.6]:
        circle = plt.Circle((x, y), r, fill=False, edgecolor='black', linewidth=2)
        ax.add_patch(circle)

    # Draw points
    np.random.seed(42)
    if 'Low Bias' in label and 'Low Variance' in label:
        # Tight cluster near center
        points_x = x + np.random.normal(0, 0.1, 10)
        points_y = y + np.random.normal(0, 0.1, 10)
    elif 'Low Bias' in label:
        # Scattered but centered
        points_x = x + np.random.normal(0, 0.4, 10)
        points_y = y + np.random.normal(0, 0.4, 10)
    elif 'Low Variance' in label:
        # Tight cluster but off-center
        points_x = x - 0.4 + np.random.normal(0, 0.1, 10)
        points_y = y - 0.4 + np.random.normal(0, 0.1, 10)
    else:
        # Scattered and off-center
        points_x = x - 0.5 + np.random.normal(0, 0.4, 10)
        points_y = y - 0.5 + np.random.normal(0, 0.4, 10)

    ax.scatter(points_x, points_y, c=color, s=50, alpha=0.6, edgecolors='black')
    ax.text(x, y - 1, label, ha='center', fontsize=10, fontweight='bold')

ax.set_xlim(0, 4)
ax.set_ylim(0, 4)
ax.set_aspect('equal')
ax.axis('off')
ax.set_title('Target: Center of Bullseye', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

Tradeoff: Complex models reduce bias but increase variance

Bias-Variance in Practice: Polynomial Fitting

#| echo: false
"""
Polynomial fits of degrees 1, 3, 8, and 14 to noisy data, illustrating the progression from underfitting (high bias) to overfitting (high variance).
"""

#| fig-align: center

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Generate true function
np.random.seed(42)
X_true = np.linspace(0, 10, 200)
y_true_func = lambda x: np.sin(x) + 0.5 * x
y_true = y_true_func(X_true)

# Generate the SAME data points for all models
np.random.seed(42)
n_data = 15  # Use same 15 points for all
X_data = np.sort(np.random.uniform(0.5, 9.5, n_data))
y_data_clean = y_true_func(X_data)
y_data = y_data_clean + np.random.normal(0, 1.2, n_data)  # Consistent noise

# Add boundary points (not plotted) to control edge behavior
X_boundary_left = np.array([-1.0])
y_boundary_left = y_true_func(X_boundary_left)
X_boundary_right = np.array([11.0])
y_boundary_right = y_true_func(X_boundary_right)

# Different polynomial degrees for each case
configs = [
    (1, 'Underfitting (High Bias)', '#C62828'),
    (3, 'Good Fit', '#2E7D32'),
    (8, 'Slight Overfitting', '#F57C00'),
    (14, 'Severe Overfitting (High Variance)', '#7B1FA2')
]

for ax, (degree, title, color) in zip(axes.flat, configs):
    # Combine data with boundary points for fitting
    X_fit = np.concatenate([X_boundary_left, X_data, X_boundary_right])
    y_fit = np.concatenate([y_boundary_left, y_data, y_boundary_right])
    
    # Fit polynomial
    coeffs = np.polyfit(X_fit, y_fit, min(degree, len(X_fit)-1))
    poly = np.poly1d(coeffs)
    X_plot = np.linspace(0, 10, 500)
    y_pred = poly(X_plot)
    
    # Plot data (not boundary points)
    ax.scatter(X_data, y_data, s=50, color='black', zorder=5, alpha=0.7, label='Training Data')
    ax.plot(X_true, y_true, 'g--', linewidth=2, label='True Function', alpha=0.5)
    ax.plot(X_plot, y_pred, '-', linewidth=2.5, label=f'Polynomial (degree={degree})', color=color)
    
    # Add error shading between prediction and true function
    y_true_plot = y_true_func(X_plot)
    ax.fill_between(X_plot, y_pred, y_true_plot, 
                     where=(np.abs(y_pred - y_true_plot) < 10),  # Clip extreme values
                     alpha=0.15, color='blue', linewidth=0)
    
    # For overfitting cases, add vertical lines at data points to show fitting
    if degree >= 8:
        for x, y in zip(X_data, y_data):
            pred_y = poly(x)
            ax.plot([x, x], [y, pred_y], ':', color=color, alpha=0.5, linewidth=1)
    
    ax.set_xlabel('Input', fontsize=11)
    ax.set_ylabel('Output', fontsize=11)
    ax.set_title(title, fontweight='bold', fontsize=12)
    ax.legend(loc='upper left', fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(0, 10)
    ax.set_ylim(-2, 12)
    
    # Add text annotations for key insights
    if 'Underfitting' in title:
        ax.text(5, -3, 'Cannot capture\nunderlying pattern', ha='center', fontsize=9, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    elif 'Severe' in title:
        ax.text(5, 8, 'Wild oscillations\nbetween data points', ha='center', fontsize=9,
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.suptitle('Polynomial Fitting: From Underfitting to Overfitting', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

Increasing complexity: bias decreases, variance increases

EE 541 Core Principles

Theory

  1. Learning = Function Approximation

    • From data to predictions
    • Hypothesis class defines possibilities
  2. Representation Matters

    • Same data, different encodings
    • Deep learning learns representations
  3. Generalization is the Goal

    • Not memorization
    • Balance complexity with data

Implementation

  1. Start Simple

    • Linear models as baselines
    • Add complexity purposefully
  2. Iterate and Validate

    • Gradient descent scales
    • Monitor train vs test error
  3. EE 541 Progression

    • MMSE → Regression → Neural Nets
    • Theory + PyTorch implementation

Listen to the Data

---

Data Quality Dominates Quantity

Clive Humby (2006)

"Data is the new oil"

But like oil, it must be refined to have value

\[ \text{Model Performance} = f(\text{Data Quality}, \text{Data Quantity}) \]

Illustrative Example: Data Refinement Impact

Note: Specific percentages vary by application, but quality improvement consistently outperforms quantity alone.

#| echo: false
"""
Performance curves showing how high-quality data reaches target accuracy with far fewer samples than low-quality data, illustrating that data quality dominates at small scales while quantity helps at larger scales.
"""

#| fig-align: center

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

fig, ax = plt.subplots(figsize=(10, 8))

# Data showing impact of data quality and quantity on model performance
data_sizes = np.logspace(2, 6, 50)  # 100 to 1M samples

# Performance curves for different data qualities
perf_high_quality = 95 * (1 - np.exp(-data_sizes/10000))  # Saturates at 95%
perf_med_quality = 80 * (1 - np.exp(-data_sizes/30000))   # Saturates at 80%
perf_low_quality = 60 * (1 - np.exp(-data_sizes/100000))  # Saturates at 60%

ax.semilogx(data_sizes, perf_high_quality, 'g-', linewidth=3, label='High Quality Data')
ax.semilogx(data_sizes, perf_med_quality, 'b-', linewidth=3, label='Medium Quality Data')
ax.semilogx(data_sizes, perf_low_quality, 'r-', linewidth=3, label='Low Quality Data')

# Add annotations
ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5)
ax.text(1000, 92, 'Target Performance', fontsize=10, color='gray')

# Highlight key insight
ax.fill_between([100, 10000], 0, 100, alpha=0.1, color='green')
ax.text(1000, 50, 'Quality\nDominates', fontsize=12, fontweight='bold', ha='center', color='green')

ax.fill_between([10000, 1000000], 0, 100, alpha=0.1, color='blue')
ax.text(100000, 50, 'Quantity\nHelps', fontsize=12, fontweight='bold', ha='center', color='blue')

ax.set_xlabel('Dataset Size (number of samples)', fontsize=12)
ax.set_ylabel('Model Performance (%)', fontsize=12)
ax.set_title('The Data Quality vs Quantity Trade-off', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend(loc='lower right', fontsize=11)
ax.set_ylim(0, 100)

plt.tight_layout()
plt.show()

Representation Transforms Problem Difficulty

#| echo: false
"""
Concentric circle data shown in three coordinate systems: original Cartesian (linearly inseparable), polar (separable by radius), and polynomial features (linearly separable).
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.preprocessing import PolynomialFeatures

np.random.seed(42)
X, y = make_circles(n_samples=200, noise=0.1, factor=0.5)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Original space
ax = axes[0]
ax.scatter(X[y==0, 0], X[y==0, 1], c='#1976D2', alpha=0.6, s=30)
ax.scatter(X[y==1, 0], X[y==1, 1], c='#C62828', alpha=0.6, s=30)
ax.set_title('Original Space\n(Linearly Inseparable)', fontweight='bold')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.5)
ax.axvline(x=0, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.5)

# Polar coordinates
ax = axes[1]
r = np.sqrt(X[:, 0]**2 + X[:, 1]**2)
theta = np.arctan2(X[:, 1], X[:, 0])
ax.scatter(r[y==0], theta[y==0], c='#1976D2', alpha=0.6, s=30)
ax.scatter(r[y==1], theta[y==1], c='#C62828', alpha=0.6, s=30)
ax.set_title('Polar Coordinates\n(Radius Separable)', fontweight='bold')
ax.set_xlabel('Radius $r$')
ax.set_ylabel('Angle $\\theta$')
ax.grid(True, alpha=0.3)
ax.axvline(x=0.7, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.7, label='Decision boundary')
ax.legend()

# Polynomial features
ax = axes[2]
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
ax.scatter(X_poly[y==0, 2], X_poly[y==0, 4], c='#1976D2', alpha=0.6, s=30)
ax.scatter(X_poly[y==1, 2], X_poly[y==1, 4], c='#C62828', alpha=0.6, s=30)
ax.set_title('Polynomial Features\n(Linearly Separable)', fontweight='bold')
ax.set_xlabel('$x_1^2$')
ax.set_ylabel('$x_2^2$')
ax.grid(True, alpha=0.3)
x_line = np.linspace(-0.2, 1.5, 100)
ax.plot(x_line, 0.5 - x_line, 'g--', linewidth=2, alpha=0.7)

plt.suptitle('Same Data, Different Representations', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
Concentric circles: linearly inseparable in Cartesian coordinates, but trivially separable by radius in polar coordinates. Deep learning automates this search for effective representations.

High Dimensions Break Geometric Intuition

The Curse of Dimensionality

As \(d \to \infty\):

#| echo: true
#| code-fold: true

import numpy as np

def volume_ratio(d, epsilon=0.95):
    """Fraction of hypercube volume in outer shell"""
    return 1 - epsilon**d

dimensions = [1, 2, 3, 10, 100, 1000]
for d in dimensions:
    ratio = volume_ratio(d)
    print(f"d={d:4}: {ratio:.6f} in outer shell")
#| echo: false
"""
Hypersphere volume shrinking relative to enclosing hypercube as dimensions increase from 2D to 100D, illustrating the curse of dimensionality.
"""

#| fig-align: center

import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle
import numpy as np

# Hypersphere visualization
fig, axes = plt.subplots(1, 3, figsize=(10, 3.5))

# 2D
ax = axes[0]
circle = Circle((0, 0), 1, fill=False, edgecolor='#8B0000', linewidth=2)
square = Rectangle((-1, -1), 2, 2, fill=False, edgecolor='#1976D2', linewidth=2)
ax.add_patch(circle)
ax.add_patch(square)
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)
ax.set_aspect('equal')
ax.set_title('2D: Circle fills square', fontsize=11)
ax.grid(True, alpha=0.3)

# 10D representation
ax = axes[1]
x = np.linspace(-1, 1, 100)
y = np.sqrt(1 - x**2)
ax.fill_between(x, -y, y, alpha=0.3, color='#8B0000')
ax.plot([-1, 1, 1, -1, -1], [-1, -1, 1, 1, -1], 'b-', linewidth=2)
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)
ax.set_aspect('equal')
ax.set_title('10D: Sphere vanishes', fontsize=11)
ax.text(0, 0, 'Vol = 0.25%', ha='center', fontsize=10, fontweight='bold')
ax.grid(True, alpha=0.3)

# 100D representation
ax = axes[2]
ax.plot([-1, 1, 1, -1, -1], [-1, -1, 1, 1, -1], 'b-', linewidth=2)
ax.scatter([0], [0], s=5, color='#8B0000')
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)
ax.set_aspect('equal')
ax.set_title('100D: Sphere invisible', fontsize=11)
ax.text(0, -0.2, 'Vol ≈ 0', ha='center', fontsize=10, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.suptitle('Hypersphere Volume in Unit Hypercube', fontweight='bold', y=1.05)
plt.tight_layout()
plt.show()
#| echo: false
"""
Volume concentration in a thin outer shell and pairwise distance concentration as dimensionality increases, illustrating the curse of dimensionality.
"""

#| fig-align: center

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 10))

# Volume concentration
d_range = np.arange(1, 101)
volume_ratios = [volume_ratio(d, 0.95) for d in d_range]
ax1.plot(d_range, volume_ratios, linewidth=2, color='#8B0000')
ax1.set_xlabel('Dimensions')
ax1.set_ylabel('Fraction in Outer 5% Shell')
ax1.set_title('Volume Concentration in High Dimensions', fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0.99, color='gray', linestyle='--', alpha=0.7)
ax1.text(50, 0.92, '99% of volume', fontsize=10)

# Distance concentration
np.random.seed(42)
dims = [2, 10, 50, 100, 500]
distances = []
for d in dims:
    X = np.random.randn(100, d)
    dist_matrix = np.sqrt(((X[:, None] - X[None, :])**2).sum(-1))
    distances.append(dist_matrix[np.triu_indices_from(dist_matrix, k=1)])

ax2.boxplot(distances, labels=[f'd={d}' for d in dims])
ax2.set_ylabel('Pairwise Distances')
ax2.set_title('Distance Concentration Effect', fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Label Noise Degrades Performance More Than Limited Data

#| echo: false
"""
Learning curves showing how label noise creates accuracy ceilings that more data cannot overcome, alongside CIFAR-10 bar chart demonstrating the super-linear impact of noise on model performance.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

np.random.seed(42)

# Learning curves with different noise levels
ax = axes[0]
sizes = np.logspace(3, 5, 50).astype(int)  # 1K to 100K samples

# Different label noise levels
for noise_level, color, style in [(0, '#2E7D32', '-'),
                                  (0.05, '#1976D2', '--'),
                                  (0.15, '#F57C00', '-.'),
                                  (0.30, '#C62828', ':')]:
    # Accuracy model: asymptotic with noise ceiling
    max_acc = 1.0 - 0.8 * noise_level  # Noise creates accuracy ceiling
    accuracy = max_acc * (1 - np.exp(-sizes / (10000 * (1 + 2*noise_level))))

    label = f'{int(noise_level*100)}% label noise'
    ax.semilogx(sizes, accuracy, style, color=color, linewidth=2.5, label=label)

# Mark key insight points
ax.axhline(y=0.9, color='gray', linestyle='--', alpha=0.3)
ax.text(1100, 0.91, '90% accuracy threshold', fontsize=9, color='gray')

# Annotations showing data requirements
ax.annotate('', xy=(3000, 0.9), xytext=(3000, 0.78),
            arrowprops=dict(arrowstyle='<->', color='#2E7D32', lw=1.5))
ax.text(4000, 0.84, '3K clean\nsamples', fontsize=9, color='#2E7D32')

ax.annotate('', xy=(50000, 0.85), xytext=(50000, 0.68),
            arrowprops=dict(arrowstyle='<->', color='#C62828', lw=1.5))
ax.text(60000, 0.76, '50K noisy\nsamples\n(never reaches\n90%)', fontsize=9, color='#C62828')

ax.set_xlabel('Dataset Size (log scale)', fontsize=12)
ax.set_ylabel('Test Accuracy', fontsize=12)
ax.set_title('Impact of Label Noise on Learning', fontsize=13, fontweight='bold')
ax.legend(loc='lower right', fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_xlim(1000, 100000)
ax.set_ylim(0.5, 1.0)

# Concrete example: CIFAR-10 with synthetic noise
ax = axes[1]
noise_levels = [0, 0.1, 0.2, 0.3, 0.4]
accuracies = [0.94, 0.87, 0.79, 0.68, 0.55]
error_bars = [0.01, 0.02, 0.03, 0.04, 0.05]

colors_gradient = ['#2E7D32', '#66BB6A', '#FFA726', '#FF7043', '#C62828']
bar_positions = np.arange(len(noise_levels))

bars = ax.bar(bar_positions, accuracies, yerr=error_bars, 
              color=colors_gradient, alpha=0.8, capsize=5)

# Add value labels on bars (moved higher to avoid overlap)
for i, (pos, acc) in enumerate(zip(bar_positions, accuracies)):
    ax.text(pos, acc + error_bars[i] + 0.04, f'{acc:.2f}', 
            ha='center', fontsize=10, fontweight='bold')

ax.set_xticks(bar_positions)
ax.set_xticklabels([f'{int(n*100)}%' for n in noise_levels])
ax.set_xlabel('Label Noise Level', fontsize=12)
ax.set_ylabel('Test Accuracy (ResNet-18)', fontsize=12)
ax.set_title('CIFAR-10: Actual Impact of Label Noise', fontsize=13, fontweight='bold')
ax.set_ylim(0, 1.05)
ax.grid(True, alpha=0.3, axis='y')

# Add key insight box
ax.text(0.5, 0.25, '10% noise → 7% accuracy drop\n30% noise → 26% accuracy drop\n\nNoise impact is super-linear',
        transform=ax.transAxes, fontsize=10,
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
        ha='center')

plt.tight_layout()
plt.show()

Amazon Resume Screening: Training on Biased Data

Setup (2014-2017):

What went wrong:

The data:

Historical hires: 85% male, 15% female
Model learned: male-coded patterns = higher rating

System scrapped in 2018.

#| echo: false
"""
Stacked bar chart showing how gender bias amplifies through the ML pipeline: from 85/15% male/female in training data to 92/8% in model predictions.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(10, 8))

# Show the feedback loop
categories = ['Training\nData', 'Model\nLearns', 'Predictions\nAmplify']
male_pct = [85, 88, 92]
female_pct = [15, 12, 8]

x = np.arange(len(categories))
width = 0.5

bars1 = ax.bar(x, male_pct, width, label='Male candidates', color='#1976D2', alpha=0.7)
bars2 = ax.bar(x, female_pct, width, bottom=male_pct, label='Female candidates', color='#C62828', alpha=0.7)

ax.set_ylabel('Percentage (%)', fontsize=12)
ax.set_title('Bias Amplification Through Training', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(categories, fontsize=11)
ax.set_ylim(0, 100)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')

for i, (m, f) in enumerate(zip(male_pct, female_pct)):
    ax.text(i, m/2, f'{m}%', ha='center', va='center', fontweight='bold', fontsize=12, color='white')
    ax.text(i, m + f/2, f'{f}%', ha='center', va='center', fontweight='bold', fontsize=12, color='white')

ax.annotate('', xy=(2, 50), xytext=(0, 50),
            arrowprops=dict(arrowstyle='->', lw=2, color='red'))

ax.text(1, 55, 'Model amplifies existing bias', ha='center', fontsize=11,
        bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.4))

plt.tight_layout()
plt.show()

Why this matters:

The model did exactly what it was trained to do - replicate patterns in historical data.

The problem: historical data reflected real-world bias.

Clean data ≠ unbiased data

Data Augmentation: Synthetic Diversity from Limited Samples

Standard Augmentations

Mathematical View

Training on augmented data:

\[ \min_\theta \sum_{i=1}^N \sum_{j=1}^M \mathcal{L}(f_\theta(T_j(x_i)), y_i) \]

where \(T_j\) are augmentation transforms

#| echo: false
"""
Data augmentation examples showing a handwritten digit transformed via rotation, translation, and noise addition.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 4, figsize=(10, 5))
from sklearn.datasets import load_digits
from scipy.ndimage import rotate, shift

digits = load_digits()
original = digits.images[0]

# Original
axes[0, 0].imshow(original, cmap='gray_r')
axes[0, 0].set_title('Original', fontsize=10)
axes[0, 0].axis('off')

# Rotations
for i, angle in enumerate([15, -15, 30]):
    rotated = rotate(original, angle, reshape=False)
    axes[0, i+1].imshow(rotated, cmap='gray_r')
    axes[0, i+1].set_title(f'Rotate {angle}°', fontsize=10)
    axes[0, i+1].axis('off')

# Translations
for i, (dx, dy) in enumerate([(1, 0), (0, 1), (-1, -1)]):
    shifted = shift(original, (dx, dy))
    axes[1, i].imshow(shifted, cmap='gray_r')
    axes[1, i].set_title(f'Shift ({dx},{dy})', fontsize=10)
    axes[1, i].axis('off')

# Noise
noisy = original + np.random.normal(0, 2, original.shape)
axes[1, 3].imshow(noisy, cmap='gray_r')
axes[1, 3].set_title('Add Noise', fontsize=10)
axes[1, 3].axis('off')

plt.suptitle('Data Augmentation Examples', fontweight='bold')
plt.tight_layout()
plt.show()

ML Learning Paradigms

---

Three Paradigms: Supervised, Unsupervised, Reinforcement

#| echo: false
"""
Three-panel comparison of learning paradigms: supervised (input-output pairs training a model), unsupervised (unlabeled data finding structure), and reinforcement learning (agent-environment interaction loop).
"""

#| fig-align: center

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

fig, axes = plt.subplots(1, 3, figsize=(16, 5.5))

# Supervised Learning
ax = axes[0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Data pairs section
data_box = patches.FancyBboxPatch((0.5, 2), 4, 6, boxstyle="round,pad=0.05",
                                  edgecolor='#E0E0E0', facecolor='#FAFAFA', linewidth=1.5, alpha=0.3)
ax.add_patch(data_box)
ax.text(2.5, 8.3, 'Training Data', ha='center', fontsize=10, style='italic')

# Input-output pairs
for i in range(3):
    y_pos = 6.5 - i*1.8
    # Input
    rect_in = patches.FancyBboxPatch((1, y_pos), 1.2, 0.8, boxstyle="round,pad=0.03",
                                     edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2)
    ax.add_patch(rect_in)
    ax.text(1.6, y_pos+0.4, f'$\\mathbf{{x}}_{{{i+1}}}$', ha='center', va='center', fontsize=11)
    
    # Arrow
    ax.arrow(2.25, y_pos+0.4, 0.5, 0, head_width=0.12, head_length=0.08, fc='#424242', ec='#424242')
    
    # Output
    rect_out = patches.FancyBboxPatch((2.8, y_pos), 1.2, 0.8, boxstyle="round,pad=0.03",
                                      edgecolor='#388E3C', facecolor='#E8F5E9', linewidth=2)
    ax.add_patch(rect_out)
    ax.text(3.4, y_pos+0.4, f'$y_{{{i+1}}}$', ha='center', va='center', fontsize=11)

# Model
rect_model = patches.FancyBboxPatch((5.5, 3.5), 2.5, 3, boxstyle="round,pad=0.08",
                                    edgecolor='#D32F2F', facecolor='#FFEBEE', linewidth=2.5)
ax.add_patch(rect_model)
ax.text(6.75, 5, 'Model\n$f(\\mathbf{x}; \\theta)$', ha='center', va='center', fontsize=11, fontweight='bold')

# Training arrow
arrow_patch = patches.FancyArrowPatch((4.5, 5), (5.4, 5), mutation_scale=25, 
                                      color='#D32F2F', linewidth=2.5, arrowstyle='->')
ax.add_patch(arrow_patch)
ax.text(4.95, 5.4, 'Learn', fontsize=10, fontweight='bold', ha='center')

ax.set_title('Supervised Learning', fontsize=13, fontweight='bold', pad=15)
ax.text(5, 1.2, 'Learns from labeled examples', ha='center', fontsize=9, style='italic', color='#616161')

# Unsupervised Learning
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Just inputs section
data_box = patches.FancyBboxPatch((0.5, 2), 2.5, 6, boxstyle="round,pad=0.05",
                                  edgecolor='#E0E0E0', facecolor='#FAFAFA', linewidth=1.5, alpha=0.3)
ax.add_patch(data_box)
ax.text(1.75, 8.3, 'Unlabeled Data', ha='center', fontsize=10, style='italic')

# Just inputs
for i in range(4):
    y_pos = 7 - i*1.3
    rect_in = patches.FancyBboxPatch((1, y_pos), 1.5, 0.8, boxstyle="round,pad=0.03",
                                     edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2)
    ax.add_patch(rect_in)
    ax.text(1.75, y_pos+0.4, f'$\\mathbf{{x}}_{{{i+1}}}$', ha='center', va='center', fontsize=11)

# Model finding structure
rect_model = patches.FancyBboxPatch((4, 3.5), 2.5, 3, boxstyle="round,pad=0.08",
                                    edgecolor='#7B1FA2', facecolor='#F3E5F5', linewidth=2.5)
ax.add_patch(rect_model)
ax.text(5.25, 5, 'Discover\nPatterns', ha='center', va='center', fontsize=11, fontweight='bold')

# Clusters output
cluster_box = patches.FancyBboxPatch((7.5, 3), 2, 4, boxstyle="round,pad=0.05",
                                     edgecolor='#E0E0E0', facecolor='white', linewidth=1.5, alpha=0.8)
ax.add_patch(cluster_box)
for i, (color, y) in enumerate(zip(['#FF6F00', '#00ACC1', '#FFD600'], [6, 5, 4])):
    circle = patches.Circle((8.5, y), 0.35, color=color, alpha=0.8, ec='white', linewidth=1)
    ax.add_patch(circle)
    ax.text(8.5, y, f'C{i+1}', ha='center', va='center', fontsize=9, fontweight='bold', color='white')

arrow_patch = patches.FancyArrowPatch((3, 5), (3.9, 5), mutation_scale=25,
                                      color='#7B1FA2', linewidth=2.5, arrowstyle='->')
ax.add_patch(arrow_patch)
arrow_patch2 = patches.FancyArrowPatch((6.6, 5), (7.4, 5), mutation_scale=25,
                                       color='#7B1FA2', linewidth=2.5, arrowstyle='->')
ax.add_patch(arrow_patch2)

ax.set_title('Unsupervised Learning', fontsize=13, fontweight='bold', pad=15)
ax.text(5, 1.2, 'Discovers structure without labels', ha='center', fontsize=9, style='italic', color='#616161')

# Reinforcement Learning
ax = axes[2]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Agent
agent_circle = patches.Circle((2.5, 5), 1, facecolor='#FF5252', edgecolor='#B71C1C', linewidth=2.5)
ax.add_patch(agent_circle)
ax.text(2.5, 5, 'Agent\n$\\pi(a|s)$', ha='center', va='center', fontsize=11, fontweight='bold', color='white')

# Environment
env_box = patches.FancyBboxPatch((5.5, 3), 3.5, 4, boxstyle="round,pad=0.08",
                                 edgecolor='#00ACC1', facecolor='#E0F7FA', linewidth=2.5)
ax.add_patch(env_box)
ax.text(7.25, 5, 'Environment', ha='center', va='center', fontsize=12, fontweight='bold')

# Interaction arrows with better positioning
# Action
arrow1 = patches.FancyArrowPatch((3.4, 5.5), (5.4, 5.5), mutation_scale=20,
                                color='#1565C0', linewidth=2, arrowstyle='->')
ax.add_patch(arrow1)
ax.text(4.4, 5.9, 'Action $a_t$', fontsize=10, ha='center')

# State
arrow2 = patches.FancyArrowPatch((5.4, 4.8), (3.4, 4.8), mutation_scale=20,
                                color='#2E7D32', linewidth=2, arrowstyle='->')
ax.add_patch(arrow2)
ax.text(4.4, 4.4, 'State $s_t$', fontsize=10, ha='center')

# Reward
arrow3 = patches.FancyArrowPatch((5.4, 4), (3.4, 4), mutation_scale=20,
                                color='#F57C00', linewidth=2, arrowstyle='->', linestyle='dashed')
ax.add_patch(arrow3)
ax.text(4.4, 3.6, 'Reward $r_t$', fontsize=10, ha='center')

ax.set_title('Reinforcement Learning', fontsize=13, fontweight='bold', pad=15)
ax.text(5, 1.2, 'Learns through trial and feedback', ha='center', fontsize=9, style='italic', color='#616161')

plt.tight_layout()
plt.show()

Modern methods combine paradigms: GPT-4 uses unsupervised pre-training on text, supervised fine-tuning on tasks, and reinforcement learning from human feedback (RLHF).

Supervised Learning: Labeled Data to Function Mapping

Problem Formulation

Given: \(\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^N\)

Learn: \(f: \mathcal{X} \to \mathcal{Y}\)

Minimize: \(\mathcal{L}(f(\mathbf{x}), y)\)

Core Tasks

Modern Applications

#| echo: true
#| code-fold: false

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Generate synthetic data
np.random.seed(42)
X = np.random.randn(1000, 10)
w_true = np.random.randn(10)
y = (X @ w_true + np.random.randn(1000)*0.1 > 0).astype(int)

# Standard supervised pipeline
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"Train accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")

Supervised Learning Training and Inference

Linear Regression Task

Training Phase

Training Phase

Inference Phase

Inference Phase

Unsupervised Learning: Structure from Unlabeled Data

No Labels, Just Data

Given: \(\mathcal{D} = \{\mathbf{x}_i\}_{i=1}^N\)

Find: Hidden patterns, structure, representations

Key Methods

Unsupervised Learning System

#| echo: false
"""
Four-panel comparison of unsupervised learning methods: raw unlabeled data, K-means clustering with centroids, PCA dimensionality reduction from 50D to 2D, and kernel density estimation.
"""

#| fig-align: center

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

fig, axes = plt.subplots(2, 2, figsize=(10, 10))

# Generate data with hidden structure
X, true_labels = make_blobs(n_samples=300, centers=3, n_features=2, random_state=42)

# Original data
ax = axes[0, 0]
ax.scatter(X[:, 0], X[:, 1], c='#616161', alpha=0.6, s=30)
ax.set_title('Raw Data (No Labels)', fontweight='bold')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.grid(True, alpha=0.3)

# K-means clustering
ax = axes[0, 1]
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
ax.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', alpha=0.6, s=30)
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
          c='#C62828', marker='*', s=300, edgecolor='black', linewidth=2)
ax.set_title('K-Means Clustering', fontweight='bold')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.grid(True, alpha=0.3)

# High-dimensional data for PCA
np.random.seed(42)
X_high = np.random.randn(200, 50)
# Add structure
X_high[:, :2] = make_blobs(n_samples=200, centers=3, n_features=2, random_state=42)[0]

# PCA
ax = axes[1, 0]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_high)
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=true_labels[:200], cmap='coolwarm', alpha=0.6, s=30)
ax.set_title('PCA Projection (50D → 2D)', fontweight='bold')
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
ax.grid(True, alpha=0.3)

# Density estimation
ax = axes[1, 1]
from scipy.stats import gaussian_kde
kde = gaussian_kde(X.T)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
positions = np.vstack([xx.ravel(), yy.ravel()])
density = kde(positions).reshape(xx.shape)
ax.contourf(xx, yy, density, levels=20, cmap='viridis', alpha=0.7)
ax.scatter(X[:, 0], X[:, 1], c='white', s=10, alpha=0.5, edgecolor='black', linewidth=0.5)
ax.set_title('Density Estimation', fontweight='bold')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.grid(True, alpha=0.3)

plt.suptitle('Unsupervised Learning Methods', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Clustering Reveals Hidden Patterns

Finding Structure in Data

Self-Supervised Learning: Labels from Data Itself

Creating Supervision from Data

Transform unsupervised → supervised by creating pretext tasks

Key Innovations

Why It Works

#| echo: true
#| code-fold: true

# Example: Simple masked prediction
def create_masked_task(sequence, mask_prob=0.15):
    """Create self-supervised task from sequence"""
    masked = sequence.copy()
    labels = np.full_like(sequence, -1)
    
    mask_indices = np.random.random(len(sequence)) < mask_prob
    masked[mask_indices] = 0  # [MASK] token
    labels[mask_indices] = sequence[mask_indices]
    
    return masked, labels

# Example sequence
sequence = np.array([1, 4, 2, 8, 3, 7, 5, 9])
masked_input, targets = create_masked_task(sequence)

print(f"Original: {sequence}")
print(f"Masked:   {masked_input}")
print(f"Targets:  {targets}")
Self-supervised learning powers modern foundation models like GPT and BERT

Reinforcement Learning: Sequential Decision Making

Sequential Decision Making

Components:

Objective: Maximize expected cumulative reward

\[ J(\pi) = \mathbb{E}_{\pi}\left[\sum_{t=0}^{\infty} \gamma^t r_t\right] \]

Applications

#| echo: false
"""
GridWorld environment showing an agent navigating toward a goal while avoiding obstacles, with arrows indicating the learned policy directions.
"""

#| fig-align: center

# Simple GridWorld visualization
fig, ax = plt.subplots(figsize=(6, 6))

# Create grid
grid_size = 5
for i in range(grid_size + 1):
    ax.axhline(i, color='black', linewidth=1)
    ax.axvline(i, color='black', linewidth=1)

# Agent position
agent_pos = (1, 1)
circle = patches.Circle((agent_pos[0] + 0.5, agent_pos[1] + 0.5), 
                        0.3, color='#1976D2', alpha=0.8)
ax.add_patch(circle)
ax.text(agent_pos[0] + 0.5, agent_pos[1] + 0.5, 'A', 
        ha='center', va='center', fontsize=14, fontweight='bold', color='white')

# Goal
goal_pos = (3, 3)
rect = patches.FancyBboxPatch((goal_pos[0], goal_pos[1]), 1, 1,
                              boxstyle="round,pad=0.02",
                              linewidth=2, edgecolor='#FFD700', facecolor='#FFF59D', alpha=0.7)
ax.add_patch(rect)
ax.text(goal_pos[0] + 0.5, goal_pos[1] + 0.5, 'G', 
        ha='center', va='center', fontsize=14, fontweight='bold')

# Obstacles
obstacles = [(2, 1), (1, 3), (3, 2)]
for obs in obstacles:
    rect = patches.Rectangle((obs[0], obs[1]), 1, 1,
                             linewidth=1, edgecolor='#C62828', facecolor='#EF5350', alpha=0.7)
    ax.add_patch(rect)
    ax.text(obs[0] + 0.5, obs[1] + 0.5, 'X',
           ha='center', va='center', fontsize=12, fontweight='bold', color='white')

# Q-values (simplified display)
actions = ['↑', '→', '↓', '←']
for i in range(grid_size):
    for j in range(grid_size):
        if (i, j) not in obstacles and (i, j) != goal_pos:
            # Show best action
            if i < 2 and j < 2:
                best_action = '→' if i < goal_pos[0] else '↑'
            else:
                best_action = '↑' if j < goal_pos[1] else '→'
            
            ax.text(i + 0.5, j + 0.5, best_action,
                   ha='center', va='center', fontsize=10, alpha=0.5)

ax.set_xlim(0, grid_size)
ax.set_ylim(0, grid_size)
ax.set_aspect('equal')
ax.set_title('GridWorld: RL Environment', fontsize=14, fontweight='bold')
ax.set_xlabel('X')
ax.set_ylabel('Y')

# Legend
legend_elements = [
    patches.Patch(color='#1976D2', label='Agent'),
    patches.Patch(color='#FFF59D', label='Goal'),
    patches.Patch(color='#EF5350', label='Obstacle')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

plt.tight_layout()
plt.show()

Same Problem, Different Paradigms

#| echo: false
"""
Comparison of supervised, unsupervised, and self-supervised learning paradigms on the Iris dataset, showing decision boundaries, clustering results, feature prediction, and performance metrics.
"""

#| fig-align: center

# Example: Learning to classify/cluster iris flowers
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris.data[:, [0, 2]]  # Use only 2 features for visualization
y = iris.target

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Supervised: Full labels
ax = axes[0, 0]
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

# Decision boundary
h = 0.02
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

ax.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
scatter = ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', edgecolor='black', s=50)
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='viridis', marker='^', edgecolor='black', s=70)
ax.set_title('Supervised: SVM Classification\n(All labels available)', fontweight='bold')
ax.set_xlabel('Sepal Length (scaled)')
ax.set_ylabel('Petal Length (scaled)')

# Unsupervised: No labels
ax = axes[0, 1]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='coolwarm', edgecolor='black', s=50)
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
          c='#C62828', marker='*', s=300, edgecolor='black', linewidth=2)
ax.set_title('Unsupervised: K-Means Clustering\n(No labels)', fontweight='bold')
ax.set_xlabel('Sepal Length (scaled)')
ax.set_ylabel('Petal Length (scaled)')

# Self-supervised: Create pretext task
ax = axes[0, 2]
# Pretext task: predict one feature from another
X_input = X_scaled[:, 0].reshape(-1, 1)
y_target = X_scaled[:, 1]
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(10, 10), random_state=42, max_iter=1000)
mlp.fit(X_input, y_target)
y_pred = mlp.predict(X_input)

ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c='#616161', alpha=0.3, s=30, label='True')
ax.scatter(X_scaled[:, 0], y_pred, c='#C62828', alpha=0.6, s=30, label='Predicted')
ax.set_title('Self-Supervised: Feature Prediction\n(Predict petal from sepal)', fontweight='bold')
ax.set_xlabel('Sepal Length (scaled)')
ax.set_ylabel('Petal Length (scaled)')
ax.legend()

# Performance comparison
ax = axes[1, 0]
methods = ['Supervised\n(100% labels)', 'Semi-supervised\n(10% labels)', 'Unsupervised\n(0% labels)']
accuracies = [0.95, 0.75, 0.65]
colors = ['#2E7D32', '#F57C00', '#8B0000']
bars = ax.bar(methods, accuracies, color=colors, alpha=0.7)
ax.set_ylabel('Accuracy / ARI Score')
ax.set_title('Performance Comparison', fontweight='bold')
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3, axis='y')

# Data requirements
ax = axes[1, 1]
paradigms = ['Supervised', 'Self-Sup.', 'Unsuper.', 'RL']
data_needs = [1000, 10000, 100, 100000]
colors = ['#2E7D32', '#1976D2', '#F57C00', '#8B0000']
bars = ax.bar(paradigms, data_needs, color=colors, alpha=0.7)
ax.set_ylabel('Typical Data Requirements')
ax.set_yscale('log')
ax.set_title('Data Efficiency', fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
ax.tick_params(axis='x', rotation=45)

# Computational cost
ax = axes[1, 2]
paradigms_short = ['Sup.', 'Self-Sup.', 'Unsup.', 'RL']
compute_cost = [1, 10, 0.5, 100]  # Relative costs
colors = ['#2E7D32', '#1976D2', '#F57C00', '#8B0000']
bars = ax.bar(paradigms_short, compute_cost, color=colors, alpha=0.7)
ax.set_ylabel('Relative Compute Cost')
ax.set_title('Computational Requirements', fontweight='bold')
ax.set_yscale('log')
ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Paradigm Comparison on Iris Dataset', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Label efficiency on CIFAR-10 (target: 90% accuracy):

Transfer learning with self-supervised pretraining: 50× reduction in labeled data

Modern Methods Combine Paradigms

Semi-Supervised Learning

Multi-Task Learning

Meta-Learning

Transfer Learning Pipeline

#| echo: false
"""
Transfer learning pipeline showing progression from pre-training on a large dataset to fine-tuning a task-specific model on smaller target data.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim(0, 10)
ax.set_ylim(0, 6)
ax.axis('off')

# Pre-training
rect1 = patches.FancyBboxPatch((0.5, 3), 2, 1.5, boxstyle="round,pad=0.05",
                               edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2)
ax.add_patch(rect1)
ax.text(1.5, 3.75, 'Large Dataset\n(ImageNet)', ha='center', va='center', fontsize=10)

# Pre-trained model
rect2 = patches.FancyBboxPatch((3.5, 3), 2, 1.5, boxstyle="round,pad=0.05",
                               edgecolor='#2E7D32', facecolor='#E8F5E9', linewidth=2)
ax.add_patch(rect2)
ax.text(4.5, 3.75, 'Pre-trained\nModel', ha='center', va='center', fontsize=10, fontweight='bold')

# Fine-tuning
rect3 = patches.FancyBboxPatch((6.5, 3), 2, 1.5, boxstyle="round,pad=0.05",
                               edgecolor='#F57C00', facecolor='#FFF3E0', linewidth=2)
ax.add_patch(rect3)
ax.text(7.5, 3.75, 'Small Target\nDataset', ha='center', va='center', fontsize=10)

# Task-specific model
rect4 = patches.FancyBboxPatch((3.5, 0.5), 2, 1.5, boxstyle="round,pad=0.05",
                               edgecolor='#8B0000', facecolor='#FFEBEE', linewidth=2)
ax.add_patch(rect4)
ax.text(4.5, 1.25, 'Task-Specific\nModel', ha='center', va='center', fontsize=10, fontweight='bold')

# Arrows with better positioning
arrow1 = patches.FancyArrowPatch((2.5, 3.75), (3.45, 3.75), mutation_scale=20,
                                 color='black', linewidth=2, arrowstyle='->')
ax.add_patch(arrow1)
ax.text(3, 4.15, 'Pre-train', fontsize=9, ha='center')

arrow2 = patches.FancyArrowPatch((5.5, 3.75), (6.45, 3.75), mutation_scale=20,
                                 color='black', linewidth=2, arrowstyle='->')
ax.add_patch(arrow2)
ax.text(6, 4.15, 'Transfer', fontsize=9, ha='center')

# Curved arrow from target dataset to right edge of task-specific model
arrow3 = patches.FancyArrowPatch((7.5, 2.95), (5.5, 1.25),
                                 connectionstyle="arc3,rad=-.3",
                                 mutation_scale=20,
                                 color='#8B0000', linewidth=2, arrowstyle='->')
ax.add_patch(arrow3)
ax.text(6.5, 2.0, 'Fine-tune', fontsize=9, ha='center')

ax.set_title('Transfer Learning: Pre-trained Models Adapt to New Tasks', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()
Modern approaches often combine paradigms for better performance

Neural Architecture

---

Neural Networks Form a Rich Hypothesis Class

Multilayer Perceptron (MLP): Fully connected feedforward network

Architecture Components

Why "Rich" Hypothesis Class?

Defn: Deep Neural Network

A neural network with more than one hidden layer. Depth enables hierarchical feature learning: early layers learn simple features, deeper layers learn complex abstractions.

Single Neuron Computation

Forward Computation

At neuron \(i\) in layer \(l\):

\[ a_i^{(l)} = h\left(\left[\mathbf{w}_i^{(l)}\right]^\top \mathbf{a}^{(l-1)} + b_i^{(l)}\right) \]

where:

Matrix Form (Entire Layer)

\[ \mathbf{a}^{(l)} = h\left(\mathbf{W}^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}\right) \]

Universal Approximation: Existence Guarantee

Cybenko (1989), Hornik et al. (1989)

A feedforward network with:

can approximate any continuous function on compact subset of \(\mathbb{R}^n\) to arbitrary accuracy

Critical word: CAN

The theorem guarantees such networks exist. Finding them through training is different.

## Preview Detailed treatment later: approximation theory, width vs depth, practical training implications
#| echo: false
"""
Function approximation showing how networks with increasing hidden layer width (5, 10, 20 neurons) converge toward a target function, illustrating the universal approximation theorem.
"""

#| fig-align: center

x = np.linspace(-2, 2, 100)
y_true = np.sin(2*x) + 0.5*np.cos(4*x)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(x, y_true, 'k-', linewidth=2, label='Target')

# Approximations with different widths
for width, alpha, color in [(5, 0.3, 'blue'), (10, 0.5, 'green'), (20, 0.7, 'red')]:
    np.random.seed(42)
    y_approx = y_true + np.random.normal(0, 0.1*(25-width)/20, len(x))
    ax.plot(x, y_approx, color=color, alpha=alpha, linewidth=1.5, 
            label=f'{width} neurons')

ax.set_xlabel('Input')
ax.set_ylabel('Output')
ax.set_title('Function Approximation', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Width vs Depth: Why We Use Deep Networks

Width-only (single hidden layer)

Universal approximation guarantees this works, but:

Depth (multiple layers)

Why depth matters: Practical networks need efficient representations

#| echo: false
"""
Parameter efficiency comparison showing exponential growth for wide shallow networks versus polynomial growth for deep networks, with visual diagram contrasting the two architectures.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 1, figsize=(8, 10))

# Parameter count: width vs depth
ax = axes[0]
n_bits = np.arange(4, 13)
width_params = 2**(n_bits - 1) * n_bits  # Exponential in width
depth_params = n_bits**2 * 4  # Polynomial in depth

ax.semilogy(n_bits, width_params, 'r-o', linewidth=2, label='Width-only', markersize=8)
ax.semilogy(n_bits, depth_params, 'b-s', linewidth=2, label='Deep', markersize=8)
ax.set_xlabel('Problem Size (bits)')
ax.set_ylabel('Parameters Required')
ax.set_title('Width vs Depth: Parameter Efficiency', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Visual comparison
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 8)
ax.axis('off')

# Wide shallow network
y_pos = 6
for i in range(8):
    x = 1 + i * 0.3
    circle = plt.Circle((x, y_pos), 0.12, color='red', alpha=0.6)
    ax.add_patch(circle)
ax.text(1.9, y_pos + 0.8, 'Wide & Shallow\n(Exponential neurons)', ha='center', fontsize=10, fontweight='bold')

# Deep narrow network
x_pos = 7
for i in range(6):
    y = 1.5 + i * 0.8
    circle = plt.Circle((x_pos, y), 0.15, color='blue', alpha=0.6)
    ax.add_patch(circle)
ax.text(x_pos, 0.5, 'Deep & Narrow\n(Polynomial neurons)', ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

Forward and Backward Pass

Implementation

#| echo: true
#| code-fold: false
#| code-line-numbers: true

class Layer:
    def forward(self, x):
        # Store for backward pass
        self.x = x
        # Linear transformation
        self.z = np.dot(x, self.W) + self.b
        # Apply activation
        self.a = self.activation(self.z)
        return self.a
    
    def backward(self, grad_output):
        # Chain rule through activation
        grad_z = grad_output * \
                 self.activation_derivative(self.z)
        
        # Parameter gradients
        self.grad_W = np.dot(self.x.T, grad_z)
        self.grad_b = np.sum(grad_z, axis=0)
        
        # Input gradient for previous layer
        grad_input = np.dot(grad_z, self.W.T)
        return grad_input

Computational Graph

#| echo: false
"""
Computational graph showing forward pass operations (input → weighted sum → bias → activation → loss) and backward pass gradient flow with partial derivatives at each node.
"""

#| fig-align: center

import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(10, 7))

# Forward pass computation nodes
nodes_forward = [
    (1, 4, '$\\mathbf{x}$', '#E3F2FD', 'input'),
    (2.5, 4, '$\\mathbf{W}\\mathbf{x}$', '#FFF9C4', 'hidden'),
    (4, 4, '$+\\mathbf{b}$', '#FFF9C4', 'hidden'),
    (5.5, 4, '$\\sigma(\\cdot)$', '#FFE0B2', 'activation'),
    (7, 4, '$\\mathbf{a}$', '#FFEBEE', 'output'),
    (8.5, 4, '$\\mathcal{L}$', '#FFCDD2', 'loss')
]

# Draw forward pass nodes
for x, y, label, color, node_type in nodes_forward:
    if node_type in ['input', 'output', 'loss']:
        circle = plt.Circle((x, y), 0.35, color=color, ec='black', linewidth=2)
        ax.add_patch(circle)
    else:
        rect = patches.FancyBboxPatch((x-0.35, y-0.35), 0.7, 0.7,
                                      boxstyle="round,pad=0.05",
                                      facecolor=color, edgecolor='black', linewidth=2)
        ax.add_patch(rect)
    ax.text(x, y, label, ha='center', va='center', fontsize=11, fontweight='bold')

# Forward pass arrows - adjusted to not overlap with nodes
forward_connections = [
    (1.35, 4, 0.65, 0),   # x to Wx
    (2.85, 4, 0.65, 0),   # Wx to +b
    (4.35, 4, 0.65, 0),   # +b to σ
    (5.85, 4, 0.65, 0),   # σ to a
    (7.35, 4, 0.65, 0)    # a to L
]

for x, y, dx, dy in forward_connections:
    ax.arrow(x, y, dx, dy, head_width=0.12, head_length=0.08,
             fc='#1976D2', ec='#1976D2', linewidth=2.5)

# Backward pass gradient flow
backward_y = 2.5
nodes_backward = [
    (8.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathcal{L}}=1$', '#FFCDD2'),
    (7, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{a}}$', '#FFE0B2'),
    (5.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{z}}$', '#FFE0B2'),
    (4, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{b}}$', '#E1F5FE'),
    (2.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{W}}$', '#E1F5FE'),
    (1, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{x}}$', '#E3F2FD')
]

# Draw backward pass nodes with larger text
for x, y, label, color in nodes_backward:
    rect = patches.FancyBboxPatch((x-0.55, y-0.3), 1.1, 0.6,
                                  boxstyle="round,pad=0.02",
                                  facecolor=color, edgecolor='#C62828',
                                  linewidth=1.5, linestyle='--', alpha=0.8)
    ax.add_patch(rect)
    ax.text(x, y, label, ha='center', va='center', fontsize=10, fontweight='bold')

# Backward pass arrows - adjusted to not overlap
backward_connections = [
    (8.05, backward_y, -0.5, 0),   # ∂L/∂L to ∂L/∂a
    (6.55, backward_y, -0.5, 0),   # ∂L/∂a to ∂L/∂z
    (5.05, backward_y, -0.5, 0),   # ∂L/∂z to ∂L/∂b
    (3.55, backward_y, -0.5, 0),   # ∂L/∂b to ∂L/∂W
    (2.05, backward_y, -0.5, 0)    # ∂L/∂W to ∂L/∂x
]

for x, y, dx, dy in backward_connections:
    ax.arrow(x, y, dx, dy, head_width=0.1, head_length=0.06,
             fc='#C62828', ec='#C62828', linewidth=2, linestyle='--', alpha=0.7)

# Vertical connections showing gradient computation
for x_pos in [2.5, 4, 5.5, 7]:
    ax.plot([x_pos, x_pos], [3.65, 2.75], 'k:', linewidth=1, alpha=0.5)

# Labels
ax.text(4.75, 5, 'Forward Pass', fontsize=13, fontweight='bold', color='#1976D2')
ax.text(4.75, 1.5, 'Backward Pass (Gradients)', fontsize=13, fontweight='bold', color='#C62828')

# Annotations
ax.text(2.5, 3.3, 'Store $\\mathbf{x}$', ha='center', fontsize=8, style='italic', color='#666')
ax.text(5.5, 3.3, 'Chain rule', ha='center', fontsize=8, style='italic', color='#666')

ax.set_xlim(0, 9.5)
ax.set_ylim(1, 5.5)
ax.axis('off')

plt.tight_layout()
plt.show()

Network Capacity and Depth

#| echo: false
"""
Parameter scaling with network width and depth (left) alongside function complexity achievable at different depths (right).
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Capacity vs parameters
ax = axes[0]
widths = [10, 20, 50, 100, 200]
depths = [2, 3, 4, 5, 6]
colors = ['blue', 'green', 'orange', 'red', 'purple']

for d, color in zip(depths, colors):
    params = [w * w * d for w in widths]
    ax.plot(widths, params, 'o-', label=f'Depth {d}', color=color, linewidth=2)

ax.set_xlabel('Width (neurons per layer)')
ax.set_ylabel('Total Parameters')
ax.set_title('Parameter Count', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_yscale('log')

# Expressivity illustration
ax = axes[1]
x = np.linspace(-2, 2, 100)
functions = [
    (lambda x: x, 'Linear', 'blue'),
    (lambda x: np.maximum(0, x), 'Shallow (1)', 'green'),
    (lambda x: np.sin(2*x), 'Medium (3)', 'orange'),
    (lambda x: np.sin(2*x) + 0.5*np.cos(4*x), 'Deep (5)', 'red')
]

for func, label, color in functions:
    y = func(x)
    if 'Deep' in label:
        y += 0.1*np.sin(10*x)
    ax.plot(x, y, label=label, color=color, linewidth=2, alpha=0.7)

ax.set_xlabel('Input')
ax.set_ylabel('Output')
ax.set_title('Function Complexity vs Depth', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

The Optimization Landscape

---

Loss Surfaces in High Dimensions

#| echo: false
"""
Comparison of loss surface complexity across model types: convex bowl for linear models, wavy terrain with local minima for shallow networks, and highly complex landscape for deep networks.
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as patches

fig = plt.figure(figsize=(15, 5))

# Simple convex loss
ax1 = fig.add_subplot(131, projection='3d')
x = np.linspace(-3, 3, 50)
y = np.linspace(-3, 3, 50)
X, Y = np.meshgrid(x, y)
Z = 0.5 * (X**2 + 2*Y**2)

surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8, edgecolor='none')
ax1.set_xlabel('$w_1$')
ax1.set_ylabel('$w_2$')
ax1.set_zlabel('Loss')
ax1.set_title('Convex (Linear Model)', fontweight='bold')

# Non-convex with local minima
ax2 = fig.add_subplot(132, projection='3d')
Z2 = np.sin(2*X) * np.cos(2*Y) + 0.1*(X**2 + Y**2)

surf2 = ax2.plot_surface(X, Y, Z2, cmap='coolwarm', alpha=0.8, edgecolor='none')
ax2.set_xlabel('$w_1$')
ax2.set_ylabel('$w_2$')
ax2.set_zlabel('Loss')
ax2.set_title('Non-Convex (Shallow Network)', fontweight='bold')

# High-dimensional projection
ax3 = fig.add_subplot(133, projection='3d')
Z3 = np.sin(3*X) * np.cos(3*Y) * np.exp(-0.1*(X**2 + Y**2)) + 0.5*np.sin(5*X) + 0.3*np.cos(7*Y)

surf3 = ax3.plot_surface(X, Y, Z3, cmap='plasma', alpha=0.8, edgecolor='none')
ax3.set_xlabel('$w_1$')
ax3.set_ylabel('$w_2$')
ax3.set_zlabel('Loss')
ax3.set_title('Complex (Deep Network)', fontweight='bold')

plt.suptitle('Loss Landscape Complexity', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

Mathematical Reality

In \(d\) dimensions with \(n\) parameters:

Empirical Observations

Stochastic Gradient Descent and Variants

#| echo: true
#| code-fold: true

def sgd(w, grad, lr=0.01):
    return w - lr * grad

def sgd_momentum(w, grad, velocity, lr=0.01, beta=0.9):
    velocity = beta * velocity + lr * grad
    return w - velocity, velocity

def adam(w, grad, m, v, t, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
    m = beta1 * m + (1 - beta1) * grad
    v = beta2 * v + (1 - beta2) * grad**2
    m_hat = m / (1 - beta1**t)
    v_hat = v / (1 - beta2**t)
    return w - lr * m_hat / (np.sqrt(v_hat) + eps), m, v
#| echo: false
"""
Comparison of SGD, Momentum, and Adam optimizer trajectories on the Rosenbrock function, showing how each algorithm navigates toward the global minimum.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Generate optimization paths
np.random.seed(42)
n_steps = 1000

def rosenbrock(x, y):
    return (1 - x)**2 + 100 * (y - x**2)**2

x_range = np.linspace(-1.5, 1.5, 100)
y_range = np.linspace(-0.5, 1.5, 100)
X_grid, Y_grid = np.meshgrid(x_range, y_range)
Z_grid = rosenbrock(X_grid, Y_grid)

# SGD path
ax = axes[0]
contour = ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6)
w_sgd = np.array([-0.5, 0.5])
path_sgd = [w_sgd.copy()]
lr = 0.0004

for _ in range(n_steps):
    grad_x = -2*(1 - w_sgd[0]) - 400*w_sgd[0]*(w_sgd[1] - w_sgd[0]**2)
    grad_y = 200*(w_sgd[1] - w_sgd[0]**2)
    grad = np.array([grad_x, grad_y])
    grad = np.clip(grad, -10, 10)  # Clip gradients to prevent overflow
    w_sgd = sgd(w_sgd, grad, lr=lr)
    path_sgd.append(w_sgd.copy())

path_sgd = np.array(path_sgd)
ax.plot(path_sgd[:, 0], path_sgd[:, 1], 'r.-', linewidth=2, markersize=3, label='SGD')
ax.plot(1, 1, 'g*', markersize=15, label='Optimum')
ax.set_title('Vanilla SGD', fontweight='bold')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.legend()

# Momentum path
ax = axes[1]
ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6)
w_mom = np.array([-0.5, 0.5])
velocity = np.zeros(2)
path_mom = [w_mom.copy()]

for _ in range(n_steps):
    grad_x = -2*(1 - w_mom[0]) - 400*w_mom[0]*(w_mom[1] - w_mom[0]**2)
    grad_y = 200*(w_mom[1] - w_mom[0]**2)
    grad = np.array([grad_x, grad_y])
    grad = np.clip(grad, -10, 10)  # Clip gradients
    w_mom, velocity = sgd_momentum(w_mom, grad, velocity, lr=0.0002, beta=0.9)
    path_mom.append(w_mom.copy())

path_mom = np.array(path_mom)
ax.plot(path_mom[:, 0], path_mom[:, 1], 'b.-', linewidth=2, markersize=3, label='Momentum')
ax.plot(1, 1, 'g*', markersize=15, label='Optimum')
ax.set_title('SGD with Momentum', fontweight='bold')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.legend()

# Adam path
ax = axes[2]
ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6)
w_adam = np.array([-0.5, 0.5])
m = np.zeros(2)
v = np.zeros(2)
path_adam = [w_adam.copy()]

for t in range(1, n_steps + 1):
    grad_x = -2*(1 - w_adam[0]) - 400*w_adam[0]*(w_adam[1] - w_adam[0]**2)
    grad_y = 200*(w_adam[1] - w_adam[0]**2)
    grad = np.array([grad_x, grad_y])
    grad = np.clip(grad, -10, 10)  # Clip gradients
    w_adam, m, v = adam(w_adam, grad, m, v, t, lr=0.005)
    path_adam.append(w_adam.copy())

path_adam = np.array(path_adam)
ax.plot(path_adam[:, 0], path_adam[:, 1], 'm.-', linewidth=2, markersize=3, label='Adam')
ax.plot(1, 1, 'g*', markersize=15, label='Optimum')
ax.set_title('Adam Optimizer', fontweight='bold')
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.legend()

plt.suptitle('Optimizer Comparison on Rosenbrock Function', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

Lottery Ticket Hypothesis: Sparse Subnetworks at Initialization

Frankle & Carbin (2019)

"Dense networks contain sparse subnetworks that can train to comparable accuracy from the same initialization"

Implications

Practical Impact

\[ \text{Parameters: } 100M \to 10M \]
\[ \text{Performance: } 95\% \to 94.5\% \]

Why this matters:

Storage:

Speed:

Training:

#| echo: false
"""
Comparison of random pruning versus lottery ticket pruning accuracy across sparsity levels, with side-by-side visualization of dense versus sparse winning ticket network structures.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 1, figsize=(8, 10))

# Pruning vs accuracy
ax = axes[0]
sparsity = np.array([0, 50, 80, 90, 95, 98, 99, 99.5])
accuracy = np.array([95, 94.8, 94.5, 94, 92, 85, 70, 50])
winning_ticket = np.array([95, 95, 94.9, 94.5, 93.5, 91, 82, 65])

ax.plot(sparsity, accuracy, 'b-o', linewidth=2, label='Random Pruning')
ax.plot(sparsity, winning_ticket, 'r-s', linewidth=2, label='Lottery Ticket')
ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Sparsity (%)')
ax.set_ylabel('Test Accuracy (%)')
ax.set_title('Pruning Performance', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Network visualization
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Dense network - Left side
for layer, x in enumerate([1, 3, 5]):
    n_neurons = [3, 4, 3][layer]
    for i in range(n_neurons):
        y = 5 + (i - n_neurons/2 + 0.5) * 1.5
        circle = patches.Circle((x, y), 0.15, color='#90CAF9', edgecolor='#1976D2', linewidth=1.5)
        ax.add_patch(circle)
        
        # Connect to next layer
        if layer < 2:
            next_n = [4, 3][layer]
            for j in range(next_n):
                next_y = 5 + (j - next_n/2 + 0.5) * 1.5
                ax.plot([x+0.15, x+2-0.15], [y, next_y], '#9E9E9E', alpha=0.3, linewidth=0.8)

ax.text(3, 1.5, 'Dense Network\n(All connections)', ha='center', fontweight='bold', fontsize=10)

# Winning Ticket - Right side  
for layer, x in enumerate([6, 8, 10]):
    n_neurons = [3, 4, 3][layer]
    for i in range(n_neurons):
        y = 5 + (i - n_neurons/2 + 0.5) * 1.5
        # Highlight winning neurons
        if (layer == 0 and i in [0, 2]) or (layer == 1 and i in [0, 2]) or (layer == 2 and i in [0, 1]):
            circle = patches.Circle((x, y), 0.15, color='#FFCDD2', edgecolor='#C62828', linewidth=2)
        else:
            circle = patches.Circle((x, y), 0.15, color='#F5F5F5', edgecolor='#9E9E9E', linewidth=1)
        ax.add_patch(circle)

# Winning connections only
winning_paths = [
    ((6, 5-1.5), (8, 5+0.75)),  # bottom to middle-high
    ((6, 5+1.5), (8, 5+0.75)),  # top to middle-high  
    ((6, 5+1.5), (8, 5-0.75)),  # top to middle-low
    ((8, 5+0.75), (10, 5-1.5)), # middle-high to bottom
    ((8, 5+0.75), (10, 5)),     # middle-high to middle
    ((8, 5-0.75), (10, 5)),     # middle-low to middle
]

for (x1, y1), (x2, y2) in winning_paths:
    ax.plot([x1+0.15, x2-0.15], [y1, y2], '#C62828', alpha=0.7, linewidth=2)

ax.text(8, 1.5, 'Winning Ticket\n(Sparse subnetwork)', ha='center', fontweight='bold', fontsize=10)

ax.set_title('Network Structure Comparison', fontweight='bold', y=0.95)

plt.tight_layout()
plt.show()

Detailed treatment: Network pruning and efficient architectures

Modern Theoretical Insights

#| echo: false
"""
Six-panel visualization of modern deep learning theory: overparameterization benefits via double descent, loss landscape smoothing with network width, implicit regularization from SGD, Neural Tangent Kernel theory, mode connectivity between minima, and the grokking phenomenon.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Overparameterization
ax = axes[0, 0]
params = np.logspace(3, 7, 50)
train_error = 100 / np.sqrt(params)
test_error_classical = train_error + 20 * np.sqrt(params / 1e6)
test_error_modern = train_error + 5 / np.sqrt(params / 1e5)

ax.loglog(params, train_error, 'b-', linewidth=2, label='Train Error')
ax.loglog(params, test_error_classical, 'r--', linewidth=2, label='Classical (U-shape)')
ax.loglog(params, test_error_modern, 'g-', linewidth=2, label='Modern (Double Descent)')
ax.set_xlabel('Number of Parameters')
ax.set_ylabel('Error')
ax.set_title('Overparameterization Benefits', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Loss landscape smoothness
ax = axes[0, 1]
widths = [64, 128, 256, 512, 1024]
roughness = [2.5, 1.8, 1.2, 0.8, 0.5]
ax.plot(widths, roughness, 'mo-', linewidth=2, markersize=8)
ax.set_xlabel('Network Width')
ax.set_ylabel('Loss Landscape Roughness')
ax.set_title('Width Smooths Landscape', fontweight='bold')
ax.grid(True, alpha=0.3)

# Implicit regularization
ax = axes[0, 2]
epochs_range = np.arange(0, 200)
train_loss = np.exp(-epochs_range / 20)
test_loss_no_reg = np.exp(-epochs_range / 25) + 0.1 * np.sqrt(epochs_range / 100)
test_loss_implicit = np.exp(-epochs_range / 25) + 0.02

ax.semilogy(epochs_range, train_loss, 'b-', linewidth=2, label='Train')
ax.semilogy(epochs_range, test_loss_no_reg, 'r--', linewidth=2, label='Test (No Reg)')
ax.semilogy(epochs_range, test_loss_implicit, 'g-', linewidth=2, label='Test (SGD)')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.set_title('Implicit Regularization', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Neural Tangent Kernel
ax = axes[1, 0]
ax.text(0.5, 0.85, 'Neural Tangent Kernel', fontsize=12, fontweight='bold', ha='center')
ax.text(0.5, 0.65, 'Wide networks ≈ Kernel methods', fontsize=11, ha='center')
ax.text(0.5, 0.45, r'$f(x, \theta_t) \approx f(x, \theta_0) + \nabla_\theta f|_{\theta_0} \cdot (\theta_t - \theta_0)$', 
        fontsize=11, ha='center')
ax.text(0.5, 0.25, 'Training dynamics become linear', fontsize=11, ha='center')
ax.text(0.5, 0.05, 'in infinite width limit', fontsize=11, ha='center', style='italic')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')

# Mode connectivity
ax = axes[1, 1]
theta = np.linspace(0, 1, 100)
loss_direct = 0.5 + 2 * (theta - 0.5)**2
loss_curve = 0.5 + 0.1 * np.sin(10 * theta)

ax.plot(theta, loss_direct, 'r--', linewidth=2, label='Direct Path')
ax.plot(theta, loss_curve, 'b-', linewidth=2, label='Curved Path')
ax.scatter([0, 1], [0.5, 0.5], c='green', s=100, zorder=5, label='Minima')
ax.set_xlabel('Interpolation Parameter α')
ax.set_ylabel('Loss')
ax.set_title('Mode Connectivity', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Grokking phenomenon
ax = axes[1, 2]
epochs_grok = np.arange(0, 1000)
train_acc = 100 * (1 - np.exp(-epochs_grok / 50))
test_acc = np.zeros_like(epochs_grok, dtype=float)
test_acc[:500] = 50
test_acc[500:] = 100 * (1 - np.exp(-(epochs_grok[500:] - 500) / 100))

ax.plot(epochs_grok, train_acc, 'b-', linewidth=2, label='Train')
ax.plot(epochs_grok, test_acc, 'r-', linewidth=2, label='Test')
ax.axvline(x=500, color='gray', linestyle='--', alpha=0.5)
ax.text(500, 30, 'Grokking', rotation=90, fontsize=10, ha='center')
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy (%)')
ax.set_title('Grokking: Delayed Generalization', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# plt.suptitle('Modern Theoretical Insights', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Why Deep Networks Generalize

---

Neural Networks Memorize Random Labels Yet Generalize on Real Data

#| echo: false
"""
Three-panel comparison showing classical U-shaped overfitting theory versus modern double descent phenomenon, plus demonstration that networks achieve 100% training accuracy even with fully random labels.
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Traditional view
ax = axes[0]
capacity = np.linspace(0, 10, 100)
train_error = 10 / (1 + capacity)
test_error = train_error + 0.5 * capacity

ax.plot(capacity, train_error, 'b-', linewidth=2, label='Training Error')
ax.plot(capacity, test_error, 'r-', linewidth=2, label='Test Error')
ax.fill_between(capacity[50:], train_error[50:], test_error[50:], alpha=0.3, color='gray')
ax.text(7.5, 5, 'Overfitting', fontsize=11, fontweight='bold')
ax.set_xlabel('Model Capacity')
ax.set_ylabel('Error')
ax.set_title('Classical View', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# What we observe
ax = axes[1]
np.random.seed(42)
n_params = np.logspace(2, 6, 50)
n_samples = 10000
interpolation_threshold = n_samples

train_error = np.zeros_like(n_params)
test_error = np.zeros_like(n_params)

for i, p in enumerate(n_params):
    if p < interpolation_threshold:
        train_error[i] = 10 * np.exp(-p/1000)
        test_error[i] = train_error[i] + 100/np.sqrt(p)
    else:
        train_error[i] = 0
        test_error[i] = 20 / np.log(p/1000)

ax.semilogx(n_params, train_error, 'b-', linewidth=2, label='Train')
ax.semilogx(n_params, test_error, 'r-', linewidth=2, label='Test')
ax.axvline(x=interpolation_threshold, color='gray', linestyle='--', alpha=0.5)
ax.text(interpolation_threshold*1.5, 15, 'Interpolation\nThreshold', fontsize=10)
ax.set_xlabel('Number of Parameters')
ax.set_ylabel('Error')
ax.set_title('Modern Reality: Double Descent', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Memorization experiment
ax = axes[2]
noise_levels = [0, 0.2, 0.5, 0.8, 1.0]
train_acc = [100, 100, 100, 100, 100]
test_acc = [92, 88, 75, 60, 50]

x_pos = np.arange(len(noise_levels))
width = 0.35

bars1 = ax.bar(x_pos - width/2, train_acc, width, label='Train', color='#2E7D32', alpha=0.7)
bars2 = ax.bar(x_pos + width/2, test_acc, width, label='Test', color='#B71C1C', alpha=0.7)

ax.set_xlabel('Label Noise Level')
ax.set_ylabel('Accuracy (%)')
ax.set_xticks(x_pos)
ax.set_xticklabels(['0%', '20%', '50%', '80%', '100%'])
ax.set_title('Networks Memorize Everything', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Why Do Neural Networks Generalize At All?', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Regularization Techniques: Dropout and Weight Decay

#| echo: true
#| code-fold: false

def dropout(x, p=0.5, training=True):
    if not training:
        return x
    mask = np.random.binomial(1, 1-p, size=x.shape) / (1-p)
    return x * mask

def weight_decay(loss, weights, lambda_reg=0.01):
    l2_penalty = sum(np.sum(w**2) for w in weights)
    return loss + lambda_reg * l2_penalty
#| echo: false
"""
Four-panel visualization of regularization techniques: dropout's random neuron deactivation, weight decay's effect on weight distributions, early stopping at optimal validation loss, and comparison of test errors across combined regularization methods.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Dropout visualization
ax = axes[0, 0]
np.random.seed(42)
neurons = 10
layers = 4

for layer in range(layers):
    x_pos = layer * 2
    y_positions = np.linspace(0, 5, neurons)
    
    for i, y_pos in enumerate(y_positions):
        if layer == 2:
            dropped = np.random.random() < 0.3
            color = 'lightgray' if dropped else 'lightblue'
            alpha = 0.3 if dropped else 1.0
        else:
            color = 'lightblue'
            alpha = 1.0
            
        circle = plt.Circle((x_pos, y_pos), 0.2, color=color, alpha=alpha, edgecolor='black')
        ax.add_patch(circle)
        
        if layer > 0:
            prev_x = (layer - 1) * 2
            prev_y_positions = np.linspace(0, 5, neurons)
            for prev_y in prev_y_positions:
                if layer == 2 and np.random.random() < 0.3:
                    continue
                ax.plot([prev_x + 0.2, x_pos - 0.2], [prev_y, y_pos], 
                       'gray', alpha=0.2, linewidth=0.5)

ax.set_xlim(-1, 7)
ax.set_ylim(-1, 6)
ax.axis('off')
ax.set_title('Dropout: Random Deactivation', fontweight='bold')
ax.text(3, -0.5, 'Forces redundant representations', ha='center', fontsize=10, style='italic')

# Weight decay effect
ax = axes[0, 1]
weights_no_decay = np.random.randn(100) * 2
weights_with_decay = np.random.randn(100) * 0.5

ax.hist(weights_no_decay, bins=30, alpha=0.5, label='No Decay', color='red')
ax.hist(weights_with_decay, bins=30, alpha=0.5, label='With L2', color='blue')
ax.set_xlabel('Weight Value')
ax.set_ylabel('Count')
ax.set_title('Weight Decay: Prefer Small Weights', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Early stopping
ax = axes[1, 0]
epochs = np.arange(0, 200)
train_loss = 0.5 * np.exp(-epochs / 30) + 0.01
val_loss = 0.5 * np.exp(-epochs / 30) + 0.02 + 0.0001 * (epochs - 50)**2
val_loss[:50] = 0.5 * np.exp(-epochs[:50] / 30) + 0.02

best_epoch = np.argmin(val_loss)

ax.plot(epochs, train_loss, 'b-', linewidth=2, label='Training')
ax.plot(epochs, val_loss, 'r-', linewidth=2, label='Validation')
ax.axvline(x=best_epoch, color='green', linestyle='--', linewidth=2, alpha=0.7)
ax.scatter([best_epoch], [val_loss[best_epoch]], color='green', s=100, zorder=5)
ax.text(best_epoch + 5, 0.15, f'Stop here\n(epoch {best_epoch})', fontsize=10)
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.set_title('Early Stopping: Quit While Ahead', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Regularization comparison
ax = axes[1, 1]
methods = ['None', 'L2', 'Dropout', 'L2+Dropout', 'All']
test_errors = [15, 12, 11, 9, 8]
colors = plt.cm.RdYlGn_r(np.linspace(0.3, 0.7, len(methods)))

bars = ax.bar(methods, test_errors, color=colors)
ax.set_ylabel('Test Error (%)')
ax.set_title('Combining Regularization Methods', fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Architecture Embeds Domain Knowledge

#| echo: false
"""
Comparison of fully connected, convolutional, and recurrent architectures showing how each embeds different inductive biases: global connections, local receptive fields, and sequential hidden state propagation.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Fully connected
ax = axes[0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

grid_size = 5
for i in range(grid_size):
    for j in range(grid_size):
        rect = plt.Rectangle((i*1.5 + 1, j*1.5 + 1), 1, 1, 
                            facecolor='lightblue', edgecolor='black')
        ax.add_patch(rect)

output_x, output_y = 9, 5
circle = plt.Circle((output_x, output_y), 0.3, color='red', alpha=0.8)
ax.add_patch(circle)

for i in range(grid_size):
    for j in range(grid_size):
        ax.plot([i*1.5 + 1.5, output_x - 0.3], [j*1.5 + 1.5, output_y], 
               'gray', alpha=0.2, linewidth=0.5)

ax.set_title('Fully Connected\nNo spatial assumption', fontweight='bold')

# Convolutional
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

for i in range(grid_size):
    for j in range(grid_size):
        rect = plt.Rectangle((i*1.5 + 1, j*1.5 + 1), 1, 1, 
                            facecolor='lightblue', edgecolor='black')
        ax.add_patch(rect)

kernel_i, kernel_j = 1, 1
for di in range(3):
    for dj in range(3):
        rect = plt.Rectangle(((kernel_i + di)*1.5 + 1, (kernel_j + dj)*1.5 + 1), 
                            1, 1, facecolor='yellow', edgecolor='red', linewidth=2)
        ax.add_patch(rect)

circle = plt.Circle((9, 5), 0.3, color='red', alpha=0.8)
ax.add_patch(circle)

for di in range(3):
    for dj in range(3):
        ax.plot([(kernel_i + di)*1.5 + 1.5, 8.7], 
               [(kernel_j + dj)*1.5 + 1.5, 5], 
               'red', alpha=0.5, linewidth=1)

ax.set_title('Convolutional\nLocal patterns matter', fontweight='bold')

# Recurrent
ax = axes[2]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

time_steps = 5
for t in range(time_steps):
    rect = plt.Rectangle((t*1.5 + 1, 4), 1, 2, 
                         facecolor='lightgreen', edgecolor='black')
    ax.add_patch(rect)
    ax.text(t*1.5 + 1.5, 5, f'$x_{t}$', ha='center', va='center')

for t in range(time_steps):
    circle = plt.Circle((t*1.5 + 1.5, 7.5), 0.3, color='orange', alpha=0.8)
    ax.add_patch(circle)
    ax.text(t*1.5 + 1.5, 7.5, f'$h_{t}$', ha='center', va='center', fontsize=9)
    
    if t < time_steps - 1:
        ax.arrow(t*1.5 + 1.8, 7.5, 0.9, 0, 
                head_width=0.15, head_length=0.1, fc='orange', ec='orange')
    
    # Arrow from input to hidden state (ending at circle edge)
    ax.arrow(t*1.5 + 1.5, 6, 0, 0.9, 
            head_width=0.15, head_length=0.1, fc='gray', ec='gray')

ax.set_title('Recurrent\nSequential dependencies', fontweight='bold')

plt.suptitle('Architectural Biases Shape Learning', fontsize=16, fontweight='bold', y=1.05)
plt.tight_layout()
plt.show()

Train/Validation/Test: Sacred Separation

#| echo: false
"""
Train/validation/test data split visualization with shuffled grid showing 70/15/15 proportions, alongside bar chart demonstrating model selection using validation scores while keeping test data hidden.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Train/Val/Test split
ax = axes[0]
np.random.seed(42)

n_total = 1000
indices = np.arange(n_total)
train_end = int(0.7 * n_total)
val_end = int(0.85 * n_total)

colors = ['#2E7D32'] * train_end + ['#F57C00'] * (val_end - train_end) + ['#1976D2'] * (n_total - val_end)
np.random.shuffle(colors)

grid_size = int(np.sqrt(n_total))
colors_grid = np.array(colors[:grid_size**2]).reshape(grid_size, grid_size)

for i in range(grid_size):
    for j in range(grid_size):
        rect = plt.Rectangle((j, i), 1, 1, facecolor=colors_grid[i, j], 
                            edgecolor='white', linewidth=0.5)
        ax.add_patch(rect)

ax.set_xlim(0, grid_size)
ax.set_ylim(0, grid_size)
ax.set_aspect('equal')
ax.axis('off')

legend_elements = [
    plt.Rectangle((0, 0), 1, 1, facecolor='#2E7D32', label='Train (70%)'),
    plt.Rectangle((0, 0), 1, 1, facecolor='#F57C00', label='Validation (15%)'),
    plt.Rectangle((0, 0), 1, 1, facecolor='#1976D2', label='Test (15%)')
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=10)
ax.set_title('Data Split Visualization', fontweight='bold', fontsize=14)

# Model selection process
ax = axes[1]

models = ['Model A', 'Model B', 'Model C', 'Model D']
val_scores = [85, 88, 87, 84]
test_scores = [83, 86, 85, 82]

x = np.arange(len(models))
width = 0.35

bars1 = ax.bar(x - width/2, val_scores, width, label='Validation', color='#F57C00', alpha=0.7)
bars2 = ax.bar(x + width/2, test_scores, width, label='Test (Hidden)', color='#1976D2', alpha=0.7)

best_idx = np.argmax(val_scores)
ax.add_patch(plt.Rectangle((best_idx - 0.5, 75), 1.0, 15, 
                          fill=False, edgecolor='red', linewidth=2.5))
ax.text(best_idx, 91, 'Selected', ha='center', fontsize=10, fontweight='bold', color='red')

ax.set_ylabel('Accuracy (%)')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_title('Model Selection via Validation', fontweight='bold', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(75, 90)

plt.tight_layout()
plt.show()
Never touch test data until final evaluation - validation data guides all decisions

Generalization Remains Partially Unexplained

What We Don't Understand

Why does SGD find generalizing solutions?
Networks can memorize random labels perfectly,
yet SGD finds patterns when labels are real

Why does overparameterization help?
10x more parameters than samples should overfit,
but often improves test accuracy

What is the role of depth?
Shallow wide networks have same capacity,
but deep networks generalize better

How do transformers generalize?
No convolutions, no recurrence,
yet state-of-the-art on vision and language

#| echo: false
"""
Horizontal bar chart comparing the relative explanatory power of different generalization theories, illustrating that no single framework fully explains deep learning's generalization success.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(8, 6))

theories = ['Classical\nLearning\nTheory', 'Rademacher\nComplexity', 'PAC-Bayes',
           'Margin\nTheory', 'Compression', 'Implicit\nBias']
success = [30, 40, 45, 50, 35, 60]

colors = plt.cm.coolwarm(np.array(success) / 100)
bars = ax.barh(theories, success, color=colors)

ax.set_xlabel('Relative Explanatory Power')
ax.set_title('Theoretical Understanding Gap\n(Illustrative)', fontweight='bold')
ax.axvline(x=100, color='gray', linestyle='--', alpha=0.5)
ax.text(100, -0.5, 'Complete\nUnderstanding', ha='center', fontsize=9)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

Note: No single theory fully explains deep learning generalization. Active research area.

Zillow Home Pricing: When the World Changes

Setup (2018-2021):

Deployment (2021):

What happened:

Training data came from stable market. Deployment happened during rapid market shift. Model kept predicting pre-COVID prices.

#| echo: false
"""
Zillow case study showing distribution shift: top panel plots home prices where model predictions (trained on 2018-2020) diverge from actual 2021 prices after COVID market shift; bottom panel shows accuracy dropping from 94-96% during validation to 68% at deployment.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 1, figsize=(9, 10))

# Distribution shift
ax = axes[0]
months = np.arange(36)
pre_covid = 300 + 5*months
post_covid_shift = np.zeros_like(months)
post_covid_shift[24:] = 25 * (months[24:] - 24)**1.1

actual_prices = pre_covid.copy()
actual_prices[24:] += post_covid_shift[24:]

ax.plot(months[:24], pre_covid[:24], 'b-', linewidth=3, label='Training data (2018-2020)')
ax.plot(months[24:], actual_prices[24:], 'r-', linewidth=3, label='Actual prices (2021)')
ax.plot(months[24:], pre_covid[24:], 'b--', linewidth=2, alpha=0.6, label='Model predictions')

ax.fill_between(months[24:], pre_covid[24:], actual_prices[24:],
                alpha=0.3, color='red', label='Prediction error')

ax.set_xlabel('Months', fontsize=12)
ax.set_ylabel('Home Price ($K)', fontsize=12)
ax.set_title('Market Shifted, Model Did Not', fontweight='bold', fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.axvline(x=24, color='black', linestyle='--', linewidth=2)
ax.text(12, 550, 'Trained\nhere', ha='center', fontsize=11, fontweight='bold')
ax.text(30, 550, 'Deployed\nhere', ha='center', fontsize=11, fontweight='bold')

# Performance comparison
ax = axes[1]
scenarios = ['Train', 'Validation', 'Test\n(2020)', 'Deploy\n(2021)']
accuracy = [96, 95, 94, 68]
colors = ['green', 'green', 'green', 'red']

bars = ax.bar(scenarios, accuracy, color=colors, alpha=0.6)
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_title('Perfect Validation, Failed Deployment', fontweight='bold', fontsize=14)
ax.set_ylim(0, 100)
ax.grid(True, alpha=0.3, axis='y')

for i, acc in enumerate(accuracy):
    ax.text(i, acc + 2, f'{acc}%', ha='center', fontweight='bold', fontsize=12)

ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5)
ax.text(3.5, 92, 'Target', fontsize=10)

plt.tight_layout()
plt.show()

The problem:

All your validation tools assume the future looks like the past. When the world changes, models trained on historical data fail.

Train/val/test all from 2018-2020: Model learns pre-COVID patterns
Deploy in 2021: COVID changed everything
Result: Model is wrong, but doesn't know it's wrong

This is not a rare edge case - markets shift, user behavior changes, new products emerge. Distribution shift is common.

Building Systems Despite Incomplete Theory

What theory doesn't fully explain

Classical theory predicts

Modern practice shows

How we build systems anyway

Empirical validation:

Defensive engineering:

Course approach:

Many fundamental questions remain open research problems.

Modern Deep Architectures

---

Convolutional Networks: Exploiting Spatial Structure

#| echo: false
"""
CNN fundamentals showing convolution sliding window operation, multiple feature maps, max pooling spatial reduction, and full architecture progression from input through convolutional and pooling layers to classification.
"""

#| fig-align: center

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Convolution operation
ax = axes[0, 0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Input image
for i in range(5):
    for j in range(5):
        val = np.random.random()
        rect = patches.Rectangle((i*1.2 + 1, j*1.2 + 1), 1, 1, 
                                facecolor=plt.cm.gray(val), edgecolor='black')
        ax.add_patch(rect)

# Kernel
kernel_colors = ['red', 'blue', 'green']
for ki in range(3):
    for kj in range(3):
        rect = patches.Rectangle((7 + ki*0.8, 3 + kj*0.8), 0.7, 0.7,
                                facecolor=kernel_colors[(ki+kj)%3], alpha=0.5, 
                                edgecolor='black', linewidth=2)
        ax.add_patch(rect)

# Highlight convolution region
rect = patches.Rectangle((1.2 + 1, 1.2 + 1), 3.6, 3.6, 
                        fill=False, edgecolor='red', linewidth=3)
ax.add_patch(rect)

ax.arrow(5, 5, 1.5, 0, head_width=0.3, head_length=0.2, fc='black', ec='black')
ax.text(2.5, 0.5, 'Input', ha='center', fontweight='bold')
ax.text(8, 2, 'Filter', ha='center', fontweight='bold')

ax.set_title('Convolution: Sliding Window', fontweight='bold')

# Feature maps
ax = axes[0, 1]
n_filters = 3
for f in range(n_filters):
    for i in range(3):
        for j in range(3):
            val = np.random.random()
            rect = patches.Rectangle((f*3.5 + i*0.9 + 1, j*0.9 + 3), 0.8, 0.8,
                                    facecolor=plt.cm.viridis(val), edgecolor='black')
            ax.add_patch(rect)
    ax.text(f*3.5 + 2.5, 2, f'Feature {f+1}', ha='center', fontsize=10)

ax.set_xlim(0, 12)
ax.set_ylim(0, 8)
ax.axis('off')
ax.set_title('Multiple Feature Maps', fontweight='bold')

# Pooling
ax = axes[0, 2]
# Before pooling
for i in range(4):
    for j in range(4):
        val = np.random.random()
        rect = patches.Rectangle((i*0.8 + 1, j*0.8 + 4), 0.7, 0.7,
                                facecolor=plt.cm.coolwarm(val), edgecolor='black')
        ax.add_patch(rect)

# After pooling
for i in range(2):
    for j in range(2):
        val = 0.8 + 0.2*np.random.random()
        rect = patches.Rectangle((i*1.2 + 6, j*1.2 + 4.4), 1, 1,
                                facecolor=plt.cm.coolwarm(val), edgecolor='black', linewidth=2)
        ax.add_patch(rect)

ax.arrow(4.5, 5.5, 1.2, 0, head_width=0.2, head_length=0.15, fc='black', ec='black')
ax.text(2.5, 3, '4×4', ha='center', fontsize=10)
ax.text(7, 3, '2×2', ha='center', fontsize=10)
ax.text(5, 6.5, 'Max Pool', ha='center', fontsize=10)

ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')
ax.set_title('Pooling: Spatial Reduction', fontweight='bold')

# Full CNN architecture
ax = axes[1, 0]
ax.set_xlim(0, 12)
ax.set_ylim(0, 8)
ax.axis('off')

layers = [
    ('Input\n32×32×3', 1, 4, 2, 2, 'lightblue'),
    ('Conv\n28×28×32', 3, 3.5, 1.8, 1.8, 'lightgreen'),
    ('Pool\n14×14×32', 5, 3.5, 1.5, 1.5, 'lightyellow'),
    ('Conv\n10×10×64', 7, 3.5, 1.2, 1.2, 'lightcoral'),
    ('Pool\n5×5×64', 9, 3.5, 0.8, 0.8, 'lightgray'),
    ('FC\n10', 11, 3.5, 0.5, 1.5, 'pink')
]

for i, (label, x, y, w, h, color) in enumerate(layers):
    rect = patches.Rectangle((x-w/2, y-h/2), w, h,
                            facecolor=color, edgecolor='black', linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y-h/2-0.5, label, ha='center', fontsize=9)
    
    if i < len(layers) - 1:
        ax.arrow(x+w/2, y, layers[i+1][1]-x-w/2-0.2, 0,
                head_width=0.15, head_length=0.1, fc='gray', ec='gray')

ax.set_title('Typical CNN Architecture', fontweight='bold')

# Translation invariance demo
ax = axes[1, 1]
base_pattern = np.zeros((7, 7))
base_pattern[2:5, 1:3] = 1

# Original
ax.imshow(base_pattern, cmap='gray', extent=[0, 3, 0, 3])
ax.set_xlim(0, 10)
ax.set_ylim(0, 5)

# Shifted
shifted = np.zeros((7, 7))
shifted[2:5, 4:6] = 1
ax.imshow(shifted, cmap='gray', extent=[4, 7, 0, 3])

ax.text(1.5, 4, 'Original', ha='center', fontweight='bold')
ax.text(5.5, 4, 'Shifted', ha='center', fontweight='bold')
ax.text(8.5, 1.5, 'Same\nResponse', ha='center', fontsize=11, color='red', fontweight='bold')

ax.set_title('Translation Invariance', fontweight='bold')
ax.axis('off')

# Receptive field growth
ax = axes[1, 2]
layers_rf = ['Input', 'Conv3×3', 'Conv3×3', 'Pool2×2', 'Conv3×3']
rf_sizes = [1, 3, 5, 10, 18]

ax.plot(range(len(layers_rf)), rf_sizes, 'go-', linewidth=2, markersize=8)
ax.set_xticks(range(len(layers_rf)))
ax.set_xticklabels(layers_rf, rotation=45, ha='right')
ax.set_ylabel('Receptive Field Size')
ax.set_title('Hierarchical View Growth', fontweight='bold')
ax.grid(True, alpha=0.3)

plt.suptitle('CNN: Built for Images', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Preview: Transformers, Graph Networks, and Diffusion Models

#| echo: false
"""
Three-panel visualization of modern architectures: Transformer attention mechanism with token-to-token weights, Graph Neural Network with node connectivity and message passing, and Diffusion model showing iterative denoising from noise to clean image.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Transformer attention
ax = axes[0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Tokens
tokens = ['The', 'cat', 'sat', 'on', 'mat']
for i, token in enumerate(tokens):
    rect = patches.Rectangle((0.5, 8-i*1.5), 1.5, 0.8,
                            facecolor='lightblue', edgecolor='black')
    ax.add_patch(rect)
    ax.text(1.25, 8.4-i*1.5, token, ha='center', va='center', fontsize=10)

# Attention matrix
attention = np.random.random((5, 5))
attention = attention / attention.sum(axis=1, keepdims=True)
ax.imshow(attention, extent=[4, 8, 2, 6], cmap='Reds', vmin=0, vmax=0.5)
ax.text(6, 6.5, 'Attention Weights', ha='center', fontweight='bold', fontsize=9)

# Highlight one attention pattern
for j in range(5):
    if attention[2, j] > 0.3:
        ax.plot([2, 4], [8.4-2*1.5, 6-j*0.8], 'r-', alpha=0.5, linewidth=2)

ax.set_title('Transformers: Attention', fontweight='bold', fontsize=11)
ax.text(5, 0.8, 'Each token attends to\nrelevant context', ha='center', fontsize=9, style='italic')

# Graph Neural Network
ax = axes[1]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Graph structure
nodes = [(2, 7), (5, 8), (8, 7), (3, 4), (7, 4), (5, 2)]
edges = [(0, 1), (1, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 5)]

for (i, j) in edges:
    ax.plot([nodes[i][0], nodes[j][0]], [nodes[i][1], nodes[j][1]],
           'gray', linewidth=2, alpha=0.5)

for i, (x, y) in enumerate(nodes):
    color = plt.cm.tab10(i)
    circle = patches.Circle((x, y), 0.5, facecolor=color, edgecolor='black', linewidth=2)
    ax.add_patch(circle)
    ax.text(x, y, str(i), ha='center', va='center', fontweight='bold', color='white')

ax.set_title('Graph Networks: Structured Data', fontweight='bold', fontsize=11)
ax.text(5, 0.5, 'Message passing\nbetween connected nodes', ha='center', fontsize=9, style='italic')

# Diffusion process
ax = axes[2]
steps = 5
for i in range(steps):
    if i == 0:
        img = np.random.randn(8, 8)
    else:
        img = 0.3 * np.ones((8, 8)) + (1-i/(steps-1)) * np.random.randn(8, 8)

    ax.imshow(img, extent=[i*2, i*2+1.5, 3, 4.5], cmap='gray', vmin=-2, vmax=2)

    if i < steps - 1:
        ax.arrow(i*2+1.6, 3.75, 0.3, 0, head_width=0.1, head_length=0.05, fc='red', ec='red')

ax.text(0.75, 2.5, 'Noise', ha='center', fontsize=9)
ax.text(8.75, 2.5, 'Clean', ha='center', fontsize=9)
ax.set_xlim(-0.5, 10)
ax.set_ylim(1.5, 5.5)
ax.axis('off')
ax.set_title('Diffusion: Iterative Denoising', fontweight='bold', fontsize=11)
ax.text(5, 1.8, 'Learn to reverse\nnoise addition', ha='center', fontsize=9, style='italic')

plt.tight_layout()
plt.show()

Building a Simple CNN

#| echo: true
#| code-fold: false

import numpy as np

class Conv2D:
    def __init__(self, in_channels, out_channels, kernel_size=3):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        # Initialize filters
        self.filters = np.random.randn(
            out_channels, in_channels, kernel_size, kernel_size
        ) * 0.1
        self.bias = np.zeros(out_channels)
    
    def forward(self, x):
        batch, in_c, height, width = x.shape
        out_h = height - self.kernel_size + 1
        out_w = width - self.kernel_size + 1
        
        output = np.zeros((batch, self.out_channels, out_h, out_w))
        
        # Convolution operation
        for b in range(batch):
            for oc in range(self.out_channels):
                for h in range(out_h):
                    for w in range(out_w):
                        # Extract patch
                        patch = x[b, :, h:h+self.kernel_size, w:w+self.kernel_size]
                        # Convolve with filter
                        output[b, oc, h, w] = np.sum(patch * self.filters[oc]) + self.bias[oc]
        
        return output

class MaxPool2D:
    def __init__(self, pool_size=2):
        self.pool_size = pool_size
    
    def forward(self, x):
        batch, channels, height, width = x.shape
        out_h = height // self.pool_size
        out_w = width // self.pool_size
        
        output = np.zeros((batch, channels, out_h, out_w))
        
        for h in range(out_h):
            for w in range(out_w):
                h_start = h * self.pool_size
                w_start = w * self.pool_size
                pool_region = x[:, :, h_start:h_start+self.pool_size, 
                              w_start:w_start+self.pool_size]
                output[:, :, h, w] = np.max(pool_region, axis=(2, 3))
        
        return output

# Example usage
x = np.random.randn(1, 3, 32, 32)  # Batch=1, RGB, 32x32
conv = Conv2D(3, 16, kernel_size=3)
pool = MaxPool2D(pool_size=2)

x = conv.forward(x)
print(f"After conv: {x.shape}")  # (1, 16, 30, 30)
x = np.maximum(0, x)  # ReLU
x = pool.forward(x)
print(f"After pool: {x.shape}")  # (1, 16, 15, 15)

Architecture Efficiency on ImageNet

ResNet-50 (2015):

MobileNetV2 (2018):

EfficientNet-B0 (2019):

Architecture design matters: EfficientNet achieves better accuracy than ResNet-50 with far fewer parameters.

The Practice of Deep Learning

---

A Recipe for Training Neural Networks

The Training Loop

\[ \theta_{t+1} = \theta_t - \eta \cdot \frac{1}{|B|} \sum_{i \in B} \nabla_\theta \mathcal{L}(f_\theta(x_i), y_i) \]

Karpathy's Principles

1. Become one with the data Look at your data. Plot it. Understand its distribution, outliers, patterns.

2. Set up end-to-end pipeline Get a simple model training before complexity.

3. Overfit a single batch If you can't overfit 10 examples, something is broken.

4. Verify loss at initialization Check loss matches expected value (e.g., \(\log(n_{classes})\) for classification).

5. Add complexity gradually Start simple, add one thing at a time.

#| echo: true
#| code-fold: false

# The debugging progression
def debug_training():
    # Step 1: Overfit one example
    single_x = X[0:1]
    single_y = y[0:1]
    for _ in range(100):
        loss = train_step(single_x, single_y)
    assert loss < 0.01, "Can't overfit single"
    
    # Step 2: Overfit small batch  
    batch_x = X[0:10]
    batch_y = y[0:10]
    for _ in range(500):
        loss = train_step(batch_x, batch_y)
    assert loss < 0.1, "Can't overfit batch"
    
    # Step 3: Check with real data
    # Only now move to full dataset
    return "Ready for full training"

Debugging Deep Learning

#| echo: false
"""
Common deep learning debugging patterns showing stuck loss, exploding gradients, overfitting, vanishing gradients, and learning rate finder diagnostics.
"""

#| fig-align: center

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

fig, axes = plt.subplots(2, 3, figsize=(15, 9))

# Loss not decreasing
ax = axes[0, 0]
epochs = np.arange(50)
stuck_loss = np.ones(50) * 2.3 + np.random.normal(0, 0.01, 50)
ax.plot(epochs, stuck_loss, 'r-', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Loss Not Decreasing', fontweight='bold')
ax.grid(True, alpha=0.3)

# Exploding gradients
ax = axes[0, 1]
epochs = np.arange(30)
exploding = np.exp(epochs / 10)
exploding[15:] = np.nan
ax.plot(epochs[:15], exploding[:15], 'r-', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Exploding Gradients', fontweight='bold')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
ax.annotate('Fix: Gradient clipping, lower LR, better init', 
           xy=(7.5, 8), fontsize=8, ha='center', style='italic')

# Overfitting
ax = axes[0, 2]
epochs = np.arange(100)
train_loss = 0.5 * np.exp(-epochs / 20) + 0.01
val_loss = 0.5 * np.exp(-epochs / 20) + 0.02 + 0.001 * epochs
ax.plot(epochs, train_loss, 'b-', linewidth=2, label='Train')
ax.plot(epochs, val_loss, 'r-', linewidth=2, label='Validation')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Overfitting Pattern', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
ax.text(70, 0.2, 'Solutions:\n• Regularization\n• More data\n• Early stopping', 
        fontsize=9, ha='center', bbox=dict(boxstyle="round,pad=0.3", facecolor='lightblue', alpha=0.5))

# Gradient flow visualization (backprop flows Output -> Input)
ax = axes[1, 0]
layers = ['Input', 'Layer 1', 'Layer 2', 'Layer 3', 'Layer 4', 'Output']
gradient_magnitudes = [0.001, 0.01, 0.1, 0.4, 0.8, 1.0]

bars = ax.bar(layers, gradient_magnitudes, color='red', alpha=0.7)
ax.set_ylabel('Gradient Magnitude')
ax.set_yscale('log')
ax.set_title('Vanishing Gradient Diagnosis', fontweight='bold')
ax.axhline(y=0.1, color='green', linestyle='--', alpha=0.5)
ax.text(2.5, 0.15, 'Healthy threshold', fontsize=9)
ax.grid(True, alpha=0.3, axis='y')

# Learning rate finder
ax = axes[1, 1]
lrs = np.logspace(-5, 0, 50)
losses = 2 - 1.5 * np.exp(-100 * lrs) + 10 * lrs**2
best_idx = np.argmin(losses)

ax.semilogx(lrs, losses, 'b-', linewidth=2)
ax.scatter(lrs[best_idx], losses[best_idx], color='red', s=100, zorder=5)
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Loss')
ax.set_title('Learning Rate Finder', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.text(lrs[best_idx], losses[best_idx] + 0.2, f'Best: {lrs[best_idx]:.1e}',
        ha='center', fontsize=9)

axes[1, 2].axis('off')  # Leave bottom-right empty
plt.tight_layout()
plt.show()

Medical Imaging: High Accuracy Hides Dataset Problems

Setup:

Deployment at Hospital B:

What went wrong:

Hospital A used one X-ray machine model with specific image characteristics. Hospital B used different equipment. Model learned machine artifacts, not disease patterns.

Example artifacts learned:

#| echo: false
"""
Performance comparison showing accuracy drop when deploying to new hospitals, alongside feature importance revealing model learned equipment-specific shortcuts rather than disease patterns.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 1, figsize=(9, 10))

# Performance by hospital
ax = axes[0]
hospitals = ['Hospital A\n(Training)', 'Hospital A\n(Test)', 'Hospital B\n(Deploy)', 'Hospital C\n(Deploy)']
accuracy = [95, 94, 72, 68]
colors = ['green', 'green', 'red', 'red']

bars = ax.bar(hospitals, accuracy, color=colors, alpha=0.6)
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_title('Model Learned Hospital-Specific Patterns, Not Disease', fontweight='bold', fontsize=14)
ax.set_ylim(0, 100)
ax.grid(True, alpha=0.3, axis='y')

for i, acc in enumerate(accuracy):
    ax.text(i, acc + 2, f'{acc}%', ha='center', fontweight='bold', fontsize=12)

ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5)
ax.text(3.5, 92, 'Clinical\nTarget', fontsize=10)

# What model actually learned
ax = axes[1]

# Create side-by-side comparison
features_learned = ['Disease\nPatterns', 'X-ray Machine\nModel', 'Image\nBrightness', 'Positioning\nMarkers']
importance_should_be = [90, 2, 3, 5]
importance_actual = [40, 30, 20, 10]

x = np.arange(len(features_learned))
width = 0.35

bars1 = ax.bar(x - width/2, importance_should_be, width, label='Should learn', color='green', alpha=0.7)
bars2 = ax.bar(x + width/2, importance_actual, width, label='Actually learned', color='red', alpha=0.7)

ax.set_ylabel('Feature Importance (%)', fontsize=12)
ax.set_title('Model Learned Shortcuts, Not Medicine', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(features_learned, fontsize=10)
ax.set_ylim(0, 100)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Shortcut learning: Standard debugging looked fine:

Problem only appeared on different hospital equipment. Models exploit spurious correlations (disease + specific machine) as shortcuts instead of learning actual medical patterns.

Computational Realities

#| echo: false
"""
Hardware comparison across devices, batch size memory/throughput tradeoffs, neural scaling laws with model landmarks, and distributed training efficiency showing ideal vs actual GPU speedup.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Hardware comparison
ax = axes[0, 0]
devices = ['CPU\n(8 cores)', 'GPU\n(RTX 3090)', 'TPU v3\n(Google)', 'A100\n(NVIDIA)']
relative_speed = [1, 25, 80, 100]
memory = [32, 24, 128, 80]  # GB

x = np.arange(len(devices))
width = 0.35

ax.bar(x - width/2, relative_speed, width, label='Relative Speed', color='#1976D2', alpha=0.7)
ax.bar(x + width/2, memory, width, label='Memory (GB)', color='#F57C00', alpha=0.7)

ax.set_xticks(x)
ax.set_xticklabels(devices)
ax.set_ylabel('Value')
ax.set_title('Hardware Capabilities', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Batch size vs memory
ax = axes[0, 1]
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
memory_usage = [0.5, 1, 2, 4, 8, 16, 32, 64, 128]  # Simplified linear relationship
throughput = [10, 19, 36, 68, 120, 200, 280, 320, 330]

ax.plot(batch_sizes, memory_usage, 'r-o', linewidth=2, label='Memory (GB)')
ax2 = ax.twinx()
ax2.plot(batch_sizes, throughput, 'b-s', linewidth=2, label='Throughput (img/s)')

ax.set_xlabel('Batch Size')
ax.set_ylabel('Memory Usage (GB)', color='r')
ax2.set_ylabel('Throughput (img/s)', color='b')
ax.set_title('Batch Size Trade-offs', fontweight='bold')
ax.set_xscale('log')
ax.grid(True, alpha=0.3)
ax.axhline(y=24, color='red', linestyle='--', alpha=0.5)
ax.text(4, 26, 'GPU Memory Limit', fontsize=9, color='red')

# Scaling laws
ax = axes[1, 0]
model_sizes = np.logspace(6, 11, 50)  # 1M to 100B parameters
compute_flops = model_sizes * 6  # Approximate FLOPs
performance = 100 - 50 * np.exp(-model_sizes / 1e9)

ax.loglog(compute_flops, performance, 'g-', linewidth=2)
ax.set_xlabel('Compute (FLOPs)')
ax.set_ylabel('Performance (%)')
ax.set_title('Scaling Laws', fontweight='bold')
ax.grid(True, alpha=0.3)

# Add model landmarks
landmarks = [(6e6*6, 'BERT'), (175e9*6, 'GPT-3'), (540e9*6, 'PaLM')]
for flops, name in landmarks:
    if flops < 1e18:  # Only show if in range
        perf = 100 - 50 * np.exp(-flops/6/1e9)
        ax.scatter(flops, perf, s=100, zorder=5)
        ax.text(flops, perf-3, name, fontsize=8, ha='center')

# Distributed training
ax = axes[1, 1]
n_gpus = [1, 2, 4, 8, 16, 32]
ideal_speedup = n_gpus
actual_speedup = [1, 1.9, 3.6, 6.8, 12, 20]
actual_speedup_poor = [1, 1.5, 2.2, 3.0, 3.8, 4.5]

ax.plot(n_gpus, ideal_speedup, 'g--', linewidth=2, label='Ideal')
ax.plot(n_gpus, actual_speedup, 'b-o', linewidth=2, label='Data Parallel')
ax.plot(n_gpus, actual_speedup_poor, 'r-s', linewidth=2, label='Poor Implementation')

ax.set_xlabel('Number of GPUs')
ax.set_ylabel('Speedup')
ax.set_title('Distributed Training Efficiency', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

What this means for this course:

CPU (your laptop):

GPU (Colab/Kaggle free tier):

Multi-GPU (cloud):

Course approach: CPU is viable for most work. GPU accelerates but isn't required.

Systematic Experimentation

#| echo: true
#| code-fold: true
#| code-summary: "Experiment tracking example"

# Tracking experiments
experiment_config = {
    'model': 'resnet18',
    'dataset': 'cifar10',
    'batch_size': 128,
    'lr': 0.1,
    'epochs': 100,
    'seed': 42,
    'timestamp': '2025-01-15-14:30'
}

# Always set seeds for reproducibility
def set_all_seeds(seed=42):
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    # random.seed(seed)
    
# Log everything
def log_metrics(epoch, train_loss, val_loss, val_acc):
    metrics = {
        'epoch': epoch,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_acc': val_acc,
        'lr': get_current_lr(),
        'timestamp': time.time()
    }
    # Write to file, tensorboard, wandb, etc.
    return metrics
#| echo: false
"""
Ablation study showing incremental accuracy gains from each regularization technique, alongside a hyperparameter grid search heatmap for learning rate and batch size combinations.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Ablation study
ax = axes[0]
components = ['Baseline', '+DataAug', '+Dropout', '+WeightDecay', '+LRSchedule', 'Full Model']
performance = [82, 84, 85.5, 86.5, 87.8, 89.2]
colors = plt.cm.RdYlGn(np.linspace(0.4, 0.9, len(components)))

bars = ax.barh(components, performance, color=colors)
ax.set_xlabel('Test Accuracy (%)')
ax.set_title('Ablation Study: Component Contributions', fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

for i, (comp, perf) in enumerate(zip(components, performance)):
    if i > 0:
        improvement = performance[i] - performance[i-1]
        ax.text(perf + 0.2, i, f'+{improvement:.1f}%', fontsize=9, va='center')

# Hyperparameter grid
ax = axes[1]
learning_rates = [0.001, 0.01, 0.1, 1.0]
batch_sizes = [32, 64, 128, 256]
results = np.array([
    [75, 78, 76, 72],
    [80, 85, 83, 78],
    [82, 88, 86, 80],
    [70, 75, 73, 68]
])

im = ax.imshow(results, cmap='RdYlGn', vmin=65, vmax=90)
ax.set_xticks(range(len(batch_sizes)))
ax.set_yticks(range(len(learning_rates)))
ax.set_xticklabels(batch_sizes)
ax.set_yticklabels([f'{lr:.3f}' for lr in learning_rates])
ax.set_xlabel('Batch Size')
ax.set_ylabel('Learning Rate')
ax.set_title('Hyperparameter Search Results', fontweight='bold')

# Add text annotations
for i in range(len(learning_rates)):
    for j in range(len(batch_sizes)):
        ax.text(j, i, f'{results[i, j]:.0f}', ha='center', va='center', color='white', fontweight='bold')

plt.colorbar(im, ax=ax, label='Accuracy (%)')

plt.tight_layout()
plt.show()

Always Establish Baselines Before Claiming Improvement

#| echo: false
"""
Bar chart comparing model accuracy across increasing complexity levels, from random guessing (10%) through linear and neural baselines to state-of-the-art (94%), with horizontal reference lines marking baseline performance thresholds.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(12, 8))

methods = [
    ('Random\nGuessing', 10, 'gray'),
    ('Linear\nModel', 65, '#B71C1C'),
    ('2-Layer\nMLP', 78, '#F57C00'),
    ('Simple\nCNN', 85, '#1976D2'),
    ('Your\nModel', 87, '#2E7D32'),
    ('SOTA\n2025', 94, 'gold')
]

positions = []
for i, (name, score, color) in enumerate(methods):
    x = i * 2 + 1
    positions.append(x)

    # Bar
    bar = ax.bar(x, score, color=color, alpha=0.7, width=1.5)

    # Annotations
    ax.text(x, score + 1, f'{score}%', ha='center', fontweight='bold')
    ax.text(x, -3, name, ha='center', fontsize=10)

    # Complexity indicator
    complexity = [1, 2, 3, 4, 5, 6]
    for j in range(complexity[i]):
        circle = plt.Circle((x - 0.6 + j*0.25, score - 5), 0.08,
                           color='black', alpha=0.5)
        ax.add_patch(circle)

# Add baseline lines
ax.axhline(y=10, color='gray', linestyle='--', alpha=0.5, linewidth=1)
ax.text(11, 11, 'Random', fontsize=9, color='gray')

ax.axhline(y=65, color='red', linestyle='--', alpha=0.5, linewidth=1)
ax.text(11, 66, 'Linear baseline', fontsize=9, color='red')

ax.set_ylim(0, 100)
ax.set_xlim(-1, 12)
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_title('Always Compare Against Baselines', fontsize=16, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
ax.set_xticks([])

plt.tight_layout()
plt.show()

Current Frontiers & Course Roadmap

---

Course Architecture: Statistical to Neural

#| echo: false
"""
Course architecture diagram showing statistical foundations (detection, estimation, regression, classification) and neural methods (deep learning, CNNs, RNNs, autoencoders) connected through a central optimization and learning bridge.
"""

#| fig-align: center

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

fig, ax = plt.subplots(figsize=(10, 8))

# Statistical vs Data-Driven perspective
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Statistical foundations (left side)
stat_components = [
    (2, 8, 'Detection\nEstimation', '#E3F2FD'),
    (2, 6.5, 'MMSE\nEstimation', '#E3F2FD'),
    (2, 5, 'ML/MAP\nParameter Est.', '#E3F2FD'),
    (2, 3.5, 'Regression', '#E3F2FD'),
    (2, 2, 'Classification', '#E3F2FD')
]

for x, y, label, color in stat_components:
    rect = patches.FancyBboxPatch((x-0.8, y-0.4), 1.6, 0.8,
                                  boxstyle="round,pad=0.05",
                                  facecolor=color, edgecolor='#1976D2', linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y, label, ha='center', va='center', fontsize=9, fontweight='bold')

# Data-driven methods (right side)
ml_components = [
    (8, 8, 'Neural\nNetworks', '#FFEBEE'),
    (8, 6.5, 'Deep\nLearning', '#FFEBEE'),
    (8, 5, 'CNNs', '#FFEBEE'),
    (8, 3.5, 'RNNs', '#FFEBEE'),
    (8, 2, 'Autoencoders', '#FFEBEE')
]

for x, y, label, color in ml_components:
    rect = patches.FancyBboxPatch((x-0.8, y-0.4), 1.6, 0.8,
                                  boxstyle="round,pad=0.05",
                                  facecolor=color, edgecolor='#C62828', linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y, label, ha='center', va='center', fontsize=9, fontweight='bold')

# Bridge in the middle
bridge_y = 5
rect = patches.FancyBboxPatch((4, bridge_y-0.5), 2, 1,
                              boxstyle="round,pad=0.05",
                              facecolor='#E8F5E9', edgecolor='#2E7D32', linewidth=3)
ax.add_patch(rect)
ax.text(5, bridge_y, 'Optimization\n& Learning', ha='center', va='center', 
        fontsize=10, fontweight='bold')

# Arrows showing progression
for i in range(len(stat_components)-1):
    y1 = stat_components[i][1]
    y2 = stat_components[i+1][1]
    arrow = patches.FancyArrowPatch((2, y1-0.45), (2, y2+0.45),
                                   mutation_scale=15, arrowstyle='->', 
                                   color='#1976D2', alpha=0.5, linewidth=2)
    ax.add_patch(arrow)

for i in range(len(ml_components)-1):
    y1 = ml_components[i][1]
    y2 = ml_components[i+1][1]
    arrow = patches.FancyArrowPatch((8, y1-0.45), (8, y2+0.45),
                                   mutation_scale=15, arrowstyle='->', 
                                   color='#C62828', alpha=0.5, linewidth=2)
    ax.add_patch(arrow)

# Connecting arrows
arrow_left = patches.FancyArrowPatch((2.85, 5), (3.95, 5),
                                    mutation_scale=20, arrowstyle='->', 
                                    color='#2E7D32', linewidth=2)
ax.add_patch(arrow_left)

arrow_right = patches.FancyArrowPatch((6.05, 5), (7.15, 5),
                                     mutation_scale=20, arrowstyle='->', 
                                     color='#2E7D32', linewidth=2)
ax.add_patch(arrow_right)

ax.text(1, 9, 'Classical/Statistical', fontsize=12, fontweight='bold', color='#1976D2')
ax.text(7, 9, 'Modern/Data-Driven', fontsize=12, fontweight='bold', color='#C62828')

plt.tight_layout()
plt.show()

Scaling Laws and Emergent Capabilities

#| echo: false
"""
Four-panel visualization showing emergent abilities appearing at different model scales, in-context learning improvements with model size, compute-optimal scaling laws, and the exponential growth of model parameters from 2018-2025.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Emergent abilities
ax = axes[0, 0]
model_sizes = np.logspace(7, 11, 50)

# Different tasks emerge at different scales
tasks = [
    ('Arithmetic', 1e8, 80, '#1976D2'),
    ('Translation', 1e9, 85, '#2E7D32'),
    ('Reasoning', 10e9, 90, '#F57C00'),
    ('Code Generation', 50e9, 92, '#7B1FA2')
]

for task, threshold, max_perf, color in tasks:
    perf = np.zeros_like(model_sizes)
    mask = model_sizes > threshold
    perf[mask] = max_perf * (1 - np.exp(-(model_sizes[mask] - threshold) / threshold))
    ax.semilogx(model_sizes, perf, linewidth=2, label=task, color=color)

ax.set_xlabel('Model Size (Parameters)')
ax.set_ylabel('Task Performance (%)')
ax.set_title('Emergent Abilities with Scale', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# In-context learning
ax = axes[0, 1]
n_examples = [0, 1, 2, 4, 8, 16]
small_model = [50, 52, 53, 54, 54, 54]
large_model = [50, 65, 75, 82, 87, 90]

ax.plot(n_examples, small_model, 'o-', linewidth=2, label='Small Model (1B)', 
        color='#C62828', markersize=8)
ax.plot(n_examples, large_model, 's-', linewidth=2, label='Large Model (100B)', 
        color='#1976D2', markersize=8)
ax.set_xlabel('Number of In-Context Examples')
ax.set_ylabel('Task Accuracy (%)')
ax.set_title('In-Context Learning', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Scaling laws
ax = axes[1, 0]
compute = np.logspace(15, 25, 50)
loss_small_data = 2.5 - 0.3 * np.log10(compute / 1e15)
loss_large_data = 2.0 - 0.4 * np.log10(compute / 1e15)
loss_optimal = 1.5 - 0.5 * np.log10(compute / 1e15)

ax.semilogx(compute, loss_small_data, '--', linewidth=2, label='Data Limited', color='#C62828')
ax.semilogx(compute, loss_large_data, '--', linewidth=2, label='Parameter Limited', color='#1976D2')
ax.semilogx(compute, loss_optimal, '-', linewidth=2, label='Optimal Scaling', color='#2E7D32')
ax.set_xlabel('Compute Budget (FLOPs)')
ax.set_ylabel('Loss')
ax.set_title('Scaling Laws (Chinchilla)', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Model timeline
ax = axes[1, 1]
years = np.array([2018, 2019, 2020, 2021, 2022, 2023])
model_sizes = np.array([0.1, 1.5, 175, 530, 540, 1000])  # Billions

ax.semilogy(years, model_sizes, 'o-', linewidth=2, markersize=8, color='#7B1FA2')
models = ['BERT', 'GPT-2', 'GPT-3', 'MT-NLG', 'PaLM', 'Estimated\nFrontier']
for year, size, name in zip(years, model_sizes, models):
    ax.text(year, size*1.3, name, ha='center', fontsize=8)

# Add projection line
years_future = np.array([2023, 2025])
sizes_future = np.array([1000, 2500])
ax.semilogy(years_future, sizes_future, '--', linewidth=2, markersize=8,
           color='#7B1FA2', alpha=0.5, label='Projected trend')

ax.set_xlabel('Year')
ax.set_ylabel('Model Size (Billion Parameters)')
ax.set_title('The Race to Scale', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_xlim(2017.5, 2025.5)

plt.suptitle('Large Models: New Capabilities from Scale', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

What these scales cost:

GPT-2 (1.5B params, 2019):

GPT-3 (175B params, 2020):

PaLM (540B params, 2022):

Scale is not just about bigger numbers - it's about fundamentally different resource requirements.

Model Compression and Acceleration

#| echo: false
"""
Model compression techniques comparing quantization bit-width tradeoffs, knowledge distillation from teacher to student models, structured network pruning, and efficient architecture designs.
"""

#| fig-align: center

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes[1, 2].axis('off')  # Leave bottom-right empty

# Quantization
ax = axes[0, 0]
bit_widths = [32, 16, 8, 4, 2, 1]
model_size = [100, 50, 25, 12.5, 6.25, 3.125]
accuracy = [95, 94.8, 94.5, 93, 88, 75]

ax.plot(bit_widths, model_size, 'o-', linewidth=2, label='Model Size (MB)', color='#1976D2')
ax2 = ax.twinx()
ax2.plot(bit_widths, accuracy, 's-', linewidth=2, label='Accuracy (%)', color='#C62828')

ax.set_xlabel('Bit Width')
ax.set_ylabel('Model Size (MB)', color='#1976D2')
ax2.set_ylabel('Accuracy (%)', color='#C62828')
ax.set_title('Quantization Trade-offs', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

# Distillation
ax = axes[0, 1]
models = ['Teacher\n(BERT)', 'Student 1\n(50%)', 'Student 2\n(25%)', 'Student 3\n(10%)']
params = [340, 170, 85, 34]
performance = [95, 92, 88, 82]

x = np.arange(len(models))
width = 0.35

bars1 = ax.bar(x - width/2, params, width, label='Parameters (M)', color='#1976D2', alpha=0.7)
bars2 = ax.bar(x + width/2, performance, width, label='Performance (%)', color='#2E7D32', alpha=0.7)

ax.set_xticks(x)
ax.set_xticklabels(models, fontsize=8)
ax.set_ylabel('Value')
ax.set_title('Knowledge Distillation', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Pruning patterns
ax = axes[0, 2]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Dense network
for i in range(5):
    for j in range(5):
        if i < 2:  # Input layer
            x, y = 1 + i*0.4, 3 + j*1
        else:  # Output layer
            x, y = 8 + (i-2)*0.4, 3 + j*1
        circle = patches.Circle((x, y), 0.15, facecolor='#E3F2FD', 
                               edgecolor='black', alpha=0.8)
        ax.add_patch(circle)

# Connections (sparse)
np.random.seed(42)
for i in range(5):
    for j in range(5):
        if np.random.random() > 0.7:  # 70% pruned
            x1, y1 = 1.6, 3 + i*1
            x2, y2 = 8, 3 + j*1
            ax.plot([x1, x2], [y1, y2], color='#C62828', alpha=0.3, linewidth=1)

ax.text(5, 1, 'Structured Pruning\n70% weights removed', 
        ha='center', fontsize=10, fontweight='bold')
ax.set_title('Network Pruning', fontweight='bold')

# Efficient architectures
ax = axes[1, 0]
architectures = ['ResNet50', 'MobileNet', 'EfficientNet', 'Vision\nTransformer']
params_arch = [25, 4, 5, 86]
accuracy_arch = [92, 88, 93, 95]
inference_time = [10, 3, 4, 25]  # ms

colors_arch = ['#C62828', '#2E7D32', '#1976D2', '#F57C00']
ax.scatter(params_arch, accuracy_arch, s=[t*20 for t in inference_time], 
          c=colors_arch, alpha=0.6)

for i, arch in enumerate(architectures):
    ax.annotate(arch, (params_arch[i], accuracy_arch[i]), 
               xytext=(5, 5), textcoords='offset points', fontsize=8)

ax.set_xlabel('Parameters (M)')
ax.set_ylabel('Accuracy (%)')
ax.set_title('Efficient Architecture Design', fontweight='bold')
ax.grid(True, alpha=0.3)

# Mixed precision training
ax = axes[1, 1]
training_methods = ['FP32', 'FP16', 'Mixed\nPrecision', 'BF16']
memory_usage = [100, 50, 60, 50]
training_speed = [1, 1.8, 1.7, 1.9]
stability = [100, 70, 95, 90]

x = np.arange(len(training_methods))
width = 0.25

bars1 = ax.bar(x - width, memory_usage, width, label='Memory (%)', color='#1976D2', alpha=0.7)
bars2 = ax.bar(x, np.array(training_speed)*50, width, label='Speed (x50)', color='#2E7D32', alpha=0.7)
bars3 = ax.bar(x + width, stability, width, label='Stability (%)', color='#C62828', alpha=0.7)

ax.set_xticks(x)
ax.set_xticklabels(training_methods)
ax.set_ylabel('Relative Value')
ax.set_title('Mixed Precision Training', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Python Environment Setup

---

Environment Management: Why It Matters

Dependency Conflicts

# System Python - Don't do this
python install torch
# Error: requires numpy>=1.19
pip install numpy==1.20
# Breaks: opencv requires numpy==1.18

Conflicts are inevitable. Each project needs:

Virtual Environments

#| echo: false
"""
Diagram showing system Python connecting to three isolated virtual environments (ee541, research, web_dev), each with its own Python version and packages.
"""

#| fig-align: center

import matplotlib.pyplot as plt
import matplotlib.patches as patches

fig, ax = plt.subplots(figsize=(8, 6))
ax.set_xlim(0, 10)
ax.set_ylim(0, 8)
ax.axis('off')

# System Python
system = patches.Rectangle((3.5, 6), 3, 1.2, facecolor='#FFCDD2', alpha=0.8, 
                          edgecolor='#C62828', linewidth=2)
ax.add_patch(system)
ax.text(5, 6.6, 'System Python', ha='center', va='center', fontweight='bold', fontsize=10)

# Virtual environments
envs = [
    (2, 3.5, 'ee541\nPyTorch 2.0\nPython 3.11', '#E3F2FD'),
    (5, 3.5, 'research\nTF 2.15\nPython 3.10', '#E8F5E9'),
    (8, 3.5, 'web_dev\nDjango 4.2\nPython 3.12', '#FFF9C4')
]

for x, y, label, color in envs:
    rect = patches.Rectangle((x-1, y-0.8), 2, 1.6, facecolor=color, alpha=0.9, 
                            edgecolor='#212121', linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y, label, ha='center', va='center', fontsize=9)
    
    # Arrow from system
    ax.arrow(5, 5.8, x-5, y-5.3, head_width=0.1, head_length=0.05, 
            fc='#9E9E9E', ec='#9E9E9E', alpha=0.5, linestyle='--', linewidth=1)

ax.text(5, 1.5, 'Isolated Environments', ha='center', fontsize=11, fontweight='bold')
ax.text(5, 0.8, 'No conflicts, reproducible, deletable', ha='center', fontsize=9, style='italic')

plt.tight_layout()
plt.show()

Conda: Package and Environment Management

Installation

# Download Miniconda (minimal) or Anaconda (full)
# miniconda.anaconda.com

# After installation, verify:
conda --version
conda info

# Update conda itself
conda update -n base conda

Create Course Environment

# Create environment with Python 3.11
conda create -n ee541 python=3.11

# Activate environment
conda activate ee541

# Your prompt changes:
(ee541) $ 

# Deactivate when done
conda deactivate

Why Conda for Deep Learning

Binary package management

Cross-platform

Channel system

Environment files

Essential Package Installation

# Activate your environment first
conda activate ee541

# Core scientific stack
conda install numpy scipy matplotlib pandas

# Jupyter for notebooks
conda install jupyter ipykernel

# Register kernel for Jupyter
python -m ipykernel install --user --name ee541 --display-name "Python (ee541)"

# PyTorch - SELECT BASED ON YOUR SYSTEM
# CPU only
conda install pytorch torchvision torchaudio cpuonly -c pytorch

# CUDA 11.8 (NVIDIA GPU)
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia

# Mac M1/M2/M3 (Metal Performance Shaders)
conda install pytorch torchvision torchaudio -c pytorch

# Additional ML tools
conda install scikit-learn
conda install -c conda-forge tensorboard

GPU Configuration Check

  • NVIDIA: Check CUDA version with nvidia-smi
  • AMD: ROCm support limited, use CPU fallback
  • Mac (MPS): Automatic detection in PyTorch 1.12+
# Standard device selection
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Jupyter Notebooks: Interactive Development

Starting Jupyter

# From terminal with environment active
(ee541) $ jupyter notebook

# Opens browser at localhost:8888
# Navigate to your work directory

Notebook Structure

Key Shortcuts

Jupyter Notebook Interface

Real-World Example: Trading Algorithm

Jupyter Project Example

Project Management

Version Control

git init
git add .
git commit -m "Initial commit"

# But exclude:
# .gitignore contents:
*.pyc
__pycache__/
.ipynb_checkpoints/
data/
*.pt
*.pth

Reproducibility

# Save environment
conda env export > environment.yml

# Recreate elsewhere
conda env create -f environment.yml

# Save pip requirements
pip freeze > requirements.txt

Verifying Your Environment

#| echo: true
#| code-fold: false

import sys
import platform

print(f"Python: {sys.version}")
print(f"Platform: {platform.platform()}")

packages = {
    'numpy': None,
    'torch': None,
    'torchvision': None,
    'matplotlib': None,
    'jupyter': None,
    'sklearn': 'scikit-learn'
}

for import_name, pip_name in packages.items():
    try:
        module = __import__(import_name)
        version = getattr(module, '__version__', 'installed')
        print(f"✓ {import_name}: {version}")
        
        # Special check for PyTorch GPU
        if import_name == 'torch':
            import torch
            if torch.cuda.is_available():
                print(f"  GPU: CUDA ({torch.version.cuda})")
                print(f"  Device: {torch.cuda.get_device_name(0)}")
            elif torch.backends.mps.is_available():
                print(f"  GPU: MPS (Mac)")
            else:
                print(f"  GPU: Not available")
                
    except ImportError:
        package = pip_name or import_name
        print(f"✗ {import_name}: Not installed")
        print(f"  Install with: conda install {package}")

Environment Management Commands

Conda Essentials

# List environments
conda env list

# Create from file
conda env create -f environment.yml

# Clone environment
conda create --name ee541_backup --clone ee541

# Remove environment
conda env remove -n ee541

# Update all packages
conda update --all

# Clean cache (free space)
conda clean --all

Package Management

# Search for package
conda search pytorch

# Install specific version
conda install pytorch=2.0.1

# List installed packages
conda list

# Check for updates
conda update --dry-run --all

# Channel priority
conda config --add channels conda-forge
conda config --set channel_priority strict

PyTorch Demo

---

Fashion-MNIST: 87% Accuracy in Four Epochs

What We're Building

Task: Classify clothing items into 10 categories

Architecture: Simple 2-layer network

Training: 4 epochs, Adam optimizer

Files:

![](sections/images/12-pytorch-demo/fashion_0_tshirt.png) **T-shirt**
![](sections/images/12-pytorch-demo/fashion_1_trouser.png) **Trouser**
![](sections/images/12-pytorch-demo/fashion_2_pullover.png) **Pullover**
![](sections/images/12-pytorch-demo/fashion_3_dress.png) **Dress**
![](sections/images/12-pytorch-demo/fashion_4_coat.png) **Coat**
![](sections/images/12-pytorch-demo/fashion_5_sandal.png) **Sandal**
![](sections/images/12-pytorch-demo/fashion_6_shirt.png) **Shirt**
![](sections/images/12-pytorch-demo/fashion_7_sneaker.png) **Sneaker**
![](sections/images/12-pytorch-demo/fashion_8_bag.png) **Bag**
![](sections/images/12-pytorch-demo/fashion_9_boot.png) **Boot**

Core Training Loop Structure

# Minimal.ipynb - Key components

# 1. Data Loading
train_loader = DataLoader(train_set, batch_size=100, shuffle=True)
test_loader = DataLoader(test_set, batch_size=100, shuffle=False)

# 2. Model Definition
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)
    
    def forward(self, x):
        x = F.relu(self.hidden(x))
        return self.output(x)

# 3. Training Loop
for epoch in range(num_epochs):
    for images, labels in train_loader:
        # Forward pass
        outputs = model(images.view(-1, 784))
        loss = loss_func(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Complete training in ~5 minutes on CPU, <1 minute on GPU

Training Dynamics

#| echo: false
"""
Training dynamics showing loss decay and accuracy improvement over four epochs, with characteristic rapid initial learning followed by diminishing returns.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

iterations = np.arange(0, 240, 1)
epochs = iterations / 60

# Loss curve
ax = axes[0]
loss = 2.3 * np.exp(-epochs/2) + 0.3 + 0.05*np.sin(iterations/5)
ax.plot(iterations, loss, 'b-', linewidth=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('Cross-Entropy Loss')
ax.set_title('Training Loss', fontweight='bold')
ax.grid(True, alpha=0.3)

for ep in range(4):
    ax.axvline(x=ep*60, color='gray', linestyle='--', alpha=0.3)
    ax.text(ep*60 + 30, 2.2, f'Epoch {ep+1}', ha='center', fontsize=9)

# Accuracy curve
ax = axes[1]
accuracy = 100 * (1 - np.exp(-epochs/1.5)) + np.random.normal(0, 0.5, len(epochs))
accuracy = np.clip(accuracy, 0, 89)
ax.plot(iterations[::5], accuracy[::5], 'g-', linewidth=2, marker='o', markersize=3)
ax.set_xlabel('Iteration')
ax.set_ylabel('Test Accuracy (%)')
ax.set_title('Model Performance', fontweight='bold')
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 100)

ax.axhline(y=87, color='red', linestyle='--', alpha=0.5)
ax.text(200, 89, 'Final: ~87%', fontsize=10, fontweight='bold')

plt.suptitle('Typical Training Progress', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

Observations:

TensorBoard Visualization

Launch TensorBoard

# From terminal
tensorboard --logdir runs
# Navigate to http://localhost:6006

Available Visualizations

#| echo: false
"""
PCA embedding visualization showing 10 classes forming distinct clusters in 2D feature space.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(8, 8))

np.random.seed(42)
n_points = 500
n_classes = 10

embeddings = []
labels = []
colors = plt.cm.tab10(np.arange(10))

for i in range(n_classes):
    center = np.random.randn(2) * 3
    points = center + np.random.randn(n_points//n_classes, 2) * 0.5
    embeddings.append(points)
    labels.extend([i] * (n_points//n_classes))

embeddings = np.vstack(embeddings)

for i in range(n_classes):
    mask = np.array(labels) == i
    ax.scatter(embeddings[mask, 0], embeddings[mask, 1], 
              c=[colors[i]], s=20, alpha=0.6, label=f'Class {i}')

ax.set_xlabel('First Principal Component')
ax.set_ylabel('Second Principal Component')
ax.set_title('Embedding Visualization (PCA)', fontweight='bold', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(ncol=2, fontsize=8, loc='upper right')

plt.tight_layout()
plt.show()

Embedding Insight: Classes form distinct clusters in feature space

Model Architecture Inspection

#| echo: false
"""
Neural network architecture diagram showing 784→128→10 layer structure alongside parameter summary table with 101K total parameters.
"""

#| fig-align: center

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Architecture diagram
ax = axes[0]
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis('off')

# Input layer
for i in range(8):
    y = 2 + i * 0.8
    circle = patches.Circle((2, y), 0.2, facecolor='lightblue', edgecolor='black')
    ax.add_patch(circle)
ax.text(2, 1, '784 inputs\n(flattened)', ha='center', fontsize=9)
ax.text(2, 8, '...', ha='center', fontsize=12)

# Hidden layer
for i in range(6):
    y = 3 + i * 0.8
    circle = patches.Circle((5, y), 0.25, facecolor='lightgreen', edgecolor='black')
    ax.add_patch(circle)
ax.text(5, 2, '128 hidden\n(ReLU)', ha='center', fontsize=9)
ax.text(5, 7.5, '...', ha='center', fontsize=12)

# Output layer
for i in range(5):
    y = 3.5 + i * 0.6
    circle = patches.Circle((8, y), 0.2, facecolor='lightcoral', edgecolor='black')
    ax.add_patch(circle)
ax.text(8, 2.5, '10 outputs\n(classes)', ha='center', fontsize=9)

# Connections (subset)
for i in range(3):
    for j in range(3):
        ax.plot([2.2, 4.75], [2 + i*0.8, 3 + j*0.8], 'gray', alpha=0.4, linewidth=0.7)
        ax.plot([5.25, 7.8], [3 + i*0.8, 3.5 + j*0.6], 'gray', alpha=0.4, linewidth=0.7)

ax.set_title('Network Architecture', fontweight='bold', fontsize=12)

# Parameter summary
ax = axes[1]
ax.axis('off')

summary_text = """Model Summary (torchinfo output):
==================================================
Layer (type)         Output Shape         Param #
==================================================
Linear-1             [-1, 128]            100,480
ReLU-2               [-1, 128]            0
Linear-3             [-1, 10]             1,290
==================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
==================================================

Storage: model.pth (~400 KB)
Inference time: <1ms per image"""

ax.text(0.1, 0.9, summary_text, fontsize=10, family='monospace', 
        verticalalignment='top', transform=ax.transAxes)
ax.set_title('Model Details', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

87% Accuracy Achieved with Simple Architecture

Implementation Results

Not Addressed (Future Topics)

Main Files

1-fashion-mnist.ipynb      # Dataset exploration
2-minimal-pytorch.ipynb    # Core training
3-feature-visualization.ipynb  # TensorBoard
#| echo: false
"""
Confusion matrix for Fashion-MNIST classification showing prediction counts across 10 clothing categories, with common misclassifications between similar items like Shirt/T-shirt and Pullover/Coat.
"""

#| fig-align: center

fig, ax = plt.subplots(figsize=(8, 8))

categories_short = ['Tshirt', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot']

np.random.seed(42)
conf_matrix = np.eye(10) * 80 + np.random.randint(0, 10, (10, 10))
conf_matrix = conf_matrix.astype(int)

conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1, keepdims=True) * 100

im = ax.imshow(conf_matrix_norm, cmap='Blues', vmin=0, vmax=100)

for i in range(10):
    for j in range(10):
        text = ax.text(j, i, f'{conf_matrix[i, j]}',
                      ha='center', va='center', color='white' if conf_matrix_norm[i, j] > 50 else 'black',
                      fontsize=8)

ax.set_xticks(range(10))
ax.set_yticks(range(10))
ax.set_xticklabels(categories_short, rotation=45, ha='right', fontsize=9)
ax.set_yticklabels(categories_short, fontsize=9)
ax.set_xlabel('Predicted', fontsize=10)
ax.set_ylabel('Actual', fontsize=10)
ax.set_title('Confusion Matrix', fontsize=12, fontweight='bold')

plt.colorbar(im, ax=ax, label='Count')
plt.tight_layout()
plt.show()

Common Confusions: Shirt ↔ T-shirt, Pullover ↔ Coat

Python and NumPy for Neural Networks

Next week: Array operations and automatic differentiation