Minimize error on unseen data using only observed samples
This gap defines machine learning


#| echo: false """ Grid of ten handwritten digit samples (0-9) from the MNIST dataset, illustrating the raw pixel input representation for classification. """ #| fig-align: center from sklearn.datasets import load_digits import matplotlib.pyplot as plt import numpy as np digits = load_digits() fig, axes = plt.subplots(2, 5, figsize=(12, 6)) for i, ax in enumerate(axes.flat): img = digits.images[i] im = ax.imshow(img, cmap='gray_r', interpolation='nearest') ax.set_title(f'Label: {digits.target[i]}', fontsize=12, fontweight='bold') ax.axis('off') # Add subtle border for spine in ax.spines.values(): spine.set_edgecolor('#E0E0E0') spine.set_linewidth(1) plt.tight_layout() plt.show()
#| echo: false """ Four representations of the same handwritten digit: original grayscale pixels, binary threshold, flattened 64-dimensional vector, and edge-detected gradient features. """ #| fig-align: center fig, axes = plt.subplots(1, 4, figsize=(16, 4)) digit_idx = 0 original = digits.images[digit_idx] # Original ax = axes[0] ax.imshow(original, cmap='gray_r', interpolation='nearest') ax.set_title('Original Pixels\n(8×8 grayscale)', fontweight='bold') ax.axis('off') # Binary threshold ax = axes[1] binary = (original > 7).astype(int) ax.imshow(binary, cmap='gray_r', interpolation='nearest') ax.set_title('Binary Threshold\n(0 or 1)', fontweight='bold') ax.axis('off') # Flattened vector ax = axes[2] flattened = original.flatten() ax.bar(range(len(flattened)), flattened, color='#1976D2', width=1.0, edgecolor='none') ax.set_title('Flattened Vector\n(64-dimensional)', fontweight='bold') ax.set_xlabel('Pixel Index') ax.set_ylabel('Intensity') ax.set_ylim(0, 16) ax.grid(True, alpha=0.3) # Edge detection ax = axes[3] from scipy import ndimage edges = ndimage.sobel(original) ax.imshow(np.abs(edges), cmap='hot', interpolation='nearest') ax.set_title('Edge Features\n(Gradient)', fontweight='bold') ax.axis('off') plt.suptitle(f'Same Data, Different Representations (Digit: {digits.target[digit_idx]})', fontsize=16, fontweight='bold', y=1.05) plt.tight_layout() plt.show()
The choice of representation can make learning tractable or impossible. Deep learning learns representations automatically.




x = 0111111011100100000010000
001011111111101111001110
Label: "GOOD"
The same pattern can be represented as:
A hypothesis class can succeed or fail based on the choice of representation.
A function \(f: \mathbb{R}^d \rightarrow \mathbb{R}\) is linear if \(f(\mathbf{x}) = \mathbf{w}^\top \mathbf{x} + b\) for some \(\mathbf{w} \in \mathbb{R}^d\) and \(b \in \mathbb{R}\). The decision boundary \(\{\mathbf{x} : f(\mathbf{x}) = 0\}\) is a hyperplane.
where:
#| echo: false """ XOR classification problem showing underfitting (linear boundary), good fit (quadratic boundary), and overfitting (high-order polynomial) to illustrate hypothesis class complexity trade-offs. """ #| fig-align: center fig, axes = plt.subplots(1, 3, figsize=(15, 5)) np.random.seed(42) n_samples = 100 X = np.random.randn(n_samples, 2) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) # Linear boundary ax = axes[0] ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0') ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1') ax.axhline(y=0, color='#2E7D32', linestyle='--', linewidth=2, label='Linear Boundary') ax.set_title('Linear Classifier\n(Underfitting)', fontweight='bold') ax.set_xlim(-3, 3) ax.set_ylim(-3, 3) ax.legend(loc='upper right') ax.grid(True, alpha=0.3) ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') # Quadratic boundary ax = axes[1] ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0') ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1') x_grid = np.linspace(-3, 3, 100) y_grid = np.linspace(-3, 3, 100) xx, yy = np.meshgrid(x_grid, y_grid) Z = np.logical_xor(xx > 0, yy > 0) ax.contour(xx, yy, Z, levels=[0.5], colors='#2E7D32', linewidths=2) ax.set_title('Quadratic Classifier\n(Good Fit)', fontweight='bold') ax.set_xlim(-3, 3) ax.set_ylim(-3, 3) ax.legend(loc='upper right') ax.grid(True, alpha=0.3) ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') # Complex boundary ax = axes[2] ax.scatter(X[~y, 0], X[~y, 1], c='#1976D2', alpha=0.6, s=50, label='Class 0') ax.scatter(X[y, 0], X[y, 1], c='#C62828', alpha=0.6, s=50, label='Class 1') # Draw wiggly boundaries around each point for i in range(0, len(X), 5): if y[i]: circle = plt.Circle((X[i, 0], X[i, 1]), 0.3, fill=False, edgecolor='#2E7D32', linewidth=1, alpha=0.5) ax.add_patch(circle) ax.set_title('High-Order Polynomial\n(Overfitting)', fontweight='bold') ax.set_xlim(-3, 3) ax.set_ylim(-3, 3) ax.legend(loc='upper right') ax.grid(True, alpha=0.3) ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') plt.suptitle('Hypothesis Class Complexity Trade-off', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
where \(h\) is an activation function:
#| echo: false """ Perceptron architecture diagram showing inputs, weighted connections, summation, activation function, and output. """ #| fig-align: center import matplotlib.patches as patches import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(10, 6)) # Input nodes input_y = [1.5, 2.5, 3.5] for i, y in enumerate(input_y): circle = plt.Circle((2, y), 0.25, color='#E3F2FD', ec='#1976D2', linewidth=2) ax.add_patch(circle) ax.text(2, y, f'$x_{i+1}$', ha='center', va='center', fontsize=11, fontweight='bold') # Bias node bias_circle = plt.Circle((2, 0.5), 0.25, color='#F5F5F5', ec='#757575', linewidth=2) ax.add_patch(bias_circle) ax.text(2, 0.5, '1', ha='center', va='center', fontsize=11, fontweight='bold') # Summation node sum_circle = plt.Circle((5, 2.5), 0.35, color='#FFF9C4', ec='#F57C00', linewidth=2) ax.add_patch(sum_circle) ax.text(5, 2.5, '$\\Sigma$', ha='center', va='center', fontsize=14, fontweight='bold') # Activation node act_circle = plt.Circle((7, 2.5), 0.35, color='#FFE0B2', ec='#F57C00', linewidth=2) ax.add_patch(act_circle) ax.text(7, 2.5, '$h$', ha='center', va='center', fontsize=14, fontweight='bold') # Output output_circle = plt.Circle((9, 2.5), 0.25, color='#C8E6C9', ec='#4CAF50', linewidth=2) ax.add_patch(output_circle) ax.text(9, 2.5, '$y$', ha='center', va='center', fontsize=12, fontweight='bold') # Connections from inputs to summation (spread out arrows) weights = ['$w_1$', '$w_2$', '$w_3$'] for i, (y, w) in enumerate(zip(input_y, weights)): # Calculate angle to spread arrows angle = np.arctan2(2.5 - y, 3) end_x = 4.65 - 0.15 * np.cos(angle + np.pi/6 * (i-1)) end_y = 2.5 - 0.15 * np.sin(angle + np.pi/6 * (i-1)) # Draw line to summation edge ax.plot([2.25, end_x], [y, end_y], 'k-', linewidth=1.5) # Arrow head outside circle ax.arrow(end_x - 0.05, end_y, 0.05, 0, head_width=0.08, head_length=0.03, fc='black') # Weight label mid_x = 3.3 mid_y = y + (2.5 - y) * 0.4 ax.text(mid_x, mid_y + 0.15, w, fontsize=10, color='#212121', ha='center') # Bias connection (also spread) end_x_b = 4.65 - 0.15 * np.cos(np.arctan2(2, 3) - np.pi/6) end_y_b = 2.5 - 0.15 * np.sin(np.arctan2(2, 3) - np.pi/6) ax.plot([2.25, end_x_b], [0.5, end_y_b], 'k--', linewidth=1.5, alpha=0.7) ax.arrow(end_x_b - 0.05, end_y_b, 0.05, 0, head_width=0.08, head_length=0.03, fc='black', alpha=0.7) ax.text(3.3, 1.3, '$b$', fontsize=10, color='#212121', ha='center', style='italic') # Summation to activation ax.arrow(5.35, 2.5, 1.3, 0, head_width=0.12, head_length=0.1, fc='black', linewidth=1.5) # Activation to output ax.arrow(7.35, 2.5, 1.3, 0, head_width=0.12, head_length=0.1, fc='black', linewidth=1.5) # Output arrow ax.arrow(9.25, 2.5, 0.5, 0, head_width=0.12, head_length=0.1, fc='#4CAF50', ec='#4CAF50', linewidth=2) ax.set_xlim(1, 10.5) ax.set_ylim(0, 4.5) ax.axis('off') plt.tight_layout() plt.show()
#| echo: false """ Common activation functions (ReLU, Sigmoid, Tanh, Leaky ReLU) and their derivatives, illustrating the nonlinear transformations and gradient behavior critical for deep network training. """ #| fig-align: center fig, axes = plt.subplots(2, 4, figsize=(14, 6)) x = np.linspace(-3, 3, 100) # ReLU ax = axes[0, 0] y = np.maximum(0, x) ax.plot(x, y, 'b-', linewidth=2) ax.set_title('ReLU', fontweight='bold') ax.grid(True, alpha=0.3) ax.set_ylim(-0.5, 3) # Sigmoid ax = axes[0, 1] y = 1 / (1 + np.exp(-x)) ax.plot(x, y, 'r-', linewidth=2) ax.set_title('Sigmoid', fontweight='bold') ax.grid(True, alpha=0.3) # Tanh ax = axes[0, 2] y = np.tanh(x) ax.plot(x, y, 'g-', linewidth=2) ax.set_title('Tanh', fontweight='bold') ax.grid(True, alpha=0.3) # Leaky ReLU ax = axes[0, 3] y = np.where(x > 0, x, 0.1*x) ax.plot(x, y, 'm-', linewidth=2) ax.set_title('Leaky ReLU', fontweight='bold') ax.grid(True, alpha=0.3) # Derivatives # ReLU derivative ax = axes[1, 0] y = (x > 0).astype(float) ax.plot(x, y, 'b--', linewidth=2) ax.set_title("ReLU'", fontweight='bold') ax.grid(True, alpha=0.3) ax.set_ylim(-0.5, 1.5) # Sigmoid derivative ax = axes[1, 1] sig = 1 / (1 + np.exp(-x)) y = sig * (1 - sig) ax.plot(x, y, 'r--', linewidth=2) ax.set_title("Sigmoid'", fontweight='bold') ax.grid(True, alpha=0.3) # Tanh derivative ax = axes[1, 2] y = 1 - np.tanh(x)**2 ax.plot(x, y, 'g--', linewidth=2) ax.set_title("Tanh'", fontweight='bold') ax.grid(True, alpha=0.3) # Leaky ReLU derivative ax = axes[1, 3] y = np.where(x > 0, 1, 0.1) ax.plot(x, y, 'm--', linewidth=2) ax.set_title("Leaky ReLU'", fontweight='bold') ax.grid(True, alpha=0.3) ax.set_ylim(-0.5, 1.5) plt.suptitle('Activation Functions and Their Derivatives', fontsize=14, fontweight='bold') plt.tight_layout() plt.show()
Without activation functions, stacking layers is pointless: \(f(\mathbf{W}_2 \mathbf{W}_1 \mathbf{x}) = f(\mathbf{W} \mathbf{x})\) where \(\mathbf{W} = \mathbf{W}_2\mathbf{W}_1\)
Later topic: Gradient flow and vanishing gradients during backpropagation
Solution:
#| echo: true #| code-fold: false def sgd_step(w, x, y, learning_rate=0.01): prediction = np.dot(w, x) error = prediction - y gradient = error * x w_new = w - learning_rate * gradient return w_new
#| echo: false """ Gradient descent optimization on a 2D loss surface with contour lines showing the path from initialization to the minimum, alongside a convergence plot of loss versus iteration. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Loss surface ax = axes[0] w1 = np.linspace(-2, 3, 100) w2 = np.linspace(-2, 3, 100) W1, W2 = np.meshgrid(w1, w2) Z = 0.5 * ((W1 - 1)**2 + 2*(W2 - 0.5)**2) contour = ax.contour(W1, W2, Z, levels=20, cmap='viridis', alpha=0.6) ax.clabel(contour, inline=True, fontsize=8) # Gradient descent path np.random.seed(42) w_init = np.array([-1.5, 2.5]) path = [w_init] learning_rate = 0.1 for _ in range(20): grad = np.array([path[-1][0] - 1, 2*(path[-1][1] - 0.5)]) new_w = path[-1] - learning_rate * grad path.append(new_w) path = np.array(path) ax.plot(path[:, 0], path[:, 1], 'r.-', linewidth=2, markersize=8, label='GD Path', markeredgecolor='white', markeredgewidth=1) ax.plot(1, 0.5, 'g*', markersize=15, label='Optimum', markeredgecolor='white', markeredgewidth=1) ax.set_xlabel('$w_1$') ax.set_ylabel('$w_2$') ax.set_title('Gradient Descent on Loss Surface', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Convergence plot ax = axes[1] losses = [0.5 * ((w[0] - 1)**2 + 2*(w[1] - 0.5)**2) for w in path] ax.plot(losses, 'o-', linewidth=2, markersize=6, color='#1976D2', markeredgecolor='white', markeredgewidth=1) ax.set_xlabel('Iteration') ax.set_ylabel('Loss') ax.set_title('Loss Convergence', fontweight='bold') ax.grid(True, alpha=0.3) ax.set_yscale('log') plt.tight_layout() plt.show()
Gradient descent navigates the loss landscape by repeatedly moving in the direction of steepest descent. For convex problems, this guarantees convergence to the global minimum. For neural networks, we settle for local minima that generalize well.
Bias: Error from wrong model assumptions
Variance: Error from sensitivity to training data
Irreducible error (\(\sigma^2\)): Noise inherent in data
#| echo: false """ Bullseye target diagram showing four bias-variance scenarios: low/high bias combined with low/high variance, where point clustering represents variance and distance from center represents bias. """ #| fig-align: center fig, ax = plt.subplots(figsize=(8, 8)) # 2x2 grid showing bias-variance combinations scenarios = [ ('Low Bias\nLow Variance', 1, 3, '#2E7D32'), ('Low Bias\nHigh Variance', 3, 3, '#F57C00'), ('High Bias\nLow Variance', 1, 1, '#1976D2'), ('High Bias\nHigh Variance', 3, 1, '#C62828') ] for label, x, y, color in scenarios: # Draw target circles for r in [0.3, 0.6]: circle = plt.Circle((x, y), r, fill=False, edgecolor='black', linewidth=2) ax.add_patch(circle) # Draw points np.random.seed(42) if 'Low Bias' in label and 'Low Variance' in label: # Tight cluster near center points_x = x + np.random.normal(0, 0.1, 10) points_y = y + np.random.normal(0, 0.1, 10) elif 'Low Bias' in label: # Scattered but centered points_x = x + np.random.normal(0, 0.4, 10) points_y = y + np.random.normal(0, 0.4, 10) elif 'Low Variance' in label: # Tight cluster but off-center points_x = x - 0.4 + np.random.normal(0, 0.1, 10) points_y = y - 0.4 + np.random.normal(0, 0.1, 10) else: # Scattered and off-center points_x = x - 0.5 + np.random.normal(0, 0.4, 10) points_y = y - 0.5 + np.random.normal(0, 0.4, 10) ax.scatter(points_x, points_y, c=color, s=50, alpha=0.6, edgecolors='black') ax.text(x, y - 1, label, ha='center', fontsize=10, fontweight='bold') ax.set_xlim(0, 4) ax.set_ylim(0, 4) ax.set_aspect('equal') ax.axis('off') ax.set_title('Target: Center of Bullseye', fontweight='bold', fontsize=14) plt.tight_layout() plt.show()
Tradeoff: Complex models reduce bias but increase variance
#| echo: false """ Polynomial fits of degrees 1, 3, 8, and 14 to noisy data, illustrating the progression from underfitting (high bias) to overfitting (high variance). """ #| fig-align: center fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Generate true function np.random.seed(42) X_true = np.linspace(0, 10, 200) y_true_func = lambda x: np.sin(x) + 0.5 * x y_true = y_true_func(X_true) # Generate the SAME data points for all models np.random.seed(42) n_data = 15 # Use same 15 points for all X_data = np.sort(np.random.uniform(0.5, 9.5, n_data)) y_data_clean = y_true_func(X_data) y_data = y_data_clean + np.random.normal(0, 1.2, n_data) # Consistent noise # Add boundary points (not plotted) to control edge behavior X_boundary_left = np.array([-1.0]) y_boundary_left = y_true_func(X_boundary_left) X_boundary_right = np.array([11.0]) y_boundary_right = y_true_func(X_boundary_right) # Different polynomial degrees for each case configs = [ (1, 'Underfitting (High Bias)', '#C62828'), (3, 'Good Fit', '#2E7D32'), (8, 'Slight Overfitting', '#F57C00'), (14, 'Severe Overfitting (High Variance)', '#7B1FA2') ] for ax, (degree, title, color) in zip(axes.flat, configs): # Combine data with boundary points for fitting X_fit = np.concatenate([X_boundary_left, X_data, X_boundary_right]) y_fit = np.concatenate([y_boundary_left, y_data, y_boundary_right]) # Fit polynomial coeffs = np.polyfit(X_fit, y_fit, min(degree, len(X_fit)-1)) poly = np.poly1d(coeffs) X_plot = np.linspace(0, 10, 500) y_pred = poly(X_plot) # Plot data (not boundary points) ax.scatter(X_data, y_data, s=50, color='black', zorder=5, alpha=0.7, label='Training Data') ax.plot(X_true, y_true, 'g--', linewidth=2, label='True Function', alpha=0.5) ax.plot(X_plot, y_pred, '-', linewidth=2.5, label=f'Polynomial (degree={degree})', color=color) # Add error shading between prediction and true function y_true_plot = y_true_func(X_plot) ax.fill_between(X_plot, y_pred, y_true_plot, where=(np.abs(y_pred - y_true_plot) < 10), # Clip extreme values alpha=0.15, color='blue', linewidth=0) # For overfitting cases, add vertical lines at data points to show fitting if degree >= 8: for x, y in zip(X_data, y_data): pred_y = poly(x) ax.plot([x, x], [y, pred_y], ':', color=color, alpha=0.5, linewidth=1) ax.set_xlabel('Input', fontsize=11) ax.set_ylabel('Output', fontsize=11) ax.set_title(title, fontweight='bold', fontsize=12) ax.legend(loc='upper left', fontsize=9) ax.grid(True, alpha=0.3) ax.set_xlim(0, 10) ax.set_ylim(-2, 12) # Add text annotations for key insights if 'Underfitting' in title: ax.text(5, -3, 'Cannot capture\nunderlying pattern', ha='center', fontsize=9, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) elif 'Severe' in title: ax.text(5, 8, 'Wild oscillations\nbetween data points', ha='center', fontsize=9, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) plt.suptitle('Polynomial Fitting: From Underfitting to Overfitting', fontsize=16, fontweight='bold') plt.tight_layout() plt.show()
Increasing complexity: bias decreases, variance increases
Learning = Function Approximation
Representation Matters
Generalization is the Goal
Start Simple
Iterate and Validate
EE 541 Progression
"Data is the new oil"
But like oil, it must be refined to have value
Note: Specific percentages vary by application, but quality improvement consistently outperforms quantity alone.
#| echo: false """ Performance curves showing how high-quality data reaches target accuracy with far fewer samples than low-quality data, illustrating that data quality dominates at small scales while quantity helps at larger scales. """ #| fig-align: center import matplotlib.pyplot as plt import numpy as np import matplotlib.patches as mpatches fig, ax = plt.subplots(figsize=(10, 8)) # Data showing impact of data quality and quantity on model performance data_sizes = np.logspace(2, 6, 50) # 100 to 1M samples # Performance curves for different data qualities perf_high_quality = 95 * (1 - np.exp(-data_sizes/10000)) # Saturates at 95% perf_med_quality = 80 * (1 - np.exp(-data_sizes/30000)) # Saturates at 80% perf_low_quality = 60 * (1 - np.exp(-data_sizes/100000)) # Saturates at 60% ax.semilogx(data_sizes, perf_high_quality, 'g-', linewidth=3, label='High Quality Data') ax.semilogx(data_sizes, perf_med_quality, 'b-', linewidth=3, label='Medium Quality Data') ax.semilogx(data_sizes, perf_low_quality, 'r-', linewidth=3, label='Low Quality Data') # Add annotations ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5) ax.text(1000, 92, 'Target Performance', fontsize=10, color='gray') # Highlight key insight ax.fill_between([100, 10000], 0, 100, alpha=0.1, color='green') ax.text(1000, 50, 'Quality\nDominates', fontsize=12, fontweight='bold', ha='center', color='green') ax.fill_between([10000, 1000000], 0, 100, alpha=0.1, color='blue') ax.text(100000, 50, 'Quantity\nHelps', fontsize=12, fontweight='bold', ha='center', color='blue') ax.set_xlabel('Dataset Size (number of samples)', fontsize=12) ax.set_ylabel('Model Performance (%)', fontsize=12) ax.set_title('The Data Quality vs Quantity Trade-off', fontsize=14, fontweight='bold') ax.grid(True, alpha=0.3) ax.legend(loc='lower right', fontsize=11) ax.set_ylim(0, 100) plt.tight_layout() plt.show()
#| echo: false """ Concentric circle data shown in three coordinate systems: original Cartesian (linearly inseparable), polar (separable by radius), and polynomial features (linearly separable). """ #| fig-align: center import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_circles from sklearn.preprocessing import PolynomialFeatures np.random.seed(42) X, y = make_circles(n_samples=200, noise=0.1, factor=0.5) fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # Original space ax = axes[0] ax.scatter(X[y==0, 0], X[y==0, 1], c='#1976D2', alpha=0.6, s=30) ax.scatter(X[y==1, 0], X[y==1, 1], c='#C62828', alpha=0.6, s=30) ax.set_title('Original Space\n(Linearly Inseparable)', fontweight='bold') ax.set_xlabel('$x_1$') ax.set_ylabel('$x_2$') ax.grid(True, alpha=0.3) ax.axhline(y=0, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.5) ax.axvline(x=0, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.5) # Polar coordinates ax = axes[1] r = np.sqrt(X[:, 0]**2 + X[:, 1]**2) theta = np.arctan2(X[:, 1], X[:, 0]) ax.scatter(r[y==0], theta[y==0], c='#1976D2', alpha=0.6, s=30) ax.scatter(r[y==1], theta[y==1], c='#C62828', alpha=0.6, s=30) ax.set_title('Polar Coordinates\n(Radius Separable)', fontweight='bold') ax.set_xlabel('Radius $r$') ax.set_ylabel('Angle $\\theta$') ax.grid(True, alpha=0.3) ax.axvline(x=0.7, color='#2E7D32', linestyle='--', linewidth=2, alpha=0.7, label='Decision boundary') ax.legend() # Polynomial features ax = axes[2] poly = PolynomialFeatures(degree=2, include_bias=False) X_poly = poly.fit_transform(X) ax.scatter(X_poly[y==0, 2], X_poly[y==0, 4], c='#1976D2', alpha=0.6, s=30) ax.scatter(X_poly[y==1, 2], X_poly[y==1, 4], c='#C62828', alpha=0.6, s=30) ax.set_title('Polynomial Features\n(Linearly Separable)', fontweight='bold') ax.set_xlabel('$x_1^2$') ax.set_ylabel('$x_2^2$') ax.grid(True, alpha=0.3) x_line = np.linspace(-0.2, 1.5, 100) ax.plot(x_line, 0.5 - x_line, 'g--', linewidth=2, alpha=0.7) plt.suptitle('Same Data, Different Representations', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
As \(d \to \infty\):
#| echo: true #| code-fold: true import numpy as np def volume_ratio(d, epsilon=0.95): """Fraction of hypercube volume in outer shell""" return 1 - epsilon**d dimensions = [1, 2, 3, 10, 100, 1000] for d in dimensions: ratio = volume_ratio(d) print(f"d={d:4}: {ratio:.6f} in outer shell")
#| echo: false """ Hypersphere volume shrinking relative to enclosing hypercube as dimensions increase from 2D to 100D, illustrating the curse of dimensionality. """ #| fig-align: center import matplotlib.pyplot as plt from matplotlib.patches import Circle, Rectangle import numpy as np # Hypersphere visualization fig, axes = plt.subplots(1, 3, figsize=(10, 3.5)) # 2D ax = axes[0] circle = Circle((0, 0), 1, fill=False, edgecolor='#8B0000', linewidth=2) square = Rectangle((-1, -1), 2, 2, fill=False, edgecolor='#1976D2', linewidth=2) ax.add_patch(circle) ax.add_patch(square) ax.set_xlim(-1.5, 1.5) ax.set_ylim(-1.5, 1.5) ax.set_aspect('equal') ax.set_title('2D: Circle fills square', fontsize=11) ax.grid(True, alpha=0.3) # 10D representation ax = axes[1] x = np.linspace(-1, 1, 100) y = np.sqrt(1 - x**2) ax.fill_between(x, -y, y, alpha=0.3, color='#8B0000') ax.plot([-1, 1, 1, -1, -1], [-1, -1, 1, 1, -1], 'b-', linewidth=2) ax.set_xlim(-1.5, 1.5) ax.set_ylim(-1.5, 1.5) ax.set_aspect('equal') ax.set_title('10D: Sphere vanishes', fontsize=11) ax.text(0, 0, 'Vol = 0.25%', ha='center', fontsize=10, fontweight='bold') ax.grid(True, alpha=0.3) # 100D representation ax = axes[2] ax.plot([-1, 1, 1, -1, -1], [-1, -1, 1, 1, -1], 'b-', linewidth=2) ax.scatter([0], [0], s=5, color='#8B0000') ax.set_xlim(-1.5, 1.5) ax.set_ylim(-1.5, 1.5) ax.set_aspect('equal') ax.set_title('100D: Sphere invisible', fontsize=11) ax.text(0, -0.2, 'Vol ≈ 0', ha='center', fontsize=10, fontweight='bold') ax.grid(True, alpha=0.3) plt.suptitle('Hypersphere Volume in Unit Hypercube', fontweight='bold', y=1.05) plt.tight_layout() plt.show()
#| echo: false """ Volume concentration in a thin outer shell and pairwise distance concentration as dimensionality increases, illustrating the curse of dimensionality. """ #| fig-align: center fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 10)) # Volume concentration d_range = np.arange(1, 101) volume_ratios = [volume_ratio(d, 0.95) for d in d_range] ax1.plot(d_range, volume_ratios, linewidth=2, color='#8B0000') ax1.set_xlabel('Dimensions') ax1.set_ylabel('Fraction in Outer 5% Shell') ax1.set_title('Volume Concentration in High Dimensions', fontweight='bold') ax1.grid(True, alpha=0.3) ax1.axhline(y=0.99, color='gray', linestyle='--', alpha=0.7) ax1.text(50, 0.92, '99% of volume', fontsize=10) # Distance concentration np.random.seed(42) dims = [2, 10, 50, 100, 500] distances = [] for d in dims: X = np.random.randn(100, d) dist_matrix = np.sqrt(((X[:, None] - X[None, :])**2).sum(-1)) distances.append(dist_matrix[np.triu_indices_from(dist_matrix, k=1)]) ax2.boxplot(distances, labels=[f'd={d}' for d in dims]) ax2.set_ylabel('Pairwise Distances') ax2.set_title('Distance Concentration Effect', fontweight='bold') ax2.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show()
#| echo: false """ Learning curves showing how label noise creates accuracy ceilings that more data cannot overcome, alongside CIFAR-10 bar chart demonstrating the super-linear impact of noise on model performance. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) np.random.seed(42) # Learning curves with different noise levels ax = axes[0] sizes = np.logspace(3, 5, 50).astype(int) # 1K to 100K samples # Different label noise levels for noise_level, color, style in [(0, '#2E7D32', '-'), (0.05, '#1976D2', '--'), (0.15, '#F57C00', '-.'), (0.30, '#C62828', ':')]: # Accuracy model: asymptotic with noise ceiling max_acc = 1.0 - 0.8 * noise_level # Noise creates accuracy ceiling accuracy = max_acc * (1 - np.exp(-sizes / (10000 * (1 + 2*noise_level)))) label = f'{int(noise_level*100)}% label noise' ax.semilogx(sizes, accuracy, style, color=color, linewidth=2.5, label=label) # Mark key insight points ax.axhline(y=0.9, color='gray', linestyle='--', alpha=0.3) ax.text(1100, 0.91, '90% accuracy threshold', fontsize=9, color='gray') # Annotations showing data requirements ax.annotate('', xy=(3000, 0.9), xytext=(3000, 0.78), arrowprops=dict(arrowstyle='<->', color='#2E7D32', lw=1.5)) ax.text(4000, 0.84, '3K clean\nsamples', fontsize=9, color='#2E7D32') ax.annotate('', xy=(50000, 0.85), xytext=(50000, 0.68), arrowprops=dict(arrowstyle='<->', color='#C62828', lw=1.5)) ax.text(60000, 0.76, '50K noisy\nsamples\n(never reaches\n90%)', fontsize=9, color='#C62828') ax.set_xlabel('Dataset Size (log scale)', fontsize=12) ax.set_ylabel('Test Accuracy', fontsize=12) ax.set_title('Impact of Label Noise on Learning', fontsize=13, fontweight='bold') ax.legend(loc='lower right', fontsize=10) ax.grid(True, alpha=0.3) ax.set_xlim(1000, 100000) ax.set_ylim(0.5, 1.0) # Concrete example: CIFAR-10 with synthetic noise ax = axes[1] noise_levels = [0, 0.1, 0.2, 0.3, 0.4] accuracies = [0.94, 0.87, 0.79, 0.68, 0.55] error_bars = [0.01, 0.02, 0.03, 0.04, 0.05] colors_gradient = ['#2E7D32', '#66BB6A', '#FFA726', '#FF7043', '#C62828'] bar_positions = np.arange(len(noise_levels)) bars = ax.bar(bar_positions, accuracies, yerr=error_bars, color=colors_gradient, alpha=0.8, capsize=5) # Add value labels on bars (moved higher to avoid overlap) for i, (pos, acc) in enumerate(zip(bar_positions, accuracies)): ax.text(pos, acc + error_bars[i] + 0.04, f'{acc:.2f}', ha='center', fontsize=10, fontweight='bold') ax.set_xticks(bar_positions) ax.set_xticklabels([f'{int(n*100)}%' for n in noise_levels]) ax.set_xlabel('Label Noise Level', fontsize=12) ax.set_ylabel('Test Accuracy (ResNet-18)', fontsize=12) ax.set_title('CIFAR-10: Actual Impact of Label Noise', fontsize=13, fontweight='bold') ax.set_ylim(0, 1.05) ax.grid(True, alpha=0.3, axis='y') # Add key insight box ax.text(0.5, 0.25, '10% noise → 7% accuracy drop\n30% noise → 26% accuracy drop\n\nNoise impact is super-linear', transform=ax.transAxes, fontsize=10, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8), ha='center') plt.tight_layout() plt.show()
Setup (2014-2017):
What went wrong:
The data:
Historical hires: 85% male, 15% female Model learned: male-coded patterns = higher rating
System scrapped in 2018.
#| echo: false """ Stacked bar chart showing how gender bias amplifies through the ML pipeline: from 85/15% male/female in training data to 92/8% in model predictions. """ #| fig-align: center fig, ax = plt.subplots(figsize=(10, 8)) # Show the feedback loop categories = ['Training\nData', 'Model\nLearns', 'Predictions\nAmplify'] male_pct = [85, 88, 92] female_pct = [15, 12, 8] x = np.arange(len(categories)) width = 0.5 bars1 = ax.bar(x, male_pct, width, label='Male candidates', color='#1976D2', alpha=0.7) bars2 = ax.bar(x, female_pct, width, bottom=male_pct, label='Female candidates', color='#C62828', alpha=0.7) ax.set_ylabel('Percentage (%)', fontsize=12) ax.set_title('Bias Amplification Through Training', fontweight='bold', fontsize=14) ax.set_xticks(x) ax.set_xticklabels(categories, fontsize=11) ax.set_ylim(0, 100) ax.legend(fontsize=11) ax.grid(True, alpha=0.3, axis='y') for i, (m, f) in enumerate(zip(male_pct, female_pct)): ax.text(i, m/2, f'{m}%', ha='center', va='center', fontweight='bold', fontsize=12, color='white') ax.text(i, m + f/2, f'{f}%', ha='center', va='center', fontweight='bold', fontsize=12, color='white') ax.annotate('', xy=(2, 50), xytext=(0, 50), arrowprops=dict(arrowstyle='->', lw=2, color='red')) ax.text(1, 55, 'Model amplifies existing bias', ha='center', fontsize=11, bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.4)) plt.tight_layout() plt.show()
Why this matters:
The model did exactly what it was trained to do - replicate patterns in historical data.
The problem: historical data reflected real-world bias.
Clean data ≠ unbiased data
Training on augmented data:
where \(T_j\) are augmentation transforms
#| echo: false """ Data augmentation examples showing a handwritten digit transformed via rotation, translation, and noise addition. """ #| fig-align: center fig, axes = plt.subplots(2, 4, figsize=(10, 5)) from sklearn.datasets import load_digits from scipy.ndimage import rotate, shift digits = load_digits() original = digits.images[0] # Original axes[0, 0].imshow(original, cmap='gray_r') axes[0, 0].set_title('Original', fontsize=10) axes[0, 0].axis('off') # Rotations for i, angle in enumerate([15, -15, 30]): rotated = rotate(original, angle, reshape=False) axes[0, i+1].imshow(rotated, cmap='gray_r') axes[0, i+1].set_title(f'Rotate {angle}°', fontsize=10) axes[0, i+1].axis('off') # Translations for i, (dx, dy) in enumerate([(1, 0), (0, 1), (-1, -1)]): shifted = shift(original, (dx, dy)) axes[1, i].imshow(shifted, cmap='gray_r') axes[1, i].set_title(f'Shift ({dx},{dy})', fontsize=10) axes[1, i].axis('off') # Noise noisy = original + np.random.normal(0, 2, original.shape) axes[1, 3].imshow(noisy, cmap='gray_r') axes[1, 3].set_title('Add Noise', fontsize=10) axes[1, 3].axis('off') plt.suptitle('Data Augmentation Examples', fontweight='bold') plt.tight_layout() plt.show()
#| echo: false """ Three-panel comparison of learning paradigms: supervised (input-output pairs training a model), unsupervised (unlabeled data finding structure), and reinforcement learning (agent-environment interaction loop). """ #| fig-align: center import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np fig, axes = plt.subplots(1, 3, figsize=(16, 5.5)) # Supervised Learning ax = axes[0] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Data pairs section data_box = patches.FancyBboxPatch((0.5, 2), 4, 6, boxstyle="round,pad=0.05", edgecolor='#E0E0E0', facecolor='#FAFAFA', linewidth=1.5, alpha=0.3) ax.add_patch(data_box) ax.text(2.5, 8.3, 'Training Data', ha='center', fontsize=10, style='italic') # Input-output pairs for i in range(3): y_pos = 6.5 - i*1.8 # Input rect_in = patches.FancyBboxPatch((1, y_pos), 1.2, 0.8, boxstyle="round,pad=0.03", edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2) ax.add_patch(rect_in) ax.text(1.6, y_pos+0.4, f'$\\mathbf{{x}}_{{{i+1}}}$', ha='center', va='center', fontsize=11) # Arrow ax.arrow(2.25, y_pos+0.4, 0.5, 0, head_width=0.12, head_length=0.08, fc='#424242', ec='#424242') # Output rect_out = patches.FancyBboxPatch((2.8, y_pos), 1.2, 0.8, boxstyle="round,pad=0.03", edgecolor='#388E3C', facecolor='#E8F5E9', linewidth=2) ax.add_patch(rect_out) ax.text(3.4, y_pos+0.4, f'$y_{{{i+1}}}$', ha='center', va='center', fontsize=11) # Model rect_model = patches.FancyBboxPatch((5.5, 3.5), 2.5, 3, boxstyle="round,pad=0.08", edgecolor='#D32F2F', facecolor='#FFEBEE', linewidth=2.5) ax.add_patch(rect_model) ax.text(6.75, 5, 'Model\n$f(\\mathbf{x}; \\theta)$', ha='center', va='center', fontsize=11, fontweight='bold') # Training arrow arrow_patch = patches.FancyArrowPatch((4.5, 5), (5.4, 5), mutation_scale=25, color='#D32F2F', linewidth=2.5, arrowstyle='->') ax.add_patch(arrow_patch) ax.text(4.95, 5.4, 'Learn', fontsize=10, fontweight='bold', ha='center') ax.set_title('Supervised Learning', fontsize=13, fontweight='bold', pad=15) ax.text(5, 1.2, 'Learns from labeled examples', ha='center', fontsize=9, style='italic', color='#616161') # Unsupervised Learning ax = axes[1] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Just inputs section data_box = patches.FancyBboxPatch((0.5, 2), 2.5, 6, boxstyle="round,pad=0.05", edgecolor='#E0E0E0', facecolor='#FAFAFA', linewidth=1.5, alpha=0.3) ax.add_patch(data_box) ax.text(1.75, 8.3, 'Unlabeled Data', ha='center', fontsize=10, style='italic') # Just inputs for i in range(4): y_pos = 7 - i*1.3 rect_in = patches.FancyBboxPatch((1, y_pos), 1.5, 0.8, boxstyle="round,pad=0.03", edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2) ax.add_patch(rect_in) ax.text(1.75, y_pos+0.4, f'$\\mathbf{{x}}_{{{i+1}}}$', ha='center', va='center', fontsize=11) # Model finding structure rect_model = patches.FancyBboxPatch((4, 3.5), 2.5, 3, boxstyle="round,pad=0.08", edgecolor='#7B1FA2', facecolor='#F3E5F5', linewidth=2.5) ax.add_patch(rect_model) ax.text(5.25, 5, 'Discover\nPatterns', ha='center', va='center', fontsize=11, fontweight='bold') # Clusters output cluster_box = patches.FancyBboxPatch((7.5, 3), 2, 4, boxstyle="round,pad=0.05", edgecolor='#E0E0E0', facecolor='white', linewidth=1.5, alpha=0.8) ax.add_patch(cluster_box) for i, (color, y) in enumerate(zip(['#FF6F00', '#00ACC1', '#FFD600'], [6, 5, 4])): circle = patches.Circle((8.5, y), 0.35, color=color, alpha=0.8, ec='white', linewidth=1) ax.add_patch(circle) ax.text(8.5, y, f'C{i+1}', ha='center', va='center', fontsize=9, fontweight='bold', color='white') arrow_patch = patches.FancyArrowPatch((3, 5), (3.9, 5), mutation_scale=25, color='#7B1FA2', linewidth=2.5, arrowstyle='->') ax.add_patch(arrow_patch) arrow_patch2 = patches.FancyArrowPatch((6.6, 5), (7.4, 5), mutation_scale=25, color='#7B1FA2', linewidth=2.5, arrowstyle='->') ax.add_patch(arrow_patch2) ax.set_title('Unsupervised Learning', fontsize=13, fontweight='bold', pad=15) ax.text(5, 1.2, 'Discovers structure without labels', ha='center', fontsize=9, style='italic', color='#616161') # Reinforcement Learning ax = axes[2] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Agent agent_circle = patches.Circle((2.5, 5), 1, facecolor='#FF5252', edgecolor='#B71C1C', linewidth=2.5) ax.add_patch(agent_circle) ax.text(2.5, 5, 'Agent\n$\\pi(a|s)$', ha='center', va='center', fontsize=11, fontweight='bold', color='white') # Environment env_box = patches.FancyBboxPatch((5.5, 3), 3.5, 4, boxstyle="round,pad=0.08", edgecolor='#00ACC1', facecolor='#E0F7FA', linewidth=2.5) ax.add_patch(env_box) ax.text(7.25, 5, 'Environment', ha='center', va='center', fontsize=12, fontweight='bold') # Interaction arrows with better positioning # Action arrow1 = patches.FancyArrowPatch((3.4, 5.5), (5.4, 5.5), mutation_scale=20, color='#1565C0', linewidth=2, arrowstyle='->') ax.add_patch(arrow1) ax.text(4.4, 5.9, 'Action $a_t$', fontsize=10, ha='center') # State arrow2 = patches.FancyArrowPatch((5.4, 4.8), (3.4, 4.8), mutation_scale=20, color='#2E7D32', linewidth=2, arrowstyle='->') ax.add_patch(arrow2) ax.text(4.4, 4.4, 'State $s_t$', fontsize=10, ha='center') # Reward arrow3 = patches.FancyArrowPatch((5.4, 4), (3.4, 4), mutation_scale=20, color='#F57C00', linewidth=2, arrowstyle='->', linestyle='dashed') ax.add_patch(arrow3) ax.text(4.4, 3.6, 'Reward $r_t$', fontsize=10, ha='center') ax.set_title('Reinforcement Learning', fontsize=13, fontweight='bold', pad=15) ax.text(5, 1.2, 'Learns through trial and feedback', ha='center', fontsize=9, style='italic', color='#616161') plt.tight_layout() plt.show()
Modern methods combine paradigms: GPT-4 uses unsupervised pre-training on text, supervised fine-tuning on tasks, and reinforcement learning from human feedback (RLHF).
Given: \(\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^N\)
Learn: \(f: \mathcal{X} \to \mathcal{Y}\)
Minimize: \(\mathcal{L}(f(\mathbf{x}), y)\)
#| echo: true #| code-fold: false import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # Generate synthetic data np.random.seed(42) X = np.random.randn(1000, 10) w_true = np.random.randn(10) y = (X @ w_true + np.random.randn(1000)*0.1 > 0).astype(int) # Standard supervised pipeline X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) train_acc = accuracy_score(y_train, model.predict(X_train)) test_acc = accuracy_score(y_test, model.predict(X_test)) print(f"Train accuracy: {train_acc:.3f}") print(f"Test accuracy: {test_acc:.3f}")



Given: \(\mathcal{D} = \{\mathbf{x}_i\}_{i=1}^N\)
Find: Hidden patterns, structure, representations

#| echo: false """ Four-panel comparison of unsupervised learning methods: raw unlabeled data, K-means clustering with centroids, PCA dimensionality reduction from 50D to 2D, and kernel density estimation. """ #| fig-align: center from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from sklearn.decomposition import PCA fig, axes = plt.subplots(2, 2, figsize=(10, 10)) # Generate data with hidden structure X, true_labels = make_blobs(n_samples=300, centers=3, n_features=2, random_state=42) # Original data ax = axes[0, 0] ax.scatter(X[:, 0], X[:, 1], c='#616161', alpha=0.6, s=30) ax.set_title('Raw Data (No Labels)', fontweight='bold') ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') ax.grid(True, alpha=0.3) # K-means clustering ax = axes[0, 1] kmeans = KMeans(n_clusters=3, random_state=42) clusters = kmeans.fit_predict(X) ax.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', alpha=0.6, s=30) ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='#C62828', marker='*', s=300, edgecolor='black', linewidth=2) ax.set_title('K-Means Clustering', fontweight='bold') ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') ax.grid(True, alpha=0.3) # High-dimensional data for PCA np.random.seed(42) X_high = np.random.randn(200, 50) # Add structure X_high[:, :2] = make_blobs(n_samples=200, centers=3, n_features=2, random_state=42)[0] # PCA ax = axes[1, 0] pca = PCA(n_components=2) X_pca = pca.fit_transform(X_high) ax.scatter(X_pca[:, 0], X_pca[:, 1], c=true_labels[:200], cmap='coolwarm', alpha=0.6, s=30) ax.set_title('PCA Projection (50D → 2D)', fontweight='bold') ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)') ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)') ax.grid(True, alpha=0.3) # Density estimation ax = axes[1, 1] from scipy.stats import gaussian_kde kde = gaussian_kde(X.T) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) positions = np.vstack([xx.ravel(), yy.ravel()]) density = kde(positions).reshape(xx.shape) ax.contourf(xx, yy, density, levels=20, cmap='viridis', alpha=0.7) ax.scatter(X[:, 0], X[:, 1], c='white', s=10, alpha=0.5, edgecolor='black', linewidth=0.5) ax.set_title('Density Estimation', fontweight='bold') ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') ax.grid(True, alpha=0.3) plt.suptitle('Unsupervised Learning Methods', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()

Transform unsupervised → supervised by creating pretext tasks
#| echo: true #| code-fold: true # Example: Simple masked prediction def create_masked_task(sequence, mask_prob=0.15): """Create self-supervised task from sequence""" masked = sequence.copy() labels = np.full_like(sequence, -1) mask_indices = np.random.random(len(sequence)) < mask_prob masked[mask_indices] = 0 # [MASK] token labels[mask_indices] = sequence[mask_indices] return masked, labels # Example sequence sequence = np.array([1, 4, 2, 8, 3, 7, 5, 9]) masked_input, targets = create_masked_task(sequence) print(f"Original: {sequence}") print(f"Masked: {masked_input}") print(f"Targets: {targets}")
Components:
Objective: Maximize expected cumulative reward
#| echo: false """ GridWorld environment showing an agent navigating toward a goal while avoiding obstacles, with arrows indicating the learned policy directions. """ #| fig-align: center # Simple GridWorld visualization fig, ax = plt.subplots(figsize=(6, 6)) # Create grid grid_size = 5 for i in range(grid_size + 1): ax.axhline(i, color='black', linewidth=1) ax.axvline(i, color='black', linewidth=1) # Agent position agent_pos = (1, 1) circle = patches.Circle((agent_pos[0] + 0.5, agent_pos[1] + 0.5), 0.3, color='#1976D2', alpha=0.8) ax.add_patch(circle) ax.text(agent_pos[0] + 0.5, agent_pos[1] + 0.5, 'A', ha='center', va='center', fontsize=14, fontweight='bold', color='white') # Goal goal_pos = (3, 3) rect = patches.FancyBboxPatch((goal_pos[0], goal_pos[1]), 1, 1, boxstyle="round,pad=0.02", linewidth=2, edgecolor='#FFD700', facecolor='#FFF59D', alpha=0.7) ax.add_patch(rect) ax.text(goal_pos[0] + 0.5, goal_pos[1] + 0.5, 'G', ha='center', va='center', fontsize=14, fontweight='bold') # Obstacles obstacles = [(2, 1), (1, 3), (3, 2)] for obs in obstacles: rect = patches.Rectangle((obs[0], obs[1]), 1, 1, linewidth=1, edgecolor='#C62828', facecolor='#EF5350', alpha=0.7) ax.add_patch(rect) ax.text(obs[0] + 0.5, obs[1] + 0.5, 'X', ha='center', va='center', fontsize=12, fontweight='bold', color='white') # Q-values (simplified display) actions = ['↑', '→', '↓', '←'] for i in range(grid_size): for j in range(grid_size): if (i, j) not in obstacles and (i, j) != goal_pos: # Show best action if i < 2 and j < 2: best_action = '→' if i < goal_pos[0] else '↑' else: best_action = '↑' if j < goal_pos[1] else '→' ax.text(i + 0.5, j + 0.5, best_action, ha='center', va='center', fontsize=10, alpha=0.5) ax.set_xlim(0, grid_size) ax.set_ylim(0, grid_size) ax.set_aspect('equal') ax.set_title('GridWorld: RL Environment', fontsize=14, fontweight='bold') ax.set_xlabel('X') ax.set_ylabel('Y') # Legend legend_elements = [ patches.Patch(color='#1976D2', label='Agent'), patches.Patch(color='#FFF59D', label='Goal'), patches.Patch(color='#EF5350', label='Obstacle') ] ax.legend(handles=legend_elements, loc='upper left', fontsize=10) plt.tight_layout() plt.show()

#| echo: false """ Comparison of supervised, unsupervised, and self-supervised learning paradigms on the Iris dataset, showing decision boundaries, clustering results, feature prediction, and performance metrics. """ #| fig-align: center # Example: Learning to classify/cluster iris flowers from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler iris = load_iris() X = iris.data[:, [0, 2]] # Use only 2 features for visualization y = iris.target # Standardize scaler = StandardScaler() X_scaled = scaler.fit_transform(X) fig, axes = plt.subplots(2, 3, figsize=(15, 10)) # Supervised: Full labels ax = axes[0, 0] X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42) from sklearn.svm import SVC svm = SVC(kernel='rbf', random_state=42) svm.fit(X_train, y_train) # Decision boundary h = 0.02 x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = svm.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.3, cmap='viridis') scatter = ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', edgecolor='black', s=50) ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='viridis', marker='^', edgecolor='black', s=70) ax.set_title('Supervised: SVM Classification\n(All labels available)', fontweight='bold') ax.set_xlabel('Sepal Length (scaled)') ax.set_ylabel('Petal Length (scaled)') # Unsupervised: No labels ax = axes[0, 1] from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3, random_state=42) clusters = kmeans.fit_predict(X_scaled) ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='coolwarm', edgecolor='black', s=50) ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='#C62828', marker='*', s=300, edgecolor='black', linewidth=2) ax.set_title('Unsupervised: K-Means Clustering\n(No labels)', fontweight='bold') ax.set_xlabel('Sepal Length (scaled)') ax.set_ylabel('Petal Length (scaled)') # Self-supervised: Create pretext task ax = axes[0, 2] # Pretext task: predict one feature from another X_input = X_scaled[:, 0].reshape(-1, 1) y_target = X_scaled[:, 1] from sklearn.neural_network import MLPRegressor mlp = MLPRegressor(hidden_layer_sizes=(10, 10), random_state=42, max_iter=1000) mlp.fit(X_input, y_target) y_pred = mlp.predict(X_input) ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c='#616161', alpha=0.3, s=30, label='True') ax.scatter(X_scaled[:, 0], y_pred, c='#C62828', alpha=0.6, s=30, label='Predicted') ax.set_title('Self-Supervised: Feature Prediction\n(Predict petal from sepal)', fontweight='bold') ax.set_xlabel('Sepal Length (scaled)') ax.set_ylabel('Petal Length (scaled)') ax.legend() # Performance comparison ax = axes[1, 0] methods = ['Supervised\n(100% labels)', 'Semi-supervised\n(10% labels)', 'Unsupervised\n(0% labels)'] accuracies = [0.95, 0.75, 0.65] colors = ['#2E7D32', '#F57C00', '#8B0000'] bars = ax.bar(methods, accuracies, color=colors, alpha=0.7) ax.set_ylabel('Accuracy / ARI Score') ax.set_title('Performance Comparison', fontweight='bold') ax.set_ylim(0, 1) ax.grid(True, alpha=0.3, axis='y') # Data requirements ax = axes[1, 1] paradigms = ['Supervised', 'Self-Sup.', 'Unsuper.', 'RL'] data_needs = [1000, 10000, 100, 100000] colors = ['#2E7D32', '#1976D2', '#F57C00', '#8B0000'] bars = ax.bar(paradigms, data_needs, color=colors, alpha=0.7) ax.set_ylabel('Typical Data Requirements') ax.set_yscale('log') ax.set_title('Data Efficiency', fontweight='bold') ax.grid(True, alpha=0.3, axis='y') ax.tick_params(axis='x', rotation=45) # Computational cost ax = axes[1, 2] paradigms_short = ['Sup.', 'Self-Sup.', 'Unsup.', 'RL'] compute_cost = [1, 10, 0.5, 100] # Relative costs colors = ['#2E7D32', '#1976D2', '#F57C00', '#8B0000'] bars = ax.bar(paradigms_short, compute_cost, color=colors, alpha=0.7) ax.set_ylabel('Relative Compute Cost') ax.set_title('Computational Requirements', fontweight='bold') ax.set_yscale('log') ax.grid(True, alpha=0.3, axis='y') plt.suptitle('Paradigm Comparison on Iris Dataset', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
Label efficiency on CIFAR-10 (target: 90% accuracy):
Transfer learning with self-supervised pretraining: 50× reduction in labeled data
#| echo: false """ Transfer learning pipeline showing progression from pre-training on a large dataset to fine-tuning a task-specific model on smaller target data. """ #| fig-align: center fig, ax = plt.subplots(figsize=(10, 6)) ax.set_xlim(0, 10) ax.set_ylim(0, 6) ax.axis('off') # Pre-training rect1 = patches.FancyBboxPatch((0.5, 3), 2, 1.5, boxstyle="round,pad=0.05", edgecolor='#1976D2', facecolor='#E3F2FD', linewidth=2) ax.add_patch(rect1) ax.text(1.5, 3.75, 'Large Dataset\n(ImageNet)', ha='center', va='center', fontsize=10) # Pre-trained model rect2 = patches.FancyBboxPatch((3.5, 3), 2, 1.5, boxstyle="round,pad=0.05", edgecolor='#2E7D32', facecolor='#E8F5E9', linewidth=2) ax.add_patch(rect2) ax.text(4.5, 3.75, 'Pre-trained\nModel', ha='center', va='center', fontsize=10, fontweight='bold') # Fine-tuning rect3 = patches.FancyBboxPatch((6.5, 3), 2, 1.5, boxstyle="round,pad=0.05", edgecolor='#F57C00', facecolor='#FFF3E0', linewidth=2) ax.add_patch(rect3) ax.text(7.5, 3.75, 'Small Target\nDataset', ha='center', va='center', fontsize=10) # Task-specific model rect4 = patches.FancyBboxPatch((3.5, 0.5), 2, 1.5, boxstyle="round,pad=0.05", edgecolor='#8B0000', facecolor='#FFEBEE', linewidth=2) ax.add_patch(rect4) ax.text(4.5, 1.25, 'Task-Specific\nModel', ha='center', va='center', fontsize=10, fontweight='bold') # Arrows with better positioning arrow1 = patches.FancyArrowPatch((2.5, 3.75), (3.45, 3.75), mutation_scale=20, color='black', linewidth=2, arrowstyle='->') ax.add_patch(arrow1) ax.text(3, 4.15, 'Pre-train', fontsize=9, ha='center') arrow2 = patches.FancyArrowPatch((5.5, 3.75), (6.45, 3.75), mutation_scale=20, color='black', linewidth=2, arrowstyle='->') ax.add_patch(arrow2) ax.text(6, 4.15, 'Transfer', fontsize=9, ha='center') # Curved arrow from target dataset to right edge of task-specific model arrow3 = patches.FancyArrowPatch((7.5, 2.95), (5.5, 1.25), connectionstyle="arc3,rad=-.3", mutation_scale=20, color='#8B0000', linewidth=2, arrowstyle='->') ax.add_patch(arrow3) ax.text(6.5, 2.0, 'Fine-tune', fontsize=9, ha='center') ax.set_title('Transfer Learning: Pre-trained Models Adapt to New Tasks', fontsize=12, fontweight='bold') plt.tight_layout() plt.show()

Multilayer Perceptron (MLP): Fully connected feedforward network
A neural network with more than one hidden layer. Depth enables hierarchical feature learning: early layers learn simple features, deeper layers learn complex abstractions.
At neuron \(i\) in layer \(l\):
where:

A feedforward network with:
can approximate any continuous function on compact subset of \(\mathbb{R}^n\) to arbitrary accuracy
Critical word: CAN
The theorem guarantees such networks exist. Finding them through training is different.
#| echo: false """ Function approximation showing how networks with increasing hidden layer width (5, 10, 20 neurons) converge toward a target function, illustrating the universal approximation theorem. """ #| fig-align: center x = np.linspace(-2, 2, 100) y_true = np.sin(2*x) + 0.5*np.cos(4*x) fig, ax = plt.subplots(figsize=(6, 4)) ax.plot(x, y_true, 'k-', linewidth=2, label='Target') # Approximations with different widths for width, alpha, color in [(5, 0.3, 'blue'), (10, 0.5, 'green'), (20, 0.7, 'red')]: np.random.seed(42) y_approx = y_true + np.random.normal(0, 0.1*(25-width)/20, len(x)) ax.plot(x, y_approx, color=color, alpha=alpha, linewidth=1.5, label=f'{width} neurons') ax.set_xlabel('Input') ax.set_ylabel('Output') ax.set_title('Function Approximation', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) plt.tight_layout() plt.show()
Universal approximation guarantees this works, but:
Why depth matters: Practical networks need efficient representations
#| echo: false """ Parameter efficiency comparison showing exponential growth for wide shallow networks versus polynomial growth for deep networks, with visual diagram contrasting the two architectures. """ #| fig-align: center fig, axes = plt.subplots(2, 1, figsize=(8, 10)) # Parameter count: width vs depth ax = axes[0] n_bits = np.arange(4, 13) width_params = 2**(n_bits - 1) * n_bits # Exponential in width depth_params = n_bits**2 * 4 # Polynomial in depth ax.semilogy(n_bits, width_params, 'r-o', linewidth=2, label='Width-only', markersize=8) ax.semilogy(n_bits, depth_params, 'b-s', linewidth=2, label='Deep', markersize=8) ax.set_xlabel('Problem Size (bits)') ax.set_ylabel('Parameters Required') ax.set_title('Width vs Depth: Parameter Efficiency', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Visual comparison ax = axes[1] ax.set_xlim(0, 10) ax.set_ylim(0, 8) ax.axis('off') # Wide shallow network y_pos = 6 for i in range(8): x = 1 + i * 0.3 circle = plt.Circle((x, y_pos), 0.12, color='red', alpha=0.6) ax.add_patch(circle) ax.text(1.9, y_pos + 0.8, 'Wide & Shallow\n(Exponential neurons)', ha='center', fontsize=10, fontweight='bold') # Deep narrow network x_pos = 7 for i in range(6): y = 1.5 + i * 0.8 circle = plt.Circle((x_pos, y), 0.15, color='blue', alpha=0.6) ax.add_patch(circle) ax.text(x_pos, 0.5, 'Deep & Narrow\n(Polynomial neurons)', ha='center', fontsize=10, fontweight='bold') plt.tight_layout() plt.show()
#| echo: true #| code-fold: false #| code-line-numbers: true class Layer: def forward(self, x): # Store for backward pass self.x = x # Linear transformation self.z = np.dot(x, self.W) + self.b # Apply activation self.a = self.activation(self.z) return self.a def backward(self, grad_output): # Chain rule through activation grad_z = grad_output * \ self.activation_derivative(self.z) # Parameter gradients self.grad_W = np.dot(self.x.T, grad_z) self.grad_b = np.sum(grad_z, axis=0) # Input gradient for previous layer grad_input = np.dot(grad_z, self.W.T) return grad_input
#| echo: false """ Computational graph showing forward pass operations (input → weighted sum → bias → activation → loss) and backward pass gradient flow with partial derivatives at each node. """ #| fig-align: center import matplotlib.patches as patches import matplotlib.pyplot as plt import numpy as np fig, ax = plt.subplots(figsize=(10, 7)) # Forward pass computation nodes nodes_forward = [ (1, 4, '$\\mathbf{x}$', '#E3F2FD', 'input'), (2.5, 4, '$\\mathbf{W}\\mathbf{x}$', '#FFF9C4', 'hidden'), (4, 4, '$+\\mathbf{b}$', '#FFF9C4', 'hidden'), (5.5, 4, '$\\sigma(\\cdot)$', '#FFE0B2', 'activation'), (7, 4, '$\\mathbf{a}$', '#FFEBEE', 'output'), (8.5, 4, '$\\mathcal{L}$', '#FFCDD2', 'loss') ] # Draw forward pass nodes for x, y, label, color, node_type in nodes_forward: if node_type in ['input', 'output', 'loss']: circle = plt.Circle((x, y), 0.35, color=color, ec='black', linewidth=2) ax.add_patch(circle) else: rect = patches.FancyBboxPatch((x-0.35, y-0.35), 0.7, 0.7, boxstyle="round,pad=0.05", facecolor=color, edgecolor='black', linewidth=2) ax.add_patch(rect) ax.text(x, y, label, ha='center', va='center', fontsize=11, fontweight='bold') # Forward pass arrows - adjusted to not overlap with nodes forward_connections = [ (1.35, 4, 0.65, 0), # x to Wx (2.85, 4, 0.65, 0), # Wx to +b (4.35, 4, 0.65, 0), # +b to σ (5.85, 4, 0.65, 0), # σ to a (7.35, 4, 0.65, 0) # a to L ] for x, y, dx, dy in forward_connections: ax.arrow(x, y, dx, dy, head_width=0.12, head_length=0.08, fc='#1976D2', ec='#1976D2', linewidth=2.5) # Backward pass gradient flow backward_y = 2.5 nodes_backward = [ (8.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathcal{L}}=1$', '#FFCDD2'), (7, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{a}}$', '#FFE0B2'), (5.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{z}}$', '#FFE0B2'), (4, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{b}}$', '#E1F5FE'), (2.5, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{W}}$', '#E1F5FE'), (1, backward_y, '$\\frac{\\partial \\mathcal{L}}{\\partial \\mathbf{x}}$', '#E3F2FD') ] # Draw backward pass nodes with larger text for x, y, label, color in nodes_backward: rect = patches.FancyBboxPatch((x-0.55, y-0.3), 1.1, 0.6, boxstyle="round,pad=0.02", facecolor=color, edgecolor='#C62828', linewidth=1.5, linestyle='--', alpha=0.8) ax.add_patch(rect) ax.text(x, y, label, ha='center', va='center', fontsize=10, fontweight='bold') # Backward pass arrows - adjusted to not overlap backward_connections = [ (8.05, backward_y, -0.5, 0), # ∂L/∂L to ∂L/∂a (6.55, backward_y, -0.5, 0), # ∂L/∂a to ∂L/∂z (5.05, backward_y, -0.5, 0), # ∂L/∂z to ∂L/∂b (3.55, backward_y, -0.5, 0), # ∂L/∂b to ∂L/∂W (2.05, backward_y, -0.5, 0) # ∂L/∂W to ∂L/∂x ] for x, y, dx, dy in backward_connections: ax.arrow(x, y, dx, dy, head_width=0.1, head_length=0.06, fc='#C62828', ec='#C62828', linewidth=2, linestyle='--', alpha=0.7) # Vertical connections showing gradient computation for x_pos in [2.5, 4, 5.5, 7]: ax.plot([x_pos, x_pos], [3.65, 2.75], 'k:', linewidth=1, alpha=0.5) # Labels ax.text(4.75, 5, 'Forward Pass', fontsize=13, fontweight='bold', color='#1976D2') ax.text(4.75, 1.5, 'Backward Pass (Gradients)', fontsize=13, fontweight='bold', color='#C62828') # Annotations ax.text(2.5, 3.3, 'Store $\\mathbf{x}$', ha='center', fontsize=8, style='italic', color='#666') ax.text(5.5, 3.3, 'Chain rule', ha='center', fontsize=8, style='italic', color='#666') ax.set_xlim(0, 9.5) ax.set_ylim(1, 5.5) ax.axis('off') plt.tight_layout() plt.show()
#| echo: false """ Parameter scaling with network width and depth (left) alongside function complexity achievable at different depths (right). """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(12, 5)) # Capacity vs parameters ax = axes[0] widths = [10, 20, 50, 100, 200] depths = [2, 3, 4, 5, 6] colors = ['blue', 'green', 'orange', 'red', 'purple'] for d, color in zip(depths, colors): params = [w * w * d for w in widths] ax.plot(widths, params, 'o-', label=f'Depth {d}', color=color, linewidth=2) ax.set_xlabel('Width (neurons per layer)') ax.set_ylabel('Total Parameters') ax.set_title('Parameter Count', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) ax.set_yscale('log') # Expressivity illustration ax = axes[1] x = np.linspace(-2, 2, 100) functions = [ (lambda x: x, 'Linear', 'blue'), (lambda x: np.maximum(0, x), 'Shallow (1)', 'green'), (lambda x: np.sin(2*x), 'Medium (3)', 'orange'), (lambda x: np.sin(2*x) + 0.5*np.cos(4*x), 'Deep (5)', 'red') ] for func, label, color in functions: y = func(x) if 'Deep' in label: y += 0.1*np.sin(10*x) ax.plot(x, y, label=label, color=color, linewidth=2, alpha=0.7) ax.set_xlabel('Input') ax.set_ylabel('Output') ax.set_title('Function Complexity vs Depth', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) plt.tight_layout() plt.show()
#| echo: false """ Comparison of loss surface complexity across model types: convex bowl for linear models, wavy terrain with local minima for shallow networks, and highly complex landscape for deep networks. """ #| fig-align: center import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import matplotlib.patches as patches fig = plt.figure(figsize=(15, 5)) # Simple convex loss ax1 = fig.add_subplot(131, projection='3d') x = np.linspace(-3, 3, 50) y = np.linspace(-3, 3, 50) X, Y = np.meshgrid(x, y) Z = 0.5 * (X**2 + 2*Y**2) surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8, edgecolor='none') ax1.set_xlabel('$w_1$') ax1.set_ylabel('$w_2$') ax1.set_zlabel('Loss') ax1.set_title('Convex (Linear Model)', fontweight='bold') # Non-convex with local minima ax2 = fig.add_subplot(132, projection='3d') Z2 = np.sin(2*X) * np.cos(2*Y) + 0.1*(X**2 + Y**2) surf2 = ax2.plot_surface(X, Y, Z2, cmap='coolwarm', alpha=0.8, edgecolor='none') ax2.set_xlabel('$w_1$') ax2.set_ylabel('$w_2$') ax2.set_zlabel('Loss') ax2.set_title('Non-Convex (Shallow Network)', fontweight='bold') # High-dimensional projection ax3 = fig.add_subplot(133, projection='3d') Z3 = np.sin(3*X) * np.cos(3*Y) * np.exp(-0.1*(X**2 + Y**2)) + 0.5*np.sin(5*X) + 0.3*np.cos(7*Y) surf3 = ax3.plot_surface(X, Y, Z3, cmap='plasma', alpha=0.8, edgecolor='none') ax3.set_xlabel('$w_1$') ax3.set_ylabel('$w_2$') ax3.set_zlabel('Loss') ax3.set_title('Complex (Deep Network)', fontweight='bold') plt.suptitle('Loss Landscape Complexity', fontsize=16, fontweight='bold') plt.tight_layout() plt.show()
In \(d\) dimensions with \(n\) parameters:
#| echo: true #| code-fold: true def sgd(w, grad, lr=0.01): return w - lr * grad def sgd_momentum(w, grad, velocity, lr=0.01, beta=0.9): velocity = beta * velocity + lr * grad return w - velocity, velocity def adam(w, grad, m, v, t, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8): m = beta1 * m + (1 - beta1) * grad v = beta2 * v + (1 - beta2) * grad**2 m_hat = m / (1 - beta1**t) v_hat = v / (1 - beta2**t) return w - lr * m_hat / (np.sqrt(v_hat) + eps), m, v
#| echo: false """ Comparison of SGD, Momentum, and Adam optimizer trajectories on the Rosenbrock function, showing how each algorithm navigates toward the global minimum. """ #| fig-align: center fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # Generate optimization paths np.random.seed(42) n_steps = 1000 def rosenbrock(x, y): return (1 - x)**2 + 100 * (y - x**2)**2 x_range = np.linspace(-1.5, 1.5, 100) y_range = np.linspace(-0.5, 1.5, 100) X_grid, Y_grid = np.meshgrid(x_range, y_range) Z_grid = rosenbrock(X_grid, Y_grid) # SGD path ax = axes[0] contour = ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6) w_sgd = np.array([-0.5, 0.5]) path_sgd = [w_sgd.copy()] lr = 0.0004 for _ in range(n_steps): grad_x = -2*(1 - w_sgd[0]) - 400*w_sgd[0]*(w_sgd[1] - w_sgd[0]**2) grad_y = 200*(w_sgd[1] - w_sgd[0]**2) grad = np.array([grad_x, grad_y]) grad = np.clip(grad, -10, 10) # Clip gradients to prevent overflow w_sgd = sgd(w_sgd, grad, lr=lr) path_sgd.append(w_sgd.copy()) path_sgd = np.array(path_sgd) ax.plot(path_sgd[:, 0], path_sgd[:, 1], 'r.-', linewidth=2, markersize=3, label='SGD') ax.plot(1, 1, 'g*', markersize=15, label='Optimum') ax.set_title('Vanilla SGD', fontweight='bold') ax.set_xlabel('$w_1$') ax.set_ylabel('$w_2$') ax.legend() # Momentum path ax = axes[1] ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6) w_mom = np.array([-0.5, 0.5]) velocity = np.zeros(2) path_mom = [w_mom.copy()] for _ in range(n_steps): grad_x = -2*(1 - w_mom[0]) - 400*w_mom[0]*(w_mom[1] - w_mom[0]**2) grad_y = 200*(w_mom[1] - w_mom[0]**2) grad = np.array([grad_x, grad_y]) grad = np.clip(grad, -10, 10) # Clip gradients w_mom, velocity = sgd_momentum(w_mom, grad, velocity, lr=0.0002, beta=0.9) path_mom.append(w_mom.copy()) path_mom = np.array(path_mom) ax.plot(path_mom[:, 0], path_mom[:, 1], 'b.-', linewidth=2, markersize=3, label='Momentum') ax.plot(1, 1, 'g*', markersize=15, label='Optimum') ax.set_title('SGD with Momentum', fontweight='bold') ax.set_xlabel('$w_1$') ax.set_ylabel('$w_2$') ax.legend() # Adam path ax = axes[2] ax.contour(X_grid, Y_grid, Z_grid, levels=20, cmap='viridis', alpha=0.6) w_adam = np.array([-0.5, 0.5]) m = np.zeros(2) v = np.zeros(2) path_adam = [w_adam.copy()] for t in range(1, n_steps + 1): grad_x = -2*(1 - w_adam[0]) - 400*w_adam[0]*(w_adam[1] - w_adam[0]**2) grad_y = 200*(w_adam[1] - w_adam[0]**2) grad = np.array([grad_x, grad_y]) grad = np.clip(grad, -10, 10) # Clip gradients w_adam, m, v = adam(w_adam, grad, m, v, t, lr=0.005) path_adam.append(w_adam.copy()) path_adam = np.array(path_adam) ax.plot(path_adam[:, 0], path_adam[:, 1], 'm.-', linewidth=2, markersize=3, label='Adam') ax.plot(1, 1, 'g*', markersize=15, label='Optimum') ax.set_title('Adam Optimizer', fontweight='bold') ax.set_xlabel('$w_1$') ax.set_ylabel('$w_2$') ax.legend() plt.suptitle('Optimizer Comparison on Rosenbrock Function', fontsize=16, fontweight='bold') plt.tight_layout() plt.show()
"Dense networks contain sparse subnetworks that can train to comparable accuracy from the same initialization"
Why this matters:
Storage:
Speed:
Training:
#| echo: false """ Comparison of random pruning versus lottery ticket pruning accuracy across sparsity levels, with side-by-side visualization of dense versus sparse winning ticket network structures. """ #| fig-align: center fig, axes = plt.subplots(2, 1, figsize=(8, 10)) # Pruning vs accuracy ax = axes[0] sparsity = np.array([0, 50, 80, 90, 95, 98, 99, 99.5]) accuracy = np.array([95, 94.8, 94.5, 94, 92, 85, 70, 50]) winning_ticket = np.array([95, 95, 94.9, 94.5, 93.5, 91, 82, 65]) ax.plot(sparsity, accuracy, 'b-o', linewidth=2, label='Random Pruning') ax.plot(sparsity, winning_ticket, 'r-s', linewidth=2, label='Lottery Ticket') ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5) ax.set_xlabel('Sparsity (%)') ax.set_ylabel('Test Accuracy (%)') ax.set_title('Pruning Performance', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Network visualization ax = axes[1] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Dense network - Left side for layer, x in enumerate([1, 3, 5]): n_neurons = [3, 4, 3][layer] for i in range(n_neurons): y = 5 + (i - n_neurons/2 + 0.5) * 1.5 circle = patches.Circle((x, y), 0.15, color='#90CAF9', edgecolor='#1976D2', linewidth=1.5) ax.add_patch(circle) # Connect to next layer if layer < 2: next_n = [4, 3][layer] for j in range(next_n): next_y = 5 + (j - next_n/2 + 0.5) * 1.5 ax.plot([x+0.15, x+2-0.15], [y, next_y], '#9E9E9E', alpha=0.3, linewidth=0.8) ax.text(3, 1.5, 'Dense Network\n(All connections)', ha='center', fontweight='bold', fontsize=10) # Winning Ticket - Right side for layer, x in enumerate([6, 8, 10]): n_neurons = [3, 4, 3][layer] for i in range(n_neurons): y = 5 + (i - n_neurons/2 + 0.5) * 1.5 # Highlight winning neurons if (layer == 0 and i in [0, 2]) or (layer == 1 and i in [0, 2]) or (layer == 2 and i in [0, 1]): circle = patches.Circle((x, y), 0.15, color='#FFCDD2', edgecolor='#C62828', linewidth=2) else: circle = patches.Circle((x, y), 0.15, color='#F5F5F5', edgecolor='#9E9E9E', linewidth=1) ax.add_patch(circle) # Winning connections only winning_paths = [ ((6, 5-1.5), (8, 5+0.75)), # bottom to middle-high ((6, 5+1.5), (8, 5+0.75)), # top to middle-high ((6, 5+1.5), (8, 5-0.75)), # top to middle-low ((8, 5+0.75), (10, 5-1.5)), # middle-high to bottom ((8, 5+0.75), (10, 5)), # middle-high to middle ((8, 5-0.75), (10, 5)), # middle-low to middle ] for (x1, y1), (x2, y2) in winning_paths: ax.plot([x1+0.15, x2-0.15], [y1, y2], '#C62828', alpha=0.7, linewidth=2) ax.text(8, 1.5, 'Winning Ticket\n(Sparse subnetwork)', ha='center', fontweight='bold', fontsize=10) ax.set_title('Network Structure Comparison', fontweight='bold', y=0.95) plt.tight_layout() plt.show()
Detailed treatment: Network pruning and efficient architectures
#| echo: false """ Six-panel visualization of modern deep learning theory: overparameterization benefits via double descent, loss landscape smoothing with network width, implicit regularization from SGD, Neural Tangent Kernel theory, mode connectivity between minima, and the grokking phenomenon. """ #| fig-align: center fig, axes = plt.subplots(2, 3, figsize=(15, 10)) # Overparameterization ax = axes[0, 0] params = np.logspace(3, 7, 50) train_error = 100 / np.sqrt(params) test_error_classical = train_error + 20 * np.sqrt(params / 1e6) test_error_modern = train_error + 5 / np.sqrt(params / 1e5) ax.loglog(params, train_error, 'b-', linewidth=2, label='Train Error') ax.loglog(params, test_error_classical, 'r--', linewidth=2, label='Classical (U-shape)') ax.loglog(params, test_error_modern, 'g-', linewidth=2, label='Modern (Double Descent)') ax.set_xlabel('Number of Parameters') ax.set_ylabel('Error') ax.set_title('Overparameterization Benefits', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Loss landscape smoothness ax = axes[0, 1] widths = [64, 128, 256, 512, 1024] roughness = [2.5, 1.8, 1.2, 0.8, 0.5] ax.plot(widths, roughness, 'mo-', linewidth=2, markersize=8) ax.set_xlabel('Network Width') ax.set_ylabel('Loss Landscape Roughness') ax.set_title('Width Smooths Landscape', fontweight='bold') ax.grid(True, alpha=0.3) # Implicit regularization ax = axes[0, 2] epochs_range = np.arange(0, 200) train_loss = np.exp(-epochs_range / 20) test_loss_no_reg = np.exp(-epochs_range / 25) + 0.1 * np.sqrt(epochs_range / 100) test_loss_implicit = np.exp(-epochs_range / 25) + 0.02 ax.semilogy(epochs_range, train_loss, 'b-', linewidth=2, label='Train') ax.semilogy(epochs_range, test_loss_no_reg, 'r--', linewidth=2, label='Test (No Reg)') ax.semilogy(epochs_range, test_loss_implicit, 'g-', linewidth=2, label='Test (SGD)') ax.set_xlabel('Epochs') ax.set_ylabel('Loss') ax.set_title('Implicit Regularization', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Neural Tangent Kernel ax = axes[1, 0] ax.text(0.5, 0.85, 'Neural Tangent Kernel', fontsize=12, fontweight='bold', ha='center') ax.text(0.5, 0.65, 'Wide networks ≈ Kernel methods', fontsize=11, ha='center') ax.text(0.5, 0.45, r'$f(x, \theta_t) \approx f(x, \theta_0) + \nabla_\theta f|_{\theta_0} \cdot (\theta_t - \theta_0)$', fontsize=11, ha='center') ax.text(0.5, 0.25, 'Training dynamics become linear', fontsize=11, ha='center') ax.text(0.5, 0.05, 'in infinite width limit', fontsize=11, ha='center', style='italic') ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.axis('off') # Mode connectivity ax = axes[1, 1] theta = np.linspace(0, 1, 100) loss_direct = 0.5 + 2 * (theta - 0.5)**2 loss_curve = 0.5 + 0.1 * np.sin(10 * theta) ax.plot(theta, loss_direct, 'r--', linewidth=2, label='Direct Path') ax.plot(theta, loss_curve, 'b-', linewidth=2, label='Curved Path') ax.scatter([0, 1], [0.5, 0.5], c='green', s=100, zorder=5, label='Minima') ax.set_xlabel('Interpolation Parameter α') ax.set_ylabel('Loss') ax.set_title('Mode Connectivity', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Grokking phenomenon ax = axes[1, 2] epochs_grok = np.arange(0, 1000) train_acc = 100 * (1 - np.exp(-epochs_grok / 50)) test_acc = np.zeros_like(epochs_grok, dtype=float) test_acc[:500] = 50 test_acc[500:] = 100 * (1 - np.exp(-(epochs_grok[500:] - 500) / 100)) ax.plot(epochs_grok, train_acc, 'b-', linewidth=2, label='Train') ax.plot(epochs_grok, test_acc, 'r-', linewidth=2, label='Test') ax.axvline(x=500, color='gray', linestyle='--', alpha=0.5) ax.text(500, 30, 'Grokking', rotation=90, fontsize=10, ha='center') ax.set_xlabel('Epochs') ax.set_ylabel('Accuracy (%)') ax.set_title('Grokking: Delayed Generalization', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # plt.suptitle('Modern Theoretical Insights', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
#| echo: false """ Three-panel comparison showing classical U-shaped overfitting theory versus modern double descent phenomenon, plus demonstration that networks achieve 100% training accuracy even with fully random labels. """ #| fig-align: center import numpy as np import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # Traditional view ax = axes[0] capacity = np.linspace(0, 10, 100) train_error = 10 / (1 + capacity) test_error = train_error + 0.5 * capacity ax.plot(capacity, train_error, 'b-', linewidth=2, label='Training Error') ax.plot(capacity, test_error, 'r-', linewidth=2, label='Test Error') ax.fill_between(capacity[50:], train_error[50:], test_error[50:], alpha=0.3, color='gray') ax.text(7.5, 5, 'Overfitting', fontsize=11, fontweight='bold') ax.set_xlabel('Model Capacity') ax.set_ylabel('Error') ax.set_title('Classical View', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # What we observe ax = axes[1] np.random.seed(42) n_params = np.logspace(2, 6, 50) n_samples = 10000 interpolation_threshold = n_samples train_error = np.zeros_like(n_params) test_error = np.zeros_like(n_params) for i, p in enumerate(n_params): if p < interpolation_threshold: train_error[i] = 10 * np.exp(-p/1000) test_error[i] = train_error[i] + 100/np.sqrt(p) else: train_error[i] = 0 test_error[i] = 20 / np.log(p/1000) ax.semilogx(n_params, train_error, 'b-', linewidth=2, label='Train') ax.semilogx(n_params, test_error, 'r-', linewidth=2, label='Test') ax.axvline(x=interpolation_threshold, color='gray', linestyle='--', alpha=0.5) ax.text(interpolation_threshold*1.5, 15, 'Interpolation\nThreshold', fontsize=10) ax.set_xlabel('Number of Parameters') ax.set_ylabel('Error') ax.set_title('Modern Reality: Double Descent', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Memorization experiment ax = axes[2] noise_levels = [0, 0.2, 0.5, 0.8, 1.0] train_acc = [100, 100, 100, 100, 100] test_acc = [92, 88, 75, 60, 50] x_pos = np.arange(len(noise_levels)) width = 0.35 bars1 = ax.bar(x_pos - width/2, train_acc, width, label='Train', color='#2E7D32', alpha=0.7) bars2 = ax.bar(x_pos + width/2, test_acc, width, label='Test', color='#B71C1C', alpha=0.7) ax.set_xlabel('Label Noise Level') ax.set_ylabel('Accuracy (%)') ax.set_xticks(x_pos) ax.set_xticklabels(['0%', '20%', '50%', '80%', '100%']) ax.set_title('Networks Memorize Everything', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3, axis='y') plt.suptitle('Why Do Neural Networks Generalize At All?', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
#| echo: true #| code-fold: false def dropout(x, p=0.5, training=True): if not training: return x mask = np.random.binomial(1, 1-p, size=x.shape) / (1-p) return x * mask def weight_decay(loss, weights, lambda_reg=0.01): l2_penalty = sum(np.sum(w**2) for w in weights) return loss + lambda_reg * l2_penalty
#| echo: false """ Four-panel visualization of regularization techniques: dropout's random neuron deactivation, weight decay's effect on weight distributions, early stopping at optimal validation loss, and comparison of test errors across combined regularization methods. """ #| fig-align: center fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Dropout visualization ax = axes[0, 0] np.random.seed(42) neurons = 10 layers = 4 for layer in range(layers): x_pos = layer * 2 y_positions = np.linspace(0, 5, neurons) for i, y_pos in enumerate(y_positions): if layer == 2: dropped = np.random.random() < 0.3 color = 'lightgray' if dropped else 'lightblue' alpha = 0.3 if dropped else 1.0 else: color = 'lightblue' alpha = 1.0 circle = plt.Circle((x_pos, y_pos), 0.2, color=color, alpha=alpha, edgecolor='black') ax.add_patch(circle) if layer > 0: prev_x = (layer - 1) * 2 prev_y_positions = np.linspace(0, 5, neurons) for prev_y in prev_y_positions: if layer == 2 and np.random.random() < 0.3: continue ax.plot([prev_x + 0.2, x_pos - 0.2], [prev_y, y_pos], 'gray', alpha=0.2, linewidth=0.5) ax.set_xlim(-1, 7) ax.set_ylim(-1, 6) ax.axis('off') ax.set_title('Dropout: Random Deactivation', fontweight='bold') ax.text(3, -0.5, 'Forces redundant representations', ha='center', fontsize=10, style='italic') # Weight decay effect ax = axes[0, 1] weights_no_decay = np.random.randn(100) * 2 weights_with_decay = np.random.randn(100) * 0.5 ax.hist(weights_no_decay, bins=30, alpha=0.5, label='No Decay', color='red') ax.hist(weights_with_decay, bins=30, alpha=0.5, label='With L2', color='blue') ax.set_xlabel('Weight Value') ax.set_ylabel('Count') ax.set_title('Weight Decay: Prefer Small Weights', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3, axis='y') # Early stopping ax = axes[1, 0] epochs = np.arange(0, 200) train_loss = 0.5 * np.exp(-epochs / 30) + 0.01 val_loss = 0.5 * np.exp(-epochs / 30) + 0.02 + 0.0001 * (epochs - 50)**2 val_loss[:50] = 0.5 * np.exp(-epochs[:50] / 30) + 0.02 best_epoch = np.argmin(val_loss) ax.plot(epochs, train_loss, 'b-', linewidth=2, label='Training') ax.plot(epochs, val_loss, 'r-', linewidth=2, label='Validation') ax.axvline(x=best_epoch, color='green', linestyle='--', linewidth=2, alpha=0.7) ax.scatter([best_epoch], [val_loss[best_epoch]], color='green', s=100, zorder=5) ax.text(best_epoch + 5, 0.15, f'Stop here\n(epoch {best_epoch})', fontsize=10) ax.set_xlabel('Epochs') ax.set_ylabel('Loss') ax.set_title('Early Stopping: Quit While Ahead', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Regularization comparison ax = axes[1, 1] methods = ['None', 'L2', 'Dropout', 'L2+Dropout', 'All'] test_errors = [15, 12, 11, 9, 8] colors = plt.cm.RdYlGn_r(np.linspace(0.3, 0.7, len(methods))) bars = ax.bar(methods, test_errors, color=colors) ax.set_ylabel('Test Error (%)') ax.set_title('Combining Regularization Methods', fontweight='bold') ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show()
#| echo: false """ Comparison of fully connected, convolutional, and recurrent architectures showing how each embeds different inductive biases: global connections, local receptive fields, and sequential hidden state propagation. """ #| fig-align: center fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # Fully connected ax = axes[0] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') grid_size = 5 for i in range(grid_size): for j in range(grid_size): rect = plt.Rectangle((i*1.5 + 1, j*1.5 + 1), 1, 1, facecolor='lightblue', edgecolor='black') ax.add_patch(rect) output_x, output_y = 9, 5 circle = plt.Circle((output_x, output_y), 0.3, color='red', alpha=0.8) ax.add_patch(circle) for i in range(grid_size): for j in range(grid_size): ax.plot([i*1.5 + 1.5, output_x - 0.3], [j*1.5 + 1.5, output_y], 'gray', alpha=0.2, linewidth=0.5) ax.set_title('Fully Connected\nNo spatial assumption', fontweight='bold') # Convolutional ax = axes[1] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') for i in range(grid_size): for j in range(grid_size): rect = plt.Rectangle((i*1.5 + 1, j*1.5 + 1), 1, 1, facecolor='lightblue', edgecolor='black') ax.add_patch(rect) kernel_i, kernel_j = 1, 1 for di in range(3): for dj in range(3): rect = plt.Rectangle(((kernel_i + di)*1.5 + 1, (kernel_j + dj)*1.5 + 1), 1, 1, facecolor='yellow', edgecolor='red', linewidth=2) ax.add_patch(rect) circle = plt.Circle((9, 5), 0.3, color='red', alpha=0.8) ax.add_patch(circle) for di in range(3): for dj in range(3): ax.plot([(kernel_i + di)*1.5 + 1.5, 8.7], [(kernel_j + dj)*1.5 + 1.5, 5], 'red', alpha=0.5, linewidth=1) ax.set_title('Convolutional\nLocal patterns matter', fontweight='bold') # Recurrent ax = axes[2] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') time_steps = 5 for t in range(time_steps): rect = plt.Rectangle((t*1.5 + 1, 4), 1, 2, facecolor='lightgreen', edgecolor='black') ax.add_patch(rect) ax.text(t*1.5 + 1.5, 5, f'$x_{t}$', ha='center', va='center') for t in range(time_steps): circle = plt.Circle((t*1.5 + 1.5, 7.5), 0.3, color='orange', alpha=0.8) ax.add_patch(circle) ax.text(t*1.5 + 1.5, 7.5, f'$h_{t}$', ha='center', va='center', fontsize=9) if t < time_steps - 1: ax.arrow(t*1.5 + 1.8, 7.5, 0.9, 0, head_width=0.15, head_length=0.1, fc='orange', ec='orange') # Arrow from input to hidden state (ending at circle edge) ax.arrow(t*1.5 + 1.5, 6, 0, 0.9, head_width=0.15, head_length=0.1, fc='gray', ec='gray') ax.set_title('Recurrent\nSequential dependencies', fontweight='bold') plt.suptitle('Architectural Biases Shape Learning', fontsize=16, fontweight='bold', y=1.05) plt.tight_layout() plt.show()
#| echo: false """ Train/validation/test data split visualization with shuffled grid showing 70/15/15 proportions, alongside bar chart demonstrating model selection using validation scores while keeping test data hidden. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Train/Val/Test split ax = axes[0] np.random.seed(42) n_total = 1000 indices = np.arange(n_total) train_end = int(0.7 * n_total) val_end = int(0.85 * n_total) colors = ['#2E7D32'] * train_end + ['#F57C00'] * (val_end - train_end) + ['#1976D2'] * (n_total - val_end) np.random.shuffle(colors) grid_size = int(np.sqrt(n_total)) colors_grid = np.array(colors[:grid_size**2]).reshape(grid_size, grid_size) for i in range(grid_size): for j in range(grid_size): rect = plt.Rectangle((j, i), 1, 1, facecolor=colors_grid[i, j], edgecolor='white', linewidth=0.5) ax.add_patch(rect) ax.set_xlim(0, grid_size) ax.set_ylim(0, grid_size) ax.set_aspect('equal') ax.axis('off') legend_elements = [ plt.Rectangle((0, 0), 1, 1, facecolor='#2E7D32', label='Train (70%)'), plt.Rectangle((0, 0), 1, 1, facecolor='#F57C00', label='Validation (15%)'), plt.Rectangle((0, 0), 1, 1, facecolor='#1976D2', label='Test (15%)') ] ax.legend(handles=legend_elements, loc='upper right', fontsize=10) ax.set_title('Data Split Visualization', fontweight='bold', fontsize=14) # Model selection process ax = axes[1] models = ['Model A', 'Model B', 'Model C', 'Model D'] val_scores = [85, 88, 87, 84] test_scores = [83, 86, 85, 82] x = np.arange(len(models)) width = 0.35 bars1 = ax.bar(x - width/2, val_scores, width, label='Validation', color='#F57C00', alpha=0.7) bars2 = ax.bar(x + width/2, test_scores, width, label='Test (Hidden)', color='#1976D2', alpha=0.7) best_idx = np.argmax(val_scores) ax.add_patch(plt.Rectangle((best_idx - 0.5, 75), 1.0, 15, fill=False, edgecolor='red', linewidth=2.5)) ax.text(best_idx, 91, 'Selected', ha='center', fontsize=10, fontweight='bold', color='red') ax.set_ylabel('Accuracy (%)') ax.set_xticks(x) ax.set_xticklabels(models) ax.set_title('Model Selection via Validation', fontweight='bold', fontsize=14) ax.legend() ax.grid(True, alpha=0.3, axis='y') ax.set_ylim(75, 90) plt.tight_layout() plt.show()
Why does SGD find generalizing solutions?
Networks can memorize random labels perfectly,
yet SGD finds patterns when labels are real
Why does overparameterization help?
10x more parameters than samples should overfit,
but often improves test accuracy
What is the role of depth?
Shallow wide networks have same capacity,
but deep networks generalize better
How do transformers generalize?
No convolutions, no recurrence,
yet state-of-the-art on vision and language
#| echo: false """ Horizontal bar chart comparing the relative explanatory power of different generalization theories, illustrating that no single framework fully explains deep learning's generalization success. """ #| fig-align: center fig, ax = plt.subplots(figsize=(8, 6)) theories = ['Classical\nLearning\nTheory', 'Rademacher\nComplexity', 'PAC-Bayes', 'Margin\nTheory', 'Compression', 'Implicit\nBias'] success = [30, 40, 45, 50, 35, 60] colors = plt.cm.coolwarm(np.array(success) / 100) bars = ax.barh(theories, success, color=colors) ax.set_xlabel('Relative Explanatory Power') ax.set_title('Theoretical Understanding Gap\n(Illustrative)', fontweight='bold') ax.axvline(x=100, color='gray', linestyle='--', alpha=0.5) ax.text(100, -0.5, 'Complete\nUnderstanding', ha='center', fontsize=9) ax.grid(True, alpha=0.3, axis='x') plt.tight_layout() plt.show()
Note: No single theory fully explains deep learning generalization. Active research area.
Setup (2018-2021):
Deployment (2021):
What happened:
Training data came from stable market. Deployment happened during rapid market shift. Model kept predicting pre-COVID prices.
#| echo: false """ Zillow case study showing distribution shift: top panel plots home prices where model predictions (trained on 2018-2020) diverge from actual 2021 prices after COVID market shift; bottom panel shows accuracy dropping from 94-96% during validation to 68% at deployment. """ #| fig-align: center fig, axes = plt.subplots(2, 1, figsize=(9, 10)) # Distribution shift ax = axes[0] months = np.arange(36) pre_covid = 300 + 5*months post_covid_shift = np.zeros_like(months) post_covid_shift[24:] = 25 * (months[24:] - 24)**1.1 actual_prices = pre_covid.copy() actual_prices[24:] += post_covid_shift[24:] ax.plot(months[:24], pre_covid[:24], 'b-', linewidth=3, label='Training data (2018-2020)') ax.plot(months[24:], actual_prices[24:], 'r-', linewidth=3, label='Actual prices (2021)') ax.plot(months[24:], pre_covid[24:], 'b--', linewidth=2, alpha=0.6, label='Model predictions') ax.fill_between(months[24:], pre_covid[24:], actual_prices[24:], alpha=0.3, color='red', label='Prediction error') ax.set_xlabel('Months', fontsize=12) ax.set_ylabel('Home Price ($K)', fontsize=12) ax.set_title('Market Shifted, Model Did Not', fontweight='bold', fontsize=14) ax.legend(fontsize=10) ax.grid(True, alpha=0.3) ax.axvline(x=24, color='black', linestyle='--', linewidth=2) ax.text(12, 550, 'Trained\nhere', ha='center', fontsize=11, fontweight='bold') ax.text(30, 550, 'Deployed\nhere', ha='center', fontsize=11, fontweight='bold') # Performance comparison ax = axes[1] scenarios = ['Train', 'Validation', 'Test\n(2020)', 'Deploy\n(2021)'] accuracy = [96, 95, 94, 68] colors = ['green', 'green', 'green', 'red'] bars = ax.bar(scenarios, accuracy, color=colors, alpha=0.6) ax.set_ylabel('Accuracy (%)', fontsize=12) ax.set_title('Perfect Validation, Failed Deployment', fontweight='bold', fontsize=14) ax.set_ylim(0, 100) ax.grid(True, alpha=0.3, axis='y') for i, acc in enumerate(accuracy): ax.text(i, acc + 2, f'{acc}%', ha='center', fontweight='bold', fontsize=12) ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5) ax.text(3.5, 92, 'Target', fontsize=10) plt.tight_layout() plt.show()
The problem:
All your validation tools assume the future looks like the past. When the world changes, models trained on historical data fail.
Train/val/test all from 2018-2020: Model learns pre-COVID patterns Deploy in 2021: COVID changed everything Result: Model is wrong, but doesn't know it's wrong
This is not a rare edge case - markets shift, user behavior changes, new products emerge. Distribution shift is common.
Empirical validation:
Defensive engineering:
Course approach:
Many fundamental questions remain open research problems.
#| echo: false """ CNN fundamentals showing convolution sliding window operation, multiple feature maps, max pooling spatial reduction, and full architecture progression from input through convolutional and pooling layers to classification. """ #| fig-align: center import numpy as np import matplotlib.pyplot as plt import matplotlib.patches as patches fig, axes = plt.subplots(2, 3, figsize=(15, 10)) # Convolution operation ax = axes[0, 0] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Input image for i in range(5): for j in range(5): val = np.random.random() rect = patches.Rectangle((i*1.2 + 1, j*1.2 + 1), 1, 1, facecolor=plt.cm.gray(val), edgecolor='black') ax.add_patch(rect) # Kernel kernel_colors = ['red', 'blue', 'green'] for ki in range(3): for kj in range(3): rect = patches.Rectangle((7 + ki*0.8, 3 + kj*0.8), 0.7, 0.7, facecolor=kernel_colors[(ki+kj)%3], alpha=0.5, edgecolor='black', linewidth=2) ax.add_patch(rect) # Highlight convolution region rect = patches.Rectangle((1.2 + 1, 1.2 + 1), 3.6, 3.6, fill=False, edgecolor='red', linewidth=3) ax.add_patch(rect) ax.arrow(5, 5, 1.5, 0, head_width=0.3, head_length=0.2, fc='black', ec='black') ax.text(2.5, 0.5, 'Input', ha='center', fontweight='bold') ax.text(8, 2, 'Filter', ha='center', fontweight='bold') ax.set_title('Convolution: Sliding Window', fontweight='bold') # Feature maps ax = axes[0, 1] n_filters = 3 for f in range(n_filters): for i in range(3): for j in range(3): val = np.random.random() rect = patches.Rectangle((f*3.5 + i*0.9 + 1, j*0.9 + 3), 0.8, 0.8, facecolor=plt.cm.viridis(val), edgecolor='black') ax.add_patch(rect) ax.text(f*3.5 + 2.5, 2, f'Feature {f+1}', ha='center', fontsize=10) ax.set_xlim(0, 12) ax.set_ylim(0, 8) ax.axis('off') ax.set_title('Multiple Feature Maps', fontweight='bold') # Pooling ax = axes[0, 2] # Before pooling for i in range(4): for j in range(4): val = np.random.random() rect = patches.Rectangle((i*0.8 + 1, j*0.8 + 4), 0.7, 0.7, facecolor=plt.cm.coolwarm(val), edgecolor='black') ax.add_patch(rect) # After pooling for i in range(2): for j in range(2): val = 0.8 + 0.2*np.random.random() rect = patches.Rectangle((i*1.2 + 6, j*1.2 + 4.4), 1, 1, facecolor=plt.cm.coolwarm(val), edgecolor='black', linewidth=2) ax.add_patch(rect) ax.arrow(4.5, 5.5, 1.2, 0, head_width=0.2, head_length=0.15, fc='black', ec='black') ax.text(2.5, 3, '4×4', ha='center', fontsize=10) ax.text(7, 3, '2×2', ha='center', fontsize=10) ax.text(5, 6.5, 'Max Pool', ha='center', fontsize=10) ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') ax.set_title('Pooling: Spatial Reduction', fontweight='bold') # Full CNN architecture ax = axes[1, 0] ax.set_xlim(0, 12) ax.set_ylim(0, 8) ax.axis('off') layers = [ ('Input\n32×32×3', 1, 4, 2, 2, 'lightblue'), ('Conv\n28×28×32', 3, 3.5, 1.8, 1.8, 'lightgreen'), ('Pool\n14×14×32', 5, 3.5, 1.5, 1.5, 'lightyellow'), ('Conv\n10×10×64', 7, 3.5, 1.2, 1.2, 'lightcoral'), ('Pool\n5×5×64', 9, 3.5, 0.8, 0.8, 'lightgray'), ('FC\n10', 11, 3.5, 0.5, 1.5, 'pink') ] for i, (label, x, y, w, h, color) in enumerate(layers): rect = patches.Rectangle((x-w/2, y-h/2), w, h, facecolor=color, edgecolor='black', linewidth=2) ax.add_patch(rect) ax.text(x, y-h/2-0.5, label, ha='center', fontsize=9) if i < len(layers) - 1: ax.arrow(x+w/2, y, layers[i+1][1]-x-w/2-0.2, 0, head_width=0.15, head_length=0.1, fc='gray', ec='gray') ax.set_title('Typical CNN Architecture', fontweight='bold') # Translation invariance demo ax = axes[1, 1] base_pattern = np.zeros((7, 7)) base_pattern[2:5, 1:3] = 1 # Original ax.imshow(base_pattern, cmap='gray', extent=[0, 3, 0, 3]) ax.set_xlim(0, 10) ax.set_ylim(0, 5) # Shifted shifted = np.zeros((7, 7)) shifted[2:5, 4:6] = 1 ax.imshow(shifted, cmap='gray', extent=[4, 7, 0, 3]) ax.text(1.5, 4, 'Original', ha='center', fontweight='bold') ax.text(5.5, 4, 'Shifted', ha='center', fontweight='bold') ax.text(8.5, 1.5, 'Same\nResponse', ha='center', fontsize=11, color='red', fontweight='bold') ax.set_title('Translation Invariance', fontweight='bold') ax.axis('off') # Receptive field growth ax = axes[1, 2] layers_rf = ['Input', 'Conv3×3', 'Conv3×3', 'Pool2×2', 'Conv3×3'] rf_sizes = [1, 3, 5, 10, 18] ax.plot(range(len(layers_rf)), rf_sizes, 'go-', linewidth=2, markersize=8) ax.set_xticks(range(len(layers_rf))) ax.set_xticklabels(layers_rf, rotation=45, ha='right') ax.set_ylabel('Receptive Field Size') ax.set_title('Hierarchical View Growth', fontweight='bold') ax.grid(True, alpha=0.3) plt.suptitle('CNN: Built for Images', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
#| echo: false """ Three-panel visualization of modern architectures: Transformer attention mechanism with token-to-token weights, Graph Neural Network with node connectivity and message passing, and Diffusion model showing iterative denoising from noise to clean image. """ #| fig-align: center fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # Transformer attention ax = axes[0] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Tokens tokens = ['The', 'cat', 'sat', 'on', 'mat'] for i, token in enumerate(tokens): rect = patches.Rectangle((0.5, 8-i*1.5), 1.5, 0.8, facecolor='lightblue', edgecolor='black') ax.add_patch(rect) ax.text(1.25, 8.4-i*1.5, token, ha='center', va='center', fontsize=10) # Attention matrix attention = np.random.random((5, 5)) attention = attention / attention.sum(axis=1, keepdims=True) ax.imshow(attention, extent=[4, 8, 2, 6], cmap='Reds', vmin=0, vmax=0.5) ax.text(6, 6.5, 'Attention Weights', ha='center', fontweight='bold', fontsize=9) # Highlight one attention pattern for j in range(5): if attention[2, j] > 0.3: ax.plot([2, 4], [8.4-2*1.5, 6-j*0.8], 'r-', alpha=0.5, linewidth=2) ax.set_title('Transformers: Attention', fontweight='bold', fontsize=11) ax.text(5, 0.8, 'Each token attends to\nrelevant context', ha='center', fontsize=9, style='italic') # Graph Neural Network ax = axes[1] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Graph structure nodes = [(2, 7), (5, 8), (8, 7), (3, 4), (7, 4), (5, 2)] edges = [(0, 1), (1, 2), (0, 3), (1, 3), (1, 4), (2, 4), (3, 4), (4, 5)] for (i, j) in edges: ax.plot([nodes[i][0], nodes[j][0]], [nodes[i][1], nodes[j][1]], 'gray', linewidth=2, alpha=0.5) for i, (x, y) in enumerate(nodes): color = plt.cm.tab10(i) circle = patches.Circle((x, y), 0.5, facecolor=color, edgecolor='black', linewidth=2) ax.add_patch(circle) ax.text(x, y, str(i), ha='center', va='center', fontweight='bold', color='white') ax.set_title('Graph Networks: Structured Data', fontweight='bold', fontsize=11) ax.text(5, 0.5, 'Message passing\nbetween connected nodes', ha='center', fontsize=9, style='italic') # Diffusion process ax = axes[2] steps = 5 for i in range(steps): if i == 0: img = np.random.randn(8, 8) else: img = 0.3 * np.ones((8, 8)) + (1-i/(steps-1)) * np.random.randn(8, 8) ax.imshow(img, extent=[i*2, i*2+1.5, 3, 4.5], cmap='gray', vmin=-2, vmax=2) if i < steps - 1: ax.arrow(i*2+1.6, 3.75, 0.3, 0, head_width=0.1, head_length=0.05, fc='red', ec='red') ax.text(0.75, 2.5, 'Noise', ha='center', fontsize=9) ax.text(8.75, 2.5, 'Clean', ha='center', fontsize=9) ax.set_xlim(-0.5, 10) ax.set_ylim(1.5, 5.5) ax.axis('off') ax.set_title('Diffusion: Iterative Denoising', fontweight='bold', fontsize=11) ax.text(5, 1.8, 'Learn to reverse\nnoise addition', ha='center', fontsize=9, style='italic') plt.tight_layout() plt.show()
#| echo: true #| code-fold: false import numpy as np class Conv2D: def __init__(self, in_channels, out_channels, kernel_size=3): self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size # Initialize filters self.filters = np.random.randn( out_channels, in_channels, kernel_size, kernel_size ) * 0.1 self.bias = np.zeros(out_channels) def forward(self, x): batch, in_c, height, width = x.shape out_h = height - self.kernel_size + 1 out_w = width - self.kernel_size + 1 output = np.zeros((batch, self.out_channels, out_h, out_w)) # Convolution operation for b in range(batch): for oc in range(self.out_channels): for h in range(out_h): for w in range(out_w): # Extract patch patch = x[b, :, h:h+self.kernel_size, w:w+self.kernel_size] # Convolve with filter output[b, oc, h, w] = np.sum(patch * self.filters[oc]) + self.bias[oc] return output class MaxPool2D: def __init__(self, pool_size=2): self.pool_size = pool_size def forward(self, x): batch, channels, height, width = x.shape out_h = height // self.pool_size out_w = width // self.pool_size output = np.zeros((batch, channels, out_h, out_w)) for h in range(out_h): for w in range(out_w): h_start = h * self.pool_size w_start = w * self.pool_size pool_region = x[:, :, h_start:h_start+self.pool_size, w_start:w_start+self.pool_size] output[:, :, h, w] = np.max(pool_region, axis=(2, 3)) return output # Example usage x = np.random.randn(1, 3, 32, 32) # Batch=1, RGB, 32x32 conv = Conv2D(3, 16, kernel_size=3) pool = MaxPool2D(pool_size=2) x = conv.forward(x) print(f"After conv: {x.shape}") # (1, 16, 30, 30) x = np.maximum(0, x) # ReLU x = pool.forward(x) print(f"After pool: {x.shape}") # (1, 16, 15, 15)
ResNet-50 (2015):
MobileNetV2 (2018):
EfficientNet-B0 (2019):
Architecture design matters: EfficientNet achieves better accuracy than ResNet-50 with far fewer parameters.
1. Become one with the data Look at your data. Plot it. Understand its distribution, outliers, patterns.
2. Set up end-to-end pipeline Get a simple model training before complexity.
3. Overfit a single batch If you can't overfit 10 examples, something is broken.
4. Verify loss at initialization Check loss matches expected value (e.g., \(\log(n_{classes})\) for classification).
5. Add complexity gradually Start simple, add one thing at a time.
#| echo: true #| code-fold: false # The debugging progression def debug_training(): # Step 1: Overfit one example single_x = X[0:1] single_y = y[0:1] for _ in range(100): loss = train_step(single_x, single_y) assert loss < 0.01, "Can't overfit single" # Step 2: Overfit small batch batch_x = X[0:10] batch_y = y[0:10] for _ in range(500): loss = train_step(batch_x, batch_y) assert loss < 0.1, "Can't overfit batch" # Step 3: Check with real data # Only now move to full dataset return "Ready for full training"
#| echo: false """ Common deep learning debugging patterns showing stuck loss, exploding gradients, overfitting, vanishing gradients, and learning rate finder diagnostics. """ #| fig-align: center import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np fig, axes = plt.subplots(2, 3, figsize=(15, 9)) # Loss not decreasing ax = axes[0, 0] epochs = np.arange(50) stuck_loss = np.ones(50) * 2.3 + np.random.normal(0, 0.01, 50) ax.plot(epochs, stuck_loss, 'r-', linewidth=2) ax.set_xlabel('Epoch') ax.set_ylabel('Loss') ax.set_title('Loss Not Decreasing', fontweight='bold') ax.grid(True, alpha=0.3) # Exploding gradients ax = axes[0, 1] epochs = np.arange(30) exploding = np.exp(epochs / 10) exploding[15:] = np.nan ax.plot(epochs[:15], exploding[:15], 'r-', linewidth=2) ax.set_xlabel('Epoch') ax.set_ylabel('Loss') ax.set_title('Exploding Gradients', fontweight='bold') ax.set_yscale('log') ax.grid(True, alpha=0.3) ax.annotate('Fix: Gradient clipping, lower LR, better init', xy=(7.5, 8), fontsize=8, ha='center', style='italic') # Overfitting ax = axes[0, 2] epochs = np.arange(100) train_loss = 0.5 * np.exp(-epochs / 20) + 0.01 val_loss = 0.5 * np.exp(-epochs / 20) + 0.02 + 0.001 * epochs ax.plot(epochs, train_loss, 'b-', linewidth=2, label='Train') ax.plot(epochs, val_loss, 'r-', linewidth=2, label='Validation') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') ax.set_title('Overfitting Pattern', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) ax.text(70, 0.2, 'Solutions:\n• Regularization\n• More data\n• Early stopping', fontsize=9, ha='center', bbox=dict(boxstyle="round,pad=0.3", facecolor='lightblue', alpha=0.5)) # Gradient flow visualization (backprop flows Output -> Input) ax = axes[1, 0] layers = ['Input', 'Layer 1', 'Layer 2', 'Layer 3', 'Layer 4', 'Output'] gradient_magnitudes = [0.001, 0.01, 0.1, 0.4, 0.8, 1.0] bars = ax.bar(layers, gradient_magnitudes, color='red', alpha=0.7) ax.set_ylabel('Gradient Magnitude') ax.set_yscale('log') ax.set_title('Vanishing Gradient Diagnosis', fontweight='bold') ax.axhline(y=0.1, color='green', linestyle='--', alpha=0.5) ax.text(2.5, 0.15, 'Healthy threshold', fontsize=9) ax.grid(True, alpha=0.3, axis='y') # Learning rate finder ax = axes[1, 1] lrs = np.logspace(-5, 0, 50) losses = 2 - 1.5 * np.exp(-100 * lrs) + 10 * lrs**2 best_idx = np.argmin(losses) ax.semilogx(lrs, losses, 'b-', linewidth=2) ax.scatter(lrs[best_idx], losses[best_idx], color='red', s=100, zorder=5) ax.set_xlabel('Learning Rate') ax.set_ylabel('Loss') ax.set_title('Learning Rate Finder', fontweight='bold') ax.grid(True, alpha=0.3) ax.text(lrs[best_idx], losses[best_idx] + 0.2, f'Best: {lrs[best_idx]:.1e}', ha='center', fontsize=9) axes[1, 2].axis('off') # Leave bottom-right empty plt.tight_layout() plt.show()
Setup:
Deployment at Hospital B:
What went wrong:
Hospital A used one X-ray machine model with specific image characteristics. Hospital B used different equipment. Model learned machine artifacts, not disease patterns.
Example artifacts learned:
#| echo: false """ Performance comparison showing accuracy drop when deploying to new hospitals, alongside feature importance revealing model learned equipment-specific shortcuts rather than disease patterns. """ #| fig-align: center fig, axes = plt.subplots(2, 1, figsize=(9, 10)) # Performance by hospital ax = axes[0] hospitals = ['Hospital A\n(Training)', 'Hospital A\n(Test)', 'Hospital B\n(Deploy)', 'Hospital C\n(Deploy)'] accuracy = [95, 94, 72, 68] colors = ['green', 'green', 'red', 'red'] bars = ax.bar(hospitals, accuracy, color=colors, alpha=0.6) ax.set_ylabel('Accuracy (%)', fontsize=12) ax.set_title('Model Learned Hospital-Specific Patterns, Not Disease', fontweight='bold', fontsize=14) ax.set_ylim(0, 100) ax.grid(True, alpha=0.3, axis='y') for i, acc in enumerate(accuracy): ax.text(i, acc + 2, f'{acc}%', ha='center', fontweight='bold', fontsize=12) ax.axhline(y=90, color='gray', linestyle='--', alpha=0.5) ax.text(3.5, 92, 'Clinical\nTarget', fontsize=10) # What model actually learned ax = axes[1] # Create side-by-side comparison features_learned = ['Disease\nPatterns', 'X-ray Machine\nModel', 'Image\nBrightness', 'Positioning\nMarkers'] importance_should_be = [90, 2, 3, 5] importance_actual = [40, 30, 20, 10] x = np.arange(len(features_learned)) width = 0.35 bars1 = ax.bar(x - width/2, importance_should_be, width, label='Should learn', color='green', alpha=0.7) bars2 = ax.bar(x + width/2, importance_actual, width, label='Actually learned', color='red', alpha=0.7) ax.set_ylabel('Feature Importance (%)', fontsize=12) ax.set_title('Model Learned Shortcuts, Not Medicine', fontweight='bold', fontsize=14) ax.set_xticks(x) ax.set_xticklabels(features_learned, fontsize=10) ax.set_ylim(0, 100) ax.legend(fontsize=11) ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show()
Shortcut learning: Standard debugging looked fine:
Problem only appeared on different hospital equipment. Models exploit spurious correlations (disease + specific machine) as shortcuts instead of learning actual medical patterns.
#| echo: false """ Hardware comparison across devices, batch size memory/throughput tradeoffs, neural scaling laws with model landmarks, and distributed training efficiency showing ideal vs actual GPU speedup. """ #| fig-align: center fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Hardware comparison ax = axes[0, 0] devices = ['CPU\n(8 cores)', 'GPU\n(RTX 3090)', 'TPU v3\n(Google)', 'A100\n(NVIDIA)'] relative_speed = [1, 25, 80, 100] memory = [32, 24, 128, 80] # GB x = np.arange(len(devices)) width = 0.35 ax.bar(x - width/2, relative_speed, width, label='Relative Speed', color='#1976D2', alpha=0.7) ax.bar(x + width/2, memory, width, label='Memory (GB)', color='#F57C00', alpha=0.7) ax.set_xticks(x) ax.set_xticklabels(devices) ax.set_ylabel('Value') ax.set_title('Hardware Capabilities', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3, axis='y') # Batch size vs memory ax = axes[0, 1] batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256] memory_usage = [0.5, 1, 2, 4, 8, 16, 32, 64, 128] # Simplified linear relationship throughput = [10, 19, 36, 68, 120, 200, 280, 320, 330] ax.plot(batch_sizes, memory_usage, 'r-o', linewidth=2, label='Memory (GB)') ax2 = ax.twinx() ax2.plot(batch_sizes, throughput, 'b-s', linewidth=2, label='Throughput (img/s)') ax.set_xlabel('Batch Size') ax.set_ylabel('Memory Usage (GB)', color='r') ax2.set_ylabel('Throughput (img/s)', color='b') ax.set_title('Batch Size Trade-offs', fontweight='bold') ax.set_xscale('log') ax.grid(True, alpha=0.3) ax.axhline(y=24, color='red', linestyle='--', alpha=0.5) ax.text(4, 26, 'GPU Memory Limit', fontsize=9, color='red') # Scaling laws ax = axes[1, 0] model_sizes = np.logspace(6, 11, 50) # 1M to 100B parameters compute_flops = model_sizes * 6 # Approximate FLOPs performance = 100 - 50 * np.exp(-model_sizes / 1e9) ax.loglog(compute_flops, performance, 'g-', linewidth=2) ax.set_xlabel('Compute (FLOPs)') ax.set_ylabel('Performance (%)') ax.set_title('Scaling Laws', fontweight='bold') ax.grid(True, alpha=0.3) # Add model landmarks landmarks = [(6e6*6, 'BERT'), (175e9*6, 'GPT-3'), (540e9*6, 'PaLM')] for flops, name in landmarks: if flops < 1e18: # Only show if in range perf = 100 - 50 * np.exp(-flops/6/1e9) ax.scatter(flops, perf, s=100, zorder=5) ax.text(flops, perf-3, name, fontsize=8, ha='center') # Distributed training ax = axes[1, 1] n_gpus = [1, 2, 4, 8, 16, 32] ideal_speedup = n_gpus actual_speedup = [1, 1.9, 3.6, 6.8, 12, 20] actual_speedup_poor = [1, 1.5, 2.2, 3.0, 3.8, 4.5] ax.plot(n_gpus, ideal_speedup, 'g--', linewidth=2, label='Ideal') ax.plot(n_gpus, actual_speedup, 'b-o', linewidth=2, label='Data Parallel') ax.plot(n_gpus, actual_speedup_poor, 'r-s', linewidth=2, label='Poor Implementation') ax.set_xlabel('Number of GPUs') ax.set_ylabel('Speedup') ax.set_title('Distributed Training Efficiency', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) plt.tight_layout() plt.show()
What this means for this course:
CPU (your laptop):
GPU (Colab/Kaggle free tier):
Multi-GPU (cloud):
Course approach: CPU is viable for most work. GPU accelerates but isn't required.
#| echo: true #| code-fold: true #| code-summary: "Experiment tracking example" # Tracking experiments experiment_config = { 'model': 'resnet18', 'dataset': 'cifar10', 'batch_size': 128, 'lr': 0.1, 'epochs': 100, 'seed': 42, 'timestamp': '2025-01-15-14:30' } # Always set seeds for reproducibility def set_all_seeds(seed=42): np.random.seed(seed) # torch.manual_seed(seed) # torch.cuda.manual_seed_all(seed) # random.seed(seed) # Log everything def log_metrics(epoch, train_loss, val_loss, val_acc): metrics = { 'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss, 'val_acc': val_acc, 'lr': get_current_lr(), 'timestamp': time.time() } # Write to file, tensorboard, wandb, etc. return metrics
#| echo: false """ Ablation study showing incremental accuracy gains from each regularization technique, alongside a hyperparameter grid search heatmap for learning rate and batch size combinations. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Ablation study ax = axes[0] components = ['Baseline', '+DataAug', '+Dropout', '+WeightDecay', '+LRSchedule', 'Full Model'] performance = [82, 84, 85.5, 86.5, 87.8, 89.2] colors = plt.cm.RdYlGn(np.linspace(0.4, 0.9, len(components))) bars = ax.barh(components, performance, color=colors) ax.set_xlabel('Test Accuracy (%)') ax.set_title('Ablation Study: Component Contributions', fontweight='bold') ax.grid(True, alpha=0.3, axis='x') for i, (comp, perf) in enumerate(zip(components, performance)): if i > 0: improvement = performance[i] - performance[i-1] ax.text(perf + 0.2, i, f'+{improvement:.1f}%', fontsize=9, va='center') # Hyperparameter grid ax = axes[1] learning_rates = [0.001, 0.01, 0.1, 1.0] batch_sizes = [32, 64, 128, 256] results = np.array([ [75, 78, 76, 72], [80, 85, 83, 78], [82, 88, 86, 80], [70, 75, 73, 68] ]) im = ax.imshow(results, cmap='RdYlGn', vmin=65, vmax=90) ax.set_xticks(range(len(batch_sizes))) ax.set_yticks(range(len(learning_rates))) ax.set_xticklabels(batch_sizes) ax.set_yticklabels([f'{lr:.3f}' for lr in learning_rates]) ax.set_xlabel('Batch Size') ax.set_ylabel('Learning Rate') ax.set_title('Hyperparameter Search Results', fontweight='bold') # Add text annotations for i in range(len(learning_rates)): for j in range(len(batch_sizes)): ax.text(j, i, f'{results[i, j]:.0f}', ha='center', va='center', color='white', fontweight='bold') plt.colorbar(im, ax=ax, label='Accuracy (%)') plt.tight_layout() plt.show()
#| echo: false """ Bar chart comparing model accuracy across increasing complexity levels, from random guessing (10%) through linear and neural baselines to state-of-the-art (94%), with horizontal reference lines marking baseline performance thresholds. """ #| fig-align: center fig, ax = plt.subplots(figsize=(12, 8)) methods = [ ('Random\nGuessing', 10, 'gray'), ('Linear\nModel', 65, '#B71C1C'), ('2-Layer\nMLP', 78, '#F57C00'), ('Simple\nCNN', 85, '#1976D2'), ('Your\nModel', 87, '#2E7D32'), ('SOTA\n2025', 94, 'gold') ] positions = [] for i, (name, score, color) in enumerate(methods): x = i * 2 + 1 positions.append(x) # Bar bar = ax.bar(x, score, color=color, alpha=0.7, width=1.5) # Annotations ax.text(x, score + 1, f'{score}%', ha='center', fontweight='bold') ax.text(x, -3, name, ha='center', fontsize=10) # Complexity indicator complexity = [1, 2, 3, 4, 5, 6] for j in range(complexity[i]): circle = plt.Circle((x - 0.6 + j*0.25, score - 5), 0.08, color='black', alpha=0.5) ax.add_patch(circle) # Add baseline lines ax.axhline(y=10, color='gray', linestyle='--', alpha=0.5, linewidth=1) ax.text(11, 11, 'Random', fontsize=9, color='gray') ax.axhline(y=65, color='red', linestyle='--', alpha=0.5, linewidth=1) ax.text(11, 66, 'Linear baseline', fontsize=9, color='red') ax.set_ylim(0, 100) ax.set_xlim(-1, 12) ax.set_ylabel('Accuracy (%)', fontsize=12) ax.set_title('Always Compare Against Baselines', fontsize=16, fontweight='bold') ax.grid(True, alpha=0.3, axis='y') ax.set_xticks([]) plt.tight_layout() plt.show()
#| echo: false """ Course architecture diagram showing statistical foundations (detection, estimation, regression, classification) and neural methods (deep learning, CNNs, RNNs, autoencoders) connected through a central optimization and learning bridge. """ #| fig-align: center import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np fig, ax = plt.subplots(figsize=(10, 8)) # Statistical vs Data-Driven perspective ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Statistical foundations (left side) stat_components = [ (2, 8, 'Detection\nEstimation', '#E3F2FD'), (2, 6.5, 'MMSE\nEstimation', '#E3F2FD'), (2, 5, 'ML/MAP\nParameter Est.', '#E3F2FD'), (2, 3.5, 'Regression', '#E3F2FD'), (2, 2, 'Classification', '#E3F2FD') ] for x, y, label, color in stat_components: rect = patches.FancyBboxPatch((x-0.8, y-0.4), 1.6, 0.8, boxstyle="round,pad=0.05", facecolor=color, edgecolor='#1976D2', linewidth=2) ax.add_patch(rect) ax.text(x, y, label, ha='center', va='center', fontsize=9, fontweight='bold') # Data-driven methods (right side) ml_components = [ (8, 8, 'Neural\nNetworks', '#FFEBEE'), (8, 6.5, 'Deep\nLearning', '#FFEBEE'), (8, 5, 'CNNs', '#FFEBEE'), (8, 3.5, 'RNNs', '#FFEBEE'), (8, 2, 'Autoencoders', '#FFEBEE') ] for x, y, label, color in ml_components: rect = patches.FancyBboxPatch((x-0.8, y-0.4), 1.6, 0.8, boxstyle="round,pad=0.05", facecolor=color, edgecolor='#C62828', linewidth=2) ax.add_patch(rect) ax.text(x, y, label, ha='center', va='center', fontsize=9, fontweight='bold') # Bridge in the middle bridge_y = 5 rect = patches.FancyBboxPatch((4, bridge_y-0.5), 2, 1, boxstyle="round,pad=0.05", facecolor='#E8F5E9', edgecolor='#2E7D32', linewidth=3) ax.add_patch(rect) ax.text(5, bridge_y, 'Optimization\n& Learning', ha='center', va='center', fontsize=10, fontweight='bold') # Arrows showing progression for i in range(len(stat_components)-1): y1 = stat_components[i][1] y2 = stat_components[i+1][1] arrow = patches.FancyArrowPatch((2, y1-0.45), (2, y2+0.45), mutation_scale=15, arrowstyle='->', color='#1976D2', alpha=0.5, linewidth=2) ax.add_patch(arrow) for i in range(len(ml_components)-1): y1 = ml_components[i][1] y2 = ml_components[i+1][1] arrow = patches.FancyArrowPatch((8, y1-0.45), (8, y2+0.45), mutation_scale=15, arrowstyle='->', color='#C62828', alpha=0.5, linewidth=2) ax.add_patch(arrow) # Connecting arrows arrow_left = patches.FancyArrowPatch((2.85, 5), (3.95, 5), mutation_scale=20, arrowstyle='->', color='#2E7D32', linewidth=2) ax.add_patch(arrow_left) arrow_right = patches.FancyArrowPatch((6.05, 5), (7.15, 5), mutation_scale=20, arrowstyle='->', color='#2E7D32', linewidth=2) ax.add_patch(arrow_right) ax.text(1, 9, 'Classical/Statistical', fontsize=12, fontweight='bold', color='#1976D2') ax.text(7, 9, 'Modern/Data-Driven', fontsize=12, fontweight='bold', color='#C62828') plt.tight_layout() plt.show()
#| echo: false """ Four-panel visualization showing emergent abilities appearing at different model scales, in-context learning improvements with model size, compute-optimal scaling laws, and the exponential growth of model parameters from 2018-2025. """ #| fig-align: center fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Emergent abilities ax = axes[0, 0] model_sizes = np.logspace(7, 11, 50) # Different tasks emerge at different scales tasks = [ ('Arithmetic', 1e8, 80, '#1976D2'), ('Translation', 1e9, 85, '#2E7D32'), ('Reasoning', 10e9, 90, '#F57C00'), ('Code Generation', 50e9, 92, '#7B1FA2') ] for task, threshold, max_perf, color in tasks: perf = np.zeros_like(model_sizes) mask = model_sizes > threshold perf[mask] = max_perf * (1 - np.exp(-(model_sizes[mask] - threshold) / threshold)) ax.semilogx(model_sizes, perf, linewidth=2, label=task, color=color) ax.set_xlabel('Model Size (Parameters)') ax.set_ylabel('Task Performance (%)') ax.set_title('Emergent Abilities with Scale', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # In-context learning ax = axes[0, 1] n_examples = [0, 1, 2, 4, 8, 16] small_model = [50, 52, 53, 54, 54, 54] large_model = [50, 65, 75, 82, 87, 90] ax.plot(n_examples, small_model, 'o-', linewidth=2, label='Small Model (1B)', color='#C62828', markersize=8) ax.plot(n_examples, large_model, 's-', linewidth=2, label='Large Model (100B)', color='#1976D2', markersize=8) ax.set_xlabel('Number of In-Context Examples') ax.set_ylabel('Task Accuracy (%)') ax.set_title('In-Context Learning', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Scaling laws ax = axes[1, 0] compute = np.logspace(15, 25, 50) loss_small_data = 2.5 - 0.3 * np.log10(compute / 1e15) loss_large_data = 2.0 - 0.4 * np.log10(compute / 1e15) loss_optimal = 1.5 - 0.5 * np.log10(compute / 1e15) ax.semilogx(compute, loss_small_data, '--', linewidth=2, label='Data Limited', color='#C62828') ax.semilogx(compute, loss_large_data, '--', linewidth=2, label='Parameter Limited', color='#1976D2') ax.semilogx(compute, loss_optimal, '-', linewidth=2, label='Optimal Scaling', color='#2E7D32') ax.set_xlabel('Compute Budget (FLOPs)') ax.set_ylabel('Loss') ax.set_title('Scaling Laws (Chinchilla)', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) # Model timeline ax = axes[1, 1] years = np.array([2018, 2019, 2020, 2021, 2022, 2023]) model_sizes = np.array([0.1, 1.5, 175, 530, 540, 1000]) # Billions ax.semilogy(years, model_sizes, 'o-', linewidth=2, markersize=8, color='#7B1FA2') models = ['BERT', 'GPT-2', 'GPT-3', 'MT-NLG', 'PaLM', 'Estimated\nFrontier'] for year, size, name in zip(years, model_sizes, models): ax.text(year, size*1.3, name, ha='center', fontsize=8) # Add projection line years_future = np.array([2023, 2025]) sizes_future = np.array([1000, 2500]) ax.semilogy(years_future, sizes_future, '--', linewidth=2, markersize=8, color='#7B1FA2', alpha=0.5, label='Projected trend') ax.set_xlabel('Year') ax.set_ylabel('Model Size (Billion Parameters)') ax.set_title('The Race to Scale', fontweight='bold') ax.grid(True, alpha=0.3) ax.set_xlim(2017.5, 2025.5) plt.suptitle('Large Models: New Capabilities from Scale', fontsize=16, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
What these scales cost:
GPT-2 (1.5B params, 2019):
GPT-3 (175B params, 2020):
PaLM (540B params, 2022):
Scale is not just about bigger numbers - it's about fundamentally different resource requirements.
#| echo: false """ Model compression techniques comparing quantization bit-width tradeoffs, knowledge distillation from teacher to student models, structured network pruning, and efficient architecture designs. """ #| fig-align: center fig, axes = plt.subplots(2, 3, figsize=(15, 10)) axes[1, 2].axis('off') # Leave bottom-right empty # Quantization ax = axes[0, 0] bit_widths = [32, 16, 8, 4, 2, 1] model_size = [100, 50, 25, 12.5, 6.25, 3.125] accuracy = [95, 94.8, 94.5, 93, 88, 75] ax.plot(bit_widths, model_size, 'o-', linewidth=2, label='Model Size (MB)', color='#1976D2') ax2 = ax.twinx() ax2.plot(bit_widths, accuracy, 's-', linewidth=2, label='Accuracy (%)', color='#C62828') ax.set_xlabel('Bit Width') ax.set_ylabel('Model Size (MB)', color='#1976D2') ax2.set_ylabel('Accuracy (%)', color='#C62828') ax.set_title('Quantization Trade-offs', fontweight='bold') ax.grid(True, alpha=0.3) ax.invert_xaxis() # Distillation ax = axes[0, 1] models = ['Teacher\n(BERT)', 'Student 1\n(50%)', 'Student 2\n(25%)', 'Student 3\n(10%)'] params = [340, 170, 85, 34] performance = [95, 92, 88, 82] x = np.arange(len(models)) width = 0.35 bars1 = ax.bar(x - width/2, params, width, label='Parameters (M)', color='#1976D2', alpha=0.7) bars2 = ax.bar(x + width/2, performance, width, label='Performance (%)', color='#2E7D32', alpha=0.7) ax.set_xticks(x) ax.set_xticklabels(models, fontsize=8) ax.set_ylabel('Value') ax.set_title('Knowledge Distillation', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3, axis='y') # Pruning patterns ax = axes[0, 2] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Dense network for i in range(5): for j in range(5): if i < 2: # Input layer x, y = 1 + i*0.4, 3 + j*1 else: # Output layer x, y = 8 + (i-2)*0.4, 3 + j*1 circle = patches.Circle((x, y), 0.15, facecolor='#E3F2FD', edgecolor='black', alpha=0.8) ax.add_patch(circle) # Connections (sparse) np.random.seed(42) for i in range(5): for j in range(5): if np.random.random() > 0.7: # 70% pruned x1, y1 = 1.6, 3 + i*1 x2, y2 = 8, 3 + j*1 ax.plot([x1, x2], [y1, y2], color='#C62828', alpha=0.3, linewidth=1) ax.text(5, 1, 'Structured Pruning\n70% weights removed', ha='center', fontsize=10, fontweight='bold') ax.set_title('Network Pruning', fontweight='bold') # Efficient architectures ax = axes[1, 0] architectures = ['ResNet50', 'MobileNet', 'EfficientNet', 'Vision\nTransformer'] params_arch = [25, 4, 5, 86] accuracy_arch = [92, 88, 93, 95] inference_time = [10, 3, 4, 25] # ms colors_arch = ['#C62828', '#2E7D32', '#1976D2', '#F57C00'] ax.scatter(params_arch, accuracy_arch, s=[t*20 for t in inference_time], c=colors_arch, alpha=0.6) for i, arch in enumerate(architectures): ax.annotate(arch, (params_arch[i], accuracy_arch[i]), xytext=(5, 5), textcoords='offset points', fontsize=8) ax.set_xlabel('Parameters (M)') ax.set_ylabel('Accuracy (%)') ax.set_title('Efficient Architecture Design', fontweight='bold') ax.grid(True, alpha=0.3) # Mixed precision training ax = axes[1, 1] training_methods = ['FP32', 'FP16', 'Mixed\nPrecision', 'BF16'] memory_usage = [100, 50, 60, 50] training_speed = [1, 1.8, 1.7, 1.9] stability = [100, 70, 95, 90] x = np.arange(len(training_methods)) width = 0.25 bars1 = ax.bar(x - width, memory_usage, width, label='Memory (%)', color='#1976D2', alpha=0.7) bars2 = ax.bar(x, np.array(training_speed)*50, width, label='Speed (x50)', color='#2E7D32', alpha=0.7) bars3 = ax.bar(x + width, stability, width, label='Stability (%)', color='#C62828', alpha=0.7) ax.set_xticks(x) ax.set_xticklabels(training_methods) ax.set_ylabel('Relative Value') ax.set_title('Mixed Precision Training', fontweight='bold') ax.legend() ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show()
# System Python - Don't do this python install torch # Error: requires numpy>=1.19 pip install numpy==1.20 # Breaks: opencv requires numpy==1.18
Conflicts are inevitable. Each project needs:
#| echo: false """ Diagram showing system Python connecting to three isolated virtual environments (ee541, research, web_dev), each with its own Python version and packages. """ #| fig-align: center import matplotlib.pyplot as plt import matplotlib.patches as patches fig, ax = plt.subplots(figsize=(8, 6)) ax.set_xlim(0, 10) ax.set_ylim(0, 8) ax.axis('off') # System Python system = patches.Rectangle((3.5, 6), 3, 1.2, facecolor='#FFCDD2', alpha=0.8, edgecolor='#C62828', linewidth=2) ax.add_patch(system) ax.text(5, 6.6, 'System Python', ha='center', va='center', fontweight='bold', fontsize=10) # Virtual environments envs = [ (2, 3.5, 'ee541\nPyTorch 2.0\nPython 3.11', '#E3F2FD'), (5, 3.5, 'research\nTF 2.15\nPython 3.10', '#E8F5E9'), (8, 3.5, 'web_dev\nDjango 4.2\nPython 3.12', '#FFF9C4') ] for x, y, label, color in envs: rect = patches.Rectangle((x-1, y-0.8), 2, 1.6, facecolor=color, alpha=0.9, edgecolor='#212121', linewidth=2) ax.add_patch(rect) ax.text(x, y, label, ha='center', va='center', fontsize=9) # Arrow from system ax.arrow(5, 5.8, x-5, y-5.3, head_width=0.1, head_length=0.05, fc='#9E9E9E', ec='#9E9E9E', alpha=0.5, linestyle='--', linewidth=1) ax.text(5, 1.5, 'Isolated Environments', ha='center', fontsize=11, fontweight='bold') ax.text(5, 0.8, 'No conflicts, reproducible, deletable', ha='center', fontsize=9, style='italic') plt.tight_layout() plt.show()
# Download Miniconda (minimal) or Anaconda (full) # miniconda.anaconda.com # After installation, verify: conda --version conda info # Update conda itself conda update -n base conda
# Create environment with Python 3.11 conda create -n ee541 python=3.11 # Activate environment conda activate ee541 # Your prompt changes: (ee541) $ # Deactivate when done conda deactivate
Binary package management
Cross-platform
Channel system
conda-forge: Community packagespytorch: Official PyTorch buildsnvidia: CUDA toolkitEnvironment files
environment.yml for reproducibility# Activate your environment first conda activate ee541 # Core scientific stack conda install numpy scipy matplotlib pandas # Jupyter for notebooks conda install jupyter ipykernel # Register kernel for Jupyter python -m ipykernel install --user --name ee541 --display-name "Python (ee541)" # PyTorch - SELECT BASED ON YOUR SYSTEM # CPU only conda install pytorch torchvision torchaudio cpuonly -c pytorch # CUDA 11.8 (NVIDIA GPU) conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia # Mac M1/M2/M3 (Metal Performance Shaders) conda install pytorch torchvision torchaudio -c pytorch # Additional ML tools conda install scikit-learn conda install -c conda-forge tensorboard
nvidia-smi# Standard device selection import torch if torch.cuda.is_available(): device = torch.device("cuda") elif torch.backends.mps.is_available(): device = torch.device("mps") else: device = torch.device("cpu") print(f"Using device: {device}")
# From terminal with environment active (ee541) $ jupyter notebook # Opens browser at localhost:8888 # Navigate to your work directory
Shift+Enter: Run cell, move to nextCtrl+Enter: Run cell, stayEsc: Command modeEnter: Edit modeA/B: Insert cell above/below

Version Control
git init git add . git commit -m "Initial commit" # But exclude: # .gitignore contents: *.pyc __pycache__/ .ipynb_checkpoints/ data/ *.pt *.pth
Reproducibility
# Save environment conda env export > environment.yml # Recreate elsewhere conda env create -f environment.yml # Save pip requirements pip freeze > requirements.txt
#| echo: true #| code-fold: false import sys import platform print(f"Python: {sys.version}") print(f"Platform: {platform.platform()}") packages = { 'numpy': None, 'torch': None, 'torchvision': None, 'matplotlib': None, 'jupyter': None, 'sklearn': 'scikit-learn' } for import_name, pip_name in packages.items(): try: module = __import__(import_name) version = getattr(module, '__version__', 'installed') print(f"✓ {import_name}: {version}") # Special check for PyTorch GPU if import_name == 'torch': import torch if torch.cuda.is_available(): print(f" GPU: CUDA ({torch.version.cuda})") print(f" Device: {torch.cuda.get_device_name(0)}") elif torch.backends.mps.is_available(): print(f" GPU: MPS (Mac)") else: print(f" GPU: Not available") except ImportError: package = pip_name or import_name print(f"✗ {import_name}: Not installed") print(f" Install with: conda install {package}")
# List environments conda env list # Create from file conda env create -f environment.yml # Clone environment conda create --name ee541_backup --clone ee541 # Remove environment conda env remove -n ee541 # Update all packages conda update --all # Clean cache (free space) conda clean --all
# Search for package conda search pytorch # Install specific version conda install pytorch=2.0.1 # List installed packages conda list # Check for updates conda update --dry-run --all # Channel priority conda config --add channels conda-forge conda config --set channel_priority strict
Task: Classify clothing items into 10 categories
Architecture: Simple 2-layer network
Training: 4 epochs, Adam optimizer
Files:
1-fashion-mnist.ipynb: Dataset exploration2-minimal-pytorch.ipynb: Core training loop3-feature-visualization.ipynb: TensorBoard monitoring# Minimal.ipynb - Key components # 1. Data Loading train_loader = DataLoader(train_set, batch_size=100, shuffle=True) test_loader = DataLoader(test_set, batch_size=100, shuffle=False) # 2. Model Definition class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.hidden = nn.Linear(784, 128) self.output = nn.Linear(128, 10) def forward(self, x): x = F.relu(self.hidden(x)) return self.output(x) # 3. Training Loop for epoch in range(num_epochs): for images, labels in train_loader: # Forward pass outputs = model(images.view(-1, 784)) loss = loss_func(outputs, labels) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step()
#| echo: false """ Training dynamics showing loss decay and accuracy improvement over four epochs, with characteristic rapid initial learning followed by diminishing returns. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) iterations = np.arange(0, 240, 1) epochs = iterations / 60 # Loss curve ax = axes[0] loss = 2.3 * np.exp(-epochs/2) + 0.3 + 0.05*np.sin(iterations/5) ax.plot(iterations, loss, 'b-', linewidth=2) ax.set_xlabel('Iteration') ax.set_ylabel('Cross-Entropy Loss') ax.set_title('Training Loss', fontweight='bold') ax.grid(True, alpha=0.3) for ep in range(4): ax.axvline(x=ep*60, color='gray', linestyle='--', alpha=0.3) ax.text(ep*60 + 30, 2.2, f'Epoch {ep+1}', ha='center', fontsize=9) # Accuracy curve ax = axes[1] accuracy = 100 * (1 - np.exp(-epochs/1.5)) + np.random.normal(0, 0.5, len(epochs)) accuracy = np.clip(accuracy, 0, 89) ax.plot(iterations[::5], accuracy[::5], 'g-', linewidth=2, marker='o', markersize=3) ax.set_xlabel('Iteration') ax.set_ylabel('Test Accuracy (%)') ax.set_title('Model Performance', fontweight='bold') ax.grid(True, alpha=0.3) ax.set_ylim(0, 100) ax.axhline(y=87, color='red', linestyle='--', alpha=0.5) ax.text(200, 89, 'Final: ~87%', fontsize=10, fontweight='bold') plt.suptitle('Typical Training Progress', fontsize=14, fontweight='bold', y=1.02) plt.tight_layout() plt.show()
Observations:
# From terminal tensorboard --logdir runs # Navigate to http://localhost:6006
#| echo: false """ PCA embedding visualization showing 10 classes forming distinct clusters in 2D feature space. """ #| fig-align: center fig, ax = plt.subplots(figsize=(8, 8)) np.random.seed(42) n_points = 500 n_classes = 10 embeddings = [] labels = [] colors = plt.cm.tab10(np.arange(10)) for i in range(n_classes): center = np.random.randn(2) * 3 points = center + np.random.randn(n_points//n_classes, 2) * 0.5 embeddings.append(points) labels.extend([i] * (n_points//n_classes)) embeddings = np.vstack(embeddings) for i in range(n_classes): mask = np.array(labels) == i ax.scatter(embeddings[mask, 0], embeddings[mask, 1], c=[colors[i]], s=20, alpha=0.6, label=f'Class {i}') ax.set_xlabel('First Principal Component') ax.set_ylabel('Second Principal Component') ax.set_title('Embedding Visualization (PCA)', fontweight='bold', fontsize=12) ax.grid(True, alpha=0.3) ax.legend(ncol=2, fontsize=8, loc='upper right') plt.tight_layout() plt.show()
Embedding Insight: Classes form distinct clusters in feature space
#| echo: false """ Neural network architecture diagram showing 784→128→10 layer structure alongside parameter summary table with 101K total parameters. """ #| fig-align: center fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Architecture diagram ax = axes[0] ax.set_xlim(0, 10) ax.set_ylim(0, 10) ax.axis('off') # Input layer for i in range(8): y = 2 + i * 0.8 circle = patches.Circle((2, y), 0.2, facecolor='lightblue', edgecolor='black') ax.add_patch(circle) ax.text(2, 1, '784 inputs\n(flattened)', ha='center', fontsize=9) ax.text(2, 8, '...', ha='center', fontsize=12) # Hidden layer for i in range(6): y = 3 + i * 0.8 circle = patches.Circle((5, y), 0.25, facecolor='lightgreen', edgecolor='black') ax.add_patch(circle) ax.text(5, 2, '128 hidden\n(ReLU)', ha='center', fontsize=9) ax.text(5, 7.5, '...', ha='center', fontsize=12) # Output layer for i in range(5): y = 3.5 + i * 0.6 circle = patches.Circle((8, y), 0.2, facecolor='lightcoral', edgecolor='black') ax.add_patch(circle) ax.text(8, 2.5, '10 outputs\n(classes)', ha='center', fontsize=9) # Connections (subset) for i in range(3): for j in range(3): ax.plot([2.2, 4.75], [2 + i*0.8, 3 + j*0.8], 'gray', alpha=0.4, linewidth=0.7) ax.plot([5.25, 7.8], [3 + i*0.8, 3.5 + j*0.6], 'gray', alpha=0.4, linewidth=0.7) ax.set_title('Network Architecture', fontweight='bold', fontsize=12) # Parameter summary ax = axes[1] ax.axis('off') summary_text = """Model Summary (torchinfo output): ================================================== Layer (type) Output Shape Param # ================================================== Linear-1 [-1, 128] 100,480 ReLU-2 [-1, 128] 0 Linear-3 [-1, 10] 1,290 ================================================== Total params: 101,770 Trainable params: 101,770 Non-trainable params: 0 ================================================== Storage: model.pth (~400 KB) Inference time: <1ms per image""" ax.text(0.1, 0.9, summary_text, fontsize=10, family='monospace', verticalalignment='top', transform=ax.transAxes) ax.set_title('Model Details', fontweight='bold', fontsize=12) plt.tight_layout() plt.show()
1-fashion-mnist.ipynb # Dataset exploration 2-minimal-pytorch.ipynb # Core training 3-feature-visualization.ipynb # TensorBoard
#| echo: false """ Confusion matrix for Fashion-MNIST classification showing prediction counts across 10 clothing categories, with common misclassifications between similar items like Shirt/T-shirt and Pullover/Coat. """ #| fig-align: center fig, ax = plt.subplots(figsize=(8, 8)) categories_short = ['Tshirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot'] np.random.seed(42) conf_matrix = np.eye(10) * 80 + np.random.randint(0, 10, (10, 10)) conf_matrix = conf_matrix.astype(int) conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1, keepdims=True) * 100 im = ax.imshow(conf_matrix_norm, cmap='Blues', vmin=0, vmax=100) for i in range(10): for j in range(10): text = ax.text(j, i, f'{conf_matrix[i, j]}', ha='center', va='center', color='white' if conf_matrix_norm[i, j] > 50 else 'black', fontsize=8) ax.set_xticks(range(10)) ax.set_yticks(range(10)) ax.set_xticklabels(categories_short, rotation=45, ha='right', fontsize=9) ax.set_yticklabels(categories_short, fontsize=9) ax.set_xlabel('Predicted', fontsize=10) ax.set_ylabel('Actual', fontsize=10) ax.set_title('Confusion Matrix', fontsize=12, fontweight='bold') plt.colorbar(im, ax=ax, label='Count') plt.tight_layout() plt.show()
Common Confusions: Shirt ↔ T-shirt, Pullover ↔ Coat
Next week: Array operations and automatic differentiation