From 6c1fc0a53a2948d44572f058c060594b55418f60 Mon Sep 17 00:00:00 2001
From: Sondre Wold <sondrewold@proton.me>
Date: Sat, 29 Nov 2025 16:06:05 +0100
Subject: [PATCH] [refactor] create model factory, separate into files

---
 main.py           | 75 -----------------------------------------------
 src/__init__.py   |  0
 src/__main__.py   |  5 ++++
 src/experiment.py | 44 +++++++++++++++++++++++++++
 src/models.py     | 37 +++++++++++++++++++++++
 src/utils.py      | 41 ++++++++++++++++++++++++++
 6 files changed, 127 insertions(+), 75 deletions(-)
 delete mode 100644 main.py
 create mode 100644 src/__init__.py
 create mode 100644 src/__main__.py
 create mode 100644 src/experiment.py
 create mode 100644 src/models.py
 create mode 100644 src/utils.py

diff --git a/main.py b/main.py
deleted file mode 100644
index 03908b8..0000000
--- a/main.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from sklearn.datasets import load_wine
-from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-
-
-class Model(torch.nn.Module):
-    def __init__(self, n_features: int, n_classes: int) -> None:
-        super(Model, self).__init__()
-        self.fc1 = torch.nn.Linear(n_features, n_features * 4)
-        self.fc2 = torch.nn.Linear(n_features * 4, n_classes)
-        self.act = torch.nn.Tanh()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.fc1(x)
-        x = self.act(x)
-        return self.fc2(x)
-
-
-def get_grad_dist(model: torch.nn.Module) -> None:
-    grads = []
-    for param in model.parameters():
-        if param.grad is not None:
-            grads += param.grad.view(-1).tolist()
-    return grads
-
-
-def main():
-    data = load_wine()
-    X = torch.Tensor(data.data)
-    y = torch.LongTensor(data.target)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
-    model = Model(len(X[0]), 3)
-    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
-    criterion = torch.nn.CrossEntropyLoss()
-    grads = []
-    for epoch in range(0, 5):
-        model.train()
-        epoch_loss = 0.0
-        for X, y in zip(X_train, y_train):
-            optimizer.zero_grad()
-            y_hat = model(X)
-            loss = criterion(y_hat, y)
-            epoch_loss += loss.item()
-            loss.backward()
-            optimizer.step()
-            grads += get_grad_dist(model)
-
-        print(f"Loss: {epoch_loss / len(X_train)}")
-    
-    grads_abs = np.abs(np.array(grads))
-    grads_abs = grads_abs[grads_abs > 0]
-    print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}")
-    log_min = np.log10(grads_abs.min())
-    log_max = np.log10(grads_abs.max())
-    bins = np.logspace(log_min, log_max, 50)
-    counts, bin_edges = np.histogram(grads_abs, bins=bins)
-    #bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
-    bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:])
-    bin_widths = np.diff(bin_edges)
-    probability_density = counts / (np.sum(counts) * bin_widths)
-    fig, ax = plt.subplots(figsize=(12, 6))
-    ax.loglog(bin_centers, probability_density, linewidth=3)
-    ax.set_xlabel('Gradient Magnitude', fontsize=12)
-    ax.set_ylabel('Probability density', fontsize=12)
-    ax.set_title('Distribution of Gradient Magnitudes', fontsize=14)
-    ax.grid(True, which='both', alpha=0.3)
-    plt.tight_layout()
-    integral = np.sum(probability_density * bin_widths)
-    print(f"Integral of PDF: {integral}")  # Should be close to 1
-    plt.show()
-
-if __name__ == "__main__":
-    main()
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/__main__.py b/src/__main__.py
new file mode 100644
index 0000000..e79f874
--- /dev/null
+++ b/src/__main__.py
@@ -0,0 +1,5 @@
+def main() -> None:
+    print("Experiment")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/experiment.py b/src/experiment.py
new file mode 100644
index 0000000..b403fcf
--- /dev/null
+++ b/src/experiment.py
@@ -0,0 +1,44 @@
+from sklearn.datasets import load_wine
+from sklearn.model_selection import train_test_split
+import torch
+import argparse
+from src.models import ModelFactory
+from src.utils import get_grad_dist, plot_distribution
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser("How are gradients distributed?")
+    parser.add_argument("--model", type=str, choices=ModelFactory._factory)
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    data = load_wine()
+    features = torch.Tensor(data.data)
+    targets = torch.LongTensor(data.target)
+    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.1, random_state=1337)
+    acts = [torch.nn.ReLU(), torch.nn.Tanh(), torch.nn.Sigmoid(), torch.nn.GELU()]
+    for act in acts:
+        model: torch.nn.Module = ModelFactory.get(args.model, n_features=len(features[0]), n_classes=3, activation_function=act)
+        optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+        criterion = torch.nn.CrossEntropyLoss()
+        grads = []
+        for epoch in range(0, args.epochs):
+            model.train()
+            epoch_loss = 0.0
+            for X, y in zip(X_train, y_train):
+                optimizer.zero_grad()
+                y_hat = model(X)
+                loss = criterion(y_hat, y)
+                epoch_loss += loss.item()
+                loss.backward()
+                optimizer.step()
+                grads += get_grad_dist(model)
+
+            print(f"Loss: {epoch_loss / len(X_train)}")
+        plot_distribution(grads)
+    
+if __name__ == "__main__":
+    main()
diff --git a/src/models.py b/src/models.py
new file mode 100644
index 0000000..fc9bb2e
--- /dev/null
+++ b/src/models.py
@@ -0,0 +1,37 @@
+import torch
+from typing import Callable
+from typing import Any
+
+
+class ModelFactory:
+
+    _factory: dict[str, type[torch.nn.Module]] = {}
+
+    def __init__(self, name: str) -> None:
+        self._name = name
+
+    def __call__(self, model: type[torch.nn.Module]) -> type[torch.nn.Module]:
+        self._factory[self._name] = model
+        return model
+
+    @classmethod
+    def get(cls: type[ModelFactory], name: str, *args: Any, **kwargs: Any) -> torch.nn.Module:
+        return cls._factory[name](*args, **kwargs)
+
+
+@ModelFactory("MLP")
+class MLP(torch.nn.Module):
+    """
+    Basic MLP with an activation function in-between layers.
+    """
+    def __init__(self, n_features: int, n_classes: int, activation_function: Callable = torch.nn.ReLU()) -> None:
+        super(MLP, self).__init__()
+        self.fc1 = torch.nn.Linear(n_features, n_features * 4)
+        self.fc2 = torch.nn.Linear(n_features * 4, n_classes)
+        self.act = activation_function
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..4f5593d
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,41 @@
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+def get_grad_dist(model: torch.nn.Module) -> None:
+    grads = []
+    for param in model.parameters():
+        if param.grad is not None:
+            grads += param.grad.view(-1).tolist()
+    return grads
+
+
+def plot_distribution(gradients: list[torch.Tensor], geometric_mean: bool = False) -> None:
+    grads_abs = np.abs(np.array(gradients))
+    grads_abs = grads_abs[grads_abs > 0]
+    print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}")
+    log_min = np.log10(grads_abs.min())
+    log_max = np.log10(grads_abs.max())
+    bins = np.logspace(log_min, log_max, 50)
+    counts, bin_edges = np.histogram(grads_abs, bins=bins)
+    if geometric_mean:
+        bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:])
+    else:
+        # Linear mean
+        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
+    bin_widths = np.diff(bin_edges)
+    probability_density = counts / (np.sum(counts) * bin_widths)
+    fig, ax = plt.subplots(figsize=(12, 6))
+    ax.loglog(bin_centers, probability_density, linewidth=3)
+    ax.set_xlabel('Gradient Magnitude', fontsize=12)
+    ax.set_ylabel('Probability density', fontsize=12)
+    ax.set_title('Distribution of Gradient Magnitudes', fontsize=14)
+    ax.grid(True, which='both', alpha=0.3)
+    plt.tight_layout()
+    integral = np.sum(probability_density * bin_widths)
+    #print(f"Integral of PDF: {integral}")  # Should be close to 1
+    plt.show()
+
+
+
+
-- 
2.39.5