From 6c1fc0a53a2948d44572f058c060594b55418f60 Mon Sep 17 00:00:00 2001 From: Sondre Wold Date: Sat, 29 Nov 2025 16:06:05 +0100 Subject: [PATCH] [refactor] create model factory, separate into files --- main.py | 75 ----------------------------------------------- src/__init__.py | 0 src/__main__.py | 5 ++++ src/experiment.py | 44 +++++++++++++++++++++++++++ src/models.py | 37 +++++++++++++++++++++++ src/utils.py | 41 ++++++++++++++++++++++++++ 6 files changed, 127 insertions(+), 75 deletions(-) delete mode 100644 main.py create mode 100644 src/__init__.py create mode 100644 src/__main__.py create mode 100644 src/experiment.py create mode 100644 src/models.py create mode 100644 src/utils.py diff --git a/main.py b/main.py deleted file mode 100644 index 03908b8..0000000 --- a/main.py +++ /dev/null @@ -1,75 +0,0 @@ -from sklearn.datasets import load_wine -from sklearn.model_selection import train_test_split -import matplotlib.pyplot as plt -import numpy as np -import torch - - -class Model(torch.nn.Module): - def __init__(self, n_features: int, n_classes: int) -> None: - super(Model, self).__init__() - self.fc1 = torch.nn.Linear(n_features, n_features * 4) - self.fc2 = torch.nn.Linear(n_features * 4, n_classes) - self.act = torch.nn.Tanh() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.fc1(x) - x = self.act(x) - return self.fc2(x) - - -def get_grad_dist(model: torch.nn.Module) -> None: - grads = [] - for param in model.parameters(): - if param.grad is not None: - grads += param.grad.view(-1).tolist() - return grads - - -def main(): - data = load_wine() - X = torch.Tensor(data.data) - y = torch.LongTensor(data.target) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337) - model = Model(len(X[0]), 3) - optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) - criterion = torch.nn.CrossEntropyLoss() - grads = [] - for epoch in range(0, 5): - model.train() - epoch_loss = 0.0 - for X, y in zip(X_train, y_train): - optimizer.zero_grad() - y_hat = model(X) - loss = criterion(y_hat, y) - epoch_loss += loss.item() - loss.backward() - optimizer.step() - grads += get_grad_dist(model) - - print(f"Loss: {epoch_loss / len(X_train)}") - - grads_abs = np.abs(np.array(grads)) - grads_abs = grads_abs[grads_abs > 0] - print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}") - log_min = np.log10(grads_abs.min()) - log_max = np.log10(grads_abs.max()) - bins = np.logspace(log_min, log_max, 50) - counts, bin_edges = np.histogram(grads_abs, bins=bins) - #bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:]) - bin_widths = np.diff(bin_edges) - probability_density = counts / (np.sum(counts) * bin_widths) - fig, ax = plt.subplots(figsize=(12, 6)) - ax.loglog(bin_centers, probability_density, linewidth=3) - ax.set_xlabel('Gradient Magnitude', fontsize=12) - ax.set_ylabel('Probability density', fontsize=12) - ax.set_title('Distribution of Gradient Magnitudes', fontsize=14) - ax.grid(True, which='both', alpha=0.3) - plt.tight_layout() - integral = np.sum(probability_density * bin_widths) - print(f"Integral of PDF: {integral}") # Should be close to 1 - plt.show() - -if __name__ == "__main__": - main() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__main__.py b/src/__main__.py new file mode 100644 index 0000000..e79f874 --- /dev/null +++ b/src/__main__.py @@ -0,0 +1,5 @@ +def main() -> None: + print("Experiment") + +if __name__ == "__main__": + main() diff --git a/src/experiment.py b/src/experiment.py new file mode 100644 index 0000000..b403fcf --- /dev/null +++ b/src/experiment.py @@ -0,0 +1,44 @@ +from sklearn.datasets import load_wine +from sklearn.model_selection import train_test_split +import torch +import argparse +from src.models import ModelFactory +from src.utils import get_grad_dist, plot_distribution + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser("How are gradients distributed?") + parser.add_argument("--model", type=str, choices=ModelFactory._factory) + parser.add_argument("--epochs", type=int, default=5) + parser.add_argument("--lr", type=float, default=1e-3) + return parser.parse_args() + +def main(): + args = parse_args() + data = load_wine() + features = torch.Tensor(data.data) + targets = torch.LongTensor(data.target) + X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.1, random_state=1337) + acts = [torch.nn.ReLU(), torch.nn.Tanh(), torch.nn.Sigmoid(), torch.nn.GELU()] + for act in acts: + model: torch.nn.Module = ModelFactory.get(args.model, n_features=len(features[0]), n_classes=3, activation_function=act) + optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr) + criterion = torch.nn.CrossEntropyLoss() + grads = [] + for epoch in range(0, args.epochs): + model.train() + epoch_loss = 0.0 + for X, y in zip(X_train, y_train): + optimizer.zero_grad() + y_hat = model(X) + loss = criterion(y_hat, y) + epoch_loss += loss.item() + loss.backward() + optimizer.step() + grads += get_grad_dist(model) + + print(f"Loss: {epoch_loss / len(X_train)}") + plot_distribution(grads) + +if __name__ == "__main__": + main() diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..fc9bb2e --- /dev/null +++ b/src/models.py @@ -0,0 +1,37 @@ +import torch +from typing import Callable +from typing import Any + + +class ModelFactory: + + _factory: dict[str, type[torch.nn.Module]] = {} + + def __init__(self, name: str) -> None: + self._name = name + + def __call__(self, model: type[torch.nn.Module]) -> type[torch.nn.Module]: + self._factory[self._name] = model + return model + + @classmethod + def get(cls: type[ModelFactory], name: str, *args: Any, **kwargs: Any) -> torch.nn.Module: + return cls._factory[name](*args, **kwargs) + + +@ModelFactory("MLP") +class MLP(torch.nn.Module): + """ + Basic MLP with an activation function in-between layers. + """ + def __init__(self, n_features: int, n_classes: int, activation_function: Callable = torch.nn.ReLU()) -> None: + super(MLP, self).__init__() + self.fc1 = torch.nn.Linear(n_features, n_features * 4) + self.fc2 = torch.nn.Linear(n_features * 4, n_classes) + self.act = activation_function + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..4f5593d --- /dev/null +++ b/src/utils.py @@ -0,0 +1,41 @@ +import torch +import numpy as np +import matplotlib.pyplot as plt + +def get_grad_dist(model: torch.nn.Module) -> None: + grads = [] + for param in model.parameters(): + if param.grad is not None: + grads += param.grad.view(-1).tolist() + return grads + + +def plot_distribution(gradients: list[torch.Tensor], geometric_mean: bool = False) -> None: + grads_abs = np.abs(np.array(gradients)) + grads_abs = grads_abs[grads_abs > 0] + print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}") + log_min = np.log10(grads_abs.min()) + log_max = np.log10(grads_abs.max()) + bins = np.logspace(log_min, log_max, 50) + counts, bin_edges = np.histogram(grads_abs, bins=bins) + if geometric_mean: + bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:]) + else: + # Linear mean + bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + bin_widths = np.diff(bin_edges) + probability_density = counts / (np.sum(counts) * bin_widths) + fig, ax = plt.subplots(figsize=(12, 6)) + ax.loglog(bin_centers, probability_density, linewidth=3) + ax.set_xlabel('Gradient Magnitude', fontsize=12) + ax.set_ylabel('Probability density', fontsize=12) + ax.set_title('Distribution of Gradient Magnitudes', fontsize=14) + ax.grid(True, which='both', alpha=0.3) + plt.tight_layout() + integral = np.sum(probability_density * bin_widths) + #print(f"Integral of PDF: {integral}") # Should be close to 1 + plt.show() + + + + -- 2.39.5