+++ /dev/null
-from sklearn.datasets import load_wine
-from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-
-
-class Model(torch.nn.Module):
- def __init__(self, n_features: int, n_classes: int) -> None:
- super(Model, self).__init__()
- self.fc1 = torch.nn.Linear(n_features, n_features * 4)
- self.fc2 = torch.nn.Linear(n_features * 4, n_classes)
- self.act = torch.nn.Tanh()
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- x = self.fc1(x)
- x = self.act(x)
- return self.fc2(x)
-
-
-def get_grad_dist(model: torch.nn.Module) -> None:
- grads = []
- for param in model.parameters():
- if param.grad is not None:
- grads += param.grad.view(-1).tolist()
- return grads
-
-
-def main():
- data = load_wine()
- X = torch.Tensor(data.data)
- y = torch.LongTensor(data.target)
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
- model = Model(len(X[0]), 3)
- optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
- criterion = torch.nn.CrossEntropyLoss()
- grads = []
- for epoch in range(0, 5):
- model.train()
- epoch_loss = 0.0
- for X, y in zip(X_train, y_train):
- optimizer.zero_grad()
- y_hat = model(X)
- loss = criterion(y_hat, y)
- epoch_loss += loss.item()
- loss.backward()
- optimizer.step()
- grads += get_grad_dist(model)
-
- print(f"Loss: {epoch_loss / len(X_train)}")
-
- grads_abs = np.abs(np.array(grads))
- grads_abs = grads_abs[grads_abs > 0]
- print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}")
- log_min = np.log10(grads_abs.min())
- log_max = np.log10(grads_abs.max())
- bins = np.logspace(log_min, log_max, 50)
- counts, bin_edges = np.histogram(grads_abs, bins=bins)
- #bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
- bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:])
- bin_widths = np.diff(bin_edges)
- probability_density = counts / (np.sum(counts) * bin_widths)
- fig, ax = plt.subplots(figsize=(12, 6))
- ax.loglog(bin_centers, probability_density, linewidth=3)
- ax.set_xlabel('Gradient Magnitude', fontsize=12)
- ax.set_ylabel('Probability density', fontsize=12)
- ax.set_title('Distribution of Gradient Magnitudes', fontsize=14)
- ax.grid(True, which='both', alpha=0.3)
- plt.tight_layout()
- integral = np.sum(probability_density * bin_widths)
- print(f"Integral of PDF: {integral}") # Should be close to 1
- plt.show()
-
-if __name__ == "__main__":
- main()
--- /dev/null
+def main() -> None:
+ print("Experiment")
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+from sklearn.datasets import load_wine
+from sklearn.model_selection import train_test_split
+import torch
+import argparse
+from src.models import ModelFactory
+from src.utils import get_grad_dist, plot_distribution
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser("How are gradients distributed?")
+ parser.add_argument("--model", type=str, choices=ModelFactory._factory)
+ parser.add_argument("--epochs", type=int, default=5)
+ parser.add_argument("--lr", type=float, default=1e-3)
+ return parser.parse_args()
+
+def main():
+ args = parse_args()
+ data = load_wine()
+ features = torch.Tensor(data.data)
+ targets = torch.LongTensor(data.target)
+ X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.1, random_state=1337)
+ acts = [torch.nn.ReLU(), torch.nn.Tanh(), torch.nn.Sigmoid(), torch.nn.GELU()]
+ for act in acts:
+ model: torch.nn.Module = ModelFactory.get(args.model, n_features=len(features[0]), n_classes=3, activation_function=act)
+ optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+ criterion = torch.nn.CrossEntropyLoss()
+ grads = []
+ for epoch in range(0, args.epochs):
+ model.train()
+ epoch_loss = 0.0
+ for X, y in zip(X_train, y_train):
+ optimizer.zero_grad()
+ y_hat = model(X)
+ loss = criterion(y_hat, y)
+ epoch_loss += loss.item()
+ loss.backward()
+ optimizer.step()
+ grads += get_grad_dist(model)
+
+ print(f"Loss: {epoch_loss / len(X_train)}")
+ plot_distribution(grads)
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+import torch
+from typing import Callable
+from typing import Any
+
+
+class ModelFactory:
+
+ _factory: dict[str, type[torch.nn.Module]] = {}
+
+ def __init__(self, name: str) -> None:
+ self._name = name
+
+ def __call__(self, model: type[torch.nn.Module]) -> type[torch.nn.Module]:
+ self._factory[self._name] = model
+ return model
+
+ @classmethod
+ def get(cls: type[ModelFactory], name: str, *args: Any, **kwargs: Any) -> torch.nn.Module:
+ return cls._factory[name](*args, **kwargs)
+
+
+@ModelFactory("MLP")
+class MLP(torch.nn.Module):
+ """
+ Basic MLP with an activation function in-between layers.
+ """
+ def __init__(self, n_features: int, n_classes: int, activation_function: Callable = torch.nn.ReLU()) -> None:
+ super(MLP, self).__init__()
+ self.fc1 = torch.nn.Linear(n_features, n_features * 4)
+ self.fc2 = torch.nn.Linear(n_features * 4, n_classes)
+ self.act = activation_function
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.fc2(x)
+ return x
--- /dev/null
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+def get_grad_dist(model: torch.nn.Module) -> None:
+ grads = []
+ for param in model.parameters():
+ if param.grad is not None:
+ grads += param.grad.view(-1).tolist()
+ return grads
+
+
+def plot_distribution(gradients: list[torch.Tensor], geometric_mean: bool = False) -> None:
+ grads_abs = np.abs(np.array(gradients))
+ grads_abs = grads_abs[grads_abs > 0]
+ print(f"Min magnitude: {min(grads_abs)}, Max: {max(grads_abs)}, Mean: {np.mean(grads_abs)}")
+ log_min = np.log10(grads_abs.min())
+ log_max = np.log10(grads_abs.max())
+ bins = np.logspace(log_min, log_max, 50)
+ counts, bin_edges = np.histogram(grads_abs, bins=bins)
+ if geometric_mean:
+ bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:])
+ else:
+ # Linear mean
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
+ bin_widths = np.diff(bin_edges)
+ probability_density = counts / (np.sum(counts) * bin_widths)
+ fig, ax = plt.subplots(figsize=(12, 6))
+ ax.loglog(bin_centers, probability_density, linewidth=3)
+ ax.set_xlabel('Gradient Magnitude', fontsize=12)
+ ax.set_ylabel('Probability density', fontsize=12)
+ ax.set_title('Distribution of Gradient Magnitudes', fontsize=14)
+ ax.grid(True, which='both', alpha=0.3)
+ plt.tight_layout()
+ integral = np.sum(probability_density * bin_widths)
+ #print(f"Integral of PDF: {integral}") # Should be close to 1
+ plt.show()
+
+
+
+