Source code for torch_concepts.data.preprocessing.autoencoder

"""
Autoencoder preprocessing for dimensionality reduction.

This module provides autoencoder-based preprocessing to learn low-dimensional
representations of high-dimensional concept data.
"""
import torch.nn as nn
import torch
import torch.optim as optim
import logging
from torch.utils.data import DataLoader
from tqdm import tqdm

logger = logging.getLogger(__name__)


[docs] class SimpleAutoencoder(nn.Module): """ Simple feedforward autoencoder for dimensionality reduction. A standard autoencoder with encoder and decoder networks using ReLU activations. Useful for preprocessing high-dimensional concept spaces. Attributes: encoder (nn.Sequential): Encoder network. decoder (nn.Sequential): Decoder network. Args: input_shape: Number of input features. latent_dim: Dimension of the latent space. Example: >>> import torch >>> from torch_concepts.data.preprocessing.autoencoder import SimpleAutoencoder >>> >>> # Create autoencoder >>> autoencoder = SimpleAutoencoder(input_shape=784, latent_dim=32) >>> >>> # Forward pass >>> x = torch.randn(4, 784) >>> encoded, decoded = autoencoder(x) >>> print(f"Encoded shape: {encoded.shape}") Encoded shape: torch.Size([4, 32]) >>> print(f"Decoded shape: {decoded.shape}") Decoded shape: torch.Size([4, 784]) """
[docs] def __init__(self, input_shape, latent_dim): super(SimpleAutoencoder, self).__init__() self.encoder = nn.Sequential( nn.Flatten(), nn.Linear(input_shape, latent_dim), nn.ReLU(), nn.Linear(latent_dim, latent_dim), nn.LeakyReLU(0.1) ) self.decoder = nn.Sequential( nn.Linear(latent_dim, latent_dim), nn.ReLU(0.1), nn.Linear(latent_dim, input_shape), )
[docs] def forward(self, x): """ Forward pass through the autoencoder. Args: x: Input tensor of shape (batch_size, input_shape). Returns: Tuple[torch.Tensor, torch.Tensor]: (encoded, decoded) where - encoded has shape (batch_size, latent_dim) - decoded has shape (batch_size, input_shape) """ encoded = self.encoder(x) decoded = self.decoder(encoded) return encoded, decoded
[docs] class AutoencoderTrainer: """ Trainer class for autoencoder models with early stopping. Provides training loop, early stopping, and latent representation extraction for autoencoder models. Attributes: model (SimpleAutoencoder): The autoencoder model. criterion (nn.MSELoss): Reconstruction loss function. optimizer (optim.Adam): Optimizer for training. device (str): Device to train on ('cpu' or 'cuda'). Args: input_shape: Number of input features. noise: Noise level to add to latent representations (default: 0.5). latent_dim: Dimension of latent space (default: 32). lr: Learning rate (default: 0.0005). epochs: Maximum training epochs (default: 2000). batch_size: Batch size for training (default: 512). patience: Early stopping patience in epochs (default: 50). device: Device to use for training (default: 'cpu'). Example: >>> import torch >>> from torch_concepts.data.preprocessing.autoencoder import AutoencoderTrainer >>> >>> # Create synthetic data >>> data = torch.randn(1000, 100) >>> >>> # Create and train autoencoder >>> trainer = AutoencoderTrainer( ... input_shape=100, ... latent_dim=16, ... epochs=100, ... batch_size=64, ... device='cpu' ... ) >>> >>> # Train >>> trainer.train(data) Autoencoder training started... >>> >>> # Extract latent representations >>> latent = trainer.extract_latent() >>> print(latent.shape) torch.Size([1000, 16]) """
[docs] def __init__( self, input_shape: int, noise: float = 0., latent_dim: int = 32, lr: float = 0.0005, epochs: int = 2000, batch_size: int = 512, patience: int = 50, device=None ): self.noise_level = noise self.latend_dim = latent_dim self.lr = lr self.epochs = epochs self.batch_size = batch_size self.patience = patience if device is None: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' else: self.device = device self.model = SimpleAutoencoder(input_shape, self.latend_dim) self.model.to(self.device) self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.best_model_wts = None
[docs] def train(self, dataset): """ Train the autoencoder on the given dataset. Implements training loop with MSE reconstruction loss and early stopping based on validation loss. Args: dataset: PyTorch dataset or tensor to train on. """ self.data_loader = DataLoader(dataset, batch_size=self.batch_size) best_loss = float('inf') patience_counter = 0 logger.info('Autoencoder training started...') for epoch in tqdm(range(self.epochs)): self.model.train() train_loss = 0.0 for data in self.data_loader: data = data.to(self.device) self.optimizer.zero_grad() _, outputs = self.model(data) loss = self.criterion(outputs, data) loss.backward() self.optimizer.step() train_loss += loss.item() train_loss /= len(self.data_loader) if epoch % 300 == 0: logger.info(f'Epoch {epoch+1}/{self.epochs}, Train Loss: {train_loss:.4f}') if train_loss < best_loss: best_loss = train_loss patience_counter = 0 self.best_model_wts = self.model.state_dict() else: patience_counter += 1 if patience_counter >= self.patience: logger.info('Early stopping') break logger.info(f'Epoch {epoch+1}/{self.epochs}, Final Train Loss: {train_loss:.4f}') self.is_fitted = True
[docs] def extract_latent(self): """ Extract latent representations from the trained autoencoder. Uses the best model weights (lowest reconstruction loss) to encode the entire dataset. Optionally adds noise to latent representations. Returns: torch.Tensor: Latent representations of shape (n_samples, latent_dim). Example: >>> # After training >>> latent = trainer.extract_latent() >>> print(latent.shape) torch.Size([1000, 16]) """ # Generate the latent representations self.model.load_state_dict(self.best_model_wts) self.model.eval() latent = [] with torch.no_grad(): for data in self.data_loader: data = data.to(self.device) encoded, _ = self.model(data) if self.noise_level > 0: encoded = (1 - self.noise_level)*encoded + self.noise_level*torch.randn_like(encoded) latent.append(encoded) latent = torch.cat(latent, dim=0) return latent
[docs] def extract_embs_from_autoencoder( df, autoencoder_kwargs={} ): """ Extract embeddings from a pandas DataFrame using an autoencoder. Convenience function that trains an autoencoder on tabular data and returns the learned latent representations. Args: df: Input pandas DataFrame. autoencoder_kwargs: Dictionary of keyword arguments for AutoencoderTrainer. Can include 'device' to specify training device (default: 'cpu'). Returns: torch.Tensor: Latent representations of shape (n_samples, latent_dim). Example: >>> import pandas as pd >>> import torch >>> from torch_concepts.data.preprocessing.autoencoder import extract_embs_from_autoencoder >>> >>> # Create sample DataFrame >>> df = pd.DataFrame(torch.randn(100, 50).numpy()) >>> >>> # Extract embeddings >>> embeddings = extract_embs_from_autoencoder( ... df, ... autoencoder_kwargs={ ... 'latent_dim': 10, ... 'epochs': 50, ... 'batch_size': 32, ... 'noise': 0.1, ... 'device': 'cpu' # or 'cuda' if desired ... } ... ) >>> print(embeddings.shape) torch.Size([100, 10]) """ # Convert DataFrame to tensor data = torch.tensor(df.values, dtype=torch.float32) # Train autoencoder trainer = AutoencoderTrainer( input_shape=data.shape[1], **autoencoder_kwargs ) # Train and get transformed dataset trainer.train(data) latent = trainer.extract_latent() return latent