"""Configuration and result dataclasses for the genetic feature selector.
This module holds the plain data containers used across ``evo_gafs``:
* :class:`GAConfig` — the full configuration of the genetic algorithm.
* :class:`EvolutionStats` — per-generation statistics collected during a run.
* :class:`SelectionResult` — the final outcome of a feature-selection run.
None of these classes depend on DEAP or scikit-learn, which keeps them cheap to
import and trivial to serialize.
"""
from __future__ import annotations
import json
import pickle
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
import numpy as np
VALID_MODES = ("single", "multiobjective")
[docs]
@dataclass
class GAConfig:
"""Full configuration of the genetic algorithm.
Parameters
----------
population_size : int, default=50
Number of individuals in the population. Larger populations explore the
search space better at a higher computational cost. Typical: 30-100.
n_generations : int, default=100
Number of generations (iterations of the GA). Typical: 50-200.
crossover_prob : float, default=0.8
Probability of applying crossover between two individuals. Recommended
range: ``[0.6, 0.9]``.
mutation_prob : float, default=0.15
Probability of applying the mutation operator to an individual.
Recommended range: ``[0.05, 0.3]``.
mutation_indpb : float or None, default=None
Independent probability of flipping each bit when an individual is
mutated. If ``None``, it is set to ``1 / n_features`` at fit time.
tournament_size : int, default=3
Tournament size for tournament selection (``mode='single'``). Larger
values increase selective pressure. Typical: 2-7.
mode : {'single', 'multiobjective'}, default='single'
``'single'`` uses a weighted scalar fitness; ``'multiobjective'`` uses
NSGA-II and returns a Pareto front.
alpha : float, default=0.8
Weight of the performance metric in ``mode='single'``::
fitness = alpha * cv_score + (1 - alpha) * compression_ratio
``alpha=1.0`` is a pure wrapper; lower values favour compression
(useful for edge deployment).
cv_folds : int, default=5
Number of cross-validation folds used to evaluate fitness.
min_features : int, default=1
Minimum number of selected features. Individuals below this threshold
are repaired/penalised.
elite_size : int, default=2
Number of best individuals carried over unchanged each generation
(elitism, ``mode='single'`` only).
random_seed : int or None, default=42
Seed for reproducibility.
n_jobs : int, default=1
Parallelism passed to scikit-learn's cross-validation.
verbose : bool, default=True
If ``True``, print a log line for some generations and a final summary.
early_stopping_rounds : int or None, default=None
If the best fitness does not improve for this many generations, stop.
``None`` disables early stopping (``mode='single'`` only).
early_stopping_tol : float, default=1e-4
Minimum improvement considered significant for early stopping.
"""
population_size: int = 50
n_generations: int = 100
crossover_prob: float = 0.8
mutation_prob: float = 0.15
mutation_indpb: float | None = None
tournament_size: int = 3
mode: str = "single"
alpha: float = 0.8
cv_folds: int = 5
min_features: int = 1
elite_size: int = 2
random_seed: int | None = 42
n_jobs: int = 1
verbose: bool = True
early_stopping_rounds: int | None = None
early_stopping_tol: float = 1e-4
def __post_init__(self) -> None:
self.validate()
[docs]
def validate(self) -> None:
"""Validate the configuration, raising :class:`ValueError` on error.
Unlike ``assert`` statements, these checks are always enforced, even
when Python runs with optimisations (``-O``).
"""
if self.mode not in VALID_MODES:
raise ValueError(f"mode must be one of {VALID_MODES}, got {self.mode!r}")
if not 0.0 <= self.alpha <= 1.0:
raise ValueError(f"alpha must be in [0, 1], got {self.alpha}")
if not 0.0 < self.crossover_prob <= 1.0:
raise ValueError(f"crossover_prob must be in (0, 1], got {self.crossover_prob}")
if not 0.0 < self.mutation_prob <= 1.0:
raise ValueError(f"mutation_prob must be in (0, 1], got {self.mutation_prob}")
if self.mutation_indpb is not None and not 0.0 < self.mutation_indpb <= 1.0:
raise ValueError(f"mutation_indpb must be in (0, 1], got {self.mutation_indpb}")
if self.population_size < 2:
raise ValueError(f"population_size must be >= 2, got {self.population_size}")
if self.n_generations < 1:
raise ValueError(f"n_generations must be >= 1, got {self.n_generations}")
if self.tournament_size < 2:
raise ValueError(f"tournament_size must be >= 2, got {self.tournament_size}")
if self.cv_folds < 2:
raise ValueError(f"cv_folds must be >= 2, got {self.cv_folds}")
if self.min_features < 1:
raise ValueError(f"min_features must be >= 1, got {self.min_features}")
if self.elite_size < 0:
raise ValueError(f"elite_size must be >= 0, got {self.elite_size}")
[docs]
def to_dict(self) -> dict:
"""Return the configuration as a plain dictionary."""
return asdict(self)
[docs]
@dataclass
class EvolutionStats:
"""Statistics for a single generation during evolution."""
generation: int
best_fitness: float
mean_fitness: float
std_fitness: float
best_n_features: int
mean_n_features: float
elapsed_time: float
def __repr__(self) -> str:
return (
f"Gen {self.generation:4d} | "
f"BestFit={self.best_fitness:.4f} | "
f"MeanFit={self.mean_fitness:.4f}±{self.std_fitness:.4f} | "
f"BestFeats={self.best_n_features:3d} | "
f"MeanFeats={self.mean_n_features:.1f} | "
f"Time={self.elapsed_time:.2f}s"
)
[docs]
@dataclass
class SelectionResult:
"""Final outcome of a GA feature-selection run.
Attributes
----------
selected_mask : numpy.ndarray
Boolean vector of length ``n_features``. ``True`` marks a selected
feature.
selected_indices : numpy.ndarray
Indices of the selected features.
selected_feature_names : list of str
Names of the selected features.
best_fitness : float
Best fitness achieved.
best_cv_score : float
Best cross-validation score (raw metric, unweighted).
n_selected : int
Number of selected features.
compression_ratio : float
Fraction of features removed (``1 - n_selected / n_total``).
history : list of EvolutionStats
Per-generation statistics.
pareto_front : list of dict or None
Only in ``mode='multiobjective'``. Each entry holds ``mask``,
``cv_score``, ``compression`` and ``n_features``.
config : GAConfig or None
Configuration used for the run.
total_time : float
Total wall-clock time in seconds.
n_evaluations : int
Total number of fitness evaluations performed.
"""
selected_mask: np.ndarray
selected_indices: np.ndarray
selected_feature_names: list[str]
best_fitness: float
best_cv_score: float
n_selected: int
compression_ratio: float
history: list[EvolutionStats] = field(default_factory=list)
pareto_front: list[dict] | None = None
config: GAConfig | None = None
total_time: float = 0.0
n_evaluations: int = 0
[docs]
def summary(self) -> str:
"""Return a human-readable multi-line summary of the result."""
lines = [
"=" * 60,
" RESULT - GA Feature Selection",
"=" * 60,
f" Original features : {len(self.selected_mask)}",
f" Selected features : {self.n_selected}",
f" Compression ratio : {self.compression_ratio:.1%}",
f" Best CV score : {self.best_cv_score:.4f}",
f" Best fitness : {self.best_fitness:.4f}",
f" Total evaluations : {self.n_evaluations}",
f" Total time : {self.total_time:.2f}s",
"-" * 60,
" Selected features:",
]
for i, name in zip(self.selected_indices, self.selected_feature_names):
lines.append(f" [{int(i):3d}] {name}")
lines.append("=" * 60)
return "\n".join(lines)
[docs]
def to_json(self) -> dict[str, Any]:
"""Return a JSON-serialisable dictionary (e.g. for logging/MLflow)."""
return {
"selected_indices": [int(i) for i in self.selected_indices],
"selected_feature_names": list(self.selected_feature_names),
"best_fitness": float(self.best_fitness),
"best_cv_score": float(self.best_cv_score),
"n_selected": int(self.n_selected),
"compression_ratio": float(self.compression_ratio),
"total_time": float(self.total_time),
"n_evaluations": int(self.n_evaluations),
"config": self.config.to_dict() if self.config is not None else None,
}
[docs]
def save_json(self, path: str | Path) -> None:
"""Write the JSON-serialisable summary to ``path``."""
Path(path).write_text(json.dumps(self.to_json(), indent=2), encoding="utf-8")
[docs]
def save(self, path: str | Path) -> None:
"""Pickle the full result object to ``path``."""
Path(path).write_bytes(pickle.dumps(self))
[docs]
@classmethod
def load(cls, path: str | Path) -> SelectionResult:
"""Load a pickled :class:`SelectionResult` from ``path``."""
return pickle.loads(Path(path).read_bytes())