Skip to content

Evaluator

High-level evaluation coordinator bound to a trainer.

This class owns an EncoderEvaluator and DecoderEvaluator and exposes convenience methods that pass the trainer through. It can also hold a Visualizer instance for evaluation-related plots.

Subclasses can override evaluation or plotting methods to customize caching, metrics, or visualization behavior.

Source code in gradiend/evaluator/evaluator.py
def __init__(
    self,
    trainer: Any,
    encoder_evaluator: Optional[EncoderEvaluator] = None,
    decoder_evaluator: Optional[DecoderEvaluator] = None,
    visualizer_class: Optional[Type] = None,
):
    self._trainer = trainer
    self._encoder_evaluator = encoder_evaluator or EncoderEvaluator()
    self._decoder_evaluator = decoder_evaluator or DecoderEvaluator()
    self._visualizer = None
    self._visualizer_class = visualizer_class if visualizer_class is not None else _default_visualizer_class()

_decoder_evaluator instance-attribute

_decoder_evaluator = decoder_evaluator or DecoderEvaluator()

_encoder_evaluator instance-attribute

_encoder_evaluator = encoder_evaluator or EncoderEvaluator()

_trainer instance-attribute

_trainer = trainer

_visualizer instance-attribute

_visualizer = None

_visualizer_class instance-attribute

_visualizer_class = visualizer_class if visualizer_class is not None else _default_visualizer_class()

trainer property

trainer

_delegate_to_visualizer

_delegate_to_visualizer(method_name, **kwargs)
Source code in gradiend/evaluator/evaluator.py
def _delegate_to_visualizer(self, method_name: str, **kwargs: Any) -> Any:
    viz = self._get_visualizer()
    if viz is not None and hasattr(viz, method_name):
        return getattr(viz, method_name)(**kwargs)
    raise NotImplementedError(
        f"{method_name} requires a Visualizer; set visualizer_class on Evaluator or override this method."
    )

_get_visualizer

_get_visualizer()
Source code in gradiend/evaluator/evaluator.py
def _get_visualizer(self):
    if self._visualizer is not None:
        return self._visualizer
    if self._visualizer_class is None:
        return None
    self._visualizer = self._visualizer_class(self._trainer)
    return self._visualizer

evaluate

evaluate(*, kwargs_encoder=None, kwargs_decoder=None, **kwargs)

Run encoder and decoder evaluation and return a combined result.

Parameters:

Name Type Description Default
kwargs_encoder dict

Optional dict of keyword arguments forwarded to evaluate_encoder.

None
kwargs_decoder dict

Optional dict of keyword arguments forwarded to evaluate_decoder.

None
**kwargs Any

Extra kwargs applied to both encoder and decoder evaluations (e.g., shared eval data settings).

{}

Returns:

Type Description
Dict[str, Any]

Dict with:

Dict[str, Any]
  • encoder: Result dict from evaluate_encoder.
Dict[str, Any]
  • decoder: Result dict from evaluate_decoder.
Source code in gradiend/evaluator/evaluator.py
def evaluate(self, *, kwargs_encoder: dict = None, kwargs_decoder: dict = None, **kwargs: Any) -> Dict[str, Any]:
    """
    Run encoder and decoder evaluation and return a combined result.

    Args:
        kwargs_encoder: Optional dict of keyword arguments forwarded to
            evaluate_encoder.
        kwargs_decoder: Optional dict of keyword arguments forwarded to
            evaluate_decoder.
        **kwargs: Extra kwargs applied to both encoder and decoder
            evaluations (e.g., shared eval data settings).

    Returns:
        Dict with:
        - encoder: Result dict from evaluate_encoder.
        - decoder: Result dict from evaluate_decoder.
    """
    kwargs_encoder = kwargs_encoder or {}
    kwargs_decoder = kwargs_decoder or {}

    # Pass through any additional kwargs to both evaluators (e.g. for create_eval_data).
    kwargs_encoder.update(kwargs)
    kwargs_decoder.update(kwargs)

    enc = self.evaluate_encoder(**kwargs_encoder)
    dec = self.evaluate_decoder(**kwargs_decoder)

    return {"encoder": enc, "decoder": dec}

evaluate_decoder

evaluate_decoder(model_with_gradiend=None, feature_factors=None, lrs=None, use_cache=None, max_size_training_like=None, max_size_neutral=None, eval_batch_size=None, training_like_df=None, neutral_df=None, selector=None, summary_extractor=None, summary_metrics=None, target_class=None, increase_target_probabilities=True, plot=False, show=None)

Run decoder grid evaluation and return summary + grid for one direction (strengthen or weaken).

Only the dataset and feature-factor combinations required for the requested direction are computed. Use increase_target_probabilities=True (default) for strengthen, False for weaken.

Parameters:

Name Type Description Default
model_with_gradiend Any

Optional ModelWithGradiend (or path) to evaluate. If None, the trainer's model is used.

None
feature_factors Optional[list]

Optional list of feature factors to test. If None, derived from direction and target classes.

None
lrs Optional[list]

Optional list of learning rates to test. If None, defaults are used.

None
use_cache Optional[bool]

If True, cached decoder grid results are reused when available under the trainer's experiment_dir. If None, defaults come from trainer training args.

None
max_size_training_like Optional[int]

Maximum size for generated training-like eval data.

None
max_size_neutral Optional[int]

Maximum size for generated neutral eval data (and LMS text cap).

None
eval_batch_size Optional[int]

Common eval batch size used for LMS.

None
training_like_df Optional[Any]

Optional explicit training-like DataFrame.

None
neutral_df Optional[Any]

Optional explicit neutral DataFrame.

None
selector Optional[Any]

Optional SelectionPolicy for choosing best candidate per metric (e.g. LMSThresholdPolicy).

None
summary_extractor Optional[Any]

Optional callable(results) -> (candidates, ctx). Use to add derived metrics (e.g. bpi, fpi, mpi) to candidates; then pass summary_metrics.

None
summary_metrics Optional[Any]

Optional list of metric names to summarize (e.g. ["bpi", "fpi", "mpi"]).

None
target_class Optional[Any]

If set (str or list of str), evaluate only for this target class (or classes). Restricts feature factors and datasets for efficiency. When None, evaluates for all target classes.

None
increase_target_probabilities bool

If True (default), compute strengthen summaries only (keys e.g. "3SG"). If False, compute weaken summaries only (keys e.g. "3SG_weaken"). Only required combinations are evaluated.

True
plot bool

If True, after selection run any missing dataset evaluations for plotting, update cache, then plot.

False
show Optional[bool]

If True, display the plot; if False, only save. When None and plot=True, defaults to True.

None

Returns:

Type Description
Dict[str, Any]

Flat dict: for strengthen, keys like result['3SG']; for weaken, keys like result['3SG_weaken'].

Dict[str, Any]

Each entry has value, feature_factor, learning_rate, id, strengthen, lms, base_lms. Plus 'grid'.

Dict[str, Any]

When plot=True, also 'plot_paths' and 'plot_path'.

Source code in gradiend/evaluator/evaluator.py
def evaluate_decoder(
    self,
    model_with_gradiend: Any = None,
    feature_factors: Optional[list] = None,
    lrs: Optional[list] = None,
    use_cache: Optional[bool] = None,
    max_size_training_like: Optional[int] = None,
    max_size_neutral: Optional[int] = None,
    eval_batch_size: Optional[int] = None,
    training_like_df: Optional[Any] = None,
    neutral_df: Optional[Any] = None,
    selector: Optional[Any] = None,
    summary_extractor: Optional[Any] = None,
    summary_metrics: Optional[Any] = None,
    target_class: Optional[Any] = None,
    increase_target_probabilities: bool = True,
    plot: bool = False,
    show: Optional[bool] = None,
) -> Dict[str, Any]:
    """
    Run decoder grid evaluation and return summary + grid for one direction (strengthen or weaken).

    Only the dataset and feature-factor combinations required for the requested direction are computed.
    Use increase_target_probabilities=True (default) for strengthen, False for weaken.

    Args:
        model_with_gradiend: Optional ModelWithGradiend (or path) to evaluate.
            If None, the trainer's model is used.
        feature_factors: Optional list of feature factors to test. If None,
            derived from direction and target classes.
        lrs: Optional list of learning rates to test. If None, defaults are used.
        use_cache: If True, cached decoder grid results are reused when
            available under the trainer's experiment_dir. If None, defaults
            come from trainer training args.
        max_size_training_like: Maximum size for generated training-like eval data.
        max_size_neutral: Maximum size for generated neutral eval data (and LMS text cap).
        eval_batch_size: Common eval batch size used for LMS.
        training_like_df: Optional explicit training-like DataFrame.
        neutral_df: Optional explicit neutral DataFrame.
        selector: Optional SelectionPolicy for choosing best candidate per metric (e.g. LMSThresholdPolicy).
        summary_extractor: Optional callable(results) -> (candidates, ctx). Use to add derived metrics
            (e.g. bpi, fpi, mpi) to candidates; then pass summary_metrics.
        summary_metrics: Optional list of metric names to summarize (e.g. ["bpi", "fpi", "mpi"]).
        target_class: If set (str or list of str), evaluate only for this target class (or classes).
            Restricts feature factors and datasets for efficiency. When None, evaluates for all target classes.
        increase_target_probabilities: If True (default), compute strengthen summaries only (keys e.g. "3SG").
            If False, compute weaken summaries only (keys e.g. "3SG_weaken"). Only required combinations are evaluated.
        plot: If True, after selection run any missing dataset evaluations for plotting, update cache, then plot.
        show: If True, display the plot; if False, only save. When None and plot=True, defaults to True.

    Returns:
        Flat dict: for strengthen, keys like result['3SG']; for weaken, keys like result['3SG_weaken'].
        Each entry has value, feature_factor, learning_rate, id, strengthen, lms, base_lms. Plus 'grid'.
        When plot=True, also 'plot_paths' and 'plot_path'.
    """
    kwargs = dict(
        trainer=self._trainer,
        model_with_gradiend=model_with_gradiend,
        feature_factors=feature_factors,
        lrs=lrs,
        use_cache=use_cache,
        max_size_training_like=max_size_training_like,
        max_size_neutral=max_size_neutral,
        eval_batch_size=eval_batch_size,
        training_like_df=training_like_df,
        neutral_df=neutral_df,
        summary_metrics=summary_metrics,
        target_class=target_class,
        increase_target_probabilities=increase_target_probabilities,
        plot=plot,
        show=show if show is not None else plot,
    )
    if selector is not None:
        kwargs["selector"] = selector
    if summary_extractor is not None:
        kwargs["summary_extractor"] = summary_extractor
    return self._decoder_evaluator.evaluate_decoder(**kwargs)

evaluate_encoder

evaluate_encoder(encoder_df=None, eval_data=None, use_cache=None, split=None, max_size=None, **kwargs)

Run encoder evaluation and return encoding/correlation metrics.

Parameters:

Name Type Description Default
encoder_df Optional[Union[Any, Dict[str, Any]]]

Optional DataFrame or dict with "encoder_df" key. If provided, skips encoding and computes metrics from this data. Use evaluate_encoder(return_df=True) to get such a dict.

None
eval_data Any

Optional pre-computed GradientTrainingDataset. If None and encoder_df is None, the trainer creates eval data via create_eval_data.

None
use_cache Optional[bool]

If True, reuse cached JSON result under experiment_dir when available. If None, defaults come from trainer training args.

None
split Optional[str]

Dataset split for eval data creation. Default: "test".

None
max_size Optional[int]

Maximum samples per variant for eval data creation.

None
**kwargs Any

Forwarded to create_eval_data when encoder_df and eval_data are None.

{}

Returns:

Type Description
Dict[str, Any]

Dict with keys: correlation, mean_by_class, mean_by_type, n_samples,

Dict[str, Any]

all_data, training_only, target_classes_only, boundaries; optionally

Dict[str, Any]

neutral_mean_by_type, mean_by_feature_class, label_value_to_class_name.

Source code in gradiend/evaluator/evaluator.py
def evaluate_encoder(
    self,
    encoder_df: Optional[Union[Any, Dict[str, Any]]] = None,
    eval_data: Any = None,
    use_cache: Optional[bool] = None,
    split: Optional[str] = None,
    max_size: Optional[int] = None,
    **kwargs: Any,
) -> Dict[str, Any]:
    """
    Run encoder evaluation and return encoding/correlation metrics.

    Args:
        encoder_df: Optional DataFrame or dict with "encoder_df" key. If provided,
            skips encoding and computes metrics from this data. Use
            evaluate_encoder(return_df=True) to get such a dict.
        eval_data: Optional pre-computed GradientTrainingDataset. If None and
            encoder_df is None, the trainer creates eval data via create_eval_data.
        use_cache: If True, reuse cached JSON result under experiment_dir when
            available. If None, defaults come from trainer training args.
        split: Dataset split for eval data creation. Default: "test".
        max_size: Maximum samples per variant for eval data creation.
        **kwargs: Forwarded to create_eval_data when encoder_df and eval_data are None.

    Returns:
        Dict with keys: correlation, mean_by_class, mean_by_type, n_samples,
        all_data, training_only, target_classes_only, boundaries; optionally
        neutral_mean_by_type, mean_by_feature_class, label_value_to_class_name.
    """
    resolved_df = encoder_df
    if isinstance(encoder_df, dict) and "encoder_df" in encoder_df:
        resolved_df = encoder_df["encoder_df"]
    return self._encoder_evaluator.evaluate_encoder(
        self._trainer,
        encoder_df=resolved_df,
        eval_data=eval_data,
        use_cache=use_cache,
        split=split,
        max_size=max_size,
        **kwargs,
    )

plot_encoder_distributions

plot_encoder_distributions(**kwargs)

Plot encoder distributions (typically a violin plot).

Parameters:

Name Type Description Default
**kwargs Any

Forwarded to the Visualizer implementation.

{}

Returns:

Type Description
Any

Whatever the Visualizer returns (often a matplotlib/seaborn figure).

Raises:

Type Description
NotImplementedError

If no Visualizer is configured and this method is not overridden in a subclass.

Source code in gradiend/evaluator/evaluator.py
def plot_encoder_distributions(self, **kwargs: Any) -> Any:
    """
    Plot encoder distributions (typically a violin plot).

    Args:
        **kwargs: Forwarded to the Visualizer implementation.

    Returns:
        Whatever the Visualizer returns (often a matplotlib/seaborn figure).

    Raises:
        NotImplementedError: If no Visualizer is configured and this method
            is not overridden in a subclass.
    """
    return self._delegate_to_visualizer("plot_encoder_distributions", **kwargs)

plot_encoder_scatter

plot_encoder_scatter(**kwargs)

Plot interactive encoder scatter (Plotly: jitter x, encoded y, colored by label).

Parameters:

Name Type Description Default
**kwargs Any

Forwarded to the Visualizer implementation (encoder_df, show, etc.).

{}

Returns:

Type Description
Any

Plotly Figure or None if Plotly is not installed.

Raises:

Type Description
NotImplementedError

If no Visualizer is configured.

Source code in gradiend/evaluator/evaluator.py
def plot_encoder_scatter(self, **kwargs: Any) -> Any:
    """
    Plot interactive encoder scatter (Plotly: jitter x, encoded y, colored by label).

    Args:
        **kwargs: Forwarded to the Visualizer implementation (encoder_df, show, etc.).

    Returns:
        Plotly Figure or None if Plotly is not installed.

    Raises:
        NotImplementedError: If no Visualizer is configured.
    """
    return self._delegate_to_visualizer("plot_encoder_scatter", **kwargs)

plot_probability_shifts

plot_probability_shifts(**kwargs)

Plot decoder probability shifts vs learning rate.

Parameters:

Name Type Description Default
**kwargs Any

Forwarded to the Visualizer implementation (decoder_results, class_ids, use_cache, etc.).

{}

Returns:

Type Description
Any

Path to saved plot file or empty string.

Raises:

Type Description
NotImplementedError

If no Visualizer is configured.

Source code in gradiend/evaluator/evaluator.py
def plot_probability_shifts(self, **kwargs: Any) -> Any:
    """
    Plot decoder probability shifts vs learning rate.

    Args:
        **kwargs: Forwarded to the Visualizer implementation (decoder_results, class_ids, use_cache, etc.).

    Returns:
        Path to saved plot file or empty string.

    Raises:
        NotImplementedError: If no Visualizer is configured.
    """
    return self._delegate_to_visualizer("plot_probability_shifts", **kwargs)

plot_training_convergence

plot_training_convergence(**kwargs)

Plot training convergence (means by class/feature_class and correlation).

Parameters:

Name Type Description Default
**kwargs Any

Forwarded to the Visualizer implementation.

{}

Returns:

Type Description
Any

Whatever the Visualizer returns (often a matplotlib/seaborn figure).

Raises:

Type Description
NotImplementedError

If no Visualizer is configured and this method is not overridden in a subclass.

Source code in gradiend/evaluator/evaluator.py
def plot_training_convergence(self, **kwargs: Any) -> Any:
    """
    Plot training convergence (means by class/feature_class and correlation).

    Args:
        **kwargs: Forwarded to the Visualizer implementation.

    Returns:
        Whatever the Visualizer returns (often a matplotlib/seaborn figure).

    Raises:
        NotImplementedError: If no Visualizer is configured and this method
            is not overridden in a subclass.
    """
    return self._delegate_to_visualizer("plot_training_convergence", **kwargs)