The laft.metrics module provides comprehensive binary classification metrics commonly used in anomaly detection and out-of-distribution detection tasks.
BinaryMetrics
TypedDict containing binary classification metric results.
class BinaryMetrics(TypedDict, total=False):
auroc: float
auprc: float
accuracy: float
f1: float
fpr95: float
Area Under the Receiver Operating Characteristic curve (0-1, higher is better).
Area Under the Precision-Recall Curve (0-1, higher is better).
Classification accuracy at optimal or specified threshold (0-1, higher is better).
F1 score at optimal or specified threshold (0-1, higher is better).
False Positive Rate at 95% True Positive Rate (0-1, lower is better). Common metric for anomaly detection.
binary_auroc
Computes Area Under the Receiver Operating Characteristic curve.
def binary_auroc(input: Tensor, target: Tensor) -> float
Predicted scores or probabilities, shape [N]. Higher values indicate positive class.
Ground truth binary labels, shape [N]. Values should be 0 or 1.
AUROC score between 0 and 1. Higher is better; 0.5 indicates random performance.
Usage
import torch
from laft.metrics import binary_auroc
scores = torch.tensor([0.9, 0.8, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0])
auroc = binary_auroc(scores, labels)
print(f"AUROC: {auroc:.3f}") # Perfect score: 1.000
binary_auprc
Computes Area Under the Precision-Recall Curve.
def binary_auprc(input: Tensor, target: Tensor) -> float
Predicted scores, shape [N].
Ground truth binary labels, shape [N].
AUPRC score between 0 and 1. Higher is better. Particularly useful for imbalanced datasets.
Usage
from laft.metrics import binary_auprc
scores = torch.tensor([0.9, 0.8, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0])
auprc = binary_auprc(scores, labels)
print(f"AUPRC: {auprc:.3f}")
binary_f1_score
Computes F1 score at a specified or optimal threshold.
def binary_f1_score(
input: Tensor,
target: Tensor,
*,
threshold: float | Literal["auto"] = "auto",
) -> float
Predicted scores, shape [N].
Ground truth binary labels, shape [N].
threshold
float | Literal['auto']
default:"'auto'"
Classification threshold:
"auto": Automatically determines optimal threshold that maximizes F1
float: Use specified threshold value
F1 score between 0 and 1. Harmonic mean of precision and recall.
Usage
from laft.metrics import binary_f1_score
scores = torch.tensor([0.9, 0.8, 0.6, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0, 0])
# Auto-compute optimal threshold
f1_auto = binary_f1_score(scores, labels)
print(f"F1 (auto): {f1_auto:.3f}")
# Use specific threshold
f1_fixed = binary_f1_score(scores, labels, threshold=0.5)
print(f"F1 (0.5): {f1_fixed:.3f}")
binary_accuracy
Computes classification accuracy at a specified or optimal threshold.
def binary_accuracy(
input: Tensor,
target: Tensor,
*,
threshold: float | Literal["auto"] = "auto",
) -> float
Predicted scores, shape [N].
Ground truth binary labels, shape [N].
threshold
float | Literal['auto']
default:"'auto'"
Classification threshold (same behavior as binary_f1_score).
Accuracy between 0 and 1. Fraction of correct predictions.
Usage
from laft.metrics import binary_accuracy
scores = torch.tensor([0.9, 0.8, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0])
accuracy = binary_accuracy(scores, labels, threshold=0.5)
print(f"Accuracy: {accuracy:.1%}")
binary_fpr95
Computes False Positive Rate at 95% True Positive Rate. Common metric for anomaly detection.
def binary_fpr95(input: Tensor, target: Tensor) -> float
Predicted anomaly scores, shape [N]. Higher values indicate anomalies.
Ground truth labels, shape [N]. 1 for anomaly, 0 for normal.
FPR@95 between 0 and 1. Lower is better. Measures false alarm rate when detecting 95% of anomalies.
Usage
from laft.metrics import binary_fpr95
# Anomaly detection scores (higher = more anomalous)
anomal_scores = torch.tensor([0.9, 0.85, 0.7, 0.3, 0.2, 0.1])
labels = torch.tensor([1, 1, 1, 0, 0, 0]) # 1 = anomaly
fpr95 = binary_fpr95(anomal_scores, labels)
print(f"FPR@95: {fpr95:.1%}") # Low is good
optimal_threshold
Computes the optimal classification threshold that maximizes F1 score.
def optimal_threshold(input: Tensor, target: Tensor) -> float
Predicted scores, shape [N].
Ground truth binary labels, shape [N].
Optimal threshold value that maximizes F1 score.
Usage
from laft.metrics import optimal_threshold
scores = torch.tensor([0.9, 0.8, 0.6, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0, 0])
threshold = optimal_threshold(scores, labels)
print(f"Optimal threshold: {threshold:.3f}")
# Use this threshold for predictions
predictions = (scores >= threshold).float()
binary_metrics
Computes multiple binary classification metrics at once.
def binary_metrics(
input: Tensor | Sequence[Tensor],
target: Tensor,
*,
threshold: float | Literal["auto"] = "auto",
types: Sequence[str] = ("auroc", "auprc", "fpr95"),
) -> BinaryMetrics | tuple[BinaryMetrics, BinaryMetrics]
input
Tensor | Sequence[Tensor]
Predicted scores:
- Single tensor: Returns single
BinaryMetrics dict
- Sequence of tensors: Computes metrics for each, returns mean and std
Ground truth binary labels, shape [N].
threshold
float | Literal['auto']
default:"'auto'"
Threshold for accuracy and F1 metrics (if included in types).
types
Sequence[str]
default:"('auroc', 'auprc', 'fpr95')"
Metrics to compute. Available: "auroc", "auprc", "accuracy", "f1", "fpr95".
metrics
BinaryMetrics | tuple[BinaryMetrics, BinaryMetrics]
- Single input: Returns
BinaryMetrics dict
- Multiple inputs: Returns
(mean_metrics, std_metrics) tuple
Usage
Single evaluation:
from laft.metrics import binary_metrics
scores = torch.tensor([0.9, 0.8, 0.3, 0.2])
labels = torch.tensor([1, 1, 0, 0])
# Compute default metrics
metrics = binary_metrics(scores, labels)
print(f"AUROC: {metrics['auroc']:.3f}")
print(f"AUPRC: {metrics['auprc']:.3f}")
print(f"FPR95: {metrics['fpr95']:.3f}")
# Compute all metrics including F1 and accuracy
all_metrics = binary_metrics(
scores, labels,
types=("auroc", "auprc", "f1", "accuracy", "fpr95")
)
print(f"F1: {all_metrics['f1']:.3f}")
print(f"Accuracy: {all_metrics['accuracy']:.3f}")
Multiple runs with mean and std:
# Evaluate across multiple runs/folds
run_scores = [
torch.tensor([0.9, 0.8, 0.3, 0.2]),
torch.tensor([0.85, 0.75, 0.4, 0.15]),
torch.tensor([0.95, 0.9, 0.2, 0.1]),
]
labels = torch.tensor([1, 1, 0, 0])
mean_metrics, std_metrics = binary_metrics(run_scores, labels)
print(f"AUROC: {mean_metrics['auroc']:.3f} ± {std_metrics['auroc']:.3f}")
print(f"AUPRC: {mean_metrics['auprc']:.3f} ± {std_metrics['auprc']:.3f}")
mean_std
Computes mean and standard deviation of a sequence of values.
def mean_std(values: Sequence[float]) -> tuple[float, float]
List of numerical values.
Standard deviation of the values.
Usage
from laft.metrics import mean_std
auroc_scores = [0.95, 0.93, 0.96, 0.94]
mean, std = mean_std(auroc_scores)
print(f"AUROC: {mean:.3f} ± {std:.3f}")
metric_mean_std
Computes mean and standard deviation across multiple metric dictionaries.
def metric_mean_std(
metrics: Sequence[BinaryMetrics]
) -> tuple[BinaryMetrics, BinaryMetrics]
List of metric dictionaries from multiple runs.
Dictionary with mean value for each metric.
Dictionary with standard deviation for each metric.
Usage
from laft.metrics import binary_metrics, metric_mean_std
import torch
# Run experiment multiple times
results = []
for seed in range(5):
torch.manual_seed(seed)
scores = torch.randn(100)
labels = (scores > 0).float()
results.append(binary_metrics(scores, labels))
# Aggregate results
mean_metrics, std_metrics = metric_mean_std(results)
print("Results across 5 runs:")
for key in mean_metrics:
print(f"{key}: {mean_metrics[key]:.3f} ± {std_metrics[key]:.3f}")
Complete Example
Evaluating an anomaly detection system:
import torch
from laft.metrics import binary_metrics
from laft import knn
# Simulate anomaly detection with k-NN
torch.manual_seed(42)
# Normal training data
train_features = torch.randn(1000, 512)
# Test data: normal + anomalies
test_normal = torch.randn(100, 512)
test_anomaly = torch.randn(50, 512) * 3 # Anomalies are further from normal
test_features = torch.cat([test_normal, test_anomaly])
test_labels = torch.cat([
torch.zeros(100), # Normal
torch.ones(50), # Anomaly
])
# Compute anomaly scores
scores = knn(train_features, test_features, n_neighbors=30)
# Evaluate
metrics = binary_metrics(
scores,
test_labels,
types=("auroc", "auprc", "f1", "fpr95")
)
print("Anomaly Detection Results:")
print(f" AUROC: {metrics['auroc']:.3f}")
print(f" AUPRC: {metrics['auprc']:.3f}")
print(f" F1: {metrics['f1']:.3f}")
print(f" FPR@95: {metrics['fpr95']:.3f}")