Source code for holovec.utils.operations

"""General utility operations for VSA systems.

This module provides general-purpose operations for hypervector manipulation
and analysis, including top-k selection, noise injection, and similarity
matrix computation.

Key Features:
    - Top-k selection from scored collections
    - Controlled noise injection for robustness testing
    - Pairwise similarity matrix computation
    - Support for various VSA operations

References:
    Kanerva (2009): Hyperdimensional Computing
    Plate (2003): Holographic Reduced Representations
"""

from __future__ import annotations

from typing import Dict, List, Tuple

import numpy as np

from ..backends.base import Array
from ..models.base import VSAModel


[docs] def select_top_k( items: Dict[str, float], k: int = 5, ) -> List[Tuple[str, float]]: """Select top-k items by score. Sorts items by score (descending) and returns the top k items as (label, score) tuples. Args: items: Dictionary mapping labels to scores k: Number of items to select (default: 5) Returns: List of (label, score) tuples sorted by score (highest first) Raises: TypeError: If arguments are not correct types ValueError: If k < 1, k > items size, or items is empty Examples: >>> # Select top 3 by similarity >>> scores = {'a': 0.95, 'b': 0.87, 'c': 0.92, 'd': 0.75} >>> top = select_top_k(scores, k=3) >>> print(top) [('a', 0.95), ('c', 0.92), ('b', 0.87)] >>> >>> # Get just the labels >>> labels = [label for label, _ in select_top_k(scores, k=2)] >>> print(labels) ['a', 'c'] References: Standard selection operation for ranked retrieval """ # Type validation if not isinstance(items, dict): raise TypeError(f"items must be dict, got {type(items)}") if not isinstance(k, int): raise TypeError(f"k must be int, got {type(k)}") # Value validation if len(items) == 0: raise ValueError("items must not be empty") if k < 1: raise ValueError(f"k must be >= 1, got {k}") if k > len(items): raise ValueError(f"k ({k}) cannot exceed items size ({len(items)})") # Sort by score (descending) and take top k sorted_items = sorted(items.items(), key=lambda x: x[1], reverse=True)[:k] return sorted_items
[docs] def add_noise( vector: Array, model: VSAModel, noise_level: float = 0.1, seed: int = None, ) -> Array: """Add controlled noise to a hypervector. Adds noise by bundling the original vector with a random vector, weighted by noise_level. Useful for testing robustness and approximate matching. Args: vector: Original hypervector model: VSA model for random generation and bundling noise_level: Proportion of noise to add (0.0 = none, 1.0 = full) (default: 0.1) seed: Random seed for reproducibility (default: None) Returns: Noisy hypervector Raises: TypeError: If arguments are not correct types ValueError: If noise_level not in [0.0, 1.0] Examples: >>> # Add 10% noise >>> noisy = add_noise(original, model, noise_level=0.1) >>> sim = model.similarity(original, noisy) >>> print(f"Similarity after noise: {sim:.3f}") >>> >>> # Heavy noise for stress testing >>> very_noisy = add_noise(original, model, noise_level=0.5) >>> >>> # Reproducible noise >>> noisy1 = add_noise(original, model, noise_level=0.2, seed=42) >>> noisy2 = add_noise(original, model, noise_level=0.2, seed=42) >>> # noisy1 and noisy2 are identical References: Robustness testing in Kanerva (2009) and related work """ # Type validation if not isinstance(model, VSAModel): raise TypeError(f"model must be VSAModel, got {type(model)}") if not isinstance(noise_level, (int, float)): raise TypeError(f"noise_level must be numeric, got {type(noise_level)}") if seed is not None and not isinstance(seed, int): raise TypeError(f"seed must be int or None, got {type(seed)}") # Value validation if not (0.0 <= noise_level <= 1.0): raise ValueError(f"noise_level must be in [0.0, 1.0], got {noise_level}") # Generate random noise noise = model.random(seed=seed) # Simple approach: bundle original with scaled noise # For discrete models like MAP, bundling doesn't allow fine-grained mixing # So we use a simple threshold-based approach if noise_level == 0.0: return vector elif noise_level == 1.0: return noise elif noise_level >= 0.5: # High noise: bundle equal parts return model.bundle([vector, noise]) else: # Low noise: bundle with more signal # Use 3:1 ratio for low noise, 2:1 for medium ratio = 3 if noise_level < 0.3 else 2 vectors = [vector] * ratio + [noise] return model.bundle(vectors)
[docs] def similarity_matrix( vectors: List[Array], model: VSAModel, labels: List[str] = None, ) -> np.ndarray: """Compute pairwise similarity matrix. Computes similarity between all pairs of vectors, returning an n×n similarity matrix where entry (i,j) is similarity(vectors[i], vectors[j]). Args: vectors: List of hypervectors model: VSA model for similarity computation labels: Optional labels for vectors (for reference, not used in computation) Returns: NxN numpy array of pairwise similarities Raises: TypeError: If arguments are not correct types ValueError: If vectors is empty or labels length doesn't match Examples: >>> # Compute similarity matrix >>> vectors = [model.random(seed=i) for i in range(5)] >>> sim_matrix = similarity_matrix(vectors, model) >>> print(f"Shape: {sim_matrix.shape}") (5, 5) >>> print(f"Diagonal (self-similarity): {np.diag(sim_matrix)}") >>> >>> # With labels for interpretation >>> labels = ['cat', 'dog', 'bird', 'fish', 'snake'] >>> sim_matrix = similarity_matrix(vectors, model, labels) >>> # Most similar pair (excluding self-similarity) >>> np.fill_diagonal(sim_matrix, -np.inf) >>> i, j = np.unravel_index(np.argmax(sim_matrix), sim_matrix.shape) >>> print(f"Most similar: {labels[i]} - {labels[j]}") References: Standard analysis tool for VSA systems """ # Type validation if not isinstance(vectors, list): raise TypeError(f"vectors must be list, got {type(vectors)}") if not isinstance(model, VSAModel): raise TypeError(f"model must be VSAModel, got {type(model)}") if labels is not None and not isinstance(labels, list): raise TypeError(f"labels must be list or None, got {type(labels)}") # Value validation if len(vectors) == 0: raise ValueError("vectors must not be empty") if labels is not None and len(labels) != len(vectors): raise ValueError( f"labels length ({len(labels)}) must match vectors length ({len(vectors)})" ) # Compute pairwise similarities n = len(vectors) matrix = np.zeros((n, n)) for i in range(n): for j in range(n): similarity = model.similarity(vectors[i], vectors[j]) matrix[i, j] = float(similarity) return matrix