Source code for holovec.models.bsdc

"""BSDC (Binary Sparse Distributed Codes) VSA model.

BSDC uses sparse binary vectors where only a small fraction of bits are set to 1.
This makes them memory-efficient and biologically plausible, while maintaining
the key properties of hyperdimensional computing.

Properties:
- Binding: XOR (self-inverse)
- Bundling: Majority voting with sparsity preservation
- Sparsity: Typically p = 1/√D (optimal for capacity)
- Memory efficient: Can use sparse data structures
- Biologically plausible: Similar to sparse neural codes
- Self-inverse binding

Key Advantages:
- Very memory efficient for high dimensions (D > 10000)
- Biological plausibility (cortical neurons ~1% active)
- Fast operations on sparse representations
- Good capacity with much lower memory footprint

Optimal Sparsity:
- p = 1/√D maximizes capacity (from information theory)
- For D=10,000: p ≈ 0.01 (1% of bits are 1)
- For D=100,000: p ≈ 0.003 (0.3% of bits are 1)

References:
- Kanerva (1988): "Sparse Distributed Memory" (foundational work)
- Rachkovskij & Kussul (2001): "Binding and normalization of binary sparse codes"
- Kleyko et al. (2023): HDC/VSA Survey (BSDC comparison)
"""

from __future__ import annotations

from typing import Optional, Sequence

import numpy as np

from ..backends import Backend
from ..backends.base import Array
from ..spaces import SparseSpace, VectorSpace
from .base import VSAModel


[docs] class BSDCModel(VSAModel): """BSDC (Binary Sparse Distributed Codes) model. Binding: XOR (element-wise, self-inverse) Unbinding: XOR (same as binding, self-inverse) Bundling: Majority voting with sparsity preservation Permutation: circular shift Uses SparseSpace with optimal sparsity p = 1/√D. """
[docs] def __init__( self, dimension: int = 10000, sparsity: Optional[float] = None, space: Optional[VectorSpace] = None, backend: Optional[Backend] = None, seed: Optional[int] = None ): """Initialize BSDC model. Args: dimension: Dimensionality of hypervectors (typically > 1000) sparsity: Fraction of 1s (default: 1/√D which is optimal) space: Vector space (defaults to SparseSpace with optimal sparsity) backend: Computational backend seed: Random seed for space """ if space is None: from ..backends import get_backend backend = backend if backend is not None else get_backend() space = SparseSpace(dimension, sparsity=sparsity, backend=backend, seed=seed) super().__init__(space, backend) # Store sparsity for easy access if isinstance(space, SparseSpace): self.sparsity = space.sparsity else: # Fallback if using non-sparse space import math self.sparsity = sparsity if sparsity is not None else 1.0 / math.sqrt(dimension)
@property def model_name(self) -> str: return "BSDC" @property def is_self_inverse(self) -> bool: return True # XOR is self-inverse @property def is_commutative(self) -> bool: return True # XOR is commutative @property def is_exact_inverse(self) -> bool: return True # XOR is exact inverse of itself
[docs] def bind(self, a: Array, b: Array) -> Array: """Bind using XOR. For sparse binary codes, XOR preserves sparsity on average. Expected sparsity of result: p(1-p) + (1-p)p = 2p(1-p) For optimal p = 1/√D, result sparsity ≈ 2/√D (slightly increased). Args: a: First hypervector b: Second hypervector Returns: Bound hypervector c = a XOR b """ result = self.backend.xor(a, b) # Optional: re-sparsify if needed to maintain sparsity # For now, let XOR naturally handle it return result
[docs] def unbind(self, a: Array, b: Array) -> Array: """Unbind using XOR (self-inverse). Since XOR is self-inverse: unbind(bind(a, b), b) = a Args: a: Bound hypervector (or first operand) b: Second operand Returns: Unbound hypervector (exact recovery) """ # XOR is self-inverse return self.bind(a, b)
[docs] def bundle(self, vectors: Sequence[Array], maintain_sparsity: bool = True) -> Array: """Bundle using majority voting. For sparse codes, bundling requires careful handling to maintain sparsity: 1. Sum all vectors element-wise 2. Apply threshold to get binary result 3. Optionally re-sparsify to maintain target sparsity Args: vectors: Sequence of hypervectors to bundle maintain_sparsity: If True, enforce target sparsity (default: True) Returns: Bundled hypervector Raises: ValueError: If vectors is empty """ if not vectors: raise ValueError("Cannot bundle empty sequence") vectors = list(vectors) # Sum all vectors (counts how many 1s at each position) sum_result = self.backend.sum(self.backend.stack(vectors, axis=0), axis=0) if maintain_sparsity: # Strategy: Take top-k positions with highest counts # where k ≈ sparsity * dimension sum_np = self.backend.to_numpy(sum_result) target_ones = int(self.sparsity * self.dimension) # Get indices of top-k values if target_ones > 0: # Use argpartition for efficiency (O(n) instead of O(n log n)) threshold_idx = max(0, len(sum_np) - target_ones) threshold = np.partition(sum_np, threshold_idx)[threshold_idx] # Set positions >= threshold to 1, rest to 0 result_np = (sum_np >= threshold).astype(np.int32) # If we have ties at the threshold, we might have slightly more # than target_ones. This is acceptable for maintaining sparsity. return self.backend.from_numpy(result_np) else: # No ones in result (edge case) return self.backend.zeros(self.dimension, dtype='int32') else: # Simple majority voting: threshold at N/2 threshold = len(vectors) / 2.0 result = self.backend.threshold(sum_result, threshold=threshold, above=1.0, below=0.0) return result.astype('int32')
[docs] def permute(self, vec: Array, k: int = 1) -> Array: """Permute using circular shift. Shifts vector elements by k positions. For sparse codes, this maintains sparsity perfectly. Args: vec: Hypervector to permute k: Number of positions to shift (default: 1) Returns: Permuted hypervector """ return self.backend.roll(vec, shift=k, axis=0)
[docs] def measure_sparsity(self, vec: Array) -> float: """Measure actual sparsity of a vector. Args: vec: Hypervector to measure Returns: Fraction of 1s in the vector """ vec_np = self.backend.to_numpy(vec) count_ones = np.sum(vec_np) return float(count_ones) / len(vec_np)
[docs] def rehash(self, vec: Array) -> Array: """Rehash vector to restore optimal sparsity. Useful after multiple operations that may have changed sparsity. Randomly selects positions to maintain target sparsity while preserving as much similarity as possible. Args: vec: Hypervector to rehash Returns: Rehashed hypervector with target sparsity """ vec_np = self.backend.to_numpy(vec) target_ones = int(self.sparsity * self.dimension) # Get current 1 positions current_ones = np.where(vec_np == 1)[0] current_count = len(current_ones) if current_count == target_ones: # Already at target sparsity return vec elif current_count > target_ones: # Too many 1s: randomly remove some keep_indices = np.random.choice( current_ones, size=target_ones, replace=False ) result = np.zeros_like(vec_np) result[keep_indices] = 1 else: # Too few 1s: randomly add some current_zeros = np.where(vec_np == 0)[0] add_count = target_ones - current_count add_indices = np.random.choice( current_zeros, size=add_count, replace=False ) result = vec_np.copy() result[add_indices] = 1 return self.backend.from_numpy(result.astype(np.int32))
[docs] def encode_sequence( self, items: Sequence[Array], use_ngrams: bool = False, n: int = 2 ) -> Array: """Encode sequence of items. Two strategies: 1. Position binding: item_i ⊗ ρⁱ(position) 2. N-grams: Bundle all n-grams in sequence Args: items: Sequence of hypervectors use_ngrams: If True, use n-gram encoding (default: False) n: N-gram size (default: 2 for bigrams) Returns: Sequence hypervector Raises: ValueError: If items is empty """ if not items: raise ValueError("Cannot encode empty sequence") items = list(items) if use_ngrams: # N-gram encoding if len(items) < n: # Sequence too short for n-grams, fall back to simple bundle return self.bundle(items) ngrams = [] for i in range(len(items) - n + 1): # Create n-gram by binding n consecutive items ngram = items[i] for j in range(1, n): ngram = self.bind(ngram, items[i + j]) ngrams.append(ngram) return self.bundle(ngrams) else: # Position binding encoding pos = self.random(seed=42) # Fixed position vector bound_items = [] for i, item in enumerate(items): permuted_pos = self.permute(pos, k=i) bound_items.append(self.bind(item, permuted_pos)) return self.bundle(bound_items)
def __repr__(self) -> str: return (f"BSDCModel(dimension={self.dimension}, " f"sparsity={self.sparsity:.4f}, " f"space={self.space.space_name}, " f"backend={self.backend.name})")
def optimal_sparsity(dimension: int) -> float: """Calculate optimal sparsity for given dimension. The optimal sparsity p = 1/√D maximizes the capacity of sparse distributed codes. Args: dimension: Dimensionality of hypervectors Returns: Optimal sparsity value Examples: >>> optimal_sparsity(10000) 0.01 >>> optimal_sparsity(100000) 0.00316... """ import math return 1.0 / math.sqrt(dimension) def expected_ones(dimension: int, sparsity: Optional[float] = None) -> int: """Calculate expected number of 1s for given dimension and sparsity. Args: dimension: Dimensionality of hypervectors sparsity: Sparsity level (default: optimal = 1/√D) Returns: Expected number of 1 bits Examples: >>> expected_ones(10000) 100 >>> expected_ones(10000, sparsity=0.05) 500 """ if sparsity is None: sparsity = optimal_sparsity(dimension) return int(dimension * sparsity) def compare_sparse_vs_dense( dimension: int = 10000, trials: int = 10 ) -> dict: """Compare BSDC (sparse) vs BSC (dense) performance. Compares memory usage, binding preservation, and capacity. Args: dimension: Dimensionality of hypervectors trials: Number of random trials Returns: Dictionary with comparison statistics """ from ..backends import get_backend from .bsc import BSCModel backend = get_backend('numpy') bsdc = BSDCModel(dimension=dimension, backend=backend, seed=42) bsc = BSCModel(dimension=dimension, backend=backend, seed=42) # Measure memory efficiency a_bsdc = bsdc.random(seed=1) a_bsc = bsc.random(seed=1) ones_bsdc = float(backend.sum(a_bsdc)) ones_bsc = float(backend.sum(a_bsc)) memory_ratio = ones_bsdc / ones_bsc # Measure binding quality bsdc_sims = [] bsc_sims = [] for i in range(trials): a_bsdc = bsdc.random(seed=i*2) b_bsdc = bsdc.random(seed=i*2+1) a_bsc = bsc.random(seed=i*2) b_bsc = bsc.random(seed=i*2+1) # Bind and unbind c_bsdc = bsdc.bind(a_bsdc, b_bsdc) recovered_bsdc = bsdc.unbind(c_bsdc, b_bsdc) c_bsc = bsc.bind(a_bsc, b_bsc) recovered_bsc = bsc.unbind(c_bsc, b_bsc) # Measure recovery sim_bsdc = bsdc.similarity(a_bsdc, recovered_bsdc) sim_bsc = bsc.similarity(a_bsc, recovered_bsc) bsdc_sims.append(sim_bsdc) bsc_sims.append(sim_bsc) return { 'memory_ratio': memory_ratio, # BSDC/BSC (should be ~0.01 for D=10000) 'bsdc_recovery_mean': float(np.mean(bsdc_sims)), 'bsc_recovery_mean': float(np.mean(bsc_sims)), 'bsdc_sparsity': bsdc.sparsity, 'bsdc_expected_ones': expected_ones(dimension), 'bsc_expected_ones': dimension // 2, # ~50% for dense binary }