Source code for holovec.models.bsdc

"""BSDC (Binary Sparse Distributed Codes) VSA model.

BSDC uses sparse binary vectors where only a small fraction of bits are set to 1.
This makes them memory-efficient and biologically plausible, while maintaining
the key properties of hyperdimensional computing.

Properties:
- Binding: XOR (self-inverse)
- Bundling: Majority voting with sparsity preservation
- Sparsity: Typically p = 1/√D (optimal for capacity)
- Memory efficient: Can use sparse data structures
- Biologically plausible: Similar to sparse neural codes
- Self-inverse binding

Key Advantages:
- Very memory efficient for high dimensions (D > 10000)
- Biological plausibility (cortical neurons ~1% active)
- Fast operations on sparse representations
- Good capacity with much lower memory footprint

Optimal Sparsity:
- p = 1/√D maximizes capacity (from information theory)
- For D=10,000: p ≈ 0.01 (1% of bits are 1)
- For D=100,000: p ≈ 0.003 (0.3% of bits are 1)

References:
- Kanerva (1988): "Sparse Distributed Memory" (foundational work)
- Rachkovskij & Kussul (2001): "Binding and normalization of binary sparse codes"
- Kleyko et al. (2023): HDC/VSA Survey (BSDC comparison)
"""

from __future__ import annotations

from typing import Optional, Sequence

import numpy as np

from ..backends import Backend
from ..backends.base import Array
from ..spaces import SparseSpace, VectorSpace
from .base import VSAModel



[docs]
class BSDCModel(VSAModel):
    """BSDC (Binary Sparse Distributed Codes) model.

    Binding: XOR (element-wise, self-inverse)
    Unbinding: XOR (same as binding, self-inverse)
    Bundling: Majority voting with sparsity preservation
    Permutation: circular shift

    Uses SparseSpace with optimal sparsity p = 1/√D.
    """


[docs]
    def __init__(
        self,
        dimension: int = 10000,
        sparsity: Optional[float] = None,
        space: Optional[VectorSpace] = None,
        backend: Optional[Backend] = None,
        seed: Optional[int] = None
    ):
        """Initialize BSDC model.

        Args:
            dimension: Dimensionality of hypervectors (typically > 1000)
            sparsity: Fraction of 1s (default: 1/√D which is optimal)
            space: Vector space (defaults to SparseSpace with optimal sparsity)
            backend: Computational backend
            seed: Random seed for space
        """
        if space is None:
            from ..backends import get_backend
            backend = backend if backend is not None else get_backend()
            space = SparseSpace(dimension, sparsity=sparsity, backend=backend, seed=seed)

        super().__init__(space, backend)

        # Store sparsity for easy access
        if isinstance(space, SparseSpace):
            self.sparsity = space.sparsity
        else:
            # Fallback if using non-sparse space
            import math
            self.sparsity = sparsity if sparsity is not None else 1.0 / math.sqrt(dimension)


    @property
    def model_name(self) -> str:
        return "BSDC"

    @property
    def is_self_inverse(self) -> bool:
        return True  # XOR is self-inverse

    @property
    def is_commutative(self) -> bool:
        return True  # XOR is commutative

    @property
    def is_exact_inverse(self) -> bool:
        return True  # XOR is exact inverse of itself


[docs]
    def bind(self, a: Array, b: Array) -> Array:
        """Bind using XOR.

        For sparse binary codes, XOR preserves sparsity on average.
        Expected sparsity of result: p(1-p) + (1-p)p = 2p(1-p)

        For optimal p = 1/√D, result sparsity ≈ 2/√D (slightly increased).

        Args:
            a: First hypervector
            b: Second hypervector

        Returns:
            Bound hypervector c = a XOR b
        """
        result = self.backend.xor(a, b)

        # Optional: re-sparsify if needed to maintain sparsity
        # For now, let XOR naturally handle it
        return result



[docs]
    def unbind(self, a: Array, b: Array) -> Array:
        """Unbind using XOR (self-inverse).

        Since XOR is self-inverse: unbind(bind(a, b), b) = a

        Args:
            a: Bound hypervector (or first operand)
            b: Second operand

        Returns:
            Unbound hypervector (exact recovery)
        """
        # XOR is self-inverse
        return self.bind(a, b)



[docs]
    def bundle(self, vectors: Sequence[Array], maintain_sparsity: bool = True) -> Array:
        """Bundle using majority voting.

        For sparse codes, bundling requires careful handling to maintain sparsity:
        1. Sum all vectors element-wise
        2. Apply threshold to get binary result
        3. Optionally re-sparsify to maintain target sparsity

        Args:
            vectors: Sequence of hypervectors to bundle
            maintain_sparsity: If True, enforce target sparsity (default: True)

        Returns:
            Bundled hypervector

        Raises:
            ValueError: If vectors is empty
        """
        if not vectors:
            raise ValueError("Cannot bundle empty sequence")

        vectors = list(vectors)

        # Sum all vectors (counts how many 1s at each position)
        sum_result = self.backend.sum(self.backend.stack(vectors, axis=0), axis=0)

        if maintain_sparsity:
            # Strategy: Take top-k positions with highest counts
            # where k ≈ sparsity * dimension
            sum_np = self.backend.to_numpy(sum_result)
            target_ones = int(self.sparsity * self.dimension)

            # Get indices of top-k values
            if target_ones > 0:
                # Use argpartition for efficiency (O(n) instead of O(n log n))
                threshold_idx = max(0, len(sum_np) - target_ones)
                threshold = np.partition(sum_np, threshold_idx)[threshold_idx]

                # Set positions >= threshold to 1, rest to 0
                result_np = (sum_np >= threshold).astype(np.int32)

                # If we have ties at the threshold, we might have slightly more
                # than target_ones. This is acceptable for maintaining sparsity.
                return self.backend.from_numpy(result_np)
            else:
                # No ones in result (edge case)
                return self.backend.zeros(self.dimension, dtype='int32')
        else:
            # Simple majority voting: threshold at N/2
            threshold = len(vectors) / 2.0
            result = self.backend.threshold(sum_result, threshold=threshold, above=1.0, below=0.0)
            return result.astype('int32')



[docs]
    def permute(self, vec: Array, k: int = 1) -> Array:
        """Permute using circular shift.

        Shifts vector elements by k positions. For sparse codes,
        this maintains sparsity perfectly.

        Args:
            vec: Hypervector to permute
            k: Number of positions to shift (default: 1)

        Returns:
            Permuted hypervector
        """
        return self.backend.roll(vec, shift=k, axis=0)



[docs]
    def measure_sparsity(self, vec: Array) -> float:
        """Measure actual sparsity of a vector.

        Args:
            vec: Hypervector to measure

        Returns:
            Fraction of 1s in the vector
        """
        vec_np = self.backend.to_numpy(vec)
        count_ones = np.sum(vec_np)
        return float(count_ones) / len(vec_np)



[docs]
    def rehash(self, vec: Array) -> Array:
        """Rehash vector to restore optimal sparsity.

        Useful after multiple operations that may have changed sparsity.
        Randomly selects positions to maintain target sparsity while
        preserving as much similarity as possible.

        Args:
            vec: Hypervector to rehash

        Returns:
            Rehashed hypervector with target sparsity
        """
        vec_np = self.backend.to_numpy(vec)
        target_ones = int(self.sparsity * self.dimension)

        # Get current 1 positions
        current_ones = np.where(vec_np == 1)[0]
        current_count = len(current_ones)

        if current_count == target_ones:
            # Already at target sparsity
            return vec
        elif current_count > target_ones:
            # Too many 1s: randomly remove some
            keep_indices = np.random.choice(
                current_ones, size=target_ones, replace=False
            )
            result = np.zeros_like(vec_np)
            result[keep_indices] = 1
        else:
            # Too few 1s: randomly add some
            current_zeros = np.where(vec_np == 0)[0]
            add_count = target_ones - current_count
            add_indices = np.random.choice(
                current_zeros, size=add_count, replace=False
            )
            result = vec_np.copy()
            result[add_indices] = 1

        return self.backend.from_numpy(result.astype(np.int32))



[docs]
    def encode_sequence(
        self,
        items: Sequence[Array],
        use_ngrams: bool = False,
        n: int = 2
    ) -> Array:
        """Encode sequence of items.

        Two strategies:
        1. Position binding: item_i ⊗ ρⁱ(position)
        2. N-grams: Bundle all n-grams in sequence

        Args:
            items: Sequence of hypervectors
            use_ngrams: If True, use n-gram encoding (default: False)
            n: N-gram size (default: 2 for bigrams)

        Returns:
            Sequence hypervector

        Raises:
            ValueError: If items is empty
        """
        if not items:
            raise ValueError("Cannot encode empty sequence")

        items = list(items)

        if use_ngrams:
            # N-gram encoding
            if len(items) < n:
                # Sequence too short for n-grams, fall back to simple bundle
                return self.bundle(items)

            ngrams = []
            for i in range(len(items) - n + 1):
                # Create n-gram by binding n consecutive items
                ngram = items[i]
                for j in range(1, n):
                    ngram = self.bind(ngram, items[i + j])
                ngrams.append(ngram)

            return self.bundle(ngrams)
        else:
            # Position binding encoding
            pos = self.random(seed=42)  # Fixed position vector
            bound_items = []

            for i, item in enumerate(items):
                permuted_pos = self.permute(pos, k=i)
                bound_items.append(self.bind(item, permuted_pos))

            return self.bundle(bound_items)


    def __repr__(self) -> str:
        return (f"BSDCModel(dimension={self.dimension}, "
                f"sparsity={self.sparsity:.4f}, "
                f"space={self.space.space_name}, "
                f"backend={self.backend.name})")



def optimal_sparsity(dimension: int) -> float:
    """Calculate optimal sparsity for given dimension.

    The optimal sparsity p = 1/√D maximizes the capacity of sparse
    distributed codes.

    Args:
        dimension: Dimensionality of hypervectors

    Returns:
        Optimal sparsity value

    Examples:
        >>> optimal_sparsity(10000)
        0.01
        >>> optimal_sparsity(100000)
        0.00316...
    """
    import math
    return 1.0 / math.sqrt(dimension)


def expected_ones(dimension: int, sparsity: Optional[float] = None) -> int:
    """Calculate expected number of 1s for given dimension and sparsity.

    Args:
        dimension: Dimensionality of hypervectors
        sparsity: Sparsity level (default: optimal = 1/√D)

    Returns:
        Expected number of 1 bits

    Examples:
        >>> expected_ones(10000)
        100
        >>> expected_ones(10000, sparsity=0.05)
        500
    """
    if sparsity is None:
        sparsity = optimal_sparsity(dimension)
    return int(dimension * sparsity)


def compare_sparse_vs_dense(
    dimension: int = 10000,
    trials: int = 10
) -> dict:
    """Compare BSDC (sparse) vs BSC (dense) performance.

    Compares memory usage, binding preservation, and capacity.

    Args:
        dimension: Dimensionality of hypervectors
        trials: Number of random trials

    Returns:
        Dictionary with comparison statistics
    """
    from ..backends import get_backend
    from .bsc import BSCModel

    backend = get_backend('numpy')

    bsdc = BSDCModel(dimension=dimension, backend=backend, seed=42)
    bsc = BSCModel(dimension=dimension, backend=backend, seed=42)

    # Measure memory efficiency
    a_bsdc = bsdc.random(seed=1)
    a_bsc = bsc.random(seed=1)

    ones_bsdc = float(backend.sum(a_bsdc))
    ones_bsc = float(backend.sum(a_bsc))

    memory_ratio = ones_bsdc / ones_bsc

    # Measure binding quality
    bsdc_sims = []
    bsc_sims = []

    for i in range(trials):
        a_bsdc = bsdc.random(seed=i*2)
        b_bsdc = bsdc.random(seed=i*2+1)

        a_bsc = bsc.random(seed=i*2)
        b_bsc = bsc.random(seed=i*2+1)

        # Bind and unbind
        c_bsdc = bsdc.bind(a_bsdc, b_bsdc)
        recovered_bsdc = bsdc.unbind(c_bsdc, b_bsdc)

        c_bsc = bsc.bind(a_bsc, b_bsc)
        recovered_bsc = bsc.unbind(c_bsc, b_bsc)

        # Measure recovery
        sim_bsdc = bsdc.similarity(a_bsdc, recovered_bsdc)
        sim_bsc = bsc.similarity(a_bsc, recovered_bsc)

        bsdc_sims.append(sim_bsdc)
        bsc_sims.append(sim_bsc)

    return {
        'memory_ratio': memory_ratio,  # BSDC/BSC (should be ~0.01 for D=10000)
        'bsdc_recovery_mean': float(np.mean(bsdc_sims)),
        'bsc_recovery_mean': float(np.mean(bsc_sims)),
        'bsdc_sparsity': bsdc.sparsity,
        'bsdc_expected_ones': expected_ones(dimension),
        'bsc_expected_ones': dimension // 2,  # ~50% for dense binary
    }