Distributed Representations and Capacity Analysis¶

Topics: Bundling capacity, dimension effects, information limits, cleanup Time: 15 minutes Prerequisites: 01_basic_operations.py, 02_models_comparison.py Related: 31_performance_benchmarks.py, 27_cleanup_strategies.py
This example explores the fundamental capacity limits of hyperdimensional computing - how many items can be bundled together while maintaining retrievability, and how dimension affects this capacity.
Key concepts: - Bundling capacity: Number of items in superposition - Dimension scaling: More dimensions → higher capacity - Similarity decay: How bundling reduces similarity - Cleanup requirements: When do we need cleanup? - Practical limits: Real-world capacity guidelines
Understanding capacity is crucial for designing reliable HDC systems.
 import numpy as np
 from holovec import VSA
 from holovec.utils.cleanup import BruteForceCleanup

 print("=" * 70)
 print("Distributed Representations and Capacity Analysis")
 print("=" * 70)
 print()

 # ============================================================================
 # Demo 1: Basic Bundling Capacity
 # ============================================================================
 print("=" * 70)
 print("Demo 1: Bundling Capacity - Similarity Decay")
 print("=" * 70)

 model = VSA.create('MAP', dim=10000, seed=42)

 print(f"\nModel: {model.model_name}")
 print(f"Dimension: {model.dimension}")
 print()

 # Test bundling different numbers of vectors
 bundle_sizes = [1, 2, 5, 10, 20, 50, 100, 200]

 print(f"{'Bundle Size':<15s} {'Avg Similarity':<15s} {'Min Similarity':<15s} {'Retrievable?':<12s}")
 print("-" * 65)

 for n in bundle_sizes:
     # Create n vectors
     vectors = [model.random(seed=i) for i in range(n)]

     # Bundle them
     bundled = model.bundle(vectors)

     # Test similarity to each original
     similarities = [float(model.similarity(bundled, v)) for v in vectors]
     avg_sim = np.mean(similarities)
     min_sim = np.min(similarities)

     # Heuristic: retrievable if min similarity > 0.3
     retrievable = "Yes" if min_sim > 0.3 else "Marginal" if min_sim > 0.15 else "No"

     print(f"{n:<15d} {avg_sim:13.3f}   {min_sim:13.3f}   {retrievable:<12s}")

 print("\nKey insight:")
 print("  - Similarity decreases as √(1/n) approximately")
 print("  - With dim=10000, can reliably bundle ~50-100 items")
 print("  - Beyond that, cleanup strategies needed")

 # ============================================================================
 # Demo 2: Dimension vs Capacity
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 2: How Dimension Affects Capacity")
 print("=" * 70)

 dimensions = [1000, 5000, 10000, 20000]
 test_bundle_sizes = [10, 20, 50, 100]

 print(f"\n{'Dimension':<12s} ", end="")
 for n in test_bundle_sizes:
     print(f"N={n:<8d} ", end="")
 print()
 print("-" * 60)

 for dim in dimensions:
     model = VSA.create('MAP', dim=dim, seed=42)
     print(f"{dim:<12d} ", end="")

     for n in test_bundle_sizes:
         vectors = [model.random(seed=i) for i in range(n)]
         bundled = model.bundle(vectors)

         # Average similarity
         sims = [float(model.similarity(bundled, v)) for v in vectors]
         avg_sim = np.mean(sims)

         print(f"{avg_sim:9.3f} ", end="")

     print()

 print("\nObservations:")
 print("  - Higher dimension → higher similarity for same bundle size")
 print("  - Roughly: capacity ∝ dimension")
 print("  - dim=10000 supports ~100 items comfortably")
 print("  - dim=20000 supports ~200 items")

 # ============================================================================
 # Demo 3: Information Theory Perspective
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 3: Information Capacity")
 print("=" * 70)

 model = VSA.create('MAP', dim=10000, seed=42)

 print(f"\nDimension: {model.dimension}")
 print(f"\nInformation capacity analysis:")
 print()

 # Each bit in MAP can be +1 or -1 (1 bit of info)
 total_bits = model.dimension

 print(f"Total bits available: {total_bits:,}")
 print()

 # Bundling n vectors averages them
 # Information per vector decreases with bundle size
 print(f"{'Bundle Size':<15s} {'Bits/Vector':<15s} {'Total Bits Used':<18s}")
 print("-" * 55)

 for n in [1, 10, 50, 100, 200]:
     bits_per_vector = total_bits / n
     total_used = total_bits  # Bundle still uses full dimension

     print(f"{n:<15d} {bits_per_vector:13.1f}   {total_used:16,d}")

 print("\nKey insight:")
 print("  - Fixed total capacity (dimension)")
 print("  - More vectors = less information per vector")
 print("  - Trade-off: quantity vs fidelity")

 # ============================================================================
 # Demo 4: Retrieval Accuracy vs Bundle Size
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 4: Retrieval Accuracy Analysis")
 print("=" * 70)

 model = VSA.create('MAP', dim=10000, seed=42)

 # Create a codebook
 codebook_size = 100
 codebook = {f"item_{i}": model.random(seed=1000+i)
             for i in range(codebook_size)}

 cleanup = BruteForceCleanup()

 print(f"\nCodebook size: {codebook_size}")
 print(f"Testing retrieval accuracy for different bundle sizes:")
 print()

 bundle_sizes_test = [5, 10, 20, 50, 100]

 print(f"{'Bundle Size':<15s} {'Accuracy':<12s} {'Avg Rank':<12s}")
 print("-" * 45)

 for n in bundle_sizes_test:
     # Select random items to bundle
     np.random.seed(42)
     selected_keys = np.random.choice(list(codebook.keys()), size=n, replace=False)
     selected = [codebook[key] for key in selected_keys]

     # Bundle
     bundled = model.bundle(selected)

     # Try to retrieve each item
     correct = 0
     ranks = []

     for target_key in selected_keys:
         target = codebook[target_key]

         # Find most similar in codebook
         best_key = None
         best_sim = float('-inf')

         for key, vec in codebook.items():
             sim = float(model.similarity(bundled, vec))
             if sim > best_sim:
                 best_sim = sim
                 best_key = key

         if best_key == target_key:
             correct += 1

         # Calculate rank
         sims = [(key, float(model.similarity(bundled, vec)))
                 for key, vec in codebook.items()]
         sims.sort(key=lambda x: x[1], reverse=True)
         rank = next(i for i, (key, _) in enumerate(sims, 1) if key == target_key)
         ranks.append(rank)

     accuracy = correct / n
     avg_rank = np.mean(ranks)

     print(f"{n:<15d} {accuracy:10.2%}   {avg_rank:10.1f}")

 print("\nInsight:")
 print("  - Accuracy drops as bundle size increases")
 print("  - Even when top-1 fails, true item often in top-10")
 print("  - Cleanup strategies help improve accuracy")

 # ============================================================================
 # Demo 5: Binding vs Bundling Capacity
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 5: Binding Chains - Different Capacity Behavior")
 print("=" * 70)

 model = VSA.create('FHRR', dim=10000, seed=42)

 print(f"\nModel: {model.model_name} (exact inverses)")
 print()

 # Test binding chains of different lengths
 chain_lengths = [1, 2, 3, 5, 10]

 print(f"{'Chain Length':<15s} {'Similarity':<15s} {'Recoverable?':<12s}")
 print("-" * 50)

 for length in chain_lengths:
     # Create binding chain: A * B * C * D * ...
     vectors = [model.random(seed=100+i) for i in range(length)]

     # Bind them all
     result = vectors[0]
     for v in vectors[1:]:
         result = model.bind(result, v)

     # Try to recover first vector by unbinding all others
     recovered = result
     for v in reversed(vectors[1:]):
         recovered = model.unbind(recovered, v)

     # Similarity to original
     sim = float(model.similarity(recovered, vectors[0]))
     recoverable = "Yes" if sim > 0.99 else "Degraded"

     print(f"{length:<15d} {sim:13.3f}   {recoverable:<12s}")

 print("\nKey difference:")
 print("  - Binding (with exact inverses): Perfect recovery regardless of length")
 print("  - Bundling: Similarity degrades with number of items")
 print("  - Use binding for structured data, bundling for sets")

 # ============================================================================
 # Demo 6: Practical Capacity Guidelines
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 6: Practical Capacity Guidelines")
 print("=" * 70)

 print("\nRule of thumb for bundling capacity:")
 print()

 guideline_table = """
 Dimension    No Cleanup    With Cleanup    Use Case
 ──────────────────────────────────────────────────────────────
 1,000        ~10 items     ~20 items       Toy problems
 5,000        ~50 items     ~100 items      Small applications
 10,000       ~100 items    ~200 items      Standard applications
 20,000       ~200 items    ~500 items      Large-scale systems
 """

 print(guideline_table)

 print("\nFactors affecting capacity:")
 print("  1. Dimension: Higher = more capacity")
 print("  2. Required similarity: Lower threshold = more capacity")
 print("  3. Cleanup: Can double effective capacity")
 print("  4. Model: MAP vs FHRR may differ slightly")
 print("  5. Noise level: Clean data = higher capacity")

 # ============================================================================
 # Demo 7: Overloading Recovery Strategies
 # ============================================================================
 print("\n" + "=" * 70)
 print("Demo 7: Strategies for High-Capacity Scenarios")
 print("=" * 70)

 print("\nWhen you need to store more items than dimension allows:\n")

 print("Strategy 1: Multiple bundles (sharding)")
 print("  - Split items into groups")
 print("  - Bundle each group separately")
 print("  - Store multiple bundles")
 print("  Example: 1000 items → 10 bundles of 100 each")
 print()

 print("Strategy 2: Hierarchical encoding")
 print("  - Create categories")
 print("  - Bundle within categories")
 print("  - Bind category to bundle")
 print("  Example: colors→[red, blue] spatial→[top, bottom]")
 print()

 print("Strategy 3: Cleanup on retrieval")
 print("  - Allow lower bundling similarity")
 print("  - Use resonator cleanup to sharpen")
 print("  - See 27_cleanup_strategies.py")
 print()

 print("Strategy 4: Increase dimension")
 print("  - dim=20000 or dim=50000")
 print("  - Trade memory for capacity")
 print("  - See 31_performance_benchmarks.py for cost")

 # ============================================================================
 # Summary
 # ============================================================================
 print("\n" + "=" * 70)
 print("Summary: Capacity Recommendations")
 print("=" * 70)
 print()

 print("✓ Bundling Capacity Rules:")
 print("  - Conservative: N ≤ dimension / 100")
 print("  - Standard: N ≤ dimension / 50 (with cleanup)")
 print("  - Aggressive: N ≤ dimension / 20 (requires strong cleanup)")
 print()

 print("✓ When Approaching Limits:")
 print("  - Monitor similarity metrics")
 print("  - Test retrieval accuracy")
 print("  - Use cleanup strategies")
 print("  - Consider hierarchical encoding")
 print()

 print("✓ Dimension Selection:")
 print("  - Estimate max bundle size needed")
 print("  - Choose dim ≥ 50 × max_bundle_size")
 print("  - Leave safety margin (2x)")
 print("  Example: Need 100 items → dim ≥ 10,000")
 print()

 print("✓ Binding Capacity:")
 print("  - Much higher than bundling (with exact inverses)")
 print("  - Limited by numerical precision, not information")
 print("  - Chains of 10-20 bindings work well")
 print()

 print("Next steps:")
 print("  → 27_cleanup_strategies.py - Improve retrieval accuracy")
 print("  → 33_error_handling_robustness.py - Handle degraded signals")
 print("  → 31_performance_benchmarks.py - Dimension cost analysis")
 print()
 print("=" * 70)
Gallery generated by Sphinx-Gallery