Distributed Representations and Capacity Analysis

Topics: Bundling capacity, dimension effects, information limits, cleanup Time: 15 minutes Prerequisites: 01_basic_operations.py, 02_models_comparison.py Related: 31_performance_benchmarks.py, 27_cleanup_strategies.py

This example explores the fundamental capacity limits of hyperdimensional computing - how many items can be bundled together while maintaining retrievability, and how dimension affects this capacity.

Key concepts: - Bundling capacity: Number of items in superposition - Dimension scaling: More dimensions → higher capacity - Similarity decay: How bundling reduces similarity - Cleanup requirements: When do we need cleanup? - Practical limits: Real-world capacity guidelines

Understanding capacity is crucial for designing reliable HDC systems.

 24 import numpy as np
 25 from holovec import VSA
 26 from holovec.utils.cleanup import BruteForceCleanup
 27
 28 print("=" * 70)
 29 print("Distributed Representations and Capacity Analysis")
 30 print("=" * 70)
 31 print()
 32
 33 # ============================================================================
 34 # Demo 1: Basic Bundling Capacity
 35 # ============================================================================
 36 print("=" * 70)
 37 print("Demo 1: Bundling Capacity - Similarity Decay")
 38 print("=" * 70)
 39
 40 model = VSA.create('MAP', dim=10000, seed=42)
 41
 42 print(f"\nModel: {model.model_name}")
 43 print(f"Dimension: {model.dimension}")
 44 print()
 45
 46 # Test bundling different numbers of vectors
 47 bundle_sizes = [1, 2, 5, 10, 20, 50, 100, 200]
 48
 49 print(f"{'Bundle Size':<15s} {'Avg Similarity':<15s} {'Min Similarity':<15s} {'Retrievable?':<12s}")
 50 print("-" * 65)
 51
 52 for n in bundle_sizes:
 53     # Create n vectors
 54     vectors = [model.random(seed=i) for i in range(n)]
 55
 56     # Bundle them
 57     bundled = model.bundle(vectors)
 58
 59     # Test similarity to each original
 60     similarities = [float(model.similarity(bundled, v)) for v in vectors]
 61     avg_sim = np.mean(similarities)
 62     min_sim = np.min(similarities)
 63
 64     # Heuristic: retrievable if min similarity > 0.3
 65     retrievable = "Yes" if min_sim > 0.3 else "Marginal" if min_sim > 0.15 else "No"
 66
 67     print(f"{n:<15d} {avg_sim:13.3f}   {min_sim:13.3f}   {retrievable:<12s}")
 68
 69 print("\nKey insight:")
 70 print("  - Similarity decreases as √(1/n) approximately")
 71 print("  - With dim=10000, can reliably bundle ~50-100 items")
 72 print("  - Beyond that, cleanup strategies needed")
 73
 74 # ============================================================================
 75 # Demo 2: Dimension vs Capacity
 76 # ============================================================================
 77 print("\n" + "=" * 70)
 78 print("Demo 2: How Dimension Affects Capacity")
 79 print("=" * 70)
 80
 81 dimensions = [1000, 5000, 10000, 20000]
 82 test_bundle_sizes = [10, 20, 50, 100]
 83
 84 print(f"\n{'Dimension':<12s} ", end="")
 85 for n in test_bundle_sizes:
 86     print(f"N={n:<8d} ", end="")
 87 print()
 88 print("-" * 60)
 89
 90 for dim in dimensions:
 91     model = VSA.create('MAP', dim=dim, seed=42)
 92     print(f"{dim:<12d} ", end="")
 93
 94     for n in test_bundle_sizes:
 95         vectors = [model.random(seed=i) for i in range(n)]
 96         bundled = model.bundle(vectors)
 97
 98         # Average similarity
 99         sims = [float(model.similarity(bundled, v)) for v in vectors]
100         avg_sim = np.mean(sims)
101
102         print(f"{avg_sim:9.3f} ", end="")
103
104     print()
105
106 print("\nObservations:")
107 print("  - Higher dimension → higher similarity for same bundle size")
108 print("  - Roughly: capacity ∝ dimension")
109 print("  - dim=10000 supports ~100 items comfortably")
110 print("  - dim=20000 supports ~200 items")
111
112 # ============================================================================
113 # Demo 3: Information Theory Perspective
114 # ============================================================================
115 print("\n" + "=" * 70)
116 print("Demo 3: Information Capacity")
117 print("=" * 70)
118
119 model = VSA.create('MAP', dim=10000, seed=42)
120
121 print(f"\nDimension: {model.dimension}")
122 print(f"\nInformation capacity analysis:")
123 print()
124
125 # Each bit in MAP can be +1 or -1 (1 bit of info)
126 total_bits = model.dimension
127
128 print(f"Total bits available: {total_bits:,}")
129 print()
130
131 # Bundling n vectors averages them
132 # Information per vector decreases with bundle size
133 print(f"{'Bundle Size':<15s} {'Bits/Vector':<15s} {'Total Bits Used':<18s}")
134 print("-" * 55)
135
136 for n in [1, 10, 50, 100, 200]:
137     bits_per_vector = total_bits / n
138     total_used = total_bits  # Bundle still uses full dimension
139
140     print(f"{n:<15d} {bits_per_vector:13.1f}   {total_used:16,d}")
141
142 print("\nKey insight:")
143 print("  - Fixed total capacity (dimension)")
144 print("  - More vectors = less information per vector")
145 print("  - Trade-off: quantity vs fidelity")
146
147 # ============================================================================
148 # Demo 4: Retrieval Accuracy vs Bundle Size
149 # ============================================================================
150 print("\n" + "=" * 70)
151 print("Demo 4: Retrieval Accuracy Analysis")
152 print("=" * 70)
153
154 model = VSA.create('MAP', dim=10000, seed=42)
155
156 # Create a codebook
157 codebook_size = 100
158 codebook = {f"item_{i}": model.random(seed=1000+i)
159             for i in range(codebook_size)}
160
161 cleanup = BruteForceCleanup()
162
163 print(f"\nCodebook size: {codebook_size}")
164 print(f"Testing retrieval accuracy for different bundle sizes:")
165 print()
166
167 bundle_sizes_test = [5, 10, 20, 50, 100]
168
169 print(f"{'Bundle Size':<15s} {'Accuracy':<12s} {'Avg Rank':<12s}")
170 print("-" * 45)
171
172 for n in bundle_sizes_test:
173     # Select random items to bundle
174     np.random.seed(42)
175     selected_keys = np.random.choice(list(codebook.keys()), size=n, replace=False)
176     selected = [codebook[key] for key in selected_keys]
177
178     # Bundle
179     bundled = model.bundle(selected)
180
181     # Try to retrieve each item
182     correct = 0
183     ranks = []
184
185     for target_key in selected_keys:
186         target = codebook[target_key]
187
188         # Find most similar in codebook
189         best_key = None
190         best_sim = float('-inf')
191
192         for key, vec in codebook.items():
193             sim = float(model.similarity(bundled, vec))
194             if sim > best_sim:
195                 best_sim = sim
196                 best_key = key
197
198         if best_key == target_key:
199             correct += 1
200
201         # Calculate rank
202         sims = [(key, float(model.similarity(bundled, vec)))
203                 for key, vec in codebook.items()]
204         sims.sort(key=lambda x: x[1], reverse=True)
205         rank = next(i for i, (key, _) in enumerate(sims, 1) if key == target_key)
206         ranks.append(rank)
207
208     accuracy = correct / n
209     avg_rank = np.mean(ranks)
210
211     print(f"{n:<15d} {accuracy:10.2%}   {avg_rank:10.1f}")
212
213 print("\nInsight:")
214 print("  - Accuracy drops as bundle size increases")
215 print("  - Even when top-1 fails, true item often in top-10")
216 print("  - Cleanup strategies help improve accuracy")
217
218 # ============================================================================
219 # Demo 5: Binding vs Bundling Capacity
220 # ============================================================================
221 print("\n" + "=" * 70)
222 print("Demo 5: Binding Chains - Different Capacity Behavior")
223 print("=" * 70)
224
225 model = VSA.create('FHRR', dim=10000, seed=42)
226
227 print(f"\nModel: {model.model_name} (exact inverses)")
228 print()
229
230 # Test binding chains of different lengths
231 chain_lengths = [1, 2, 3, 5, 10]
232
233 print(f"{'Chain Length':<15s} {'Similarity':<15s} {'Recoverable?':<12s}")
234 print("-" * 50)
235
236 for length in chain_lengths:
237     # Create binding chain: A * B * C * D * ...
238     vectors = [model.random(seed=100+i) for i in range(length)]
239
240     # Bind them all
241     result = vectors[0]
242     for v in vectors[1:]:
243         result = model.bind(result, v)
244
245     # Try to recover first vector by unbinding all others
246     recovered = result
247     for v in reversed(vectors[1:]):
248         recovered = model.unbind(recovered, v)
249
250     # Similarity to original
251     sim = float(model.similarity(recovered, vectors[0]))
252     recoverable = "Yes" if sim > 0.99 else "Degraded"
253
254     print(f"{length:<15d} {sim:13.3f}   {recoverable:<12s}")
255
256 print("\nKey difference:")
257 print("  - Binding (with exact inverses): Perfect recovery regardless of length")
258 print("  - Bundling: Similarity degrades with number of items")
259 print("  - Use binding for structured data, bundling for sets")
260
261 # ============================================================================
262 # Demo 6: Practical Capacity Guidelines
263 # ============================================================================
264 print("\n" + "=" * 70)
265 print("Demo 6: Practical Capacity Guidelines")
266 print("=" * 70)
267
268 print("\nRule of thumb for bundling capacity:")
269 print()
270
271 guideline_table = """
272 Dimension    No Cleanup    With Cleanup    Use Case
273 ──────────────────────────────────────────────────────────────
274 1,000        ~10 items     ~20 items       Toy problems
275 5,000        ~50 items     ~100 items      Small applications
276 10,000       ~100 items    ~200 items      Standard applications
277 20,000       ~200 items    ~500 items      Large-scale systems
278 """
279
280 print(guideline_table)
281
282 print("\nFactors affecting capacity:")
283 print("  1. Dimension: Higher = more capacity")
284 print("  2. Required similarity: Lower threshold = more capacity")
285 print("  3. Cleanup: Can double effective capacity")
286 print("  4. Model: MAP vs FHRR may differ slightly")
287 print("  5. Noise level: Clean data = higher capacity")
288
289 # ============================================================================
290 # Demo 7: Overloading Recovery Strategies
291 # ============================================================================
292 print("\n" + "=" * 70)
293 print("Demo 7: Strategies for High-Capacity Scenarios")
294 print("=" * 70)
295
296 print("\nWhen you need to store more items than dimension allows:\n")
297
298 print("Strategy 1: Multiple bundles (sharding)")
299 print("  - Split items into groups")
300 print("  - Bundle each group separately")
301 print("  - Store multiple bundles")
302 print("  Example: 1000 items → 10 bundles of 100 each")
303 print()
304
305 print("Strategy 2: Hierarchical encoding")
306 print("  - Create categories")
307 print("  - Bundle within categories")
308 print("  - Bind category to bundle")
309 print("  Example: colors→[red, blue] spatial→[top, bottom]")
310 print()
311
312 print("Strategy 3: Cleanup on retrieval")
313 print("  - Allow lower bundling similarity")
314 print("  - Use resonator cleanup to sharpen")
315 print("  - See 27_cleanup_strategies.py")
316 print()
317
318 print("Strategy 4: Increase dimension")
319 print("  - dim=20000 or dim=50000")
320 print("  - Trade memory for capacity")
321 print("  - See 31_performance_benchmarks.py for cost")
322
323 # ============================================================================
324 # Summary
325 # ============================================================================
326 print("\n" + "=" * 70)
327 print("Summary: Capacity Recommendations")
328 print("=" * 70)
329 print()
330
331 print("✓ Bundling Capacity Rules:")
332 print("  - Conservative: N ≤ dimension / 100")
333 print("  - Standard: N ≤ dimension / 50 (with cleanup)")
334 print("  - Aggressive: N ≤ dimension / 20 (requires strong cleanup)")
335 print()
336
337 print("✓ When Approaching Limits:")
338 print("  - Monitor similarity metrics")
339 print("  - Test retrieval accuracy")
340 print("  - Use cleanup strategies")
341 print("  - Consider hierarchical encoding")
342 print()
343
344 print("✓ Dimension Selection:")
345 print("  - Estimate max bundle size needed")
346 print("  - Choose dim ≥ 50 × max_bundle_size")
347 print("  - Leave safety margin (2x)")
348 print("  Example: Need 100 items → dim ≥ 10,000")
349 print()
350
351 print("✓ Binding Capacity:")
352 print("  - Much higher than bundling (with exact inverses)")
353 print("  - Limited by numerical precision, not information")
354 print("  - Chains of 10-20 bindings work well")
355 print()
356
357 print("Next steps:")
358 print("  → 27_cleanup_strategies.py - Improve retrieval accuracy")
359 print("  → 33_error_handling_robustness.py - Handle degraded signals")
360 print("  → 31_performance_benchmarks.py - Dimension cost analysis")
361 print()
362 print("=" * 70)

Gallery generated by Sphinx-Gallery