"""
Multi-Factor Unbinding and Factorization Methods
================================================

Topics: Factorization, multi-factor unbinding, composite structures, iterative cleanup
Time: 15 minutes
Prerequisites: 27_cleanup_strategies.py, 24_app_working_memory.py
Related: 23_app_symbolic_reasoning.py

This example demonstrates advanced techniques for factorizing composite
hypervectors - decomposing bundled representations back into their constituent
factors. Mastering factorization is essential for information retrieval from
distributed hyperdimensional representations.

Key concepts:
- Bundle factorization: Decompose bundled items (A ⊕ B ⊕ C → A, B, C)
- Binding factorization: Decompose bound structures (A ⊗ B ⊗ C → A, B, C)
- Iterative unbinding: Sequential factor extraction
- Noise accumulation: How errors compound during factorization
- Practical strategies: When and how to factorize

Factorization enables querying and retrieving information from complex
composite representations built through binding and bundling operations.
"""

import numpy as np
from holovec import VSA
from holovec.utils.cleanup import BruteForceCleanup, ResonatorCleanup

print("=" * 70)
print("Multi-Factor Unbinding and Factorization")
print("=" * 70)
print()

# Create model
model = VSA.create('FHRR', dim=10000, seed=42)

# Create cleanup strategy
cleanup = BruteForceCleanup()

# ============================================================================
# Demo 1: Bundle Factorization - Recovering Bundled Items
# ============================================================================
print("=" * 70)
print("Demo 1: Bundle Factorization")
print("=" * 70)

print("\nScenario: Bundle of 4 items")

# Create codebook
items = {}
for i in range(10):
    items[f"item_{i}"] = model.random(seed=100 + i)

# Bundle 4 specific items
bundled = [items["item_0"], items["item_2"], items["item_5"], items["item_7"]]
bundle = model.bundle(bundled)

print("  Bundle: item_0 ⊕ item_2 ⊕ item_5 ⊕ item_7")

# Factorize to recover all items
print("\n" + "=" * 70)
print("Factorizing bundle:")
print("=" * 70)

labels, sims = cleanup.factorize(bundle, items, model, n_factors=6)

print("\nRecovered factors (top 6):")
for i, (label, sim) in enumerate(zip(labels, sims), 1):
    in_bundle = "✓" if label in ["item_0", "item_2", "item_5", "item_7"] else "✗"
    print(f"  {i}. {label:10s}: {sim:.3f}  [{in_bundle}]")

# Calculate recall
correct_in_top4 = sum(1 for l in labels[:4] if l in ["item_0", "item_2", "item_5", "item_7"])
recall = correct_in_top4 / 4.0

print(f"\nRecall@4: {recall:.2f} ({correct_in_top4}/4 factors recovered)")

print("\nKey observation:")
print("  - Top factors are the original bundled items")
print("  - Similarity degrades but items still identifiable")
print("  - Ideal for 'what's in this bundle?' queries")

# ============================================================================
# Demo 2: Binding Chain Factorization - Sequential Unbinding
# ============================================================================
print("\n" + "=" * 70)
print("Demo 2: Binding Chain Factorization")
print("=" * 70)

print("\nScenario: Chain of bindings (A ⊗ B ⊗ C)")

# Create binding chain: A ⊗ B ⊗ C
A = items["item_0"]
B = items["item_1"]
C = items["item_2"]

chain = model.bind(model.bind(A, B), C)

print("  Chain: item_0 ⊗ item_1 ⊗ item_2")

# Method 1: Sequential unbinding (if you know the order)
print("\n" + "=" * 70)
print("Method 1: Sequential unbinding (knowing order)")
print("=" * 70)

# Unbind C to get (A ⊗ B)
step1 = model.unbind(chain, C)
print("\n  Step 1: Unbind item_2")
label1, sim1 = cleanup.cleanup(step1, {k: model.bind(v, items["item_1"]) for k, v in items.items()}, model)
print(f"    Result ≈ (item_0 ⊗ item_1), found: {label1} (sim={sim1:.3f})")

# Unbind B to get A
step2 = model.unbind(step1, B)
print("\n  Step 2: Unbind item_1")
label2, sim2 = cleanup.cleanup(step2, items, model)
print(f"    Result ≈ item_0, found: {label2} (sim={sim2:.3f})")

print("\nKey observation:")
print("  - Sequential unbinding requires knowing binding order")
print("  - Each unbind step recovers one factor")
print("  - Most reliable when order is known")

# ============================================================================
# Demo 3: Mixed Binding and Bundling - Structured Factorization
# ============================================================================
print("\n" + "=" * 70)
print("Demo 3: Mixed Operations - Structured Factorization")
print("=" * 70)

print("\nScenario: Role-filler structure with multiple bindings")
print("  Structure: (role_A ⊗ filler_1) ⊕ (role_B ⊗ filler_2)")

# Create roles and fillers
role_A = model.random(seed=200)
role_B = model.random(seed=201)
filler_1 = items["item_3"]
filler_2 = items["item_4"]

# Create structure
struct = model.bundle([
    model.bind(role_A, filler_1),
    model.bind(role_B, filler_2)
])

print("\n  role_A ⊗ item_3")
print("  role_B ⊗ item_4")
print("  → bundled together")

# Query by role
print("\n" + "=" * 70)
print("Query: What is bound to role_A?")
print("=" * 70)

result_A = model.unbind(struct, role_A)
label_A, sim_A = cleanup.cleanup(result_A, items, model)

print(f"\n  Unbind role_A: {label_A} (similarity={sim_A:.3f})")
print(f"  Expected: item_3")

# Query by role B
print("\n" + "=" * 70)
print("Query: What is bound to role_B?")
print("=" * 70)

result_B = model.unbind(struct, role_B)
label_B, sim_B = cleanup.cleanup(result_B, items, model)

print(f"\n  Unbind role_B: {label_B} (similarity={sim_B:.3f})")
print(f"  Expected: item_4")

print("\nKey observation:")
print("  - Can query structure by role (dimension)")
print("  - Unbinding isolates specific role-filler pairs")
print("  - Essential pattern for structured retrieval")

# ============================================================================
# Demo 4: Noise Accumulation in Factorization
# ============================================================================
print("\n" + "=" * 70)
print("Demo 4: Noise Accumulation During Factorization")
print("=" * 70)

print("\nTesting: Bundle size vs. factorization accuracy")

# Test different bundle sizes
sizes = [2, 4, 6, 8, 10]

print(f"\n{'Size':>5s} | {'Recall@Top':>12s} | {'Avg Sim':>10s} | {'Correct':>10s}")
print("-" * 45)

for size in sizes:
    # Create bundle of 'size' items
    selected = [items[f"item_{i}"] for i in range(size)]
    test_bundle = model.bundle(selected)

    # Factorize
    labels_test, sims_test = cleanup.factorize(test_bundle, items, model, n_factors=size)

    # Calculate metrics
    expected = {f"item_{i}" for i in range(size)}
    correct_count = sum(1 for l in labels_test[:size] if l in expected)
    recall = correct_count / size
    avg_sim = np.mean(sims_test[:size])

    print(f"{size:>5d} | {recall:>12.2f} | {avg_sim:>10.3f} | {correct_count:>10d}/{size}")

print("\nKey observation:")
print("  - Accuracy decreases with more bundled items")
print("  - Similarities degrade due to interference")
print("  - Practical limit: ~5-7 factors for reliable recovery")
print("  - Mirrors human working memory capacity!")

# ============================================================================
# Demo 5: Practical Application - Query Decomposition
# ============================================================================
print("\n" + "=" * 70)
print("Demo 5: Practical Application - Complex Query")
print("=" * 70)

print("\nScenario: Multi-attribute product search")
print("  Query: color=red AND category=laptop AND price=affordable")

# Define attributes
COLOR = model.random(seed=300)
CATEGORY = model.random(seed=301)
PRICE = model.random(seed=302)

# Define values
red = model.random(seed=400)
laptop = model.random(seed=401)
affordable = model.random(seed=402)

# Create query
query = model.bundle([
    model.bind(COLOR, red),
    model.bind(CATEGORY, laptop),
    model.bind(PRICE, affordable)
])

print("\n  Query HV created (color ⊗ red) ⊕ (category ⊗ laptop) ⊕ (price ⊗ affordable)")

# Decompose query to understand it
print("\n" + "=" * 70)
print("Decomposing query attributes:")
print("=" * 70)

# Create attribute codebook
attributes = {
    "COLOR": COLOR,
    "CATEGORY": CATEGORY,
    "PRICE": PRICE
}

# Factorize to find which attributes are in query
attr_labels, attr_sims = cleanup.factorize(query, attributes, model, n_factors=3)

print("\nQuery contains these attributes:")
for label, sim in zip(attr_labels, attr_sims):
    print(f"  {label:10s}: {sim:.3f}")

# Extract values for each attribute
print("\n" + "=" * 70)
print("Extracting attribute values:")
print("=" * 70)

values = {
    "red": red,
    "blue": model.random(seed=403),
    "laptop": laptop,
    "phone": model.random(seed=404),
    "affordable": affordable,
    "expensive": model.random(seed=405)
}

# Extract color value
color_val = model.unbind(query, COLOR)
color_label, color_sim = cleanup.cleanup(color_val, values, model)
print(f"\n  COLOR value: {color_label} (similarity={color_sim:.3f})")

# Extract category value
category_val = model.unbind(query, CATEGORY)
category_label, category_sim = cleanup.cleanup(category_val, values, model)
print(f"  CATEGORY value: {category_label} (similarity={category_sim:.3f})")

# Extract price value
price_val = model.unbind(query, PRICE)
price_label, price_sim = cleanup.cleanup(price_val, values, model)
print(f"  PRICE value: {price_label} (similarity={price_sim:.3f})")

print("\nKey observation:")
print("  - Can decompose complex queries into attributes + values")
print("  - Enables query understanding and refinement")
print("  - Practical for search engines and databases")

# ============================================================================
# Demo 6: Best Practices for Factorization
# ============================================================================
print("\n" + "=" * 70)
print("Demo 6: Factorization Best Practices")
print("=" * 70)

print("\n✓ DO:")
print("  - Factorize bundles with ≤ 7 items for best results")
print("  - Use cleanup strategies (BruteForce or Resonator)")
print("  - Provide comprehensive codebook for cleanup")
print("  - Check similarity scores to assess confidence")
print("  - Sequential unbinding when order is known")
print()
print("✗ DON'T:")
print("  - Bundle > 10 items if you need to factorize later")
print("  - Expect perfect recovery (always approximate)")
print("  - Unbind without cleanup (results are noisy)")
print("  - Ignore similarity scores (they indicate confidence)")
print("  - Chain too many unbind operations (noise compounds)")
print()
print("Strategies by use case:")
print()
print("  Bundle factorization:")
print("    - Use: factorize() method")
print("    - Returns: top-k most similar items")
print("    - Best for: 'what's in this bundle?' queries")
print()
print("  Binding chain factorization:")
print("    - Use: sequential unbind() + cleanup()")
print("    - Requires: knowing binding order")
print("    - Best for: structured data with known schema")
print()
print("  Mixed operations:")
print("    - Use: unbind() by dimension + cleanup()")
print("    - Pattern: role-filler binding in bundles")
print("    - Best for: attribute-value structures")
print()

# ============================================================================
# Summary
# ============================================================================
print("=" * 70)
print("Summary: Factorization Key Takeaways")
print("=" * 70)
print()
print("✓ Bundle factorization: Decompose A ⊕ B ⊕ C → A, B, C")
print("✓ Binding chains: Sequential unbinding with known order")
print("✓ Mixed structures: Combine unbinding + cleanup")
print("✓ Noise accumulation: Accuracy degrades with complexity")
print("✓ Practical limit: ~5-7 factors for reliable recovery")
print()
print("Core factorization pattern:")
print("  1. Create comprehensive codebook")
print("  2. Call cleanup.factorize(composite, codebook, model, n_factors=k)")
print("  3. Check similarity scores for confidence")
print("  4. Use top-k results as recovered factors")
print()
print("When to factorize:")
print("  - Retrieving bundled items from working memory")
print("  - Decomposing composite queries")
print("  - Understanding structured representations")
print("  - Multi-attribute search and filtering")
print()
print("Complexity considerations:")
print("  - Bundle of 3 items: Easy, high accuracy")
print("  - Bundle of 5-7 items: Moderate, good accuracy")
print("  - Bundle of 10+ items: Hard, degraded accuracy")
print("  - Deep binding chains: Noise compounds exponentially")
print()
print("Next steps:")
print("  → Apply factorization in your domain")
print("  → Combine with 27_cleanup_strategies.py techniques")
print("  → Use in 24_app_working_memory.py patterns")
print()
print("=" * 70)