Demonstration of N-gram Encoder for local sequence pattern encoding.¶

This demo showcases the NGramEncoder, which captures local patterns in sequences using sliding windows (n-grams). This is particularly useful for:
Text analysis (character/word n-grams)
Pattern matching in sequences
Similarity detection based on local context
NLP applications
The encoder supports: - Multiple n-gram sizes (unigrams, bigrams, trigrams, etc.) - Overlapping and non-overlapping windows (stride parameter) - Two modes: bundling (bag-of-ngrams) and chaining (ordered n-grams)
 from holovec import VSA
 from holovec.encoders import NGramEncoder


 def print_section(title):
     """Print a section header."""
     print(f"\n{'=' * 70}")
     print(f"{title}")
     print('=' * 70)


 def demo_basic_ngram_encoding():
     """Demonstrate basic n-gram encoding."""
     print_section("Demo 1: Basic N-gram Encoding")

     model = VSA.create('MAP', dim=5000, seed=42)
     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)

     print(f"\nEncoder: {encoder}")
     print(f"Configuration: n={encoder.n}, stride={encoder.stride}, mode='{encoder.mode}'")

     # Encode a sequence
     sequence = ['the', 'quick', 'brown', 'fox']
     hv = encoder.encode(sequence)

     print(f"\nInput sequence: {sequence}")
     print(f"Bigrams (n=2, stride=1):")
     print("  - ['the', 'quick']")
     print("  - ['quick', 'brown']")
     print("  - ['brown', 'fox']")
     print(f"\nEncoded hypervector shape: {hv.shape}")
     print(f"Codebook size: {encoder.get_codebook_size()} unique symbols")


 def demo_different_n_values():
     """Demonstrate different n-gram sizes."""
     print_section("Demo 2: Different N-gram Sizes")

     model = VSA.create('MAP', dim=5000, seed=42)
     sequence = ['A', 'B', 'C', 'D', 'E']

     print(f"\nInput sequence: {sequence}\n")

     for n in [1, 2, 3, 4]:
         encoder = NGramEncoder(model, n=n, stride=1, mode='bundling', seed=42)
         hv = encoder.encode(sequence)

         # Calculate number of n-grams
         num_ngrams = len(sequence) - n + 1

         name = {1: "Unigrams", 2: "Bigrams", 3: "Trigrams", 4: "4-grams"}[n]
         print(f"{name} (n={n}):")
         print(f"  Number of n-grams: {num_ngrams}")
         print(f"  Hypervector shape: {hv.shape}")


 def demo_stride_parameter():
     """Demonstrate stride (overlapping vs non-overlapping)."""
     print_section("Demo 3: Stride Parameter (Overlapping vs Non-overlapping)")

     model = VSA.create('MAP', dim=5000, seed=42)
     sequence = ['A', 'B', 'C', 'D', 'E', 'F']

     print(f"\nInput sequence: {sequence}\n")

     # Overlapping bigrams (stride=1)
     encoder1 = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
     hv1 = encoder1.encode(sequence)

     print("Overlapping bigrams (stride=1):")
     print("  N-grams: ['A','B'], ['B','C'], ['C','D'], ['D','E'], ['E','F']")
     print(f"  Count: 5 n-grams")

     # Non-overlapping bigrams (stride=2)
     encoder2 = NGramEncoder(model, n=2, stride=2, mode='bundling', seed=42)
     hv2 = encoder2.encode(sequence)

     print("\nNon-overlapping bigrams (stride=2):")
     print("  N-grams: ['A','B'], ['C','D'], ['E','F']")
     print(f"  Count: 3 n-grams")

     # Partial overlap (stride=2 with trigrams)
     encoder3 = NGramEncoder(model, n=3, stride=2, mode='bundling', seed=42)
     hv3 = encoder3.encode(sequence)

     print("\nPartial overlap trigrams (n=3, stride=2):")
     print("  N-grams: ['A','B','C'], ['C','D','E']")
     print(f"  Count: 2 n-grams")


 def demo_text_similarity():
     """Demonstrate text similarity using n-grams."""
     print_section("Demo 4: Text Similarity with N-grams")

     model = VSA.create('MAP', dim=10000, seed=42)
     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)

     # Encode sentences as word bigrams
     sent1 = ['the', 'cat', 'sat', 'on', 'the', 'mat']
     sent2 = ['the', 'cat', 'sat', 'on', 'the', 'hat']  # Similar (1 word diff)
     sent3 = ['a', 'dog', 'ran', 'in', 'the', 'park']   # Different

     hv1 = encoder.encode(sent1)
     hv2 = encoder.encode(sent2)
     hv3 = encoder.encode(sent3)

     print("\nSentence 1:", ' '.join(sent1))
     print("Sentence 2:", ' '.join(sent2), "(differs by 1 word)")
     print("Sentence 3:", ' '.join(sent3), "(completely different)\n")

     sim_1_2 = float(model.similarity(hv1, hv2))
     sim_1_3 = float(model.similarity(hv1, hv3))

     print(f"Similarity (sent1 vs sent2): {sim_1_2:.3f}")
     print(f"Similarity (sent1 vs sent3): {sim_1_3:.3f}")

     print("\nKey insight:")
     print("  Sentences sharing more bigrams have higher similarity.")
     print("  Bigrams shared by sent1 and sent2:")
     print("    ['the','cat'], ['cat','sat'], ['sat','on'], ['on','the']")


 def demo_bundling_vs_chaining():
     """Demonstrate bundling mode vs chaining mode."""
     print_section("Demo 5: Bundling vs Chaining Modes")

     model = VSA.create('MAP', dim=10000, seed=42)

     sequence = ['A', 'B', 'C']

     # Bundling mode: order-invariant across n-grams
     encoder_bundle = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
     hv_bundle = encoder_bundle.encode(sequence)

     print("Bundling Mode (bag-of-ngrams):")
     print(f"  Sequence: {sequence}")
     print(f"  N-grams: ['A','B'], ['B','C']")
     print(f"  Encoding: bundle(encode(['A','B']), encode(['B','C']))")
     print(f"  Order-sensitive: No (n-grams bundled)")
     print(f"  Reversible: {encoder_bundle.is_reversible}")

     # Chaining mode: order-sensitive
     encoder_chain = NGramEncoder(model, n=2, stride=1, mode='chaining', seed=42)
     hv_chain = encoder_chain.encode(sequence)

     print("\nChaining Mode (ordered n-grams):")
     print(f"  Sequence: {sequence}")
     print(f"  N-grams: ['A','B'] at position 0, ['B','C'] at position 1")
     print(f"  Encoding: bundle(permute(encode(['A','B']), 0), permute(encode(['B','C']), 1))")
     print(f"  Order-sensitive: Yes (positions encoded)")
     print(f"  Reversible: {encoder_chain.is_reversible}")

     # Test decoding in chaining mode
     if encoder_chain.is_reversible:
         decoded = encoder_chain.decode(hv_chain, max_ngrams=3, threshold=0.2)
         print(f"\nDecoded n-grams (approximate): {decoded}")


 def demo_character_ngrams():
     """Demonstrate character-level n-grams."""
     print_section("Demo 6: Character-Level N-grams")

     model = VSA.create('MAP', dim=10000, seed=42)
     encoder = NGramEncoder(model, n=3, stride=1, mode='bundling', seed=42)

     # Encode words as character trigrams
     word1 = list("pattern")   # ['p','a','t','t','e','r','n']
     word2 = list("patter")    # ['p','a','t','t','e','r']
     word3 = list("matter")    # ['m','a','t','t','e','r']

     hv1 = encoder.encode(word1)
     hv2 = encoder.encode(word2)
     hv3 = encoder.encode(word3)

     print("\nWord 1: 'pattern'")
     print("  Trigrams: pat, att, tte, ter, ern")

     print("\nWord 2: 'patter'")
     print("  Trigrams: pat, att, tte, ter")
     print("  (shares 4/5 with 'pattern')")

     print("\nWord 3: 'matter'")
     print("  Trigrams: mat, att, tte, ter")
     print("  (shares 3/5 with 'pattern')")

     sim_1_2 = float(model.similarity(hv1, hv2))
     sim_1_3 = float(model.similarity(hv1, hv3))

     print(f"\nSimilarity 'pattern' vs 'patter': {sim_1_2:.3f}")
     print(f"Similarity 'pattern' vs 'matter': {sim_1_3:.3f}")

     print("\nKey insight:")
     print("  Character n-grams capture sub-word similarity")
     print("  Useful for misspelling detection and fuzzy matching")


 def demo_application_text_classification():
     """Demonstrate application: text classification."""
     print_section("Demo 7: Application - Text Classification")

     model = VSA.create('MAP', dim=10000, seed=42)
     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)

     print("\nScenario: Classify sentences as positive or negative\n")

     # Training examples
     positive_examples = [
         ['i', 'love', 'this', 'product'],
         ['great', 'quality', 'and', 'service'],
         ['highly', 'recommend', 'this', 'item']
     ]

     negative_examples = [
         ['terrible', 'quality', 'and', 'service'],
         ['do', 'not', 'recommend', 'this'],
         ['very', 'poor', 'experience']
     ]

     # Create class prototypes by bundling examples
     positive_hvs = [encoder.encode(ex) for ex in positive_examples]
     negative_hvs = [encoder.encode(ex) for ex in negative_examples]

     positive_prototype = model.bundle(positive_hvs)
     negative_prototype = model.bundle(negative_hvs)

     print("Training Data:")
     print("  Positive examples: 3")
     print("  Negative examples: 3")

     # Test examples
     test_sentences = [
         (['i', 'recommend', 'this', 'product'], "Positive"),
         (['poor', 'quality', 'item'], "Negative"),
         (['great', 'experience'], "Positive"),
     ]

     print("\nTest Results:")
     for sentence, true_label in test_sentences:
         hv = encoder.encode(sentence)

         sim_positive = float(model.similarity(hv, positive_prototype))
         sim_negative = float(model.similarity(hv, negative_prototype))

         predicted = "Positive" if sim_positive > sim_negative else "Negative"

         print(f"\n  Sentence: {' '.join(sentence)}")
         print(f"  Sim to positive: {sim_positive:.3f}")
         print(f"  Sim to negative: {sim_negative:.3f}")
         print(f"  Predicted: {predicted}, True: {true_label} "
               f"{'✓' if predicted == true_label else '✗'}")


 def main():
     """Run all demos."""
     print("=" * 70)
     print("N-gram Encoder - Comprehensive Demonstration")
     print("=" * 70)
     print("\nThe NGramEncoder captures local patterns in sequences using")
     print("sliding windows. This is essential for:")
     print("  - Text analysis (NLP)")
     print("  - Pattern recognition")
     print("  - Sequence similarity")
     print("  - Classification based on local features")

     demo_basic_ngram_encoding()
     demo_different_n_values()
     demo_stride_parameter()
     demo_text_similarity()
     demo_bundling_vs_chaining()
     demo_character_ngrams()
     demo_application_text_classification()

     print("\n" + "=" * 70)
     print("Demo Complete!")
     print("=" * 70)
     print("\nNext steps:")
     print("  - See docs/theory/encoders.md for mathematical details")
     print("  - Run tests: pytest tests/test_encoders_sequence.py -k NGram")
     print("  - Try with different VSA models (FHRR, HRR, BSC)")


 if __name__ == '__main__':
     main()
Gallery generated by Sphinx-Gallery