Demonstration of N-gram Encoder for local sequence pattern encoding.

This demo showcases the NGramEncoder, which captures local patterns in sequences using sliding windows (n-grams). This is particularly useful for:

  • Text analysis (character/word n-grams)

  • Pattern matching in sequences

  • Similarity detection based on local context

  • NLP applications

The encoder supports: - Multiple n-gram sizes (unigrams, bigrams, trigrams, etc.) - Overlapping and non-overlapping windows (stride parameter) - Two modes: bundling (bag-of-ngrams) and chaining (ordered n-grams)

 19 from holovec import VSA
 20 from holovec.encoders import NGramEncoder
 21
 22
 23 def print_section(title):
 24     """Print a section header."""
 25     print(f"\n{'=' * 70}")
 26     print(f"{title}")
 27     print('=' * 70)
 28
 29
 30 def demo_basic_ngram_encoding():
 31     """Demonstrate basic n-gram encoding."""
 32     print_section("Demo 1: Basic N-gram Encoding")
 33
 34     model = VSA.create('MAP', dim=5000, seed=42)
 35     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
 36
 37     print(f"\nEncoder: {encoder}")
 38     print(f"Configuration: n={encoder.n}, stride={encoder.stride}, mode='{encoder.mode}'")
 39
 40     # Encode a sequence
 41     sequence = ['the', 'quick', 'brown', 'fox']
 42     hv = encoder.encode(sequence)
 43
 44     print(f"\nInput sequence: {sequence}")
 45     print(f"Bigrams (n=2, stride=1):")
 46     print("  - ['the', 'quick']")
 47     print("  - ['quick', 'brown']")
 48     print("  - ['brown', 'fox']")
 49     print(f"\nEncoded hypervector shape: {hv.shape}")
 50     print(f"Codebook size: {encoder.get_codebook_size()} unique symbols")
 51
 52
 53 def demo_different_n_values():
 54     """Demonstrate different n-gram sizes."""
 55     print_section("Demo 2: Different N-gram Sizes")
 56
 57     model = VSA.create('MAP', dim=5000, seed=42)
 58     sequence = ['A', 'B', 'C', 'D', 'E']
 59
 60     print(f"\nInput sequence: {sequence}\n")
 61
 62     for n in [1, 2, 3, 4]:
 63         encoder = NGramEncoder(model, n=n, stride=1, mode='bundling', seed=42)
 64         hv = encoder.encode(sequence)
 65
 66         # Calculate number of n-grams
 67         num_ngrams = len(sequence) - n + 1
 68
 69         name = {1: "Unigrams", 2: "Bigrams", 3: "Trigrams", 4: "4-grams"}[n]
 70         print(f"{name} (n={n}):")
 71         print(f"  Number of n-grams: {num_ngrams}")
 72         print(f"  Hypervector shape: {hv.shape}")
 73
 74
 75 def demo_stride_parameter():
 76     """Demonstrate stride (overlapping vs non-overlapping)."""
 77     print_section("Demo 3: Stride Parameter (Overlapping vs Non-overlapping)")
 78
 79     model = VSA.create('MAP', dim=5000, seed=42)
 80     sequence = ['A', 'B', 'C', 'D', 'E', 'F']
 81
 82     print(f"\nInput sequence: {sequence}\n")
 83
 84     # Overlapping bigrams (stride=1)
 85     encoder1 = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
 86     hv1 = encoder1.encode(sequence)
 87
 88     print("Overlapping bigrams (stride=1):")
 89     print("  N-grams: ['A','B'], ['B','C'], ['C','D'], ['D','E'], ['E','F']")
 90     print(f"  Count: 5 n-grams")
 91
 92     # Non-overlapping bigrams (stride=2)
 93     encoder2 = NGramEncoder(model, n=2, stride=2, mode='bundling', seed=42)
 94     hv2 = encoder2.encode(sequence)
 95
 96     print("\nNon-overlapping bigrams (stride=2):")
 97     print("  N-grams: ['A','B'], ['C','D'], ['E','F']")
 98     print(f"  Count: 3 n-grams")
 99
100     # Partial overlap (stride=2 with trigrams)
101     encoder3 = NGramEncoder(model, n=3, stride=2, mode='bundling', seed=42)
102     hv3 = encoder3.encode(sequence)
103
104     print("\nPartial overlap trigrams (n=3, stride=2):")
105     print("  N-grams: ['A','B','C'], ['C','D','E']")
106     print(f"  Count: 2 n-grams")
107
108
109 def demo_text_similarity():
110     """Demonstrate text similarity using n-grams."""
111     print_section("Demo 4: Text Similarity with N-grams")
112
113     model = VSA.create('MAP', dim=10000, seed=42)
114     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
115
116     # Encode sentences as word bigrams
117     sent1 = ['the', 'cat', 'sat', 'on', 'the', 'mat']
118     sent2 = ['the', 'cat', 'sat', 'on', 'the', 'hat']  # Similar (1 word diff)
119     sent3 = ['a', 'dog', 'ran', 'in', 'the', 'park']   # Different
120
121     hv1 = encoder.encode(sent1)
122     hv2 = encoder.encode(sent2)
123     hv3 = encoder.encode(sent3)
124
125     print("\nSentence 1:", ' '.join(sent1))
126     print("Sentence 2:", ' '.join(sent2), "(differs by 1 word)")
127     print("Sentence 3:", ' '.join(sent3), "(completely different)\n")
128
129     sim_1_2 = float(model.similarity(hv1, hv2))
130     sim_1_3 = float(model.similarity(hv1, hv3))
131
132     print(f"Similarity (sent1 vs sent2): {sim_1_2:.3f}")
133     print(f"Similarity (sent1 vs sent3): {sim_1_3:.3f}")
134
135     print("\nKey insight:")
136     print("  Sentences sharing more bigrams have higher similarity.")
137     print("  Bigrams shared by sent1 and sent2:")
138     print("    ['the','cat'], ['cat','sat'], ['sat','on'], ['on','the']")
139
140
141 def demo_bundling_vs_chaining():
142     """Demonstrate bundling mode vs chaining mode."""
143     print_section("Demo 5: Bundling vs Chaining Modes")
144
145     model = VSA.create('MAP', dim=10000, seed=42)
146
147     sequence = ['A', 'B', 'C']
148
149     # Bundling mode: order-invariant across n-grams
150     encoder_bundle = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
151     hv_bundle = encoder_bundle.encode(sequence)
152
153     print("Bundling Mode (bag-of-ngrams):")
154     print(f"  Sequence: {sequence}")
155     print(f"  N-grams: ['A','B'], ['B','C']")
156     print(f"  Encoding: bundle(encode(['A','B']), encode(['B','C']))")
157     print(f"  Order-sensitive: No (n-grams bundled)")
158     print(f"  Reversible: {encoder_bundle.is_reversible}")
159
160     # Chaining mode: order-sensitive
161     encoder_chain = NGramEncoder(model, n=2, stride=1, mode='chaining', seed=42)
162     hv_chain = encoder_chain.encode(sequence)
163
164     print("\nChaining Mode (ordered n-grams):")
165     print(f"  Sequence: {sequence}")
166     print(f"  N-grams: ['A','B'] at position 0, ['B','C'] at position 1")
167     print(f"  Encoding: bundle(permute(encode(['A','B']), 0), permute(encode(['B','C']), 1))")
168     print(f"  Order-sensitive: Yes (positions encoded)")
169     print(f"  Reversible: {encoder_chain.is_reversible}")
170
171     # Test decoding in chaining mode
172     if encoder_chain.is_reversible:
173         decoded = encoder_chain.decode(hv_chain, max_ngrams=3, threshold=0.2)
174         print(f"\nDecoded n-grams (approximate): {decoded}")
175
176
177 def demo_character_ngrams():
178     """Demonstrate character-level n-grams."""
179     print_section("Demo 6: Character-Level N-grams")
180
181     model = VSA.create('MAP', dim=10000, seed=42)
182     encoder = NGramEncoder(model, n=3, stride=1, mode='bundling', seed=42)
183
184     # Encode words as character trigrams
185     word1 = list("pattern")   # ['p','a','t','t','e','r','n']
186     word2 = list("patter")    # ['p','a','t','t','e','r']
187     word3 = list("matter")    # ['m','a','t','t','e','r']
188
189     hv1 = encoder.encode(word1)
190     hv2 = encoder.encode(word2)
191     hv3 = encoder.encode(word3)
192
193     print("\nWord 1: 'pattern'")
194     print("  Trigrams: pat, att, tte, ter, ern")
195
196     print("\nWord 2: 'patter'")
197     print("  Trigrams: pat, att, tte, ter")
198     print("  (shares 4/5 with 'pattern')")
199
200     print("\nWord 3: 'matter'")
201     print("  Trigrams: mat, att, tte, ter")
202     print("  (shares 3/5 with 'pattern')")
203
204     sim_1_2 = float(model.similarity(hv1, hv2))
205     sim_1_3 = float(model.similarity(hv1, hv3))
206
207     print(f"\nSimilarity 'pattern' vs 'patter': {sim_1_2:.3f}")
208     print(f"Similarity 'pattern' vs 'matter': {sim_1_3:.3f}")
209
210     print("\nKey insight:")
211     print("  Character n-grams capture sub-word similarity")
212     print("  Useful for misspelling detection and fuzzy matching")
213
214
215 def demo_application_text_classification():
216     """Demonstrate application: text classification."""
217     print_section("Demo 7: Application - Text Classification")
218
219     model = VSA.create('MAP', dim=10000, seed=42)
220     encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
221
222     print("\nScenario: Classify sentences as positive or negative\n")
223
224     # Training examples
225     positive_examples = [
226         ['i', 'love', 'this', 'product'],
227         ['great', 'quality', 'and', 'service'],
228         ['highly', 'recommend', 'this', 'item']
229     ]
230
231     negative_examples = [
232         ['terrible', 'quality', 'and', 'service'],
233         ['do', 'not', 'recommend', 'this'],
234         ['very', 'poor', 'experience']
235     ]
236
237     # Create class prototypes by bundling examples
238     positive_hvs = [encoder.encode(ex) for ex in positive_examples]
239     negative_hvs = [encoder.encode(ex) for ex in negative_examples]
240
241     positive_prototype = model.bundle(positive_hvs)
242     negative_prototype = model.bundle(negative_hvs)
243
244     print("Training Data:")
245     print("  Positive examples: 3")
246     print("  Negative examples: 3")
247
248     # Test examples
249     test_sentences = [
250         (['i', 'recommend', 'this', 'product'], "Positive"),
251         (['poor', 'quality', 'item'], "Negative"),
252         (['great', 'experience'], "Positive"),
253     ]
254
255     print("\nTest Results:")
256     for sentence, true_label in test_sentences:
257         hv = encoder.encode(sentence)
258
259         sim_positive = float(model.similarity(hv, positive_prototype))
260         sim_negative = float(model.similarity(hv, negative_prototype))
261
262         predicted = "Positive" if sim_positive > sim_negative else "Negative"
263
264         print(f"\n  Sentence: {' '.join(sentence)}")
265         print(f"  Sim to positive: {sim_positive:.3f}")
266         print(f"  Sim to negative: {sim_negative:.3f}")
267         print(f"  Predicted: {predicted}, True: {true_label} "
268               f"{'✓' if predicted == true_label else '✗'}")
269
270
271 def main():
272     """Run all demos."""
273     print("=" * 70)
274     print("N-gram Encoder - Comprehensive Demonstration")
275     print("=" * 70)
276     print("\nThe NGramEncoder captures local patterns in sequences using")
277     print("sliding windows. This is essential for:")
278     print("  - Text analysis (NLP)")
279     print("  - Pattern recognition")
280     print("  - Sequence similarity")
281     print("  - Classification based on local features")
282
283     demo_basic_ngram_encoding()
284     demo_different_n_values()
285     demo_stride_parameter()
286     demo_text_similarity()
287     demo_bundling_vs_chaining()
288     demo_character_ngrams()
289     demo_application_text_classification()
290
291     print("\n" + "=" * 70)
292     print("Demo Complete!")
293     print("=" * 70)
294     print("\nNext steps:")
295     print("  - See docs/theory/encoders.md for mathematical details")
296     print("  - Run tests: pytest tests/test_encoders_sequence.py -k NGram")
297     print("  - Try with different VSA models (FHRR, HRR, BSC)")
298
299
300 if __name__ == '__main__':
301     main()

Gallery generated by Sphinx-Gallery