Note
Go to the end to download the full example code.
Demonstration of N-gram Encoder for local sequence pattern encoding.¶
This demo showcases the NGramEncoder, which captures local patterns in sequences using sliding windows (n-grams). This is particularly useful for:
Text analysis (character/word n-grams)
Pattern matching in sequences
Similarity detection based on local context
NLP applications
The encoder supports: - Multiple n-gram sizes (unigrams, bigrams, trigrams, etc.) - Overlapping and non-overlapping windows (stride parameter) - Two modes: bundling (bag-of-ngrams) and chaining (ordered n-grams)
19 from holovec import VSA
20 from holovec.encoders import NGramEncoder
21
22
23 def print_section(title):
24 """Print a section header."""
25 print(f"\n{'=' * 70}")
26 print(f"{title}")
27 print('=' * 70)
28
29
30 def demo_basic_ngram_encoding():
31 """Demonstrate basic n-gram encoding."""
32 print_section("Demo 1: Basic N-gram Encoding")
33
34 model = VSA.create('MAP', dim=5000, seed=42)
35 encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
36
37 print(f"\nEncoder: {encoder}")
38 print(f"Configuration: n={encoder.n}, stride={encoder.stride}, mode='{encoder.mode}'")
39
40 # Encode a sequence
41 sequence = ['the', 'quick', 'brown', 'fox']
42 hv = encoder.encode(sequence)
43
44 print(f"\nInput sequence: {sequence}")
45 print(f"Bigrams (n=2, stride=1):")
46 print(" - ['the', 'quick']")
47 print(" - ['quick', 'brown']")
48 print(" - ['brown', 'fox']")
49 print(f"\nEncoded hypervector shape: {hv.shape}")
50 print(f"Codebook size: {encoder.get_codebook_size()} unique symbols")
51
52
53 def demo_different_n_values():
54 """Demonstrate different n-gram sizes."""
55 print_section("Demo 2: Different N-gram Sizes")
56
57 model = VSA.create('MAP', dim=5000, seed=42)
58 sequence = ['A', 'B', 'C', 'D', 'E']
59
60 print(f"\nInput sequence: {sequence}\n")
61
62 for n in [1, 2, 3, 4]:
63 encoder = NGramEncoder(model, n=n, stride=1, mode='bundling', seed=42)
64 hv = encoder.encode(sequence)
65
66 # Calculate number of n-grams
67 num_ngrams = len(sequence) - n + 1
68
69 name = {1: "Unigrams", 2: "Bigrams", 3: "Trigrams", 4: "4-grams"}[n]
70 print(f"{name} (n={n}):")
71 print(f" Number of n-grams: {num_ngrams}")
72 print(f" Hypervector shape: {hv.shape}")
73
74
75 def demo_stride_parameter():
76 """Demonstrate stride (overlapping vs non-overlapping)."""
77 print_section("Demo 3: Stride Parameter (Overlapping vs Non-overlapping)")
78
79 model = VSA.create('MAP', dim=5000, seed=42)
80 sequence = ['A', 'B', 'C', 'D', 'E', 'F']
81
82 print(f"\nInput sequence: {sequence}\n")
83
84 # Overlapping bigrams (stride=1)
85 encoder1 = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
86 hv1 = encoder1.encode(sequence)
87
88 print("Overlapping bigrams (stride=1):")
89 print(" N-grams: ['A','B'], ['B','C'], ['C','D'], ['D','E'], ['E','F']")
90 print(f" Count: 5 n-grams")
91
92 # Non-overlapping bigrams (stride=2)
93 encoder2 = NGramEncoder(model, n=2, stride=2, mode='bundling', seed=42)
94 hv2 = encoder2.encode(sequence)
95
96 print("\nNon-overlapping bigrams (stride=2):")
97 print(" N-grams: ['A','B'], ['C','D'], ['E','F']")
98 print(f" Count: 3 n-grams")
99
100 # Partial overlap (stride=2 with trigrams)
101 encoder3 = NGramEncoder(model, n=3, stride=2, mode='bundling', seed=42)
102 hv3 = encoder3.encode(sequence)
103
104 print("\nPartial overlap trigrams (n=3, stride=2):")
105 print(" N-grams: ['A','B','C'], ['C','D','E']")
106 print(f" Count: 2 n-grams")
107
108
109 def demo_text_similarity():
110 """Demonstrate text similarity using n-grams."""
111 print_section("Demo 4: Text Similarity with N-grams")
112
113 model = VSA.create('MAP', dim=10000, seed=42)
114 encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
115
116 # Encode sentences as word bigrams
117 sent1 = ['the', 'cat', 'sat', 'on', 'the', 'mat']
118 sent2 = ['the', 'cat', 'sat', 'on', 'the', 'hat'] # Similar (1 word diff)
119 sent3 = ['a', 'dog', 'ran', 'in', 'the', 'park'] # Different
120
121 hv1 = encoder.encode(sent1)
122 hv2 = encoder.encode(sent2)
123 hv3 = encoder.encode(sent3)
124
125 print("\nSentence 1:", ' '.join(sent1))
126 print("Sentence 2:", ' '.join(sent2), "(differs by 1 word)")
127 print("Sentence 3:", ' '.join(sent3), "(completely different)\n")
128
129 sim_1_2 = float(model.similarity(hv1, hv2))
130 sim_1_3 = float(model.similarity(hv1, hv3))
131
132 print(f"Similarity (sent1 vs sent2): {sim_1_2:.3f}")
133 print(f"Similarity (sent1 vs sent3): {sim_1_3:.3f}")
134
135 print("\nKey insight:")
136 print(" Sentences sharing more bigrams have higher similarity.")
137 print(" Bigrams shared by sent1 and sent2:")
138 print(" ['the','cat'], ['cat','sat'], ['sat','on'], ['on','the']")
139
140
141 def demo_bundling_vs_chaining():
142 """Demonstrate bundling mode vs chaining mode."""
143 print_section("Demo 5: Bundling vs Chaining Modes")
144
145 model = VSA.create('MAP', dim=10000, seed=42)
146
147 sequence = ['A', 'B', 'C']
148
149 # Bundling mode: order-invariant across n-grams
150 encoder_bundle = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
151 hv_bundle = encoder_bundle.encode(sequence)
152
153 print("Bundling Mode (bag-of-ngrams):")
154 print(f" Sequence: {sequence}")
155 print(f" N-grams: ['A','B'], ['B','C']")
156 print(f" Encoding: bundle(encode(['A','B']), encode(['B','C']))")
157 print(f" Order-sensitive: No (n-grams bundled)")
158 print(f" Reversible: {encoder_bundle.is_reversible}")
159
160 # Chaining mode: order-sensitive
161 encoder_chain = NGramEncoder(model, n=2, stride=1, mode='chaining', seed=42)
162 hv_chain = encoder_chain.encode(sequence)
163
164 print("\nChaining Mode (ordered n-grams):")
165 print(f" Sequence: {sequence}")
166 print(f" N-grams: ['A','B'] at position 0, ['B','C'] at position 1")
167 print(f" Encoding: bundle(permute(encode(['A','B']), 0), permute(encode(['B','C']), 1))")
168 print(f" Order-sensitive: Yes (positions encoded)")
169 print(f" Reversible: {encoder_chain.is_reversible}")
170
171 # Test decoding in chaining mode
172 if encoder_chain.is_reversible:
173 decoded = encoder_chain.decode(hv_chain, max_ngrams=3, threshold=0.2)
174 print(f"\nDecoded n-grams (approximate): {decoded}")
175
176
177 def demo_character_ngrams():
178 """Demonstrate character-level n-grams."""
179 print_section("Demo 6: Character-Level N-grams")
180
181 model = VSA.create('MAP', dim=10000, seed=42)
182 encoder = NGramEncoder(model, n=3, stride=1, mode='bundling', seed=42)
183
184 # Encode words as character trigrams
185 word1 = list("pattern") # ['p','a','t','t','e','r','n']
186 word2 = list("patter") # ['p','a','t','t','e','r']
187 word3 = list("matter") # ['m','a','t','t','e','r']
188
189 hv1 = encoder.encode(word1)
190 hv2 = encoder.encode(word2)
191 hv3 = encoder.encode(word3)
192
193 print("\nWord 1: 'pattern'")
194 print(" Trigrams: pat, att, tte, ter, ern")
195
196 print("\nWord 2: 'patter'")
197 print(" Trigrams: pat, att, tte, ter")
198 print(" (shares 4/5 with 'pattern')")
199
200 print("\nWord 3: 'matter'")
201 print(" Trigrams: mat, att, tte, ter")
202 print(" (shares 3/5 with 'pattern')")
203
204 sim_1_2 = float(model.similarity(hv1, hv2))
205 sim_1_3 = float(model.similarity(hv1, hv3))
206
207 print(f"\nSimilarity 'pattern' vs 'patter': {sim_1_2:.3f}")
208 print(f"Similarity 'pattern' vs 'matter': {sim_1_3:.3f}")
209
210 print("\nKey insight:")
211 print(" Character n-grams capture sub-word similarity")
212 print(" Useful for misspelling detection and fuzzy matching")
213
214
215 def demo_application_text_classification():
216 """Demonstrate application: text classification."""
217 print_section("Demo 7: Application - Text Classification")
218
219 model = VSA.create('MAP', dim=10000, seed=42)
220 encoder = NGramEncoder(model, n=2, stride=1, mode='bundling', seed=42)
221
222 print("\nScenario: Classify sentences as positive or negative\n")
223
224 # Training examples
225 positive_examples = [
226 ['i', 'love', 'this', 'product'],
227 ['great', 'quality', 'and', 'service'],
228 ['highly', 'recommend', 'this', 'item']
229 ]
230
231 negative_examples = [
232 ['terrible', 'quality', 'and', 'service'],
233 ['do', 'not', 'recommend', 'this'],
234 ['very', 'poor', 'experience']
235 ]
236
237 # Create class prototypes by bundling examples
238 positive_hvs = [encoder.encode(ex) for ex in positive_examples]
239 negative_hvs = [encoder.encode(ex) for ex in negative_examples]
240
241 positive_prototype = model.bundle(positive_hvs)
242 negative_prototype = model.bundle(negative_hvs)
243
244 print("Training Data:")
245 print(" Positive examples: 3")
246 print(" Negative examples: 3")
247
248 # Test examples
249 test_sentences = [
250 (['i', 'recommend', 'this', 'product'], "Positive"),
251 (['poor', 'quality', 'item'], "Negative"),
252 (['great', 'experience'], "Positive"),
253 ]
254
255 print("\nTest Results:")
256 for sentence, true_label in test_sentences:
257 hv = encoder.encode(sentence)
258
259 sim_positive = float(model.similarity(hv, positive_prototype))
260 sim_negative = float(model.similarity(hv, negative_prototype))
261
262 predicted = "Positive" if sim_positive > sim_negative else "Negative"
263
264 print(f"\n Sentence: {' '.join(sentence)}")
265 print(f" Sim to positive: {sim_positive:.3f}")
266 print(f" Sim to negative: {sim_negative:.3f}")
267 print(f" Predicted: {predicted}, True: {true_label} "
268 f"{'✓' if predicted == true_label else '✗'}")
269
270
271 def main():
272 """Run all demos."""
273 print("=" * 70)
274 print("N-gram Encoder - Comprehensive Demonstration")
275 print("=" * 70)
276 print("\nThe NGramEncoder captures local patterns in sequences using")
277 print("sliding windows. This is essential for:")
278 print(" - Text analysis (NLP)")
279 print(" - Pattern recognition")
280 print(" - Sequence similarity")
281 print(" - Classification based on local features")
282
283 demo_basic_ngram_encoding()
284 demo_different_n_values()
285 demo_stride_parameter()
286 demo_text_similarity()
287 demo_bundling_vs_chaining()
288 demo_character_ngrams()
289 demo_application_text_classification()
290
291 print("\n" + "=" * 70)
292 print("Demo Complete!")
293 print("=" * 70)
294 print("\nNext steps:")
295 print(" - See docs/theory/encoders.md for mathematical details")
296 print(" - Run tests: pytest tests/test_encoders_sequence.py -k NGram")
297 print(" - Try with different VSA models (FHRR, HRR, BSC)")
298
299
300 if __name__ == '__main__':
301 main()