Document Classification with N-grams

Topics: Text classification, n-gram encoding, supervised learning, NLP Time: 15 minutes Prerequisites: 14_encoders_ngram.py, 26_retrieval_basics.py Related: 23_app_symbolic_reasoning.py, 25_app_integration_patterns.py

This example demonstrates practical document classification using n-gram encoding and hyperdimensional computing. Learn how to build a text classifier that can categorize documents based on their content.

Key concepts: - Document encoding: N-gram patterns for text representation - Training: Build class prototypes from labeled examples - Classification: Nearest-prototype matching - Evaluation: Accuracy metrics and confusion analysis

Text classification with HDC is fast, interpretable, and works well with limited training data - ideal for practical NLP applications.

 24 from holovec import VSA
 25 from holovec.encoders import NGramEncoder
 26 from holovec.retrieval import ItemStore
 27
 28 print("=" * 70)
 29 print("Document Classification with N-grams")
 30 print("=" * 70)
 31 print()
 32
 33 # Create model and encoder
 34 model = VSA.create('MAP', dim=10000, seed=42)
 35 encoder = NGramEncoder(model, n=3, stride=1, mode='bundling', seed=42)
 36
 37 print(f"Model: {model.model_name}, dimension={model.dimension}")
 38 print(f"Encoder: NGramEncoder(n=3, mode='bundling')")
 39 print()
 40
 41 # ============================================================================
 42 # Dataset: News Article Classification
 43 # ============================================================================
 44 print("=" * 70)
 45 print("Dataset: News Articles (4 categories)")
 46 print("=" * 70)
 47
 48 # Training data: short news snippets
 49 training_data = [
 50     # Sports
 51     ("team wins championship game tonight", "sports"),
 52     ("player scores winning goal match", "sports"),
 53     ("coach announces new training strategy", "sports"),
 54     ("league announces playoff schedule games", "sports"),
 55
 56     # Technology
 57     ("new smartphone launched today features", "technology"),
 58     ("software update fixes security bug", "technology"),
 59     ("tech company releases ai chatbot", "technology"),
 60     ("startup raises million funding round", "technology"),
 61
 62     # Business
 63     ("stock market rises today investors", "business"),
 64     ("company reports quarterly earnings profit", "business"),
 65     ("merger deal announced billion dollars", "business"),
 66     ("economic growth forecast next quarter", "business"),
 67
 68     # Health
 69     ("new study shows health benefits", "health"),
 70     ("doctor recommends exercise diet plan", "health"),
 71     ("hospital opens new emergency wing", "health"),
 72     ("vaccine approved clinical trials results", "health"),
 73 ]
 74
 75 print(f"\nTraining examples: {len(training_data)} articles")
 76 print(f"Categories: sports, technology, business, health")
 77 print(f"Examples per category: {len(training_data) // 4}")
 78 print()
 79
 80 # ============================================================================
 81 # Training: Build Class Prototypes
 82 # ============================================================================
 83 print("=" * 70)
 84 print("Training: Building Class Prototypes")
 85 print("=" * 70)
 86
 87 # Group examples by category
 88 categories = {}
 89 for text, label in training_data:
 90     if label not in categories:
 91         categories[label] = []
 92     categories[label].append(text)
 93
 94 print("\nEncoding training examples...")
 95
 96 # Build prototype for each category
 97 class_prototypes = {}
 98 for label, texts in categories.items():
 99     # Encode all documents in this category
100     encoded_docs = [encoder.encode(text) for text in texts]
101
102     # Bundle to create class prototype
103     prototype = model.bundle(encoded_docs)
104     class_prototypes[label] = prototype
105
106     print(f"  {label:12s}: {len(texts)} examples → prototype")
107
108 print(f"\nClass prototypes created: {len(class_prototypes)}")
109
110 # ============================================================================
111 # Classification: Predict Category for New Documents
112 # ============================================================================
113 print("\n" + "=" * 70)
114 print("Classification: Testing on New Documents")
115 print("=" * 70)
116
117 # Test documents
118 test_documents = [
119     "basketball team defeats rivals final",  # sports
120     "new laptop computer faster processor",  # technology
121     "company profits increase stock price",  # business
122     "patients recover hospital treatment",   # health
123 ]
124
125 print("\nClassifying test documents:")
126 print()
127
128 correct = 0
129 total = len(test_documents)
130
131 expected_labels = ["sports", "technology", "business", "health"]
132
133 for i, doc in enumerate(test_documents):
134     # Encode test document
135     doc_hv = encoder.encode(doc)
136
137     # Find most similar class prototype
138     best_label = None
139     best_sim = float('-inf')
140
141     for label, prototype in class_prototypes.items():
142         sim = float(model.similarity(doc_hv, prototype))
143         if sim > best_sim:
144             best_sim = sim
145             best_label = label
146
147     expected = expected_labels[i]
148     is_correct = best_label == expected
149     correct += (1 if is_correct else 0)
150
151     marker = "✓" if is_correct else "✗"
152     print(f"{i+1}. \"{doc}\"")
153     print(f"   Predicted: {best_label:12s} (similarity={best_sim:.3f}) {marker}")
154     print(f"   Expected:  {expected:12s}")
155     print()
156
157 accuracy = correct / total
158 print(f"Accuracy: {correct}/{total} = {accuracy:.1%}")
159
160 # ============================================================================
161 # Analysis: Understanding Classification Decisions
162 # ============================================================================
163 print("\n" + "=" * 70)
164 print("Analysis: Classification Confidence")
165 print("=" * 70)
166
167 print("\nDetailed similarity scores for first test doc:")
168 print(f"Document: '{test_documents[0]}'")
169 print()
170
171 doc_hv = encoder.encode(test_documents[0])
172
173 print(f"{'Category':12s} | {'Similarity':>12s} | {'Confidence':>12s}")
174 print("-" * 45)
175
176 for label in sorted(class_prototypes.keys()):
177     sim = float(model.similarity(doc_hv, class_prototypes[label]))
178     conf = "High" if sim > 0.6 else "Medium" if sim > 0.4 else "Low"
179     print(f"{label:12s} | {sim:12.3f} | {conf:>12s}")
180
181 print("\nKey observation:")
182 print("  - Clear winner indicates confident classification")
183 print("  - Similar scores suggest ambiguous document")
184 print("  - Low scores across all classes suggest out-of-domain")
185
186 # ============================================================================
187 # Practical Considerations
188 # ============================================================================
189 print("\n" + "=" * 70)
190 print("Practical Considerations")
191 print("=" * 70)
192
193 print("\n✓ Advantages of HDC Text Classification:")
194 print("  - Fast training: just bundle examples per class")
195 print("  - One-shot learning: works with few examples")
196 print("  - Interpretable: similarity scores show confidence")
197 print("  - Noise tolerant: robust to typos and variations")
198 print("  - No gradient descent: no hyperparameter tuning")
199 print()
200
201 print("✗ Limitations:")
202 print("  - Approximate: not as accurate as deep learning (large data)")
203 print("  - Capacity: performance degrades with many classes")
204 print("  - Context: n-grams miss long-range dependencies")
205 print("  - Tuning: n-gram size affects performance")
206 print()
207
208 print("When to use HDC text classification:")
209 print("  - Limited training data (< 100 examples per class)")
210 print("  - Fast deployment needed (no training time)")
211 print("  - Interpretability important (need similarity scores)")
212 print("  - Edge devices (low compute, memory constraints)")
213 print("  - Prototyping (quick baseline before deep learning)")
214 print()
215
216 # ============================================================================
217 # Extension: Using ItemStore for Efficient Classification
218 # ============================================================================
219 print("=" * 70)
220 print("Extension: ItemStore for Multi-Class Classification")
221 print("=" * 70)
222
223 # Build ItemStore with class prototypes
224 classifier = ItemStore(model)
225 for label, prototype in class_prototypes.items():
226     classifier.add(label, prototype)
227
228 print(f"\nClassifier built with {len(class_prototypes)} classes")
229
230 # Classify with ItemStore
231 print("\nClassifying with ItemStore:")
232
233 test_doc = "scientist discovers new medical treatment"
234 test_hv = encoder.encode(test_doc)
235
236 # Query returns list of (label, similarity) tuples
237 results = classifier.query(test_hv, k=4)
238
239 print(f"\nDocument: '{test_doc}'")
240 print("\nTop predictions:")
241 for i, (label, sim) in enumerate(results, 1):
242     print(f"  {i}. {label:12s}: {sim:.3f}")
243
244 print("\nKey observation:")
245 print("  - ItemStore enables efficient k-nearest class retrieval")
246 print("  - Can examine top-k predictions for confidence")
247 print("  - Scales well to many classes (1000+)")
248
249 # ============================================================================
250 # Summary
251 # ============================================================================
252 print("\n" + "=" * 70)
253 print("Summary: Text Classification with HDC")
254 print("=" * 70)
255 print()
256 print("Complete workflow:")
257 print("  1. Setup: Create model + NGramEncoder")
258 print("  2. Training: Encode examples + bundle per class")
259 print("  3. Classification: Encode test doc + find nearest prototype")
260 print("  4. Evaluation: Check similarity scores for confidence")
261 print()
262 print("Performance tips:")
263 print("  - N-gram size: 2-3 for words, 3-5 for characters")
264 print("  - More training examples → better prototypes")
265 print("  - Bundle diverse examples to capture class variation")
266 print("  - Use ItemStore for efficient multi-class problems")
267 print()
268 print("Next steps:")
269 print("  → Try with your own text dataset")
270 print("  → Experiment with n-gram sizes (n=2, 3, 4)")
271 print("  → Combine with 25_app_integration_patterns.py for multimodal")
272 print("  → Scale to larger datasets with ItemStore")
273 print()
274 print("=" * 70)

Gallery generated by Sphinx-Gallery