Topic Modeling
Topic modeling automatically discovers hidden themes in large text collections. Instead of reading every document, algorithms identify the underlying topics and which documents belong to each β essential for content organization, trend analysis, and understanding customer feedback.
LDA Topic Modeling Process
Why Topic Modeling Matters
A company receives 10,000 customer reviews monthly. Reading them all is impossible. Topic modeling reveals the main themes β quality issues, pricing concerns, feature requests β enabling data-driven decisions at scale.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import warnings
warnings.filterwarnings('ignore')
Generate Sample Corpus
np.random.seed(42)
topics = {
'technology': [
'artificial intelligence machine learning deep learning neural network',
'cloud computing infrastructure server deployment kubernetes',
'software development programming python javascript framework',
'data science analytics big data processing pipeline',
'cybersecurity encryption privacy protection threat detection'
],
'healthcare': [
'patient treatment diagnosis clinical trial medical research',
'drug development pharmaceutical biotech FDA approval',
'hospital care nurse doctor surgery recovery',
'mental health therapy counseling psychology wellness',
'health insurance coverage policy premium deductible'
],
'finance': [
'stock market investment portfolio dividend return',
'cryptocurrency bitcoin blockchain ethereum trading',
'banking loan mortgage interest rate credit score',
'tax planning accounting audit compliance regulation',
'financial planning retirement savings investment strategy'
]
}
documents = []
doc_topics = []
for topic_name, topic_docs in topics.items():
for doc in topic_docs:
# Add noise
words = doc.split()
noisy = words + np.random.choice(['the', 'and', 'is', 'for', 'with'], 5).tolist()
documents.append(' '.join(noisy))
doc_topics.append(topic_name)
# Shuffle
indices = np.random.permutation(len(documents))
documents = [documents[i] for i in indices]
doc_topics = [doc_topics[i] for i in indices]
print(f"Corpus: {len(documents)} documents across {len(topics)} topics")
Text Vectorization
# Count vectorization for LDA
count_vec = CountVectorizer(
max_df=0.95, min_df=2,
max_features=1000,
stop_words='english'
)
count_matrix = count_vec.fit_transform(documents)
# TF-IDF for NMF
tfidf_vec = TfidfVectorizer(
max_df=0.95, min_df=2,
max_features=1000,
stop_words='english'
)
tfidf_matrix = tfidf_vec.fit_transform(documents)
print(f"Count matrix: {count_matrix.shape}")
print(f"TF-IDF matrix: {tfidf_matrix.shape}")
Latent Dirichlet Allocation (LDA)
# Fit LDA
lda = LatentDirichletAllocation(
n_components=3,
max_iter=50,
learning_method='online',
learning_offset=50.,
random_state=42,
n_jobs=-1
)
lda_output = lda.fit_transform(count_matrix)
# Display topics
feature_names = count_vec.get_feature_names_out()
def display_topics(model, feature_names, n_top_words=10):
topics_dict = {}
for topic_idx, topic in enumerate(model.components_):
top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
topics_dict[f"Topic {topic_idx}"] = top_words
print(f"Topic {topic_idx}: {', '.join(top_words)}")
return topics_dict
print("LDA Topics:")
topic_words = display_topics(lda, feature_names)
# Document-topic distribution
print(f"\nDocument-topic shape: {lda_output.shape}")
print(f"Topic distribution for first doc: {lda_output[0].round(3)}")
Non-Negative Matrix Factorization (NMF)
# Fit NMF
nmf = NMF(
n_components=3,
init='nndsvd',
max_iter=500,
random_state=42,
alpha_W=0.1,
alpha_H=0.1
)
nmf_output = nmf.fit_transform(tfidf_matrix)
print("NMF Topics:")
display_topics(nmf, tfidf_vec.get_feature_names_out())
# NMF reconstruction error
reconstruction = np.dot(nmf_output, nmf.components_)
error = np.mean((tfidf_matrix.toarray() - reconstruction) ** 2)
print(f"\nNMF reconstruction error: {error:.6f}")
Latent Semantic Analysis (LSA)
# LSA via Truncated SVD
svd = TruncatedSVD(n_components=3, random_state=42)
lsa_output = svd.fit_transform(tfidf_matrix)
print(f"LSA explained variance: {svd.explained_variance_ratio_.sum():.3f}")
print(f"Per-component: {svd.explained_variance_ratio_.round(3)}")
# Topic similarity using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
topic_sim = cosine_similarity(svd.components_)
print(f"\nTopic similarity matrix:")
print(topic_sim.round(3))
BERTopic: Neural Topic Modeling
# BERTopic uses transformers for embeddings, then UMAP + HDBSCAN
try:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# Use a smaller model for speed
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
topic_model = BERTopic(
embedding_model=embedding_model,
nr_topics=3,
verbose=True
)
topics, probs = topic_model.fit_transform(documents)
print(f"BERTopic found {len(set(topics))} topics")
print(f"Outlier documents: {topics.count(-1)}")
# Get topic info
topic_info = topic_model.get_topic_info()
print(topic_info.head())
except ImportError:
print("Install bertopic: pip install bertopic sentence-transformers")
print("BERTopic uses neural embeddings for superior topic coherence")
Dynamic Topic Modeling
Tracking how topics evolve over time.
class DynamicTopicModel:
"""Simple dynamic topic modeling via time-sliced LDA."""
def __init__(self, n_topics=3):
self.n_topics = n_topics
self.vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
def fit(self, documents, timestamps, n_slices=5):
"""Fit LDA on time slices."""
# Sort by time
sorted_docs = [d for _, d in sorted(zip(timestamps, documents))]
# Split into time slices
slice_size = len(sorted_docs) // n_slices
self.time_slices = []
for i in range(n_slices):
start = i * slice_size
end = start + slice_size if i < n_slices - 1 else len(sorted_docs)
slice_docs = sorted_docs[start:end]
count_matrix = self.vectorizer.fit_transform(slice_docs)
lda = LatentDirichletAllocation(
n_components=self.n_topics, random_state=42, max_iter=30
)
lda.fit(count_matrix)
self.time_slices.append(lda)
return self
def get_topic_evolution(self):
"""Get how topics change across time slices."""
evolution = []
feature_names = self.vectorizer.get_feature_names_out()
for t, model in enumerate(self.time_slices):
for k in range(self.n_topics):
top_words = [feature_names[i] for i in model.components_[k].argsort()[-5:]]
evolution.append({
'time_slice': t,
'topic': k,
'top_words': ', '.join(top_words)
})
return pd.DataFrame(evolution)
# Create temporal documents
timestamps = list(range(15)) * 3
dynamic_model = DynamicTopicModel(n_topics=3)
dynamic_model.fit(documents, timestamps)
evolution = dynamic_model.get_topic_evolution()
print("Topic Evolution:")
print(evolution.head(10))
Topic Model Evaluation
# Coherence score (simplified)
def compute_coherence(model, feature_names, texts, n_top_words=10):
"""Compute topic coherence based on word co-occurrence."""
from collections import Counter
# Get top words per topic
topics_words = []
for topic in model.components_:
top_indices = topic.argsort()[:-n_top_words - 1:-1]
top_words = [feature_names[i] for i in top_indices]
topics_words.append(top_words)
# Word co-occurrence in documents
word_counts = Counter()
for text in texts:
words = set(text.lower().split())
for w1 in words:
for w2 in words:
if w1 != w2:
word_counts[(w1, w2)] += 1
# Coherence per topic
coherences = []
for words in topics_words:
score = 0
count = 0
for i in range(len(words)):
for j in range(i + 1, len(words)):
if (words[i], words[j]) in word_counts:
score += np.log(word_counts[(words[i], words[j])] + 1)
count += 1
coherences.append(score / (count + 1e-10))
return np.mean(coherences)
# Evaluate models
print("Model Comparison:")
print(f" LDA perplexity: {lda.perplexity(count_matrix):.1f}")
print(f" LDA log-likelihood: {lda.score(count_matrix):.1f}")
Practical Tips
# Optimal number of topics
def find_optimal_topics(matrix, topic_range=range(2, 8)):
"""Find optimal number of topics using perplexity."""
results = []
for n in topic_range:
lda = LatentDirichletAllocation(n_components=n, random_state=42, max_iter=30)
lda.fit(matrix)
results.append({
'n_topics': n,
'perplexity': lda.perplexity(matrix),
'score': lda.score(matrix)
})
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
return results_df
# results = find_optimal_topics(count_matrix)
Best Practices
- LDA for interpretability β probabilistic, well-understood
- NMF for speed β faster than LDA, often comparable quality
- BERTopic for quality β neural embeddings capture semantics
- Evaluate coherence β not just perplexity
- Preprocessing matters β lemmatization and stop words significantly affect results
- Domain-specific stopwords β add industry terms that don't carry meaning
Summary
Topic modeling discovers themes in text collections. LDA provides probabilistic topic assignments, NMF offers faster alternatives, and BERTopic uses neural embeddings for superior coherence. Choose based on corpus size, interpretability needs, and computational resources.