Topic Modeling

Topic modeling automatically discovers hidden themes in large text collections. Instead of reading every document, algorithms identify the underlying topics and which documents belong to each – essential for content organization, trend analysis, and understanding customer feedback.

LDA Topic Modeling Process

Why Topic Modeling Matters

A company receives 10,000 customer reviews monthly. Reading them all is impossible. Topic modeling reveals the main themes – quality issues, pricing concerns, feature requests – enabling data-driven decisions at scale.

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import warnings
warnings.filterwarnings('ignore')

Generate Sample Corpus

np.random.seed(42)

topics = {
    'technology': [
        'artificial intelligence machine learning deep learning neural network',
        'cloud computing infrastructure server deployment kubernetes',
        'software development programming python javascript framework',
        'data science analytics big data processing pipeline',
        'cybersecurity encryption privacy protection threat detection'
    ],
    'healthcare': [
        'patient treatment diagnosis clinical trial medical research',
        'drug development pharmaceutical biotech FDA approval',
        'hospital care nurse doctor surgery recovery',
        'mental health therapy counseling psychology wellness',
        'health insurance coverage policy premium deductible'
    ],
    'finance': [
        'stock market investment portfolio dividend return',
        'cryptocurrency bitcoin blockchain ethereum trading',
        'banking loan mortgage interest rate credit score',
        'tax planning accounting audit compliance regulation',
        'financial planning retirement savings investment strategy'
    ]
}

documents = []
doc_topics = []
for topic_name, topic_docs in topics.items():
    for doc in topic_docs:
        # Add noise
        words = doc.split()
        noisy = words + np.random.choice(['the', 'and', 'is', 'for', 'with'], 5).tolist()
        documents.append(' '.join(noisy))
        doc_topics.append(topic_name)

# Shuffle
indices = np.random.permutation(len(documents))
documents = [documents[i] for i in indices]
doc_topics = [doc_topics[i] for i in indices]

print(f"Corpus: {len(documents)} documents across {len(topics)} topics")

Text Vectorization

# Count vectorization for LDA
count_vec = CountVectorizer(
    max_df=0.95, min_df=2,
    max_features=1000,
    stop_words='english'
)
count_matrix = count_vec.fit_transform(documents)

# TF-IDF for NMF
tfidf_vec = TfidfVectorizer(
    max_df=0.95, min_df=2,
    max_features=1000,
    stop_words='english'
)
tfidf_matrix = tfidf_vec.fit_transform(documents)

print(f"Count matrix: {count_matrix.shape}")
print(f"TF-IDF matrix: {tfidf_matrix.shape}")

Latent Dirichlet Allocation (LDA)

# Fit LDA
lda = LatentDirichletAllocation(
    n_components=3,
    max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=42,
    n_jobs=-1
)

lda_output = lda.fit_transform(count_matrix)

# Display topics
feature_names = count_vec.get_feature_names_out()

def display_topics(model, feature_names, n_top_words=10):
    topics_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics_dict[f"Topic {topic_idx}"] = top_words
        print(f"Topic {topic_idx}: {', '.join(top_words)}")
    return topics_dict

print("LDA Topics:")
topic_words = display_topics(lda, feature_names)

# Document-topic distribution
print(f"\nDocument-topic shape: {lda_output.shape}")
print(f"Topic distribution for first doc: {lda_output[0].round(3)}")

Non-Negative Matrix Factorization (NMF)

# Fit NMF
nmf = NMF(
    n_components=3,
    init='nndsvd',
    max_iter=500,
    random_state=42,
    alpha_W=0.1,
    alpha_H=0.1
)

nmf_output = nmf.fit_transform(tfidf_matrix)

print("NMF Topics:")
display_topics(nmf, tfidf_vec.get_feature_names_out())

# NMF reconstruction error
reconstruction = np.dot(nmf_output, nmf.components_)
error = np.mean((tfidf_matrix.toarray() - reconstruction) ** 2)
print(f"\nNMF reconstruction error: {error:.6f}")

Latent Semantic Analysis (LSA)

# LSA via Truncated SVD
svd = TruncatedSVD(n_components=3, random_state=42)
lsa_output = svd.fit_transform(tfidf_matrix)

print(f"LSA explained variance: {svd.explained_variance_ratio_.sum():.3f}")
print(f"Per-component: {svd.explained_variance_ratio_.round(3)}")

# Topic similarity using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
topic_sim = cosine_similarity(svd.components_)
print(f"\nTopic similarity matrix:")
print(topic_sim.round(3))

BERTopic: Neural Topic Modeling

# BERTopic uses transformers for embeddings, then UMAP + HDBSCAN
try:
    from bertopic import BERTopic
    from sentence_transformers import SentenceTransformer
    
    # Use a smaller model for speed
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        nr_topics=3,
        verbose=True
    )
    
    topics, probs = topic_model.fit_transform(documents)
    
    print(f"BERTopic found {len(set(topics))} topics")
    print(f"Outlier documents: {topics.count(-1)}")
    
    # Get topic info
    topic_info = topic_model.get_topic_info()
    print(topic_info.head())
    
except ImportError:
    print("Install bertopic: pip install bertopic sentence-transformers")
    print("BERTopic uses neural embeddings for superior topic coherence")

Dynamic Topic Modeling

Tracking how topics evolve over time.

class DynamicTopicModel:
    """Simple dynamic topic modeling via time-sliced LDA."""
    
    def __init__(self, n_topics=3):
        self.n_topics = n_topics
        self.vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    
    def fit(self, documents, timestamps, n_slices=5):
        """Fit LDA on time slices."""
        # Sort by time
        sorted_docs = [d for _, d in sorted(zip(timestamps, documents))]
        
        # Split into time slices
        slice_size = len(sorted_docs) // n_slices
        self.time_slices = []
        
        for i in range(n_slices):
            start = i * slice_size
            end = start + slice_size if i < n_slices - 1 else len(sorted_docs)
            slice_docs = sorted_docs[start:end]
            
            count_matrix = self.vectorizer.fit_transform(slice_docs)
            lda = LatentDirichletAllocation(
                n_components=self.n_topics, random_state=42, max_iter=30
            )
            lda.fit(count_matrix)
            self.time_slices.append(lda)
        
        return self
    
    def get_topic_evolution(self):
        """Get how topics change across time slices."""
        evolution = []
        feature_names = self.vectorizer.get_feature_names_out()
        
        for t, model in enumerate(self.time_slices):
            for k in range(self.n_topics):
                top_words = [feature_names[i] for i in model.components_[k].argsort()[-5:]]
                evolution.append({
                    'time_slice': t,
                    'topic': k,
                    'top_words': ', '.join(top_words)
                })
        
        return pd.DataFrame(evolution)

# Create temporal documents
timestamps = list(range(15)) * 3
dynamic_model = DynamicTopicModel(n_topics=3)
dynamic_model.fit(documents, timestamps)
evolution = dynamic_model.get_topic_evolution()
print("Topic Evolution:")
print(evolution.head(10))

Topic Model Evaluation

# Coherence score (simplified)
def compute_coherence(model, feature_names, texts, n_top_words=10):
    """Compute topic coherence based on word co-occurrence."""
    from collections import Counter
    
    # Get top words per topic
    topics_words = []
    for topic in model.components_:
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topics_words.append(top_words)
    
    # Word co-occurrence in documents
    word_counts = Counter()
    for text in texts:
        words = set(text.lower().split())
        for w1 in words:
            for w2 in words:
                if w1 != w2:
                    word_counts[(w1, w2)] += 1
    
    # Coherence per topic
    coherences = []
    for words in topics_words:
        score = 0
        count = 0
        for i in range(len(words)):
            for j in range(i + 1, len(words)):
                if (words[i], words[j]) in word_counts:
                    score += np.log(word_counts[(words[i], words[j])] + 1)
                    count += 1
        coherences.append(score / (count + 1e-10))
    
    return np.mean(coherences)

# Evaluate models
print("Model Comparison:")
print(f"  LDA perplexity: {lda.perplexity(count_matrix):.1f}")
print(f"  LDA log-likelihood: {lda.score(count_matrix):.1f}")

Practical Tips

# Optimal number of topics
def find_optimal_topics(matrix, topic_range=range(2, 8)):
    """Find optimal number of topics using perplexity."""
    results = []
    for n in topic_range:
        lda = LatentDirichletAllocation(n_components=n, random_state=42, max_iter=30)
        lda.fit(matrix)
        results.append({
            'n_topics': n,
            'perplexity': lda.perplexity(matrix),
            'score': lda.score(matrix)
        })
    
    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))
    return results_df

# results = find_optimal_topics(count_matrix)

Best Practices

LDA for interpretability – probabilistic, well-understood
NMF for speed – faster than LDA, often comparable quality
BERTopic for quality – neural embeddings capture semantics
Evaluate coherence – not just perplexity
Preprocessing matters – lemmatization and stop words significantly affect results
Domain-specific stopwords – add industry terms that don't carry meaning

Summary

Topic modeling discovers themes in text collections. LDA provides probabilistic topic assignments, NMF offers faster alternatives, and BERTopic uses neural embeddings for superior coherence. Choose based on corpus size, interpretability needs, and computational resources.