Word2Vec
Word2Vec, introduced by Mikolov et al. in 2013, is a neural network-based approach to learn word embeddings. It comes in two architectures: Continuous Bag of Words (CBOW) and Skip-gram.
CBOW vs Skip-gram
| Aspect | CBOW | Skip-gram |
|---|---|---|
| Input | Context words | Center word |
| Output | Center word | Context words |
| Speed | Faster | Slower |
| Rare words | Poor | Better |
| Semantic | Better | Slightly worse |
| Syntactic | Slightly worse | Better |
CBOW Implementation
import torch
import torch.nn as nn
class CBOW(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear = nn.Linear(embedding_dim, vocab_size)
def forward(self, context):
# context: (batch_size, context_size)
embeds = self.embeddings(context) # (batch, context_size, embed_dim)
avg = embeds.mean(dim=1) # (batch, embed_dim)
out = self.linear(avg) # (batch, vocab_size)
return out
Skip-gram Implementation
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)
def forward(self, center, context):
center_emb = self.embeddings(center) # (batch, embed_dim)
context_emb = self.output_embeddings(context) # (batch, embed_dim)
score = torch.sum(center_emb * context_emb, dim=1)
return score
def negative_sampling_loss(self, center, context, negatives):
center_emb = self.embeddings(center)
context_emb = self.output_embeddings(context)
neg_emb = self.output_embeddings(negatives)
pos_score = torch.sum(center_emb * context_emb, dim=1)
pos_loss = torch.nn.functional.logsigmoid(pos_score)
neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()
neg_loss = torch.nn.functional.logsigmoid(-neg_score).sum(dim=1)
return -(pos_loss + neg_loss).mean()
Skip-gram with Negative Sampling
Negative Sampling
Instead of computing the full softmax over the entire vocabulary, negative sampling approximates it by sampling a small number of negative examples.
import numpy as np
from collections import Counter
class SkipGramTrainer:
def __init__(self, corpus, embedding_dim=100, window=5, neg_samples=5):
self.window = window
self.neg_samples = neg_samples
# Build vocabulary
words = [w for sent in corpus for w in sent]
word_counts = Counter(words)
self.vocab = {w: i for i, w in enumerate(word_counts.keys())}
self.vocab_size = len(self.vocab)
# Noise distribution for negative sampling (freq^0.75)
freqs = np.array(list(word_counts.values()), dtype=np.float64)
freqs = freqs ** 0.75
self.noise_dist = freqs / freqs.sum()
def get_training_pairs(self, corpus):
pairs = []
for sent in corpus:
for i, center in enumerate(sent):
center_idx = self.vocab[center]
for j in range(max(0, i-self.window),
min(len(sent), i+self.window+1)):
if i != j:
context_idx = self.vocab[sent[j]]
pairs.append((center_idx, context_idx))
return pairs
def get_negative_samples(self, batch_size):
return np.random.choice(
self.vocab_size, size=(batch_size, self.neg_samples),
p=self.noise_dist
)
Training Word2Vec with Gensim
from gensim.models import Word2Vec
# Prepare sentences
sentences = [
["the", "cat", "sat", "on", "the", "mat"],
["the", "dog", "sat", "on", "the", "log"],
["cats", "and", "dogs", "are", "friends"]
]
# Train CBOW
cbow_model = Word2Vec(
sentences,
vector_size=100,
window=5,
min_count=1,
sg=0, # 0=CBOW, 1=Skip-gram
epochs=100
)
# Train Skip-gram
sg_model = Word2Vec(
sentences,
vector_size=100,
window=5,
min_count=1,
sg=1,
epochs=100
)
print(cbow_model.wv['cat'])
Hyperparameters
| Parameter | Typical Range | Effect |
|---|---|---|
| embedding_dim | 100-300 | Higher = more expressive |
| window | 5-10 | Larger = more context |
| min_count | 5-10 | Ignore rare words |
| epochs | 5-20 | More training iterations |
| negative_samples | 5-20 | More negatives = better gradient |