Word2Vec Deep Dive

Word2Vec

Word2Vec, introduced by Mikolov et al. in 2013, is a neural network-based approach to learn word embeddings. It comes in two architectures: Continuous Bag of Words (CBOW) and Skip-gram.

CBOW vs Skip-gram

Aspect	CBOW	Skip-gram
Input	Context words	Center word
Output	Center word	Context words
Speed	Faster	Slower
Rare words	Poor	Better
Semantic	Better	Slightly worse
Syntactic	Slightly worse	Better

CBOW Implementation

import torch
import torch.nn as nn

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # context: (batch_size, context_size)
        embeds = self.embeddings(context)  # (batch, context_size, embed_dim)
        avg = embeds.mean(dim=1)           # (batch, embed_dim)
        out = self.linear(avg)             # (batch, vocab_size)
        return out

Skip-gram Implementation

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context):
        center_emb = self.embeddings(center)       # (batch, embed_dim)
        context_emb = self.output_embeddings(context) # (batch, embed_dim)
        score = torch.sum(center_emb * context_emb, dim=1)
        return score

    def negative_sampling_loss(self, center, context, negatives):
        center_emb = self.embeddings(center)
        context_emb = self.output_embeddings(context)
        neg_emb = self.output_embeddings(negatives)

        pos_score = torch.sum(center_emb * context_emb, dim=1)
        pos_loss = torch.nn.functional.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()
        neg_loss = torch.nn.functional.logsigmoid(-neg_score).sum(dim=1)

        return -(pos_loss + neg_loss).mean()

Skip-gram with Negative Sampling

\log \sigma(v'_{w_O}{}^T v_{w_I}) + \sum_{i=1}^{k} \mathbb{E}_{w_i \sim P_n(w)}[\log \sigma(-v'_{w_i}{}^T v_{w_I})]

Negative Sampling

Instead of computing the full softmax over the entire vocabulary, negative sampling approximates it by sampling a small number of negative examples.

import numpy as np
from collections import Counter

class SkipGramTrainer:
    def __init__(self, corpus, embedding_dim=100, window=5, neg_samples=5):
        self.window = window
        self.neg_samples = neg_samples

        # Build vocabulary
        words = [w for sent in corpus for w in sent]
        word_counts = Counter(words)
        self.vocab = {w: i for i, w in enumerate(word_counts.keys())}
        self.vocab_size = len(self.vocab)

        # Noise distribution for negative sampling (freq^0.75)
        freqs = np.array(list(word_counts.values()), dtype=np.float64)
        freqs = freqs ** 0.75
        self.noise_dist = freqs / freqs.sum()

    def get_training_pairs(self, corpus):
        pairs = []
        for sent in corpus:
            for i, center in enumerate(sent):
                center_idx = self.vocab[center]
                for j in range(max(0, i-self.window),
                              min(len(sent), i+self.window+1)):
                    if i != j:
                        context_idx = self.vocab[sent[j]]
                        pairs.append((center_idx, context_idx))
        return pairs

    def get_negative_samples(self, batch_size):
        return np.random.choice(
            self.vocab_size, size=(batch_size, self.neg_samples),
            p=self.noise_dist
        )

Training Word2Vec with Gensim

from gensim.models import Word2Vec

# Prepare sentences
sentences = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "sat", "on", "the", "log"],
    ["cats", "and", "dogs", "are", "friends"]
]

# Train CBOW
cbow_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=0,  # 0=CBOW, 1=Skip-gram
    epochs=100
)

# Train Skip-gram
sg_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    epochs=100
)

print(cbow_model.wv['cat'])

Hyperparameters

Parameter	Typical Range	Effect
embedding_dim	100-300	Higher = more expressive
window	5-10	Larger = more context
min_count	5-10	Ignore rare words
epochs	5-20	More training iterations
negative_samples	5-20	More negatives = better gradient