πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Word2Vec Deep Dive

Neural NLPWord2Vec🟒 Free Lesson

Advertisement

Word2Vec

Word2Vec, introduced by Mikolov et al. in 2013, is a neural network-based approach to learn word embeddings. It comes in two architectures: Continuous Bag of Words (CBOW) and Skip-gram.

CBOW vs Skip-gram

AspectCBOWSkip-gram
InputContext wordsCenter word
OutputCenter wordContext words
SpeedFasterSlower
Rare wordsPoorBetter
SemanticBetterSlightly worse
SyntacticSlightly worseBetter

CBOW Implementation

import torch
import torch.nn as nn

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # context: (batch_size, context_size)
        embeds = self.embeddings(context)  # (batch, context_size, embed_dim)
        avg = embeds.mean(dim=1)           # (batch, embed_dim)
        out = self.linear(avg)             # (batch, vocab_size)
        return out

Skip-gram Implementation

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context):
        center_emb = self.embeddings(center)       # (batch, embed_dim)
        context_emb = self.output_embeddings(context) # (batch, embed_dim)
        score = torch.sum(center_emb * context_emb, dim=1)
        return score

    def negative_sampling_loss(self, center, context, negatives):
        center_emb = self.embeddings(center)
        context_emb = self.output_embeddings(context)
        neg_emb = self.output_embeddings(negatives)

        pos_score = torch.sum(center_emb * context_emb, dim=1)
        pos_loss = torch.nn.functional.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()
        neg_loss = torch.nn.functional.logsigmoid(-neg_score).sum(dim=1)

        return -(pos_loss + neg_loss).mean()

Skip-gram with Negative Sampling

log⁑σ(vwOβ€²TvwI)+βˆ‘i=1kEwi∼Pn(w)[log⁑σ(βˆ’vwiβ€²TvwI)]\log \sigma(v'_{w_O}{}^T v_{w_I}) + \sum_{i=1}^{k} \mathbb{E}_{w_i \sim P_n(w)}[\log \sigma(-v'_{w_i}{}^T v_{w_I})]

Negative Sampling

Instead of computing the full softmax over the entire vocabulary, negative sampling approximates it by sampling a small number of negative examples.

import numpy as np
from collections import Counter

class SkipGramTrainer:
    def __init__(self, corpus, embedding_dim=100, window=5, neg_samples=5):
        self.window = window
        self.neg_samples = neg_samples

        # Build vocabulary
        words = [w for sent in corpus for w in sent]
        word_counts = Counter(words)
        self.vocab = {w: i for i, w in enumerate(word_counts.keys())}
        self.vocab_size = len(self.vocab)

        # Noise distribution for negative sampling (freq^0.75)
        freqs = np.array(list(word_counts.values()), dtype=np.float64)
        freqs = freqs ** 0.75
        self.noise_dist = freqs / freqs.sum()

    def get_training_pairs(self, corpus):
        pairs = []
        for sent in corpus:
            for i, center in enumerate(sent):
                center_idx = self.vocab[center]
                for j in range(max(0, i-self.window),
                              min(len(sent), i+self.window+1)):
                    if i != j:
                        context_idx = self.vocab[sent[j]]
                        pairs.append((center_idx, context_idx))
        return pairs

    def get_negative_samples(self, batch_size):
        return np.random.choice(
            self.vocab_size, size=(batch_size, self.neg_samples),
            p=self.noise_dist
        )

Training Word2Vec with Gensim

from gensim.models import Word2Vec

# Prepare sentences
sentences = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "sat", "on", "the", "log"],
    ["cats", "and", "dogs", "are", "friends"]
]

# Train CBOW
cbow_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=0,  # 0=CBOW, 1=Skip-gram
    epochs=100
)

# Train Skip-gram
sg_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    epochs=100
)

print(cbow_model.wv['cat'])

Hyperparameters

ParameterTypical RangeEffect
embedding_dim100-300Higher = more expressive
window5-10Larger = more context
min_count5-10Ignore rare words
epochs5-20More training iterations
negative_samples5-20More negatives = better gradient
⭐

Premium Content

Word2Vec Deep Dive

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert NLP Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement