Machine Translation

Machine translation (MT) automatically translates text from one language to another. Modern NMT uses encoder-decoder transformers with attention.

Seq2Seq Architecture

DfSeq2Seq Model

The encoder processes the source sentence, and the decoder generates the target sentence token by token.

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# MarianMT for translation
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

print(translate("Hello, how are you?"))
# "Bonjour, comment allez-vous?"

BLEU Score

BLEU (Bilingual Evaluation Understudy) measures translation quality by comparing n-gram overlap with reference translations.

DfBLEU Score

DfModified N-gram Precision

DfBrevity Penalty

Where:

p_n: Modified n-gram precision
w_n: Weight for n-gram (typically 1/N)
c: Candidate translation length
r: Reference translation length

from nltk.translate.bleu_score import (
    corpus_bleu,
    SmoothingFunction,
    sentence_bleu
)

# Single sentence BLEU
reference = [['the', 'cat', 'is', 'on', 'the', 'mat']]
candidate = ['the', 'cat', 'sat', 'on', 'the', 'mat']

# Smoothed BLEU (handles zero counts)
smooth = SmoothingFunction().method1
score = sentence_bleu(reference, candidate, smoothing_function=smooth)
print(f"BLEU: {score:.4f}")  # 0.7143

# Corpus BLEU
references = [
    [['the', 'cat', 'is', 'on', 'the', 'mat']],
    [['there', 'is', 'a', 'cat', 'on', 'the', 'mat']]
]
candidates = [
    ['the', 'cat', 'is', 'on', 'the', 'mat'],
    ['there', 'is', 'a', 'cat', 'on', 'the', 'mat']
]

corpus_score = corpus_bleu(references, candidates)
print(f"Corpus BLEU: {corpus_score:.4f}")

Translation Evaluation Metrics

Metric	Type	Measures	Range
BLEU	Precision	N-gram overlap	0-100
METEOR	Balance	Precision + recall + alignment	0-1
TER	Error	Edit distance / reference length	0-∞
CHRF	Character	Character n-gram F1	0-100
COMET	Neural	Learned metric	-1 to 1
BERTScore	Semantic	Embedding similarity	0-1

import sacrebleu

# BLEU with sacrebleu
references = ["Le chat est sur le tapis."]
candidate = "Le chat est sur le tapis."

bleu = sacrebleu.corpus_bleu([candidate], [references])
print(f"BLEU: {bleu.score:.2f}")
print(f"Signature: {bleu}")

# chrF (character n-gram F-score)
chrf = sacrebleu.corpus_chrf([candidate], [references])
print(f"chrF: {chrf.score:.2f}")

# TER (Translation Edit Rate)
ter = sacrebleu.corpus_ter([candidate], [references])
print(f"TER: {ter.score:.2f}")

Beam Search for Translation

def translate_with_beam(source_text, model, tokenizer, num_beams=5):
    inputs = tokenizer(source_text, return_tensors="pt", padding=True)

    outputs = model.generate(
        **inputs,
        num_beams=num_beams,
        max_length=128,
        length_penalty=0.6,
        early_stopping=True,
        no_repeat_ngram_size=3
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compare beam sizes
for beams in [1, 3, 5, 10]:
    result = translate_with_beam("Hello world!", model, tokenizer, num_beams=beams)
    print(f"Beams={beams}: {result}")

Quality Estimation

Quality estimation (QE) predicts translation quality without references.

from transformers import AutoModelForSequenceClassification

class TranslationQE:
    def __init__(self, model_name="Unbabel/wmt22-comet-da"):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def score(self, source, translation, reference=None):
        inputs = self.tokenizer(
            source,
            translation,
            return_tensors="pt",
            truncation=True
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
            score = torch.sigmoid(outputs.logits).item()
        return score

qe = TranslationQE()
score = qe.score(
    "The weather is nice today.",
    "Le temps est beau aujourd'hui."
)
print(f"Quality Score: {score:.4f}")

BLEU Score Calculation

BLEU was originally designed for machine translation but has been adapted for text summarization, image captioning, and other generation tasks. It correlates well with human judgment at the corpus level but not for individual sentences.

Back-Translation

Back-translation generates synthetic parallel data from monolingual corpora.

def back_translation(monolingual_data, en_to_fr_model, fr_to_en_model):
    synthetic_pairs = []
    for text in monolingual_data:
        # Translate to target language
        fr_text = translate(text, en_to_fr_model)

        # Translate back to source
        en_text = translate(fr_text, fr_to_en_model)

        # Use original as source, back-translated as target
        synthetic_pairs.append((text, en_text))

    return synthetic_pairs

Machine Translation