Named Entity Linking

Named Entity Linking (NEL) identifies entities in text and links them to corresponding entries in a knowledge base. It combines entity recognition, disambiguation, and knowledge base lookup.

Entity Linking Stages

Stage	Input	Output	Example
NER	Text	Entity spans	"Apple" → ORG
Candidate Generation	Mention	Candidate entities	"Apple" → [Apple Inc., Apple Records]
Disambiguation	Candidates	Best entity	"Apple" → Apple Inc. (Q312)
Linking	Entity	KB ID	Apple Inc. → Q312

Named Entity Recognition (NER)

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

class NERExtractor:
    def __init__(self, model_name="dslim/bert-base-NER"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.label_map = self.model.config.id2label

    def extract_entities(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)

        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [self.label_map[p.item()] for p in predictions[0]]

        entities = []
        current_entity = None

        for token, label in zip(tokens, labels):
            if label.startswith("B-"):
                if current_entity:
                    entities.append(current_entity)
                current_entity = {
                    "text": token,
                    "type": label[2:],
                    "start": 0
                }
            elif label.startswith("I-") and current_entity:
                current_entity["text"] += " " + token
            else:
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None

        if current_entity:
            entities.append(current_entity)

        return entities

ner = NERExtractor()
entities = ner.extract_entities("Apple was founded by Steve Jobs in California.")
for e in entities:
    print(f"{e['text']} [{e['type']}]")
# Apple [ORG]
# Steve Jobs [PER]
# California [LOC]

Entity Disambiguation

from transformers import AutoModel, AutoTokenizer
import torch

class EntityDisambiguator:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(
            "sentence-transformers/all-MiniLM-L6-v2"
        )
        self.model = AutoModel.from_pretrained(
            "sentence-transformers/all-MiniLM-L6-v2"
        )
        self.candidate_embeddings = {}
        self.candidate_metadata = {}

    def add_candidate(self, entity_id, name, description):
        """Add a candidate entity to the knowledge base"""
        input_text = f"{name} {description}"
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True)

        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :].squeeze()

        self.candidate_embeddings[entity_id] = embedding
        self.candidate_metadata[entity_id] = {
            "name": name,
            "description": description
        }

    def disambiguate(self, mention, context):
        """Find the best matching entity for a mention"""
        # Encode mention with context
        input_text = f"{mention}: {context}"
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True)

        with torch.no_grad():
            outputs = self.model(**inputs)
            mention_embedding = outputs.last_hidden_state[:, 0, :].squeeze()

        # Find most similar candidate
        best_score = -1
        best_entity = None

        for entity_id, emb in self.candidate_embeddings.items():
            score = torch.nn.functional.cosine_similarity(
                mention_embedding.unsqueeze(0),
                emb.unsqueeze(0)
            ).item()

            if score > best_score:
                best_score = score
                best_entity = entity_id

        return {
            "entity_id": best_entity,
            "score": best_score,
            "metadata": self.candidate_metadata.get(best_entity, {})
        }

disambiguator = EntityDisambiguator()

# Add candidates
disambiguator.add_candidate(
    "Q312", "Apple Inc.",
    "American multinational technology company that designs consumer electronics"
)
disambiguator.add_candidate(
    "Q16504", "Apple Records",
    "British record label founded by The Beatles"
)

# Disambiguate
result = disambiguator.disambiguate(
    "Apple",
    "Steve Jobs co-founded Apple in 1976"
)
print(result)
# {'entity_id': 'Q312', 'score': 0.92, 'metadata': {'name': 'Apple Inc.', ...}}

Knowledge Base Integration

import requests

class KnowledgeBaseLinker:
    def __init__(self, kb_api="https://www.wikidata.org/w/api.php"):
        self.api_url = kb_api
        self.cache = {}

    def search_entity(self, name, language="en"):
        """Search Wikidata for entity candidates"""
        params = {
            "action": "wbsearchentities",
            "search": name,
            "language": language,
            "format": "json",
            "limit": 5
        }

        response = requests.get(self.api_url, params=params)
        data = response.json()

        return [{
            "id": item["id"],
            "label": item.get("label", ""),
            "description": item.get("description", ""),
            "url": item.get("concepturi", "")
        } for item in data.get("search", [])]

    def get_entity_details(self, entity_id):
        """Get detailed information about an entity"""
        if entity_id in self.cache:
            return self.cache[entity_id]

        params = {
            "action": "wbgetentities",
            "ids": entity_id,
            "format": "json",
            "props": "labels|descriptions|claims"
        }

        response = requests.get(self.api_url, params=params)
        data = response.json()

        entity = data.get("entities", {}).get(entity_id, {})
        self.cache[entity_id] = entity
        return entity

kb = KnowledgeBaseLinker()

# Search for candidates
candidates = kb.search_entity("Python")
for c in candidates:
    print(f"{c['id']}: {c['label']} - {c['description']}")
# Q28865: Python - snake genus
# Q282201: Python - programming language
# Q189786: Python - Monty Python

Evaluation Metrics

Metric	Description	Range
Entity Detection F1	NER performance	0-100
Linking Accuracy	Correct entity links	0-100
End-to-End F1	Full pipeline performance	0-100
Micro F1	Per-mention evaluation	0-100
Macro F1	Per-type evaluation	0-100

DfEntity Linking Accuracy

Comparison of Approaches

Approach	Method	Pros	Cons
Pipeline	NER → Disambiguation	Modular	Error propagation
Joint	End-to-end model	Better accuracy	Complex
Mention-based	Independent linking	Parallelizable	No context sharing
Document-level	Document context	Better disambiguation	Slower

Entity Disambiguation Example

Entity linking performance is typically measured on benchmark datasets like AIDA-CoNLL, MSNBC, and AquaQuad. State-of-the-art systems achieve >90% accuracy on standard benchmarks.

Named Entity Linking

Named Entity Linking

Entity Linking Stages

Named Entity Recognition (NER)

Entity Disambiguation

Knowledge Base Integration

Evaluation Metrics

DfEntity Linking Accuracy

Comparison of Approaches

Entity Disambiguation Example

Premium Content

Need Expert NLP Help?