Information ExtractionEntity Linking and Knowledge Base Integrationπ’ Free Lesson
Advertisement
Named Entity Linking
Named Entity Linking (NEL) identifies entities in text and links them to corresponding entries in a knowledge base. It combines entity recognition, disambiguation, and knowledge base lookup.
Entity Linking Stages
Stage
Input
Output
Example
NER
Text
Entity spans
"Apple" β ORG
Candidate Generation
Mention
Candidate entities
"Apple" β [Apple Inc., Apple Records]
Disambiguation
Candidates
Best entity
"Apple" β Apple Inc. (Q312)
Linking
Entity
KB ID
Apple Inc. β Q312
Named Entity Recognition (NER)
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
class NERExtractor:
def __init__(self, model_name="dslim/bert-base-NER"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.label_map = self.model.config.id2label
def extract_entities(self, text):
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [self.label_map[p.item()] for p in predictions[0]]
entities = []
current_entity = None
for token, label in zip(tokens, labels):
if label.startswith("B-"):
if current_entity:
entities.append(current_entity)
current_entity = {
"text": token,
"type": label[2:],
"start": 0
}
elif label.startswith("I-") and current_entity:
current_entity["text"] += " " + token
else:
if current_entity:
entities.append(current_entity)
current_entity = None
if current_entity:
entities.append(current_entity)
return entities
ner = NERExtractor()
entities = ner.extract_entities("Apple was founded by Steve Jobs in California.")
for e in entities:
print(f"{e['text']} [{e['type']}]")
# Apple [ORG]
# Steve Jobs [PER]
# California [LOC]
Entity Disambiguation
from transformers import AutoModel, AutoTokenizer
import torch
class EntityDisambiguator:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained(
"sentence-transformers/all-MiniLM-L6-v2"
)
self.model = AutoModel.from_pretrained(
"sentence-transformers/all-MiniLM-L6-v2"
)
self.candidate_embeddings = {}
self.candidate_metadata = {}
def add_candidate(self, entity_id, name, description):
"""Add a candidate entity to the knowledge base"""
input_text = f"{name} {description}"
inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].squeeze()
self.candidate_embeddings[entity_id] = embedding
self.candidate_metadata[entity_id] = {
"name": name,
"description": description
}
def disambiguate(self, mention, context):
"""Find the best matching entity for a mention"""
# Encode mention with context
input_text = f"{mention}: {context}"
inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
mention_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
# Find most similar candidate
best_score = -1
best_entity = None
for entity_id, emb in self.candidate_embeddings.items():
score = torch.nn.functional.cosine_similarity(
mention_embedding.unsqueeze(0),
emb.unsqueeze(0)
).item()
if score > best_score:
best_score = score
best_entity = entity_id
return {
"entity_id": best_entity,
"score": best_score,
"metadata": self.candidate_metadata.get(best_entity, {})
}
disambiguator = EntityDisambiguator()
# Add candidates
disambiguator.add_candidate(
"Q312", "Apple Inc.",
"American multinational technology company that designs consumer electronics"
)
disambiguator.add_candidate(
"Q16504", "Apple Records",
"British record label founded by The Beatles"
)
# Disambiguate
result = disambiguator.disambiguate(
"Apple",
"Steve Jobs co-founded Apple in 1976"
)
print(result)
# {'entity_id': 'Q312', 'score': 0.92, 'metadata': {'name': 'Apple Inc.', ...}}
Knowledge Base Integration
import requests
class KnowledgeBaseLinker:
def __init__(self, kb_api="https://www.wikidata.org/w/api.php"):
self.api_url = kb_api
self.cache = {}
def search_entity(self, name, language="en"):
"""Search Wikidata for entity candidates"""
params = {
"action": "wbsearchentities",
"search": name,
"language": language,
"format": "json",
"limit": 5
}
response = requests.get(self.api_url, params=params)
data = response.json()
return [{
"id": item["id"],
"label": item.get("label", ""),
"description": item.get("description", ""),
"url": item.get("concepturi", "")
} for item in data.get("search", [])]
def get_entity_details(self, entity_id):
"""Get detailed information about an entity"""
if entity_id in self.cache:
return self.cache[entity_id]
params = {
"action": "wbgetentities",
"ids": entity_id,
"format": "json",
"props": "labels|descriptions|claims"
}
response = requests.get(self.api_url, params=params)
data = response.json()
entity = data.get("entities", {}).get(entity_id, {})
self.cache[entity_id] = entity
return entity
kb = KnowledgeBaseLinker()
# Search for candidates
candidates = kb.search_entity("Python")
for c in candidates:
print(f"{c['id']}: {c['label']} - {c['description']}")
# Q28865: Python - snake genus
# Q282201: Python - programming language
# Q189786: Python - Monty Python
Evaluation Metrics
Metric
Description
Range
Entity Detection F1
NER performance
0-100
Linking Accuracy
Correct entity links
0-100
End-to-End F1
Full pipeline performance
0-100
Micro F1
Per-mention evaluation
0-100
Macro F1
Per-type evaluation
0-100
DfEntity Linking Accuracy
Comparison of Approaches
Approach
Method
Pros
Cons
Pipeline
NER β Disambiguation
Modular
Error propagation
Joint
End-to-end model
Better accuracy
Complex
Mention-based
Independent linking
Parallelizable
No context sharing
Document-level
Document context
Better disambiguation
Slower
Entity Disambiguation Example
Entity linking performance is typically measured on benchmark datasets like AIDA-CoNLL, MSNBC, and AquaQuad. State-of-the-art systems achieve >90% accuracy on standard benchmarks.
β
Premium Content
Named Entity Linking
Unlock this lesson and 900+ advanced tutorials with a Premium plan.