Text Preprocessing
Text preprocessing is the critical first step in any NLP pipeline. Raw text is messy and inconsistent—preprocessing transforms it into a clean, standardized format suitable for downstream tasks. Proper preprocessing can significantly improve model performance.
1. Lowercasing
Converting all text to lowercase is the simplest normalization step. It reduces vocabulary size and treats words like "The" and "the" as identical.
text = "The Quick Brown Fox JUMPS Over the Lazy Dog"
lowercase_text = text.lower()
print(lowercase_text)
# "the quick brown fox jumps over the lazy dog"
When NOT to lowercase: For tasks like named entity recognition or sentiment analysis where capitalization carries meaning (e.g., "GOOD" vs "good").
2. HTML and Markup Removal
Web text often contains HTML tags, entities, and markup that must be stripped.
import re
from html import unescape
def remove_html(text):
clean = re.sub(r'<[^>]+>', '', text)
clean = unescape(clean)
clean = re.sub(r'\s+', ' ', clean).strip()
return clean
html_text = "<p>Natural <b>Language</b> Processing & AI</p>"
print(remove_html(html_text))
# "Natural Language Processing & AI"
3. Noise Removal
Remove special characters, numbers, URLs, emails, and other non-textual content.
def remove_noise(text, remove_numbers=False, remove_urls=True):
if remove_urls:
text = re.sub(r'http\S+|www\.\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
if remove_numbers:
text = re.sub(r'\d+', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
4. Stopword Removal
Stopwords are common words (the, is, at, which) that appear frequently but carry little semantic meaning.
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
def remove_stopwords(text, language='english'):
stop_words = set(stopwords.words(language))
tokens = text.lower().split()
filtered = [t for t in tokens if t not in stop_words]
return ' '.join(filtered)
text = "The quick brown fox jumps over the lazy dog"
print(remove_stopwords(text))
# "quick brown fox jumps lazy dog"
Common Stopword Lists
| Language | Example Stopwords |
|---|---|
| English | the, is, at, which, on, a, an, and, or, but |
| Spanish | el, la, los, las, un, una, de, del, en |
| German | der, die, das, ein, eine, und, oder, aber |
| French | le, la, les, un, une, de, du, des, et |
5. Punctuation Removal
import string
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
text = "Hello, world! How's it going? Fine, thanks."
print(remove_punctuation(text))
# "Hello world Hows it going Fine thanks"
6. Number Handling
import re
def normalize_numbers(text, replacement="[NUM]"):
return re.sub(r'\b\d+(?:\.\d+)?\b', replacement, text)
text = "I have 3 cats and 2.5 pounds of cat food"
print(normalize_numbers(text))
# "I have [NUM] cats and [NUM] pounds of cat food"
7. Whitespace Normalization
def normalize_whitespace(text):
text = re.sub(r'[\t\n\r]+', ' ', text)
text = re.sub(r' +', ' ', text)
return text.strip()
Complete Preprocessing Pipeline
import re
from html import unescape
from nltk.corpus import stopwords
import string
class TextPreprocessor:
def __init__(self, lowercase=True, remove_html=True,
remove_stopwords=False, remove_punctuation=True):
self.lowercase = lowercase
self.remove_html = remove_html
self.remove_stopwords = remove_stopwords
self.remove_punctuation = remove_punctuation
self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
def __call__(self, text):
if self.remove_html:
text = re.sub(r'<[^>]+>', '', text)
text = unescape(text)
if self.lowercase:
text = text.lower()
if self.remove_punctuation:
text = text.translate(str.maketrans('', '', string.punctuation))
if self.remove_stopwords:
tokens = text.split()
tokens = [t for t in tokens if t not in self.stop_words]
text = ' '.join(tokens)
return re.sub(r'\s+', ' ', text).strip()
preprocessor = TextPreprocessor(lowercase=True, remove_stopwords=True)
sample = "<p>The <b>Quick</b> Brown Fox! It jumped over the lazy dog.</p>"
print(preprocessor(sample))
Preprocessing Decisions
| Decision | Option A | Option B | When to Use |
|---|---|---|---|
| Lowercasing | Always lowercase | Keep original case | Lowercase for topic modeling; keep for NER |
| Stopwords | Remove all | Keep all | Remove for BoW; keep for sentiment |
| Numbers | Remove all | Replace with token | Keep for financial text; remove for general |
| Punctuation | Remove all | Keep sentence boundaries | Remove for clustering; keep for parsing |