Python Regular Expressions — Pattern Matching Mastery
Regular expressions describe search patterns in text. They are essential for validation, extraction, and text transformation.
Learning Objectives
- Use the
re module for pattern matching
- Master quantifiers, groups, and anchors
- Apply lookahead and lookbehind assertions
- Solve real-world text processing problems
- Build patterns for validation and extraction
re Module Basics
import re
text = "The price is $42.50 and $19.99"
# Find all numbers
numbers = re.findall(r'\d+\.?\d*', text)
print(numbers) # ['42.50', '19.99']
# Match and extract
match = re.search(r'\$(\d+\.?\d*)', text)
if match:
print(match.group(0)) # $42.50 (full match)
print(match.group(1)) # 42.50 (first group)
# Match at start
result = re.match(r'\d+', "123 abc")
print(result.group()) # '123'
# Find all matches with positions
for m in re.finditer(r'\d+', text):
print(f"Found '{m.group()}' at position {m.start()}-{m.end()}")
# Split by pattern
parts = re.split(r'\s+', "Hello World \t Python")
print(parts) # ['Hello', 'World', 'Python']
# Substitute
result = re.sub(r'\d+', 'X', "abc 123 def 456")
print(result) # 'abc X def X'
Function Comparison
| Function | Returns | Use Case |
|---|
re.match() | Match object or None | Check start of string |
re.search() | Match object or None | Find first occurrence |
re.findall() | List of strings/matches | Find all occurrences |
re.finditer() | Iterator of matches | Find all with positions |
re.sub() | New string | Replace matches |
re.split() | List of strings | Split by pattern |
Pattern Syntax
# Quantifiers
r'a+' # One or more 'a'
r'a*' # Zero or more 'a'
r'a?' # Zero or one 'a' (optional)
r'a{3}' # Exactly 3 'a's
r'a{2,4}' # 2 to 4 'a's
r'a{3,}' # 3 or more 'a's
# Character classes
r'\d' # Digit [0-9]
r'\D' # Not a digit
r'\w' # Word character [a-zA-Z0-9_]
r'\W' # Not a word character
r'\s' # Whitespace
r'\S' # Not whitespace
# Character sets
r'[aeiou]' # Any vowel
r'[^aeiou]' # Not a vowel (negation)
r'[a-z]' # Lowercase letters
r'[A-Za-z]' # All letters
# Anchors
r'^Hello' # Starts with
r'world$' # Ends with
r'\bword\b' # Word boundary (whole word)
# Special characters
r'.' # Any character except newline
r'\.' # Literal dot
r'\\' # Literal backslash
r'\n' # Newline
r'\t' # Tab
# Groups
r'(abc)' # Capturing group
r'(?:abc)' # Non-capturing group
r'(?P<name>abc)' # Named group
r'(?=abc)' # Positive lookahead
r'(?!abc)' # Negative lookahead
r'(?<=abc)' # Positive lookbehind
r'(?<!abc)' # Negative lookbehind
# Flags
re.IGNORECASE # Case-insensitive matching
re.MULTILINE # ^ and $ match line boundaries
re.DOTALL # . matches newline too
re.VERBOSE # Allow comments in pattern
Pattern Syntax Deep Dive
# Character Classes
r'[abc]' # Match a, b, or c
r'[^abc]' # Match anything except a, b, or c
r'[a-z]' # Match lowercase letters
r'[A-Z]' # Match uppercase letters
r'[0-9]' # Match digits
r'[a-zA-Z0-9]' # Match alphanumeric
# Special Sequences
r'\d' # Digit [0-9]
r'\D' # Non-digit [^0-9]
r'\w' # Word character [a-zA-Z0-9_]
r'\W' # Non-word character
r'\s' # Whitespace [\t\n\r\f\v]
r'\S' # Non-whitespace
r'\b' # Word boundary
r'\B' # Non-word boundary
r'\A' # Start of string
r'\Z' # End of string
Quantifiers
# Greedy vs Lazy Quantifiers
import re
text = "aaa"
# Greedy: matches as much as possible
greedy = re.findall(r'a+', text)
print(greedy) # ['aaa']
# Lazy: matches as little as possible
lazy = re.findall(r'a+?', text)
print(lazy) # ['a', 'a', 'a']
# Practical example: HTML tags
html = "<div>Content</div><span>More</span>"
# Greedy (wrong for this case)
greedy_tags = re.findall(r'<.*>', html)
print(greedy_tags) # ['<div>Content</div><span>More</span>']
# Lazy (correct for this case)
lazy_tags = re.findall(r'<.*?>', html)
print(lazy_tags) # ['<div>', '</div>', '<span>', '</span>']
# Quantifier comparison
r'a{3}' # Exactly 3 'a's
r'a{3,}' # 3 or more 'a's
r'a{3,5}' # 3 to 5 'a's
r'a{3}?' # Exactly 3 'a's (lazy)
r'a{3,}?' # 3 or more 'a's (lazy)
r'a{3,5}?' # 3 to 5 'a's (lazy)
Quantifier Examples
import re
# Zero or more
text = "ab, aab, aaab, aaaab"
matches = re.findall(r'ab', text)
print(matches) # ['ab', 'ab', 'ab', 'ab']
# One or more
text = "a, aa, aaa, b"
matches = re.findall(r'a+', text)
print(matches) # ['a', 'aa', 'aaa']
# Optional
text = "color, colour"
matches = re.findall(r'colou?r', text)
print(matches) # ['color', 'colour']
# Exactly n
text = "a, aa, aaa, aaaa"
matches = re.findall(r'a{3}', text)
print(matches) # ['aaa', 'aaa']
# Between n and m
text = "a, aa, aaa, aaaa, aaaaa"
matches = re.findall(r'a{2,4}', text)
print(matches) # ['aa', 'aaa', 'aaaa']
Groups and Named Groups
import re
# Capturing groups
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', "Today is 2024-06-15")
if match:
print(match.group(0)) # '2024-06-15' (full match)
print(match.group(1)) # '2024'
print(match.group(2)) # '06'
print(match.group(3)) # '15'
print(match.groups()) # ('2024', '06', '15')
# Named groups
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, "2024-06-15")
if match:
print(match.group('year')) # '2024'
print(match.group('month')) # '06'
print(match.groupdict()) # {'year': '2024', 'month': '06', 'day': '15'}
# Non-capturing groups
pattern = r'(?:https?://)?([\w.-]+)'
urls = re.findall(pattern, "Visit https://example.com or http://test.org")
print(urls) # ['example.com', 'test.org']
# Backreferences
pattern = r'(\w+)\s+\1' # Match repeated word
match = re.search(pattern, "the the quick brown fox")
print(match.group()) # 'the the'
Group Types Comparison
import re
# Capturing groups: ()
pattern = r'(\w+) (\w+)'
match = re.search(pattern, "Hello World")
print(match.groups()) # ('Hello', 'World')
# Non-capturing groups: (?:...)
pattern = r'(?:Hello) (\w+)'
match = re.search(pattern, "Hello World")
print(match.groups()) # ('World',)
# Named groups: (?P<name>...)
pattern = r'(?P<greeting>\w+) (?P<name>\w+)'
match = re.search(pattern, "Hello World")
print(match.groupdict()) # {'greeting': 'Hello', 'name': 'World'}
# Alternation in groups
pattern = r'(?:cat|dog)s?'
matches = re.findall(pattern, "I have cats and dogs")
print(matches) # ['cats', 'dogs']
Lookahead and Lookbehind
import re
# Positive lookahead: followed by X
r'\d+(?= dollars)' # 42 in "42 dollars"
r'\d+(?= dollars)' # No match in "42 euros"
# Negative lookahead: NOT followed by X
r'\d+(?! dollars)' # 42 in "42 euros"
r'\d+(?! dollars)' # No match in "42 dollars"
# Positive lookbehind: preceded by X
r'(?<=\$)\d+' # 42 in "$42"
# Negative lookbehind: NOT preceded by X
r'(?<!\$)\d+' # 42 in "42 dollars" but not "$42"
# Practical examples
# Extract text inside quotes
text = 'She said "hello" and "goodbye"'
matches = re.findall(r'"([^"]+)"', text)
print(matches) # ['hello', 'goodbye']
# Find words not starting with a vowel
words = re.findall(r'\b(?<![aeiouAEIOU])\w+\b', "apple banana cherry")
print(words) # ['banana', 'cherry']
# Extract prices (preceded by $)
text = "Item $10, Item $25, Item $30"
prices = re.findall(r'(?<=\$)\d+', text)
print(prices) # ['10', '25', '30']
# Password validation with lookahead
def validate_password(password):
"""Check: 8+ chars, uppercase, lowercase, digit, special char."""
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'
return bool(re.match(pattern, password))
print(validate_password("Pass123!")) # True
print(validate_password("weak")) # False
Lookahead and Lookbehind Patterns
import re
# Positive lookahead examples
# Match "foo" only when followed by "bar"
pattern = r'foo(?=bar)'
text = "foobar foobaz"
match = re.search(pattern, text)
print(match.group()) # 'foo'
# Negative lookahead examples
# Match "foo" only when NOT followed by "bar"
pattern = r'foo(?!bar)'
text = "foobar foobaz"
match = re.search(pattern, text)
print(match.group()) # 'foo' (from foobaz)
# Positive lookbehind examples
# Match "bar" only when preceded by "foo"
pattern = r'(?<=foo)bar'
text = "foobar bazbar"
match = re.search(pattern, text)
print(match.group()) # 'bar' (from foobar)
# Negative lookbehind examples
# Match "bar" only when NOT preceded by "foo"
pattern = r'(?<!foo)bar'
text = "foobar bazbar"
match = re.search(pattern, text)
print(match.group()) # 'bar' (from bazbar)
Flags
import re
# re.IGNORECASE (re.I)
pattern = r'hello'
text = "Hello World"
match = re.search(pattern, text, re.IGNORECASE)
print(match.group()) # 'Hello'
# re.MULTILINE (re.M)
pattern = r'^Hello'
text = "Hello World\nHello Python"
matches = re.findall(pattern, text, re.MULTILINE)
print(matches) # ['Hello', 'Hello']
# re.DOTALL (re.S)
pattern = r'<p>(.*?)</p>'
text = "<p>Line 1\nLine 2</p>"
match = re.search(pattern, text, re.DOTALL)
print(match.group(1)) # 'Line 1\nLine 2'
# re.VERBOSE (re.X)
pattern = r'''
^ # Start of string
(?P<year>\d{4}) # Year
- # Separator
(?P<month>\d{2}) # Month
- # Separator
(?P<day>\d{2}) # Day
$ # End of string
'''
match = re.search(pattern, "2024-06-15", re.VERBOSE)
if match:
print(match.groupdict()) # {'year': '2024', 'month': '06', 'day': '15'}
# Combining flags
pattern = r'^hello'
text = "Hello World\nHello Python"
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
print(match.group()) # 'Hello'
Flag Comparison Table
| Flag | Symbol | Description |
|---|
re.IGNORECASE | re.I | Case-insensitive matching |
re.MULTILINE | re.M | ^ and $ match line boundaries |
re.DOTALL | re.S | . matches newline |
re.VERBOSE | re.X | Allow comments in pattern |
re.ASCII | re.A | ASCII-only matching |
Common Patterns
import re
# Email validation (simplified)
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
bool(re.match(email_pattern, 'user@example.com')) # True
bool(re.match(email_pattern, 'invalid@')) # False
# Phone number (US)
phone_pattern = r'(\+1)?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
phones = re.findall(phone_pattern, "Call 555-123-4567 or (555) 987-6543")
print(phones)
# URL extraction
url_pattern = r'https?://(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}(?:/[\w./?%&=-]*)?'
urls = re.findall(url_pattern, "Visit https://example.com or http://test.org/path")
print(urls)
# Date formats
date_pattern = r'\d{4}[-/]\d{2}[-/]\d{2}'
dates = re.findall(date_pattern, "Events: 2024-06-15, 2024/12/25, 2025-01-01")
print(dates)
# IPv4 address
ipv4_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
ips = re.findall(ipv4_pattern, "Server at 192.168.1.1 and 10.0.0.1")
print(ips)
# Credit card number (basic)
cc_pattern = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
# HTML tag
tag_pattern = r'<(\w+)(?:\s+[^>]*)?>(.*?)</\1>'
html = "<b>bold</b> and <i>italic</i>"
tags = re.findall(tag_pattern, html)
print(tags) # [('b', 'bold'), ('i', 'italic')]
Real-World Validation Patterns
import re
# Strong password validation
def validate_strong_password(password):
"""Check: 8+ chars, uppercase, lowercase, digit, special char."""
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'
return bool(re.match(pattern, password))
# Username validation
def validate_username(username):
"""Check: 3-16 chars, alphanumeric and underscore."""
pattern = r'^[a-zA-Z0-9_]{3,16}$'
return bool(re.match(pattern, username))
# IP address validation
def validate_ipv4(ip):
"""Check: valid IPv4 address."""
pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
if not re.match(pattern, ip):
return False
# Check each octet
octets = ip.split('.')
for octet in octets:
if int(octet) > 255:
return False
return True
# Date validation (YYYY-MM-DD)
def validate_date(date_string):
"""Check: valid date format."""
pattern = r'^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$'
return bool(re.match(pattern, date_string))
Substitution
import re
text = "Hello World, hello Python"
# Replace all occurrences
result = re.sub(r'hello', 'Hi', text, flags=re.IGNORECASE)
# "Hi World, Hi Python"
# Replace with function
def double_number(match):
return str(int(match.group()) * 2)
result = re.sub(r'\d+', double_number, "I have 3 cats and 5 dogs")
# "I have 6 cats and 10 dogs"
# Replace named groups
pattern = r'(?P<first>\w+) (?P<last>\w+)'
result = re.sub(pattern, r'\g<last>, \g<first>', "John Smith")
# "Smith, John"
# Clean up whitespace
text = " Hello World \t Python "
cleaned = re.sub(r'\s+', ' ', text).strip()
# "Hello World Python"
# Redact sensitive data
text = "SSN: 123-45-6789, Phone: 555-123-4567"
redacted = re.sub(r'\d{3}-\d{2}-\d{4}', 'XXX-XX-XXXX', text)
# "SSN: XXX-XX-XXXX, Phone: 555-123-4567"
# Capitalize first letter of each sentence
def capitalize_sentence(match):
return match.group(0).upper()
result = re.sub(r'(?:^|\.\s+)([a-z])', capitalize_sentence, "hello world. foo bar.")
# "Hello world. Foo bar."
Advanced Substitution
import re
# Replace with conditional logic
def smart_replace(match):
value = int(match.group())
if value > 100:
return f"[{value}]"
else:
return f"({value})"
text = "Numbers: 50, 150, 200, 75"
result = re.sub(r'\d+', smart_replace, text)
print(result) # "Numbers: (50), [150], [200], (75)"
# Replace multiple patterns
def replace_multiple(text, replacements):
pattern = '|'.join(re.escape(key) for key in replacements)
def replacer(match):
return replacements[match.group()]
return re.sub(pattern, replacer, text)
replacements = {'foo': 'bar', 'hello': 'world', 'python': 'code'}
result = replace_multiple("foo hello python", replacements)
print(result) # "bar world code"
# Clean up HTML
def clean_html(html):
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', html)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
html = "<p>Hello</p> <b>World</b> <i>Python</i>"
print(clean_html(html)) # "Hello World Python"
Real-World: Log Parser
import re
from collections import defaultdict
log_pattern = r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.+)'
log_data = """2024-06-15 10:30:15 [INFO] Server started on port 8080
2024-06-15 10:30:20 [WARNING] High memory usage detected
2024-06-15 10:31:05 [ERROR] Connection refused to database
2024-06-15 10:31:10 [INFO] Retrying connection
2024-06-15 10:31:15 [ERROR] Connection timeout
2024-06-15 10:32:00 [INFO] Server stopped"""
errors = []
level_counts = defaultdict(int)
for line in log_data.split('\n'):
match = re.match(log_pattern, line)
if match:
data = match.groupdict()
level_counts[data['level']] += 1
if data['level'] == 'ERROR':
errors.append(data)
print(f"Level counts: {dict(level_counts)}")
# {'INFO': 3, 'WARNING': 1, 'ERROR': 2}
print(f"\nErrors:")
for error in errors:
print(f" [{error['timestamp']}] {error['message']}")
# [2024-06-15 10:31:05] Connection refused to database
# [2024-06-15 10:31:15] Connection timeout
Real-World: Text Extraction
import re
def extract_emails(text):
"""Extract all email addresses from text."""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return re.findall(pattern, text)
def extract_urls(text):
"""Extract all URLs from text."""
pattern = r'https?://(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}(?:/[\w./?%&=-]*)?'
return re.findall(pattern, text)
def extract_phone_numbers(text):
"""Extract US phone numbers."""
pattern = r'(\+1)?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
return re.findall(pattern, text)
def extract_dates(text):
"""Extract dates in various formats."""
patterns = [
r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
r'\d{2}/\d{2}/\d{4}', # MM/DD/YYYY
r'\d{2}\.\d{2}\.\d{4}', # DD.MM.YYYY
r'\w+ \d{1,2}, \d{4}', # Month DD, YYYY
]
dates = []
for pattern in patterns:
dates.extend(re.findall(pattern, text))
return dates
# Test extraction functions
text = """
Contact us at support@example.com or sales@company.org.
Visit https://example.com or http://test.org.
Call 555-123-4567 or (555) 987-6543.
Events: 2024-06-15, 12/25/2024, 01.01.2025.
"""
print("Emails:", extract_emails(text))
print("URLs:", extract_urls(text))
print("Phones:", extract_phone_numbers(text))
print("Dates:", extract_dates(text))
Performance Tips
import re
import time
# Pre-compile patterns for repeated use
# Bad: compiles pattern every time
def find_emails_bad(texts):
return [re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', t) for t in texts]
# Good: compile once
email_re = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
def find_emails_good(texts):
return [email_re.findall(t) for t in texts]
# Use non-capturing groups when you don't need the match
# Bad: captures group unnecessarily
pattern_bad = r'(?:https?://)?([\w.-]+)'
# Better: atomic group if supported, or simpler pattern
pattern_good = r'https?://([\w.-]+)'
# Avoid catastrophic backtracking
# Bad: nested quantifiers
# pattern = r'(a+)+b' # Can cause exponential backtracking
# Good: atomic approach
# pattern = r'a+b'
# Benchmark
texts = ["user@example.com"] * 10000
start = time.perf_counter()
find_emails_bad(texts)
bad_time = time.perf_counter() - start
start = time.perf_counter()
find_emails_good(texts)
good_time = time.perf_counter() - start
print(f"Without compile: {bad_time:.4f}s")
print(f"With compile: {good_time:.4f}s")
Performance Optimization Tips
import re
import time
# Tip 1: Use raw strings
# Bad: requires double backslashes
pattern1 = '\\d+\\.\\d+'
# Good: raw string
pattern2 = r'\d+\.\d+'
# Tip 2: Use specific character classes
# Bad: matches more than needed
pattern3 = r'.*'
# Good: matches only what's needed
pattern4 = r'[a-zA-Z0-9]+'
# Tip 3: Use non-capturing groups when possible
# Bad: captures unnecessarily
pattern5 = r'(?:foo|bar)baz'
# Better: if you don't need the group
pattern6 = r'foo|barbaz'
# Tip 4: Avoid backtracking
# Bad: can cause exponential backtracking
pattern7 = r'(a+)+b'
# Good: atomic pattern
pattern8 = r'a+b'
# Tip 5: Use boundaries
# Bad: matches partial words
pattern9 = r'cat'
# Good: matches whole words
pattern10 = r'\bcat\b'
# Benchmark example
def benchmark_patterns(text, pattern_func, iterations=1000):
start = time.perf_counter()
for _ in range(iterations):
pattern_func(text)
return time.perf_counter() - start
# Test patterns
text = "The price is $42.50 and $19.99" * 100
pattern_bad = r'\d+\.?\d*'
pattern_good = re.compile(r'\d+\.?\d*')
time_bad = benchmark_patterns(text, lambda t: re.findall(pattern_bad, t))
time_good = benchmark_patterns(text, pattern_good.findall)
print(f"Without compile: {time_bad:.4f}s")
print(f"With compile: {time_good:.4f}s")
Common Mistakes
import re
# Mistake 1: Not using raw strings
# pattern = '\d+' # Backslash interpreted as escape!
pattern = r'\d+' # Raw string — backslash is literal
# Mistake 2: Forgetting re.DOTALL
html = "<p>Line 1\nLine 2</p>"
match = re.search(r'<p>(.*?)</p>', html) # Doesn't match!
match = re.search(r'<p>(.*?)</p>', html, re.DOTALL) # Works
# Mistake 3: Greedy vs lazy
text = "<b>bold</b> and <b>italic</b>"
greedy = re.findall(r'<b>(.*)</b>', text)
# ['bold</b> and <b>italic'] — greedy matched everything!
lazy = re.findall(r'<b>(.*?)</b>', text)
# ['bold', 'italic'] — lazy matches minimum
# Mistake 4: Not handling match failures
match = re.search(r'\d+', "no numbers here")
# match.group() # AttributeError: 'NoneType' has no attribute 'group'
if match:
print(match.group())
# Mistake 5: Forgetting word boundaries
text = "I have a cat, category, and concatenate"
bad = re.findall(r'cat', text) # ['cat', 'cat', 'cat']
good = re.findall(r'\bcat\b', text) # ['cat']
# Mistake 6: Not escaping special characters
import re
filename = "file.txt"
# pattern = filename # '.' matches any char!
pattern = re.escape(filename) # 'file\\.txt'
Key Takeaways
- Use raw strings
r'...' for regex patterns
\d digits, \w word chars, \s whitespace
- Groups
() capture matched text
- Lookahead/lookbehind for context-sensitive matching
re.sub() for pattern-based replacement
- Pre-compile patterns for repeated use
- Use non-capturing groups
(?:...) when you don't need the match
- Use
re.VERBOSE for complex patterns with comments
- Be careful with greedy vs lazy quantifiers
- Always validate regex patterns for performance