String and Text Processing
Text data is everywhere β user reviews, emails, logs, social media posts, medical records. Cleaning and transforming text is one of the most common tasks in data science. This lesson covers the tools you need.
String Fundamentals
Strings in Python are immutable sequences of Unicode characters. Once created, they cannot be changed β every operation produces a new string.
String length: counts characters.
Slicing: extracts characters from index to . Negative indices count from the end: accesses the -th character from the right.
Concatenation: joins two strings. Repeating: repeats exactly times.
Membership: tests if character appears in string .
s = "Hello, Data Science"
print(len(s)) # 20
print(type(s)) # <class 'str'>
print(s[0]) # 'H'
print(s[-1]) # 'e'
print(s[6:10]) # 'Data'
print(s[::2]) # 'Hlo atcne' (every 2nd char)
print(s[::-1]) # 'ecneicS ataD ,olleH' (reversed)
Essential String Methods
Case and Whitespace
text = " Hello, World! "
# Case
print(text.upper()) # " HELLO, WORLD! "
print(text.lower()) # " hello, world! "
print(text.title()) # " Hello, World! "
print(text.capitalize()) # " hello, world! "
print(text.swapcase()) # " hELLO, wORLD! "
# Whitespace
print(text.strip()) # "Hello, World!"
print(text.lstrip()) # "Hello, World! "
print(text.rstrip()) # " Hello, World!"
print(text.strip(" !")) # "Hello, World" (strip specific chars)
Searching and Testing
email = "user@example.com"
print(email.startswith("user")) # True
print(email.endswith(".com")) # True
print(email.find("@")) # 4
print(email.rfind(".")) # 15
print(email.count("e")) # 3
print(email.replace("user", "admin")) # "admin@example.com"
# Validation methods
print("hello123".isalnum()) # True (alphanumeric)
print("hello".isalpha()) # True (letters only)
print("12345".isdigit()) # True (digits only)
print("hello".islower()) # True
print("HELLO".isupper()) # True
print(" ".isspace()) # True
Splitting and Joining
# Splitting
csv_line = "alice,bob,charlie,30"
parts = csv_line.split(",")
print(parts) # ['alice', 'bob', 'charlie', '30']
# Split with limit
"one two three".split(" ", 1) # ['one', 'two three']
# Splitlines
multiline = "line1\nline2\nline3"
print(multiline.splitlines()) # ['line1', 'line2', 'line3']
# Joining
words = ["Python", "is", "great"]
print(" ".join(words)) # "Python is great"
print(",".join(words)) # "Python,is,great"
print("\n".join(words)) # "Python\nis\ngreat"
String Formatting
f-Strings (Python 3.6+)
f-strings are the preferred way to format strings. They are fast, readable, and support expressions.
name = "Alice"
age = 30
balance = 1234.5678
# Basic interpolation
print(f"Hello, {name}!") # Hello, Alice!
print(f"Age: {age}") # Age: 30
# Expressions
print(f"Double: {age * 2}") # Double: 60
print(f"Uppercase: {name.upper()}") # Uppercase: ALICE
# Formatting numbers
print(f"Balance: ${balance:.2f}") # Balance: $1234.57
print(f"Balance: ${balance:,.2f}") # Balance: $1,234.57
print(f"Percentage: {0.856:.1%}") # Percentage: 85.6%
print(f"Zero-padded: {42:05d}") # 00042
# Alignment
print(f"{'left':<10}|") # left |
print(f"{'right':>10}|") # right|
print(f"{'center':^10}|") # center |
# Debugging
x = 42
print(f"{x=}") # x=42
Other Formatting Methods
# format() method
"Hello, {}!".format("World") # "Hello, World!"
"Hello, {name}!".format(name="World") # "Hello, World!"
# Old-style % formatting (avoid in new code)
"Hello, %s!" % "World" # "Hello, World!"
"Pi is %.2f" % 3.14159 # "Pi is 3.14"
Regular Expressions
The re module provides pattern matching for complex text operations.
Basic Patterns
import re
text = "Call me at 555-123-4567 or (555) 987-6543"
# Find all phone numbers
pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
phones = re.findall(pattern, text)
print(phones) # ['555-123-4567', '(555) 987-6543']
# Check if pattern exists
if re.search(r"\d{3}-\d{3}-\d{4}", text):
print("Phone number found")
# Replace patterns
cleaned = re.sub(r"[-.()]", "", text)
print(cleaned) # "Call me at 5551234567 or 5559876543"
Common Regex Patterns
import re
# Email validation
email_pattern = r"^[\w.+-]+@[\w-]+\.[\w.]+$"
print(bool(re.match(email_pattern, "user@example.com"))) # True
print(bool(re.match(email_pattern, "not-an-email"))) # False
# Date patterns (YYYY-MM-DD)
date_pattern = r"\d{4}-\d{2}-\d{2}"
dates = re.findall(date_pattern, "Born on 1990-05-15, died on 2020-12-01")
print(dates) # ['1990-05-15', '2020-12-01']
# Extract named groups
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(pattern, "2024-01-25")
if match:
print(match.group("year")) # 2024
print(match.group("month")) # 01
print(match.group("day")) # 25
Regex Quick Reference
Pattern matching: matches either or . Quantifiers: (zero or more), (one or more), (zero or one).
Character classes: matches digits, matches word characters, matches whitespace. Negations: , , .
Anchoring: matches the start, $s$$ matches the end of the string.
. Any character (except newline)
\d Digit [0-9]
\w Word character [a-zA-Z0-9_]
\s Whitespace
\b Word boundary
^ Start of string
$ End of string
* 0 or more
+ 1 or more
? 0 or 1
{n} Exactly n times
{n,m} Between n and m times
[abc] Character set
[^abc] Negated set
(abc) Capture group
(?:abc) Non-capturing group
a|b Alternation (a or b)
String Processing Pipeline
Text Cleaning for Data Science
Common Cleaning Pipeline
import re
import unicodedata
def clean_text(text):
"""Basic text cleaning pipeline."""
# Lowercase
text = text.lower()
# Remove HTML tags
text = re.sub(r"<[^>]+>", "", text)
# Remove URLs
text = re.sub(r"http\S+|www\.\S+", "", text)
# Remove email addresses
text = re.sub(r"\S+@\S+", "", text)
# Remove special characters and digits
text = re.sub(r"[^a-zA-Z\s]", "", text)
# Remove extra whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
# Test it
dirty = "<p>Hello! Visit https://example.com or email user@test.com. Price: $99.99</p>"
print(clean_text(dirty))
# "hello visit or email user test com price"
Unicode Normalization
import unicodedata
# Unicode can represent the same character multiple ways
s1 = "cafΓ©" # e with acute accent
s2 = "cafe\u0301" # e + combining accent
print(s1 == s2) # False (different byte sequences)
# Normalize to compare correctly
n1 = unicodedata.normalize("NFC", s1)
n2 = unicodedata.normalize("NFC", s2)
print(n1 == n2) # True
Tokenization
# Simple whitespace tokenization
text = "Data science is awesome"
tokens = text.split()
print(tokens) # ['Data', 'science', 'is', 'awesome']
# Word tokenization with regex
import re
tokens = re.findall(r"\b\w+\b", text.lower())
print(tokens) # ['data', 'science', 'is', 'awesome']
# For production, use nltk or spacy
# import nltk
# tokens = nltk.word_tokenize(text)
Stopword Removal
# Basic stopword list
stopwords = {
"i", "me", "my", "we", "our", "you", "your", "he", "she", "it",
"they", "what", "which", "who", "when", "where", "how", "a", "an",
"the", "and", "or", "but", "in", "on", "at", "to", "for", "of",
"is", "am", "are", "was", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "can", "this", "that", "these", "those",
"is", "it", "its", "not", "no", "nor",
}
def remove_stopwords(text):
tokens = text.lower().split()
return " ".join(t for t in tokens if t not in stopwords)
print(remove_stopwords("This is a very important data science lesson"))
# "important data science lesson"
Practical Examples
Extracting Features from Text
import re
def extract_text_features(text):
"""Extract useful features from raw text."""
return {
"length": len(text),
"word_count": len(text.split()),
"char_count": len(re.sub(r"\s", "", text)),
"avg_word_length": (
sum(len(w) for w in text.split()) / max(len(text.split()), 1)
),
"sentence_count": text.count(".") + text.count("!") + text.count("?"),
"uppercase_ratio": sum(1 for c in text if c.isupper()) / max(len(text), 1),
"digit_count": sum(1 for c in text if c.isdigit()),
"special_char_count": sum(1 for c in text if not c.isalnum() and not c.isspace()),
}
review = "This product is AMAZING!!! I give it 5/5 stars. Best $50 I've ever spent!!!"
features = extract_text_features(review)
for k, v in features.items():
print(f"{k}: {v}")
Batch Text Processing
import pandas as pd
reviews = pd.Series([
"Great product! Love it.",
"Terrible quality... broke after 1 day.",
"It's okay, nothing special. 3/5.",
"AMazing value for the price!!!",
])
# Clean and extract features
cleaned = reviews.str.lower().str.replace(r"[^a-zA-Z\s]", "", regex=True)
cleaned = cleaned.str.strip()
# Token counts
word_counts = cleaned.str.split().str.len()
print("Cleaned text:")
print(cleaned)
print("\nWord counts:")
print(word_counts)
Key Takeaways
- Strings are immutable β every operation creates a new string.
- f-strings are the best way to format strings in modern Python.
- Regex is powerful but start simple; use
re.findall()andre.sub()most often. - Text cleaning (lowercasing, removing noise, tokenizing) is essential before analysis.
- Always match your cleaning approach to your specific data and problem.