NumPy Essentials
NumPy is the foundation of the entire Python data science ecosystem. Every pandas DataFrame, scikit-learn model, and PyTorch tensor is built on top of NumPy arrays. Understanding NumPy is not optional β it is essential.
Why NumPy?
Python lists are slow for numerical work because each element is a separate object. NumPy stores data in contiguous blocks of memory and operates on entire arrays at once using optimized C code.
import numpy as np
import time
# Speed comparison
python_list = list(range(1_000_000))
numpy_array = np.arange(1_000_000)
# Python loop
start = time.time()
result = [x * 2 for x in python_list]
python_time = time.time() - start
# NumPy vectorized
start = time.time()
result = numpy_array * 2
numpy_time = time.time() - start
print(f"Python: {python_time:.4f}s") # ~0.05s
print(f"NumPy: {numpy_time:.6f}s") # ~0.0005s (100x faster)
Creating Arrays
import numpy as np
# From Python lists
a = np.array([1, 2, 3, 4, 5])
b = np.array([[1, 2, 3], [4, 5, 6]])
# Common constructors
zeros = np.zeros((3, 4)) # 3x4 array of zeros
ones = np.ones((2, 3)) # 2x3 array of ones
full = np.full((3, 3), 7.0) # 3x3 array filled with 7
eye = np.eye(4) # 4x4 identity matrix
empty = np.empty((2, 3)) # Uninitialized (fast)
# Sequences
arange = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1.0]
# From existing data
c = np.array([1.5, 2.7, 3.1], dtype=int) # Specify type
print(a.shape) # (5,)
print(b.shape) # (2, 3)
print(a.dtype) # int64
print(b.ndim) # 2
print(b.size) # 6
print(b.itemsize) # 8 bytes (int64)
Array Indexing and Slicing
NumPy indexing is more powerful than Python list indexing.
a = np.array([10, 20, 30, 40, 50, 60])
# Basic indexing
print(a[0]) # 10
print(a[-1]) # 60
# Slicing (same as lists)
print(a[1:4]) # [20 30 40]
print(a[::2]) # [10 30 50]
print(a[::-1]) # [60 50 40 30 20 10]
# 2D indexing
b = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(b[0, 0]) # 1
print(b[1, 2]) # 6
print(b[0]) # [1 2 3] (first row)
print(b[:, 0]) # [1 4 7] (first column)
print(b[0:2, 1:]) # [[2 3] [5 6]] (submatrix)
# Boolean indexing
data = np.array([15, 22, 8, 35, 12, 42])
mask = data > 20
print(mask) # [False True False True False True]
print(data[mask]) # [22 35 42]
# Fancy indexing (integer arrays)
indices = [0, 2, 4]
print(data[indices]) # [15 8 12]
Broadcasting Visual
Broadcasting
Broadcasting lets NumPy perform operations on arrays of different shapes without explicit loops.
# Scalar + array (broadcasts the scalar)
a = np.array([1, 2, 3, 4])
print(a + 5) # [6 7 8 9]
print(a * 2) # [2 4 6 8]
# Array + array (compatible shapes)
row = np.array([[1, 2, 3]]) # Shape: (1, 3)
col = np.array([[1], [2], [3]]) # Shape: (3, 1)
print(row + col)
# [[2 3 4]
# [3 4 5]
# [4 5 6]]
# Broadcasting rules:
# 1. Arrays with different ndim: prepend 1s to smaller shape
# 2. Size 1 dimensions stretch to match the other array
# 3. Arrays with size > 1 in same dimension must match
# Practical example: normalize each column
data = np.array([[1, 10, 100],
[2, 20, 200],
[3, 30, 300]])
means = data.mean(axis=0) # [2, 20, 200]
stds = data.std(axis=0) # [1, 10, 100]
normalized = (data - means) / stds
print(normalized)
# [[-1. -1. -1.]
# [ 0. 0. 0.]
# [ 1. 1. 1.]]
Vectorization
Vectorization replaces loops with array operations. It is faster, cleaner, and more expressive.
# Loop approach (slow)
def normalize_loop(data):
result = []
for row in data:
normalized = []
for val in row:
normalized.append((val - 50) / 25)
result.append(normalized)
return result
# Vectorized approach (fast)
def normalize_vectorized(data):
return (data - 50) / 25
# Conditional operations
scores = np.array([85, 92, 78, 95, 60])
# Loop approach
grades = []
for s in scores:
if s >= 90:
grades.append("A")
elif s >= 80:
grades.append("B")
elif s >= 70:
grades.append("C")
else:
grades.append("F")
# Vectorized with np.where
grades = np.where(scores >= 90, "A",
np.where(scores >= 80, "B",
np.where(scores >= 70, "C", "F")))
print(grades) # ['A' 'A' 'C' 'A' 'F']
Aggregation Functions
data = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
# Global aggregations
print(np.sum(data)) # 45
print(np.mean(data)) # 5.0
print(np.std(data)) # 2.58...
print(np.min(data)) # 1
print(np.max(data)) # 9
print(np.median(data)) # 5.0
print(np.percentile(data, 75)) # 7.0
# Axis-wise aggregations
print(np.sum(data, axis=0)) # [12 15 18] (column sums)
print(np.sum(data, axis=1)) # [ 6 15 24] (row sums)
print(np.mean(data, axis=0)) # [4. 5. 6.]
print(np.mean(data, axis=1)) # [2. 5. 8.]
# Useful statistics
print(np.var(data)) # Variance
print(np.prod(data)) # Product of all elements
print(np.cumsum(data)) # Cumulative sum (flattened)
print(np.argmin(data)) # Index of minimum
print(np.argmax(data)) # Index of maximum
Linear Algebra
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])
# Matrix multiplication
print(a @ b) # Same as np.dot(a, b)
print(np.matmul(a, b))
# [[19 22]
# [43 50]]
# Element-wise multiplication
print(a * b)
# [[ 5 12]
# [21 32]]
# Transpose
print(a.T)
# [[1 3]
# [2 4]]
# Inverse
print(np.linalg.inv(a))
# [[-2. 1. ]
# [ 1.5 -0.5]]
# Determinant
print(np.linalg.det(a)) # -2.0
# Eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(a)
print(eigenvalues) # [-0.372, 5.372]
print(eigenvectors)
# Singular Value Decomposition
U, S, Vt = np.linalg.svd(a)
print(U.shape, S.shape, Vt.shape)
# Solving linear equations: ax = b
# 2x + 3y = 8
# 3x + 4y = 11
coeffs = np.array([[2, 3], [3, 4]])
constants = np.array([8, 11])
solution = np.linalg.solve(coeffs, constants)
print(solution) # [1. 2.]
Random Sampling
import numpy as np
# Set seed for reproducibility
np.random.seed(42)
# Random numbers
print(np.random.random()) # Single float [0, 1)
print(np.random.random(5)) # Array of 5 floats
print(np.random.random((3, 3))) # 3x3 array
# Integers
print(np.random.randint(0, 10, size=5)) # 5 ints from [0, 10)
print(np.random.randint(0, 100, size=(3, 4))) # 3x4 array
# Normal distribution
print(np.random.normal(0, 1, size=5)) # Mean=0, Std=1
print(np.random.normal(loc=100, scale=15, size=1000)) # Custom
# Other distributions
np.random.uniform(0, 1, 1000) # Uniform
np.random.binomial(10, 0.5, 100) # Binomial
np.random.poisson(5, 100) # Poisson
np.random.exponential(1, 1000) # Exponential
# Sampling from arrays
data = np.arange(100)
sample = np.random.choice(data, size=10, replace=False) # Without replacement
print(sample)
# Shuffle in place
arr = np.array([1, 2, 3, 4, 5])
np.random.shuffle(arr)
print(arr) # Shuffled
# Permutation (returns new array)
arr2 = np.random.permutation([1, 2, 3, 4, 5])
Reshaping and Combining
a = np.arange(12)
# Reshape
b = a.reshape(3, 4)
c = a.reshape(3, 2, 2) # 3D
d = b.reshape(-1) # Flatten (-1 means infer)
e = a.reshape(4, -1) # 4 rows, auto-compute columns
print(b)
# [[ 0 1 2 3]
# [ 4 5 6 7]
# [ 8 9 10 11]]
# Transpose
print(b.T) # 4x3
print(b.swapaxes(0, 1)) # Same as .T for 2D
# Stacking
x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6], [7, 8]])
print(np.vstack((x, y))) # Vertical: 4x2
print(np.hstack((x, y))) # Horizontal: 2x4
print(np.stack((x, y))) # New axis: 2x2x2
# Splitting
z = np.arange(16).reshape(4, 4)
print(np.hsplit(z, 2)) # Split into 2 column groups
print(np.vsplit(z, 2)) # Split into 2 row groups
Where and Conditional Logic
data = np.array([15, -3, 22, 0, 8, -1, 30])
# np.where: conditional element selection
result = np.where(data > 0, data, 0) # Replace negatives with 0
print(result) # [15 0 22 0 8 0 30]
# np.select: multiple conditions
conditions = [data < 0, data == 0, data > 0]
choices = ["negative", "zero", "positive"]
result = np.select(conditions, choices, default="unknown")
print(result) # ['positive' 'negative' 'positive' 'zero' ...]
# np.clip: limit values to a range
clipped = np.clip(data, 0, 25)
print(clipped) # [15 0 22 0 8 0 25]
# np.unique
values = np.array([1, 3, 2, 3, 1, 3, 2, 1])
print(np.unique(values, return_counts=True))
# (array([1, 2, 3]), array([3, 2, 3]))
Matrix Operations Reference
Matrix Multiplication:
Element-wise Multiplication (Hadamard Product):
Dot Product:
Transpose:
Vector Norm: . L2 norm: , L1 norm: .
Key Takeaways
- NumPy arrays are 10-100x faster than Python lists for numerical operations.
- Vectorization eliminates loops β think in array operations, not element-by-element.
- Broadcasting handles different shapes automatically, saving you from manual tiling.
- Master
axis=0(column) vsaxis=1(row) for aggregations. - Use
np.where,np.select, and boolean indexing for conditional logic. - Set random seeds (
np.random.seed()) for reproducible results.