Graph Data Science: Networks and Graph Analytics
Graph data science analyzes relationships between entities. This lesson covers network analysis, centrality, community detection, and graph neural networks.
Graph Fundamentals
NetworkX Basics
The adjacency matrix represents graph connectivity:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Create graph
G = nx.Graph()
# Add nodes with attributes
G.add_node(1, name="Alice", age=25, role="Engineer")
G.add_node(2, name="Bob", age=30, role="Manager")
G.add_node(3, name="Charlie", age=35, role="Director")
# Add edges with weights
G.add_edge(1, 2, weight=0.8, relationship="colleague")
G.add_edge(2, 3, weight=0.6, relationship="reports_to")
G.add_edge(1, 3, weight=0.3, relationship="indirect")
# Basic properties
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Is directed: {G.is_directed()}")
print(f"Is connected: {nx.is_connected(G)}")
# Visualize
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue',
node_size=500, font_size=10, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u,v): d['weight']
for u,v,d in G.edges(data=True)})
plt.title("Social Network Graph")
plt.show()
Centrality Measures
Degree centrality:
Betweenness centrality:
# Degree Centrality
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)
# Betweenness Centrality
betweenness = nx.betweenness_centrality(G, weight='weight')
print("Betweenness Centrality:", betweenness)
# Closeness Centrality
closeness = nx.closeness_centrality(G)
print("Closeness Centrality:", closeness)
# Eigenvector Centrality
eigenvector = nx.eigenvector_centrality(G, max_iter=1000, weight='weight')
print("Eigenvector Centrality:", eigenvector)
# PageRank
pagerank = nx.pagerank(G, alpha=0.85, weight='weight')
print("PageRank:", pagerank)
# Visualize centralities
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
centrality_measures = {
'Degree': degree_centrality,
'Betweenness': betweenness,
'Closeness': closeness,
'Eigenvector': eigenvector
}
for ax, (name, centrality) in zip(axes, centrality_measures.items()):
node_colors = [centrality[node] for node in G.nodes()]
nx.draw(G, pos, ax=ax, node_color=node_colors,
node_size=500, cmap=plt.cm.viridis, with_labels=True)
ax.set_title(f'{name} Centrality')
plt.tight_layout()
plt.show()
Community Detection
from networkx.algorithms import community
# Louvain Community Detection
communities_louvain = community.louvain_communities(G, weight='weight', resolution=1)
print("Louvain Communities:", communities_louvain)
# Girvan-Newman (hierarchical)
communities_gn = community.girvan_newman(G)
top_level_communities = next(communities_gn)
print("Girvan-Newman Communities:", top_level_communities)
# Label Propagation
communities_lp = list(community.label_propagation_communities(G))
print("Label Propagation Communities:", communities_lp)
# Modularity score
modularity = community.modularity(G, communities_louvain)
print(f"Modularity: {modularity:.4f}")
# Visualize communities
def draw_communities(G, communities, title):
pos = nx.spring_layout(G, seed=42)
plt.figure(figsize=(10, 8))
# Color nodes by community
colors = plt.cm.Set3(np.linspace(0, 1, len(communities)))
node_colors = {}
for i, comm in enumerate(communities):
for node in comm:
node_colors[node] = colors[i]
nx.draw(G, pos, node_color=[node_colors[node] for node in G.nodes()],
node_size=500, with_labels=True, font_size=10)
plt.title(title)
plt.show()
draw_communities(G, communities_louvain, "Louvain Communities")
Graph Algorithms
# Shortest Path
shortest_path = nx.shortest_path(G, source=1, target=3, weight='weight')
shortest_path_length = nx.shortest_path_length(G, source=1, target=3, weight='weight')
print(f"Shortest path from 1 to 3: {shortest_path}")
print(f"Path length: {shortest_path_length:.4f}")
# All-pairs shortest paths
all_shortest = dict(nx.all_pairs_dijkstra_path_length(G))
print("All-pairs shortest paths:", all_shortest)
# Minimum Spanning Tree
mst = nx.minimum_spanning_tree(G, weight='weight')
print(f"Minimum Spanning Tree edges: {list(mst.edges(data=True))}")
# Clustering Coefficient
clustering = nx.clustering(G, weight='weight')
print("Clustering Coefficient:", clustering)
# Transitivity (global clustering)
transitivity = nx.transitivity(G)
print(f"Graph Transitivity: {transitivity:.4f}")
# Triangles
triangles = nx.triangles(G)
print("Triangles per node:", triangles)
Graph Neural Networks (GNNs)
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
class GCN(torch.nn.Module):
"""Graph Convolutional Network"""
def __init__(self, num_features, num_classes, hidden_channels=16):
super().__init__()
self.conv1 = GCNConv(num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
class GraphSAGE(torch.nn.Module):
"""GraphSAGE Network"""
def __init__(self, num_features, num_classes, hidden_channels=16):
super().__init__()
self.conv1 = SAGEConv(num_features, hidden_channels)
self.conv2 = SAGEConv(hidden_channels, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
class GAT(torch.nn.Module):
"""Graph Attention Network"""
def __init__(self, num_features, num_classes, hidden_channels=16, heads=8):
super().__init__()
self.conv1 = GATConv(num_features, hidden_channels, heads=heads)
self.conv2 = GATConv(hidden_channels * heads, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
# Prepare data
def prepare_graph_data(G, labels):
"""Convert NetworkX graph to PyTorch Geometric format"""
node_list = list(G.nodes())
node_to_idx = {node: idx for idx, node in enumerate(node_list)}
# Node features (degree, clustering coefficient, etc.)
features = []
for node in node_list:
feat = [
G.degree(node),
nx.clustering(G, node),
nx.pagerank(G)[node]
]
features.append(feat)
x = torch.tensor(features, dtype=torch.float)
# Edge index
edge_index = []
for u, v in G.edges():
edge_index.append([node_to_idx[u], node_to_idx[v]])
edge_index.append([node_to_idx[v], node_to_idx[u]]) # Undirected
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
# Labels
y = torch.tensor([labels[node] for node in node_list], dtype=torch.long)
return Data(x=x, edge_index=edge_index, y=y)
Knowledge Graphs
import rdflib
from rdflib import URIRef, Literal, Namespace
# Create knowledge graph
g = rdflib.Graph()
# Define namespaces
EX = Namespace("http://example.org/")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
# Add triples (subject, predicate, object)
g.add((EX.Alice, RDF.type, EX.Person))
g.add((EX.Alice, EX.age, Literal(25)))
g.add((EX.Alice, EX.worksAt, EX.CompanyA))
g.add((EX.Bob, RDF.type, EX.Person))
g.add((EX.Bob, EX.age, Literal(30)))
g.add((EX.Bob, EX.manages, EX.Alice))
# Query with SPARQL
query = """
SELECT ?person ?age ?company
WHERE {
?person rdf:type ex:Person .
?person ex:age ?age .
?person ex:worksAt ?company .
FILTER (?age > 25)
}
"""
# Execute query
results = g.query(query, initNs={'ex': EX, 'rdf': RDF})
for row in results:
print(f"Person: {row.person}, Age: {row.age}, Company: {row.company}")
Graph Features for ML
def extract_graph_features(G):
"""Extract graph-level features for ML"""
features = {}
# Basic features
features['num_nodes'] = G.number_of_nodes()
features['num_edges'] = G.number_of_edges()
features['density'] = nx.density(G)
features['is_connected'] = nx.is_connected(G)
# Degree statistics
degrees = [d for n, d in G.degree()]
features['avg_degree'] = np.mean(degrees)
features['max_degree'] = max(degrees)
features['degree_std'] = np.std(degrees)
# Centrality statistics
betweenness = list(nx.betweenness_centrality(G).values())
features['avg_betweenness'] = np.mean(betweenness)
# Clustering
features['avg_clustering'] = nx.average_clustering(G)
features['transitivity'] = nx.transitivity(G)
# Connectivity
if nx.is_connected(G):
features['diameter'] = nx.diameter(G)
features['avg_shortest_path'] = nx.average_shortest_path_length(G)
else:
largest_cc = max(nx.connected_components(G), key=len)
subgraph = G.subgraph(largest_cc)
features['diameter'] = nx.diameter(subgraph)
features['avg_shortest_path'] = nx.average_shortest_path_length(subgraph)
return features
# Extract features for multiple graphs
graph_features = []
for G in graph_list:
features = extract_graph_features(G)
graph_features.append(features)
features_df = pd.DataFrame(graph_features)
Key Takeaways
- Graphs model relationships between entities
- Centrality measures identify important nodes
- Community detection finds groups in networks
- GNNs learn node/graph representations
- Graph features enable ML on network data