Graph Data Science: Networks and Graph Analytics

Graph data science analyzes relationships between entities. This lesson covers network analysis, centrality, community detection, and graph neural networks.

Graph Fundamentals

NetworkX Basics

The adjacency matrix $A$ represents graph connectivity:

A_{ij} = \begin{cases} 1 & \text{if } (i,j) \in E \\ 0 & \text{otherwise} \end{cases}

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Create graph
G = nx.Graph()

# Add nodes with attributes
G.add_node(1, name="Alice", age=25, role="Engineer")
G.add_node(2, name="Bob", age=30, role="Manager")
G.add_node(3, name="Charlie", age=35, role="Director")

# Add edges with weights
G.add_edge(1, 2, weight=0.8, relationship="colleague")
G.add_edge(2, 3, weight=0.6, relationship="reports_to")
G.add_edge(1, 3, weight=0.3, relationship="indirect")

# Basic properties
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Is directed: {G.is_directed()}")
print(f"Is connected: {nx.is_connected(G)}")

# Visualize
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', 
        node_size=500, font_size=10, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u,v): d['weight'] 
                                                   for u,v,d in G.edges(data=True)})
plt.title("Social Network Graph")
plt.show()

Centrality Measures

Degree centrality:

C_D(v) = \frac{\text{deg}(v)}{n-1}

Betweenness centrality:

C_B(v) = \sum_{s \neq v \neq t} \frac{\sigma_{st}(v)}{\sigma_{st}}

# Degree Centrality
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)

# Betweenness Centrality
betweenness = nx.betweenness_centrality(G, weight='weight')
print("Betweenness Centrality:", betweenness)

# Closeness Centrality
closeness = nx.closeness_centrality(G)
print("Closeness Centrality:", closeness)

# Eigenvector Centrality
eigenvector = nx.eigenvector_centrality(G, max_iter=1000, weight='weight')
print("Eigenvector Centrality:", eigenvector)

# PageRank
pagerank = nx.pagerank(G, alpha=0.85, weight='weight')
print("PageRank:", pagerank)

# Visualize centralities
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

centrality_measures = {
    'Degree': degree_centrality,
    'Betweenness': betweenness,
    'Closeness': closeness,
    'Eigenvector': eigenvector
}

for ax, (name, centrality) in zip(axes, centrality_measures.items()):
    node_colors = [centrality[node] for node in G.nodes()]
    nx.draw(G, pos, ax=ax, node_color=node_colors, 
            node_size=500, cmap=plt.cm.viridis, with_labels=True)
    ax.set_title(f'{name} Centrality')

plt.tight_layout()
plt.show()

Community Detection

from networkx.algorithms import community

# Louvain Community Detection
communities_louvain = community.louvain_communities(G, weight='weight', resolution=1)
print("Louvain Communities:", communities_louvain)

# Girvan-Newman (hierarchical)
communities_gn = community.girvan_newman(G)
top_level_communities = next(communities_gn)
print("Girvan-Newman Communities:", top_level_communities)

# Label Propagation
communities_lp = list(community.label_propagation_communities(G))
print("Label Propagation Communities:", communities_lp)

# Modularity score
modularity = community.modularity(G, communities_louvain)
print(f"Modularity: {modularity:.4f}")

# Visualize communities
def draw_communities(G, communities, title):
    pos = nx.spring_layout(G, seed=42)
    plt.figure(figsize=(10, 8))
    
    # Color nodes by community
    colors = plt.cm.Set3(np.linspace(0, 1, len(communities)))
    node_colors = {}
    
    for i, comm in enumerate(communities):
        for node in comm:
            node_colors[node] = colors[i]
    
    nx.draw(G, pos, node_color=[node_colors[node] for node in G.nodes()],
            node_size=500, with_labels=True, font_size=10)
    plt.title(title)
    plt.show()

draw_communities(G, communities_louvain, "Louvain Communities")

Graph Algorithms

# Shortest Path
shortest_path = nx.shortest_path(G, source=1, target=3, weight='weight')
shortest_path_length = nx.shortest_path_length(G, source=1, target=3, weight='weight')
print(f"Shortest path from 1 to 3: {shortest_path}")
print(f"Path length: {shortest_path_length:.4f}")

# All-pairs shortest paths
all_shortest = dict(nx.all_pairs_dijkstra_path_length(G))
print("All-pairs shortest paths:", all_shortest)

# Minimum Spanning Tree
mst = nx.minimum_spanning_tree(G, weight='weight')
print(f"Minimum Spanning Tree edges: {list(mst.edges(data=True))}")

# Clustering Coefficient
clustering = nx.clustering(G, weight='weight')
print("Clustering Coefficient:", clustering)

# Transitivity (global clustering)
transitivity = nx.transitivity(G)
print(f"Graph Transitivity: {transitivity:.4f}")

# Triangles
triangles = nx.triangles(G)
print("Triangles per node:", triangles)

Graph Neural Networks (GNNs)

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data

class GCN(torch.nn.Module):
    """Graph Convolutional Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

class GraphSAGE(torch.nn.Module):
    """GraphSAGE Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16):
        super().__init__()
        self.conv1 = SAGEConv(num_features, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

class GAT(torch.nn.Module):
    """Graph Attention Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16, heads=8):
        super().__init__()
        self.conv1 = GATConv(num_features, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Prepare data
def prepare_graph_data(G, labels):
    """Convert NetworkX graph to PyTorch Geometric format"""
    node_list = list(G.nodes())
    node_to_idx = {node: idx for idx, node in enumerate(node_list)}
    
    # Node features (degree, clustering coefficient, etc.)
    features = []
    for node in node_list:
        feat = [
            G.degree(node),
            nx.clustering(G, node),
            nx.pagerank(G)[node]
        ]
        features.append(feat)
    
    x = torch.tensor(features, dtype=torch.float)
    
    # Edge index
    edge_index = []
    for u, v in G.edges():
        edge_index.append([node_to_idx[u], node_to_idx[v]])
        edge_index.append([node_to_idx[v], node_to_idx[u]])  # Undirected
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    # Labels
    y = torch.tensor([labels[node] for node in node_list], dtype=torch.long)
    
    return Data(x=x, edge_index=edge_index, y=y)

Knowledge Graphs

import rdflib
from rdflib import URIRef, Literal, Namespace

# Create knowledge graph
g = rdflib.Graph()

# Define namespaces
EX = Namespace("http://example.org/")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

# Add triples (subject, predicate, object)
g.add((EX.Alice, RDF.type, EX.Person))
g.add((EX.Alice, EX.age, Literal(25)))
g.add((EX.Alice, EX.worksAt, EX.CompanyA))
g.add((EX.Bob, RDF.type, EX.Person))
g.add((EX.Bob, EX.age, Literal(30)))
g.add((EX.Bob, EX.manages, EX.Alice))

# Query with SPARQL
query = """
SELECT ?person ?age ?company
WHERE {
    ?person rdf:type ex:Person .
    ?person ex:age ?age .
    ?person ex:worksAt ?company .
    FILTER (?age > 25)
}
"""

# Execute query
results = g.query(query, initNs={'ex': EX, 'rdf': RDF})
for row in results:
    print(f"Person: {row.person}, Age: {row.age}, Company: {row.company}")

Graph Features for ML

def extract_graph_features(G):
    """Extract graph-level features for ML"""
    features = {}
    
    # Basic features
    features['num_nodes'] = G.number_of_nodes()
    features['num_edges'] = G.number_of_edges()
    features['density'] = nx.density(G)
    features['is_connected'] = nx.is_connected(G)
    
    # Degree statistics
    degrees = [d for n, d in G.degree()]
    features['avg_degree'] = np.mean(degrees)
    features['max_degree'] = max(degrees)
    features['degree_std'] = np.std(degrees)
    
    # Centrality statistics
    betweenness = list(nx.betweenness_centrality(G).values())
    features['avg_betweenness'] = np.mean(betweenness)
    
    # Clustering
    features['avg_clustering'] = nx.average_clustering(G)
    features['transitivity'] = nx.transitivity(G)
    
    # Connectivity
    if nx.is_connected(G):
        features['diameter'] = nx.diameter(G)
        features['avg_shortest_path'] = nx.average_shortest_path_length(G)
    else:
        largest_cc = max(nx.connected_components(G), key=len)
        subgraph = G.subgraph(largest_cc)
        features['diameter'] = nx.diameter(subgraph)
        features['avg_shortest_path'] = nx.average_shortest_path_length(subgraph)
    
    return features

# Extract features for multiple graphs
graph_features = []
for G in graph_list:
    features = extract_graph_features(G)
    graph_features.append(features)

features_df = pd.DataFrame(graph_features)

Key Takeaways

Graphs model relationships between entities
Centrality measures identify important nodes
Community detection finds groups in networks
GNNs learn node/graph representations
Graph features enable ML on network data

Graph Data Science: Networks and Graph Analytics

Graph Data Science: Networks and Graph Analytics

Graph Fundamentals

NetworkX Basics

Centrality Measures

Community Detection

Graph Algorithms

Graph Neural Networks (GNNs)

Knowledge Graphs

Graph Features for ML

Key Takeaways

Premium Content

Need Expert Data Science Help?