The Interview Question
βΉοΈ
Question: You're analyzing user behavior on a social platform:
- Data: User connections, interactions, and geographic locations
- Requirements: Identify influential users, detect communities, analyze spatial patterns
Walk through your advanced analytics approach:
- How do you model and analyze the user network?
- How do you identify key influencers and communities?
- How do you incorporate geospatial analysis?
- How do you scale these analyses to millions of users?
Detailed Answer
1. Graph Analytics Framework
Graph analytics examines relationships between entities, uncovering patterns in network data.
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')
class GraphAnalytics:
"""Graph analytics framework"""
def __init__(self, edges=None, directed=False):
if directed:
self.G = nx.DiGraph()
else:
self.G = nx.Graph()
if edges is not None:
self.G.add_edges_from(edges)
def add_weighted_edges(self, edges_with_weights):
"""Add weighted edges to graph"""
for u, v, weight in edges_with_weights:
self.G.add_edge(u, v, weight=weight)
def calculate_centrality_metrics(self) -> Dict:
"""Calculate various centrality metrics"""
metrics = {}
# Degree centrality
metrics['degree'] = nx.degree_centrality(self.G)
# Betweenness centrality
metrics['betweenness'] = nx.betweenness_centrality(self.G)
# Closeness centrality
metrics['closeness'] = nx.closeness_centrality(self.G)
# Eigenvector centrality (for undirected graphs)
try:
metrics['eigenvector'] = nx.eigenvector_centrality(self.G, max_iter=1000)
except nx.PowerIterationFailedConvergence:
metrics['eigenvector'] = {node: 0 for node in self.G.nodes()}
# PageRank (for directed graphs)
if self.G.is_directed():
metrics['pagerank'] = nx.pagerank(self.G)
# Create summary DataFrame
centrality_df = pd.DataFrame(metrics)
centrality_df['node'] = centrality_df.index
centrality_df = centrality_df.sort_values('degree', ascending=False)
return centrality_df
def detect_communities(self, method='louvain'):
"""Detect communities in the graph"""
if method == 'louvain':
from community import community_louvain
partition = community_louvain.best_partition(self.G)
elif method == 'label_propagation':
communities = nx.community.label_propagation_communities(self.G)
partition = {}
for idx, community in enumerate(communities):
for node in community:
partition[node] = idx
elif method == 'girvan_newman':
from networkx.algorithms.community import girvan_newman
comp = girvan_newman(self.G)
top_communities = next(comp)
partition = {}
for idx, community in enumerate(top_communities):
for node in community:
partition[node] = idx
# Calculate community statistics
community_sizes = defaultdict(int)
for node, community in partition.items():
community_sizes[community] += 1
# Calculate modularity
from networkx.algorithms.community import modularity
communities = []
for comm_id in set(partition.values()):
community_nodes = [n for n, c in partition.items() if c == comm_id]
communities.append(set(community_nodes))
modularity_score = modularity(self.G, communities)
results = {
'partition': partition,
'community_sizes': dict(community_sizes),
'n_communities': len(set(partition.values())),
'modularity': modularity_score
}
return results
def find_shortest_paths(self, source, target=None):
"""Find shortest paths between nodes"""
if target is None:
# All shortest paths from source
lengths = nx.single_source_shortest_path_length(self.G, source)
paths = nx.single_source_shortest_path(self.G, source)
else:
# Shortest path between source and target
try:
length = nx.shortest_path_length(self.G, source, target)
path = nx.shortest_path(self.G, source, target)
lengths = {target: length}
paths = {target: path}
except nx.NetworkXNoPath:
lengths = {}
paths = {}
return lengths, paths
def calculate_network_statistics(self) -> Dict:
"""Calculate network-level statistics"""
stats = {
'n_nodes': self.G.number_of_nodes(),
'n_edges': self.G.number_of_edges(),
'density': nx.density(self.G),
'is_connected': nx.is_connected(self.G) if not self.G.is_directed() else nx.is_weakly_connected(self.G),
'average_clustering': nx.average_clustering(self.G),
'transitivity': nx.transitivity(self.G),
}
if not self.G.is_directed():
stats['average_shortest_path_length'] = nx.average_shortest_path_length(self.G) if nx.is_connected(self.G) else None
stats['diameter'] = nx.diameter(self.G) if nx.is_connected(self.G) else None
# Degree distribution
degrees = [d for n, d in self.G.degree()]
stats['average_degree'] = np.mean(degrees)
stats['degree_std'] = np.std(degrees)
return stats
def visualize_network(self, node_colors=None, node_sizes=None,
figsize=(12, 10), title="Network Visualization"):
"""Visualize the network"""
fig, ax = plt.subplots(figsize=figsize)
# Layout
pos = nx.spring_layout(self.G, k=2/np.sqrt(self.G.number_of_nodes()), iterations=50)
# Default node properties
if node_colors is None:
node_colors = 'lightblue'
if node_sizes is None:
node_sizes = [50 for _ in self.G.nodes()]
# Draw
nx.draw_networkx_edges(self.G, pos, alpha=0.3, ax=ax)
nx.draw_networkx_nodes(self.G, pos, node_color=node_colors,
node_size=node_sizes, alpha=0.8, ax=ax)
# Labels for small graphs
if self.G.number_of_nodes() < 100:
nx.draw_networkx_labels(self.G, pos, font_size=8, ax=ax)
ax.set_title(title, fontsize=14, fontweight='bold')
ax.axis('off')
plt.tight_layout()
plt.savefig('network_visualization.png', dpi=150, bbox_inches='tight')
plt.show()
# Example usage
# Create sample social network
edges = [
('Alice', 'Bob'), ('Alice', 'Charlie'), ('Bob', 'David'),
('Charlie', 'Eve'), ('David', 'Frank'), ('Eve', 'Grace'),
('Frank', 'Alice'), ('Grace', 'Bob')
]
graph = GraphAnalytics(edges)
stats = graph.calculate_network_statistics()
centrality = graph.calculate_centrality_metrics()
communities = graph.detect_communities()
graph.visualize_network()
2. Geospatial Analysis
import geopandas as gpd
from shapely.geometry import Point, Polygon
from scipy.spatial import KDTree
import folium
from folium.plugins import HeatMap, MarkerCluster
class GeospatialAnalyzer:
"""Geospatial analysis framework"""
def __init__(self, data, lat_col='latitude', lon_col='longitude'):
self.data = data.copy()
self.lat_col = lat_col
self.lon_col = lon_col
self.geometry = [Point(lon, lat) for lon, lat in zip(data[lon_col], data[lat_col])]
self.gdf = gpd.GeoDataFrame(data, geometry=self.geometry, crs='EPSG:4326')
def calculate_spatial_statistics(self):
"""Calculate spatial statistics"""
coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
# Nearest neighbor analysis
tree = KDTree(coords)
distances, indices = tree.query(coords, k=2)
nn_distances = distances[:, 1] # Distance to nearest neighbor
# Ripley's K (simplified)
# For now, just calculate basic statistics
stats = {
'centroid': {
'lat': self.data[self.lat_col].mean(),
'lon': self.data[self.lon_col].mean()
},
'spread': {
'lat_std': self.data[self.lat_col].std(),
'lon_std': self.data[self.lon_col].std()
},
'nearest_neighbor': {
'mean_distance': np.mean(nn_distances),
'std_distance': np.std(nn_distances),
'min_distance': np.min(nn_distances),
'max_distance': np.max(nn_distances)
}
}
return stats
def create_hexagonal_grid(self, hex_size=0.01):
"""Create hexagonal grid for spatial aggregation"""
# Get bounds
min_lon, max_lon = self.data[self.lon_col].min(), self.data[self.lon_col].max()
min_lat, max_lat = self.data[self.lat_col].min(), self.data[self.lat_col].max()
# Create hexagonal grid
hexagons = []
for lon in np.arange(min_lon, max_lon, hex_size):
for lat in np.arange(min_lat, max_lat, hex_size * 0.866): # Hex height
hex_center = Point(lon, lat)
hexagons.append({
'center': hex_center,
'geometry': self._create_hexagon(lon, lat, hex_size)
})
hex_gdf = gpd.GeoDataFrame(hexagons, crs='EPSG:4326')
return hex_gdf
def _create_hexagon(self, center_lon, center_lat, size):
"""Create hexagon polygon"""
from shapely.geometry import Polygon
angles = np.linspace(0, 2 * np.pi, 7)[:-1] # 6 vertices
vertices = []
for angle in angles:
dx = size * np.cos(angle)
dy = size * np.sin(angle) * 0.866 # Adjust for hex aspect ratio
vertices.append((center_lon + dx, center_lat + dy))
return Polygon(vertices)
def spatial_clustering(self, method='dbscan', eps=0.01, min_samples=5):
"""Spatial clustering"""
coords = np.column_stack([self.data[self.lat_col], self.data[self.lon_col]])
if method == 'dbscan':
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
labels = clustering.labels_
elif method == 'kmeans':
from sklearn.cluster import KMeans
n_clusters = int(np.sqrt(len(coords)))
clustering = KMeans(n_clusters=n_clusters, random_state=42).fit(coords)
labels = clustering.labels_
self.data['cluster'] = labels
# Calculate cluster statistics
cluster_stats = self.data.groupby('cluster').agg({
self.lat_col: ['mean', 'std'],
self.lon_col: ['mean', 'std'],
'cluster': 'count'
}).round(4)
return labels, cluster_stats
def calculate_spatial_autocorrelation(self, variable):
"""Calculate Moran's I for spatial autocorrelation"""
from libpysal.weights import KNN
from esda.moran import Moran
# Create weight matrix
coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
w = KNN.from_array(coords, k=8)
w.transform = 'r'
# Calculate Moran's I
moran = Moran(self.data[variable], w)
results = {
'moran_i': moran.I,
'p_value': moran.p_sim,
'expected_i': moran.EI,
'interpretation': 'Positive spatial autocorrelation' if moran.I > 0 else 'Negative spatial autocorrelation'
}
return results
def create_interactive_map(self, output_file='map.html'):
"""Create interactive map using Folium"""
# Center map on data centroid
center_lat = self.data[self.lat_col].mean()
center_lon = self.data[self.lon_col].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
# Add heatmap
heat_data = [[lat, lon] for lat, lon in zip(self.data[self.lat_col], self.data[self.lon_col])]
HeatMap(heat_data, radius=15).add_to(m)
# Add marker cluster
marker_cluster = MarkerCluster().add_to(m)
for idx, row in self.data.head(1000).iterrows(): # Limit for performance
folium.Marker(
location=[row[self.lat_col], row[self.lon_col]],
popup=f"Point {idx}"
).add_to(marker_cluster)
# Save map
m.save(output_file)
print(f"Interactive map saved to {output_file}")
return m
def visualize_spatial_analysis(self, variable=None, figsize=(12, 10)):
"""Visualize spatial analysis results"""
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 1. Data points
axes[0, 0].scatter(self.data[self.lon_col], self.data[self.lat_col],
alpha=0.5, s=10)
axes[0, 0].set_xlabel('Longitude')
axes[0, 0].set_ylabel('Latitude')
axes[0, 0].set_title('Spatial Distribution')
# 2. Density heatmap
from scipy.stats import gaussian_kde
xy = np.vstack([self.data[self.lon_col], self.data[self.lat_col]])
z = gaussian_kde(xy)(xy)
axes[0, 1].scatter(self.data[self.lon_col], self.data[self.lat_col],
c=z, cmap='hot', alpha=0.5, s=10)
axes[0, 1].set_xlabel('Longitude')
axes[0, 1].set_ylabel('Latitude')
axes[0, 1].set_title('Density Heatmap')
# 3. Nearest neighbor distribution
coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
tree = KDTree(coords)
distances, _ = tree.query(coords, k=2)
axes[1, 0].hist(distances[:, 1], bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Distance to Nearest Neighbor')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Nearest Neighbor Distance Distribution')
# 4. Spatial autocorrelation (if variable provided)
if variable and variable in self.data.columns:
from pysal.explore import esda
from pysal.lib import weights
w = weights.KNN.from_array(coords, k=8)
moran = esda.Moran(self.data[variable], w)
axes[1, 1].scatter(moran.Is, moran.p_sim, alpha=0.5)
axes[1, 1].axhline(y=0.05, color='r', linestyle='--', label='p=0.05')
axes[1, 1].set_xlabel('Moran\'s I')
axes[1, 1].set_ylabel('p-value')
axes[1, 1].set_title('Spatial Autocorrelation')
axes[1, 1].legend()
plt.tight_layout()
plt.savefig('spatial_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
# Example usage
# geo_analyzer = GeospatialAnalyzer(data, lat_col='lat', lon_col='lon')
# spatial_stats = geo_analyzer.calculate_spatial_statistics()
# clusters, cluster_stats = geo_analyzer.spatial_clustering(method='dbscan')
# geo_analyzer.create_interactive_map()
# geo_analyzer.visualize_spatial_analysis()
3. Network Analysis for Recommendations
class NetworkRecommendationEngine:
"""Recommendation engine using network analysis"""
def __init__(self, user_item_interactions, user_user_edges=None):
self.interactions = user_item_interactions
self.user_graph = None
self.item_graph = None
if user_user_edges:
self.user_graph = nx.Graph()
self.user_graph.add_edges_from(user_user_edges)
def build_bipartite_graph(self):
"""Build user-item bipartite graph"""
B = nx.Graph()
# Add nodes
users = self.interactions['user_id'].unique()
items = self.interactions['item_id'].unique()
B.add_nodes_from(users, bipartite=0)
B.add_nodes_from(items, bipartite=1)
# Add edges
for _, row in self.interactions.iterrows():
B.add_edge(row['user_id'], row['item_id'], weight=row.get('rating', 1))
return B
def friend_of_friend_recommendations(self, user_id, n_recommendations=10):
"""Recommend based on friends of friends"""
if self.user_graph is None:
print("User graph not provided")
return []
# Get friends
friends = set(self.user_graph.neighbors(user_id))
# Get friends of friends
fof = set()
for friend in friends:
fof.update(self.user_graph.neighbors(friend))
# Remove user and existing friends
fof = fof - friends - {user_id}
# Score based on number of mutual friends
scores = {}
for potential_friend in fof:
mutual_friends = len(friends.intersection(set(self.user_graph.neighbors(potential_friend))))
scores[potential_friend] = mutual_friends
# Sort and return top N
recommendations = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return recommendations[:n_recommendations]
def pagerank_recommendations(self, user_id, n_recommendations=10):
"""Recommend using Personalized PageRank"""
B = self.build_bipartite_graph()
# Calculate Personalized PageRank
personalization = {node: 0 for node in B.nodes()}
personalization[user_id] = 1
pagerank = nx.pagerank(B, personalization=personalization, alpha=0.85)
# Get item scores (exclude user nodes)
item_scores = {node: score for node, score in pagerank.items()
if node not in self.interactions['user_id'].unique()}
# Remove already interacted items
user_items = set(self.interactions[self.interactions['user_id'] == user_id]['item_id'])
item_scores = {item: score for item, score in item_scores.items()
if item not in user_items}
# Sort and return top N
recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
return recommendations[:n_recommendations]
def community_based_recommendations(self, user_id, n_recommendations=10):
"""Recommend based on community membership"""
# Detect communities
communities = nx.community.louvain_communities(self.user_graph)
# Find user's community
user_community = None
for idx, community in enumerate(communities):
if user_id in community:
user_community = idx
break
if user_community is None:
return []
# Get community members
community_members = communities[user_community]
# Get items liked by community members
community_items = self.interactions[
self.interactions['user_id'].isin(community_members)
].groupby('item_id')['rating'].agg(['mean', 'count'])
# Filter by popularity and rating
recommended_items = community_items[
(community_items['count'] >= 5) & # At least 5 ratings
(community_items['mean'] >= 4.0) # High rating
].nlargest(n_recommendations, 'mean')
return recommended_items.index.tolist()
def knowledge_graph_recommendations(self, user_id, item_attributes,
n_recommendations=10):
"""Recommend using knowledge graph embeddings"""
from node2vec import Node2Vec
# Build knowledge graph
G = nx.Graph()
# Add user-item edges
user_items = self.interactions[self.interactions['user_id'] == user_id]['item_id']
for item in user_items:
G.add_edge(user_id, item, type='interacts')
# Add item-attribute edges
for _, row in item_attributes.iterrows():
for attr in ['category', 'genre', 'director']:
if attr in row:
G.add_edge(row['item_id'], f"{attr}_{row[attr]}", type='has_attribute')
# Train Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200)
model = node2vec.fit(window=10, min_count=1)
# Get recommendations
recommendations = model.wv.most_similar(user_id, topn=n_recommendations + len(user_items))
# Filter out already interacted items
recommendations = [(item, score) for item, score in recommendations
if item not in user_items.values]
return recommendations[:n_recommendations]
# Example usage
# rec_engine = NetworkRecommendationEngine(interactions, user_user_edges)
# fof_recs = rec_engine.friend_of_friend_recommendations(user_id=123)
# pagerank_recs = rec_engine.pagerank_recommendations(user_id=123)
# community_recs = rec_engine.community_based_recommendations(user_id=123)
4. Real-World Application: Ride-Sharing Analytics
class RideSharingAnalytics:
"""Advanced analytics for ride-sharing platform"""
def __init__(self, ride_data, driver_data, rider_data):
self.rides = ride_data
self.drivers = driver_data
self.riders = rider_data
self.graph_analytics = None
self.geo_analyzer = None
def analyze_ride_network(self):
"""Analyze the ride-sharing network"""
# Create rider-driver bipartite graph
edges = [(row['rider_id'], row['driver_id'])
for _, row in self.rides.iterrows()]
self.graph_analytics = GraphAnalytics(edges, directed=True)
# Calculate metrics
network_stats = self.graph_analytics.calculate_network_statistics()
centrality = self.graph_analytics.calculate_centrality_metrics()
# Find most connected drivers (hubs)
top_drivers = centrality.nlargest(10, 'degree')
return {
'network_stats': network_stats,
'top_drivers': top_drivers,
'centrality': centrality
}
def analyze_spatial_patterns(self):
"""Analyze spatial patterns in rides"""
# Pickup locations
pickup_data = self.rides[['pickup_lat', 'pickup_lon']].copy()
pickup_data.columns = ['latitude', 'longitude']
self.geo_analyzer = GeospatialAnalyzer(pickup_data)
# Spatial statistics
spatial_stats = self.geo_analyzer.calculate_spatial_statistics()
# Spatial clustering to find demand hotspots
clusters, cluster_stats = self.geo_analyzer.spatial_clustering(
method='dbscan', eps=0.01, min_samples=10
)
# Create demand heatmap
self.geo_analyzer.create_interactive_map('demand_heatmap.html')
return {
'spatial_stats': spatial_stats,
'demand_clusters': cluster_stats,
'n_hotspots': len(cluster_stats)
}
def driver_optimization(self):
"""Optimize driver positioning based on demand"""
# Create demand grid
grid_size = 0.005 # ~500m
demand_grid = self._create_demand_grid(grid_size)
# Calculate driver coverage
driver_locations = self.drivers[['lat', 'lon']].values
# Find underserved areas
underserved = self._find_underserved_areas(demand_grid, driver_locations)
# Recommend driver repositioning
recommendations = self._recommend_repositioning(underserved, demand_grid)
return {
'demand_grid': demand_grid,
'underserved_areas': underserved,
'repositioning_recommendations': recommendations
}
def _create_demand_grid(self, grid_size):
"""Create demand grid from ride data"""
# Create grid
min_lon, max_lon = self.rides['pickup_lon'].min(), self.rides['pickup_lon'].max()
min_lat, max_lat = self.rides['pickup_lat'].min(), self.rides['pickup_lat'].max()
grid = {}
for lon in np.arange(min_lon, max_lon, grid_size):
for lat in np.arange(min_lat, max_lat, grid_size):
# Count rides in this cell
count = len(self.rides[
(self.rides['pickup_lon'] >= lon) &
(self.rides['pickup_lon'] < lon + grid_size) &
(self.rides['pickup_lat'] >= lat) &
(self.rides['pickup_lat'] < lat + grid_size)
])
grid[(lon, lat)] = {
'lon': lon + grid_size/2,
'lat': lat + grid_size/2,
'demand': count
}
return grid
def _find_underserved_areas(self, demand_grid, driver_locations, threshold=0.5):
"""Find areas with high demand but low driver coverage"""
underserved = []
for (lon, lat), cell_info in demand_grid.items():
if cell_info['demand'] > 0:
# Calculate distance to nearest driver
distances = np.sqrt((driver_locations[:, 0] - lat)**2 +
(driver_locations[:, 1] - lon)**2)
nearest_distance = np.min(distances)
# Check if underserved
if nearest_distance > threshold: # > 500m
underserved.append({
'lon': cell_info['lon'],
'lat': cell_info['lat'],
'demand': cell_info['demand'],
'nearest_driver_distance': nearest_distance
})
return pd.DataFrame(underserved)
def _recommend_repositioning(self, underserved, demand_grid):
"""Recommend driver repositioning"""
recommendations = []
for _, area in underserved.iterrows():
# Find nearby drivers
nearby_drivers = self._find_nearby_drivers(
area['lat'], area['lon'],
radius=2.0 # 2km
)
if len(nearby_drivers) > 0:
# Recommend repositioning
recommendations.append({
'target_lat': area['lat'],
'target_lon': area['lon'],
'expected_demand': area['demand'],
'drivers_to_reposition': min(3, len(nearby_drivers)),
'driver_ids': nearby_drivers['driver_id'].tolist()[:3]
})
return pd.DataFrame(recommendations)
def _find_nearby_drivers(self, lat, lon, radius):
"""Find drivers within radius"""
distances = np.sqrt((self.drivers['lat'] - lat)**2 +
(self.drivers['lon'] - lon)**2) * 111 # Rough km conversion
nearby = self.drivers[distances <= radius].copy()
nearby['distance'] = distances[distances <= radius]
return nearby.sort_values('distance')
# Example usage
# ride_analytics = RideSharingAnalytics(rides, drivers, riders)
# network_analysis = ride_analytics.analyze_ride_network()
# spatial_analysis = ride_analytics.analyze_spatial_patterns()
# optimization = ride_analytics.driver_optimization()
π‘
Pro Tip: Graph algorithms can be computationally expensive for large networks. Consider using approximations or sampling for networks with millions of nodes.
5. Common Follow-Up Questions
Follow-up 1: How do you scale graph algorithms to large networks?
def scale_graph_algorithms(graph, method='sampling'):
"""Scale graph algorithms for large networks"""
if method == 'sampling':
# Node sampling
sample_size = min(10000, graph.number_of_nodes())
sampled_nodes = np.random.choice(list(graph.nodes()), size=sample_size, replace=False)
subgraph = graph.subgraph(sampled_nodes)
return subgraph
elif method == 'distributed':
# Use distributed graph processing frameworks
# Example: GraphX, Pregel, or DGL
frameworks = {
'graphx': 'Apache Spark GraphX',
'dgl': 'Deep Graph Library',
'neo4j': 'Neo4j for graph databases'
}
return frameworks
elif method == 'approximation':
# Use approximate algorithms
# Example: Approximate PageRank, Local Clustering
from networkx.algorithms.approximation import pagerank_approx
approx_pr = pagerank_approx.pagerank(graph, alpha=0.85)
return approx_pr
# Example
# large_graph = nx.erdos_renyi_graph(100000, 0.001)
# sampled_graph = scale_graph_algorithms(large_graph, method='sampling')
Follow-up 2: How do you handle dynamic/temporal networks?
class TemporalNetworkAnalyzer:
"""Analyze temporal (time-evolving) networks"""
def __init__(self):
self.snapshots = []
def create_temporal_snapshots(self, edges_with_timestamps, time_window):
"""Create network snapshots over time"""
df = pd.DataFrame(edges_with_timestamps, columns=['u', 'v', 'timestamp'])
# Create time windows
df['time_window'] = df['timestamp'] // time_window
# Create snapshot for each time window
for window in df['time_window'].unique():
window_edges = df[df['time_window'] == window][['u', 'v']].values
G = nx.Graph()
G.add_edges_from(window_edges)
self.snapshots.append({
'time': window,
'graph': G,
'stats': {
'n_nodes': G.number_of_nodes(),
'n_edges': G.number_of_edges(),
'density': nx.density(G)
}
})
return self.snapshots
def analyze_temporal_evolution(self):
"""Analyze how network evolves over time"""
evolution = []
for snapshot in self.snapshots:
evolution.append({
'time': snapshot['time'],
**snapshot['stats']
})
evolution_df = pd.DataFrame(evolution)
# Plot evolution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].plot(evolution_df['time'], evolution_df['n_nodes'], marker='o')
axes[0].set_xlabel('Time Window')
axes[0].set_ylabel('Number of Nodes')
axes[0].set_title('Network Growth')
axes[1].plot(evolution_df['time'], evolution_df['n_edges'], marker='o', color='orange')
axes[1].set_xlabel('Time Window')
axes[1].set_ylabel('Number of Edges')
axes[1].set_title('Edge Growth')
axes[2].plot(evolution_df['time'], evolution_df['density'], marker='o', color='green')
axes[2].set_xlabel('Time Window')
axes[2].set_ylabel('Density')
axes[2].set_title('Network Density Evolution')
plt.tight_layout()
plt.savefig('temporal_network_evolution.png', dpi=150, bbox_inches='tight')
plt.show()
return evolution_df
# Example usage
# temporal_analyzer = TemporalNetworkAnalyzer()
# snapshots = temporal_analyzer.create_temporal_snapshots(edges_with_times, time_window=3600)
# evolution = temporal_analyzer.analyze_temporal_evolution()
Company-Specific Tips
βΉοΈ
Google Tips:
- Google heavily tests on graph algorithms
- Know how to implement PageRank and its variants
- Understand community detection algorithms
- Be comfortable with large-scale graph processing
Uber Tips:
- Uber values geospatial analytics
- Know how to optimize driver positioning
- Understand spatial clustering for demand prediction
- Be familiar with real-time location analytics
Quiz Section
Related Topics
- Graph Databases β Storing and querying graphs
- Network Science β Theoretical foundations
- Geospatial Databases β Spatial data storage
- Real-time Analytics β Temporal network analysis