Advanced Analytics: Graph Analytics, Geospatial, Network Analysis

The Interview Question

ℹ️

Question: You're analyzing user behavior on a social platform:

Data: User connections, interactions, and geographic locations
Requirements: Identify influential users, detect communities, analyze spatial patterns

Walk through your advanced analytics approach:

How do you model and analyze the user network?
How do you identify key influencers and communities?
How do you incorporate geospatial analysis?
How do you scale these analyses to millions of users?

Detailed Answer

1. Graph Analytics Framework

Graph analytics examines relationships between entities, uncovering patterns in network data.

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

class GraphAnalytics:
    """Graph analytics framework"""
    
    def __init__(self, edges=None, directed=False):
        if directed:
            self.G = nx.DiGraph()
        else:
            self.G = nx.Graph()
        
        if edges is not None:
            self.G.add_edges_from(edges)
    
    def add_weighted_edges(self, edges_with_weights):
        """Add weighted edges to graph"""
        for u, v, weight in edges_with_weights:
            self.G.add_edge(u, v, weight=weight)
    
    def calculate_centrality_metrics(self) -> Dict:
        """Calculate various centrality metrics"""
        
        metrics = {}
        
        # Degree centrality
        metrics['degree'] = nx.degree_centrality(self.G)
        
        # Betweenness centrality
        metrics['betweenness'] = nx.betweenness_centrality(self.G)
        
        # Closeness centrality
        metrics['closeness'] = nx.closeness_centrality(self.G)
        
        # Eigenvector centrality (for undirected graphs)
        try:
            metrics['eigenvector'] = nx.eigenvector_centrality(self.G, max_iter=1000)
        except nx.PowerIterationFailedConvergence:
            metrics['eigenvector'] = {node: 0 for node in self.G.nodes()}
        
        # PageRank (for directed graphs)
        if self.G.is_directed():
            metrics['pagerank'] = nx.pagerank(self.G)
        
        # Create summary DataFrame
        centrality_df = pd.DataFrame(metrics)
        centrality_df['node'] = centrality_df.index
        centrality_df = centrality_df.sort_values('degree', ascending=False)
        
        return centrality_df
    
    def detect_communities(self, method='louvain'):
        """Detect communities in the graph"""
        
        if method == 'louvain':
            from community import community_louvain
            partition = community_louvain.best_partition(self.G)
            
        elif method == 'label_propagation':
            communities = nx.community.label_propagation_communities(self.G)
            partition = {}
            for idx, community in enumerate(communities):
                for node in community:
                    partition[node] = idx
        
        elif method == 'girvan_newman':
            from networkx.algorithms.community import girvan_newman
            comp = girvan_newman(self.G)
            top_communities = next(comp)
            partition = {}
            for idx, community in enumerate(top_communities):
                for node in community:
                    partition[node] = idx
        
        # Calculate community statistics
        community_sizes = defaultdict(int)
        for node, community in partition.items():
            community_sizes[community] += 1
        
        # Calculate modularity
        from networkx.algorithms.community import modularity
        communities = []
        for comm_id in set(partition.values()):
            community_nodes = [n for n, c in partition.items() if c == comm_id]
            communities.append(set(community_nodes))
        
        modularity_score = modularity(self.G, communities)
        
        results = {
            'partition': partition,
            'community_sizes': dict(community_sizes),
            'n_communities': len(set(partition.values())),
            'modularity': modularity_score
        }
        
        return results
    
    def find_shortest_paths(self, source, target=None):
        """Find shortest paths between nodes"""
        
        if target is None:
            # All shortest paths from source
            lengths = nx.single_source_shortest_path_length(self.G, source)
            paths = nx.single_source_shortest_path(self.G, source)
        else:
            # Shortest path between source and target
            try:
                length = nx.shortest_path_length(self.G, source, target)
                path = nx.shortest_path(self.G, source, target)
                lengths = {target: length}
                paths = {target: path}
            except nx.NetworkXNoPath:
                lengths = {}
                paths = {}
        
        return lengths, paths
    
    def calculate_network_statistics(self) -> Dict:
        """Calculate network-level statistics"""
        
        stats = {
            'n_nodes': self.G.number_of_nodes(),
            'n_edges': self.G.number_of_edges(),
            'density': nx.density(self.G),
            'is_connected': nx.is_connected(self.G) if not self.G.is_directed() else nx.is_weakly_connected(self.G),
            'average_clustering': nx.average_clustering(self.G),
            'transitivity': nx.transitivity(self.G),
        }
        
        if not self.G.is_directed():
            stats['average_shortest_path_length'] = nx.average_shortest_path_length(self.G) if nx.is_connected(self.G) else None
            stats['diameter'] = nx.diameter(self.G) if nx.is_connected(self.G) else None
        
        # Degree distribution
        degrees = [d for n, d in self.G.degree()]
        stats['average_degree'] = np.mean(degrees)
        stats['degree_std'] = np.std(degrees)
        
        return stats
    
    def visualize_network(self, node_colors=None, node_sizes=None, 
                         figsize=(12, 10), title="Network Visualization"):
        """Visualize the network"""
        
        fig, ax = plt.subplots(figsize=figsize)
        
        # Layout
        pos = nx.spring_layout(self.G, k=2/np.sqrt(self.G.number_of_nodes()), iterations=50)
        
        # Default node properties
        if node_colors is None:
            node_colors = 'lightblue'
        if node_sizes is None:
            node_sizes = [50 for _ in self.G.nodes()]
        
        # Draw
        nx.draw_networkx_edges(self.G, pos, alpha=0.3, ax=ax)
        nx.draw_networkx_nodes(self.G, pos, node_color=node_colors, 
                              node_size=node_sizes, alpha=0.8, ax=ax)
        
        # Labels for small graphs
        if self.G.number_of_nodes() < 100:
            nx.draw_networkx_labels(self.G, pos, font_size=8, ax=ax)
        
        ax.set_title(title, fontsize=14, fontweight='bold')
        ax.axis('off')
        
        plt.tight_layout()
        plt.savefig('network_visualization.png', dpi=150, bbox_inches='tight')
        plt.show()

# Example usage
# Create sample social network
edges = [
    ('Alice', 'Bob'), ('Alice', 'Charlie'), ('Bob', 'David'),
    ('Charlie', 'Eve'), ('David', 'Frank'), ('Eve', 'Grace'),
    ('Frank', 'Alice'), ('Grace', 'Bob')
]

graph = GraphAnalytics(edges)
stats = graph.calculate_network_statistics()
centrality = graph.calculate_centrality_metrics()
communities = graph.detect_communities()
graph.visualize_network()

2. Geospatial Analysis

import geopandas as gpd
from shapely.geometry import Point, Polygon
from scipy.spatial import KDTree
import folium
from folium.plugins import HeatMap, MarkerCluster

class GeospatialAnalyzer:
    """Geospatial analysis framework"""
    
    def __init__(self, data, lat_col='latitude', lon_col='longitude'):
        self.data = data.copy()
        self.lat_col = lat_col
        self.lon_col = lon_col
        self.geometry = [Point(lon, lat) for lon, lat in zip(data[lon_col], data[lat_col])]
        self.gdf = gpd.GeoDataFrame(data, geometry=self.geometry, crs='EPSG:4326')
    
    def calculate_spatial_statistics(self):
        """Calculate spatial statistics"""
        
        coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
        
        # Nearest neighbor analysis
        tree = KDTree(coords)
        distances, indices = tree.query(coords, k=2)
        nn_distances = distances[:, 1]  # Distance to nearest neighbor
        
        # Ripley's K (simplified)
        # For now, just calculate basic statistics
        
        stats = {
            'centroid': {
                'lat': self.data[self.lat_col].mean(),
                'lon': self.data[self.lon_col].mean()
            },
            'spread': {
                'lat_std': self.data[self.lat_col].std(),
                'lon_std': self.data[self.lon_col].std()
            },
            'nearest_neighbor': {
                'mean_distance': np.mean(nn_distances),
                'std_distance': np.std(nn_distances),
                'min_distance': np.min(nn_distances),
                'max_distance': np.max(nn_distances)
            }
        }
        
        return stats
    
    def create_hexagonal_grid(self, hex_size=0.01):
        """Create hexagonal grid for spatial aggregation"""
        
        # Get bounds
        min_lon, max_lon = self.data[self.lon_col].min(), self.data[self.lon_col].max()
        min_lat, max_lat = self.data[self.lat_col].min(), self.data[self.lat_col].max()
        
        # Create hexagonal grid
        hexagons = []
        for lon in np.arange(min_lon, max_lon, hex_size):
            for lat in np.arange(min_lat, max_lat, hex_size * 0.866):  # Hex height
                hex_center = Point(lon, lat)
                hexagons.append({
                    'center': hex_center,
                    'geometry': self._create_hexagon(lon, lat, hex_size)
                })
        
        hex_gdf = gpd.GeoDataFrame(hexagons, crs='EPSG:4326')
        
        return hex_gdf
    
    def _create_hexagon(self, center_lon, center_lat, size):
        """Create hexagon polygon"""
        from shapely.geometry import Polygon
        
        angles = np.linspace(0, 2 * np.pi, 7)[:-1]  # 6 vertices
        vertices = []
        
        for angle in angles:
            dx = size * np.cos(angle)
            dy = size * np.sin(angle) * 0.866  # Adjust for hex aspect ratio
            vertices.append((center_lon + dx, center_lat + dy))
        
        return Polygon(vertices)
    
    def spatial_clustering(self, method='dbscan', eps=0.01, min_samples=5):
        """Spatial clustering"""
        
        coords = np.column_stack([self.data[self.lat_col], self.data[self.lon_col]])
        
        if method == 'dbscan':
            from sklearn.cluster import DBSCAN
            clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
            labels = clustering.labels_
        
        elif method == 'kmeans':
            from sklearn.cluster import KMeans
            n_clusters = int(np.sqrt(len(coords)))
            clustering = KMeans(n_clusters=n_clusters, random_state=42).fit(coords)
            labels = clustering.labels_
        
        self.data['cluster'] = labels
        
        # Calculate cluster statistics
        cluster_stats = self.data.groupby('cluster').agg({
            self.lat_col: ['mean', 'std'],
            self.lon_col: ['mean', 'std'],
            'cluster': 'count'
        }).round(4)
        
        return labels, cluster_stats
    
    def calculate_spatial_autocorrelation(self, variable):
        """Calculate Moran's I for spatial autocorrelation"""
        
        from libpysal.weights import KNN
        from esda.moran import Moran
        
        # Create weight matrix
        coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
        w = KNN.from_array(coords, k=8)
        w.transform = 'r'
        
        # Calculate Moran's I
        moran = Moran(self.data[variable], w)
        
        results = {
            'moran_i': moran.I,
            'p_value': moran.p_sim,
            'expected_i': moran.EI,
            'interpretation': 'Positive spatial autocorrelation' if moran.I > 0 else 'Negative spatial autocorrelation'
        }
        
        return results
    
    def create_interactive_map(self, output_file='map.html'):
        """Create interactive map using Folium"""
        
        # Center map on data centroid
        center_lat = self.data[self.lat_col].mean()
        center_lon = self.data[self.lon_col].mean()
        
        m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
        
        # Add heatmap
        heat_data = [[lat, lon] for lat, lon in zip(self.data[self.lat_col], self.data[self.lon_col])]
        HeatMap(heat_data, radius=15).add_to(m)
        
        # Add marker cluster
        marker_cluster = MarkerCluster().add_to(m)
        
        for idx, row in self.data.head(1000).iterrows():  # Limit for performance
            folium.Marker(
                location=[row[self.lat_col], row[self.lon_col]],
                popup=f"Point {idx}"
            ).add_to(marker_cluster)
        
        # Save map
        m.save(output_file)
        print(f"Interactive map saved to {output_file}")
        
        return m
    
    def visualize_spatial_analysis(self, variable=None, figsize=(12, 10)):
        """Visualize spatial analysis results"""
        
        fig, axes = plt.subplots(2, 2, figsize=figsize)
        
        # 1. Data points
        axes[0, 0].scatter(self.data[self.lon_col], self.data[self.lat_col], 
                          alpha=0.5, s=10)
        axes[0, 0].set_xlabel('Longitude')
        axes[0, 0].set_ylabel('Latitude')
        axes[0, 0].set_title('Spatial Distribution')
        
        # 2. Density heatmap
        from scipy.stats import gaussian_kde
        
        xy = np.vstack([self.data[self.lon_col], self.data[self.lat_col]])
        z = gaussian_kde(xy)(xy)
        
        axes[0, 1].scatter(self.data[self.lon_col], self.data[self.lat_col], 
                          c=z, cmap='hot', alpha=0.5, s=10)
        axes[0, 1].set_xlabel('Longitude')
        axes[0, 1].set_ylabel('Latitude')
        axes[0, 1].set_title('Density Heatmap')
        
        # 3. Nearest neighbor distribution
        coords = np.column_stack([self.data[self.lon_col], self.data[self.lat_col]])
        tree = KDTree(coords)
        distances, _ = tree.query(coords, k=2)
        
        axes[1, 0].hist(distances[:, 1], bins=30, edgecolor='black', alpha=0.7)
        axes[1, 0].set_xlabel('Distance to Nearest Neighbor')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Nearest Neighbor Distance Distribution')
        
        # 4. Spatial autocorrelation (if variable provided)
        if variable and variable in self.data.columns:
            from pysal.explore import esda
            from pysal.lib import weights
            
            w = weights.KNN.from_array(coords, k=8)
            moran = esda.Moran(self.data[variable], w)
            
            axes[1, 1].scatter(moran.Is, moran.p_sim, alpha=0.5)
            axes[1, 1].axhline(y=0.05, color='r', linestyle='--', label='p=0.05')
            axes[1, 1].set_xlabel('Moran\'s I')
            axes[1, 1].set_ylabel('p-value')
            axes[1, 1].set_title('Spatial Autocorrelation')
            axes[1, 1].legend()
        
        plt.tight_layout()
        plt.savefig('spatial_analysis.png', dpi=150, bbox_inches='tight')
        plt.show()

# Example usage
# geo_analyzer = GeospatialAnalyzer(data, lat_col='lat', lon_col='lon')
# spatial_stats = geo_analyzer.calculate_spatial_statistics()
# clusters, cluster_stats = geo_analyzer.spatial_clustering(method='dbscan')
# geo_analyzer.create_interactive_map()
# geo_analyzer.visualize_spatial_analysis()

3. Network Analysis for Recommendations

class NetworkRecommendationEngine:
    """Recommendation engine using network analysis"""
    
    def __init__(self, user_item_interactions, user_user_edges=None):
        self.interactions = user_item_interactions
        self.user_graph = None
        self.item_graph = None
        
        if user_user_edges:
            self.user_graph = nx.Graph()
            self.user_graph.add_edges_from(user_user_edges)
    
    def build_bipartite_graph(self):
        """Build user-item bipartite graph"""
        
        B = nx.Graph()
        
        # Add nodes
        users = self.interactions['user_id'].unique()
        items = self.interactions['item_id'].unique()
        
        B.add_nodes_from(users, bipartite=0)
        B.add_nodes_from(items, bipartite=1)
        
        # Add edges
        for _, row in self.interactions.iterrows():
            B.add_edge(row['user_id'], row['item_id'], weight=row.get('rating', 1))
        
        return B
    
    def friend_of_friend_recommendations(self, user_id, n_recommendations=10):
        """Recommend based on friends of friends"""
        
        if self.user_graph is None:
            print("User graph not provided")
            return []
        
        # Get friends
        friends = set(self.user_graph.neighbors(user_id))
        
        # Get friends of friends
        fof = set()
        for friend in friends:
            fof.update(self.user_graph.neighbors(friend))
        
        # Remove user and existing friends
        fof = fof - friends - {user_id}
        
        # Score based on number of mutual friends
        scores = {}
        for potential_friend in fof:
            mutual_friends = len(friends.intersection(set(self.user_graph.neighbors(potential_friend))))
            scores[potential_friend] = mutual_friends
        
        # Sort and return top N
        recommendations = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        
        return recommendations[:n_recommendations]
    
    def pagerank_recommendations(self, user_id, n_recommendations=10):
        """Recommend using Personalized PageRank"""
        
        B = self.build_bipartite_graph()
        
        # Calculate Personalized PageRank
        personalization = {node: 0 for node in B.nodes()}
        personalization[user_id] = 1
        
        pagerank = nx.pagerank(B, personalization=personalization, alpha=0.85)
        
        # Get item scores (exclude user nodes)
        item_scores = {node: score for node, score in pagerank.items() 
                      if node not in self.interactions['user_id'].unique()}
        
        # Remove already interacted items
        user_items = set(self.interactions[self.interactions['user_id'] == user_id]['item_id'])
        item_scores = {item: score for item, score in item_scores.items() 
                      if item not in user_items}
        
        # Sort and return top N
        recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
        
        return recommendations[:n_recommendations]
    
    def community_based_recommendations(self, user_id, n_recommendations=10):
        """Recommend based on community membership"""
        
        # Detect communities
        communities = nx.community.louvain_communities(self.user_graph)
        
        # Find user's community
        user_community = None
        for idx, community in enumerate(communities):
            if user_id in community:
                user_community = idx
                break
        
        if user_community is None:
            return []
        
        # Get community members
        community_members = communities[user_community]
        
        # Get items liked by community members
        community_items = self.interactions[
            self.interactions['user_id'].isin(community_members)
        ].groupby('item_id')['rating'].agg(['mean', 'count'])
        
        # Filter by popularity and rating
        recommended_items = community_items[
            (community_items['count'] >= 5) &  # At least 5 ratings
            (community_items['mean'] >= 4.0)   # High rating
        ].nlargest(n_recommendations, 'mean')
        
        return recommended_items.index.tolist()
    
    def knowledge_graph_recommendations(self, user_id, item_attributes, 
                                       n_recommendations=10):
        """Recommend using knowledge graph embeddings"""
        
        from node2vec import Node2Vec
        
        # Build knowledge graph
        G = nx.Graph()
        
        # Add user-item edges
        user_items = self.interactions[self.interactions['user_id'] == user_id]['item_id']
        for item in user_items:
            G.add_edge(user_id, item, type='interacts')
        
        # Add item-attribute edges
        for _, row in item_attributes.iterrows():
            for attr in ['category', 'genre', 'director']:
                if attr in row:
                    G.add_edge(row['item_id'], f"{attr}_{row[attr]}", type='has_attribute')
        
        # Train Node2Vec
        node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200)
        model = node2vec.fit(window=10, min_count=1)
        
        # Get recommendations
        recommendations = model.wv.most_similar(user_id, topn=n_recommendations + len(user_items))
        
        # Filter out already interacted items
        recommendations = [(item, score) for item, score in recommendations 
                          if item not in user_items.values]
        
        return recommendations[:n_recommendations]

# Example usage
# rec_engine = NetworkRecommendationEngine(interactions, user_user_edges)
# fof_recs = rec_engine.friend_of_friend_recommendations(user_id=123)
# pagerank_recs = rec_engine.pagerank_recommendations(user_id=123)
# community_recs = rec_engine.community_based_recommendations(user_id=123)

4. Real-World Application: Ride-Sharing Analytics

class RideSharingAnalytics:
    """Advanced analytics for ride-sharing platform"""
    
    def __init__(self, ride_data, driver_data, rider_data):
        self.rides = ride_data
        self.drivers = driver_data
        self.riders = rider_data
        self.graph_analytics = None
        self.geo_analyzer = None
    
    def analyze_ride_network(self):
        """Analyze the ride-sharing network"""
        
        # Create rider-driver bipartite graph
        edges = [(row['rider_id'], row['driver_id']) 
                for _, row in self.rides.iterrows()]
        
        self.graph_analytics = GraphAnalytics(edges, directed=True)
        
        # Calculate metrics
        network_stats = self.graph_analytics.calculate_network_statistics()
        centrality = self.graph_analytics.calculate_centrality_metrics()
        
        # Find most connected drivers (hubs)
        top_drivers = centrality.nlargest(10, 'degree')
        
        return {
            'network_stats': network_stats,
            'top_drivers': top_drivers,
            'centrality': centrality
        }
    
    def analyze_spatial_patterns(self):
        """Analyze spatial patterns in rides"""
        
        # Pickup locations
        pickup_data = self.rides[['pickup_lat', 'pickup_lon']].copy()
        pickup_data.columns = ['latitude', 'longitude']
        
        self.geo_analyzer = GeospatialAnalyzer(pickup_data)
        
        # Spatial statistics
        spatial_stats = self.geo_analyzer.calculate_spatial_statistics()
        
        # Spatial clustering to find demand hotspots
        clusters, cluster_stats = self.geo_analyzer.spatial_clustering(
            method='dbscan', eps=0.01, min_samples=10
        )
        
        # Create demand heatmap
        self.geo_analyzer.create_interactive_map('demand_heatmap.html')
        
        return {
            'spatial_stats': spatial_stats,
            'demand_clusters': cluster_stats,
            'n_hotspots': len(cluster_stats)
        }
    
    def driver_optimization(self):
        """Optimize driver positioning based on demand"""
        
        # Create demand grid
        grid_size = 0.005  # ~500m
        demand_grid = self._create_demand_grid(grid_size)
        
        # Calculate driver coverage
        driver_locations = self.drivers[['lat', 'lon']].values
        
        # Find underserved areas
        underserved = self._find_underserved_areas(demand_grid, driver_locations)
        
        # Recommend driver repositioning
        recommendations = self._recommend_repositioning(underserved, demand_grid)
        
        return {
            'demand_grid': demand_grid,
            'underserved_areas': underserved,
            'repositioning_recommendations': recommendations
        }
    
    def _create_demand_grid(self, grid_size):
        """Create demand grid from ride data"""
        
        # Create grid
        min_lon, max_lon = self.rides['pickup_lon'].min(), self.rides['pickup_lon'].max()
        min_lat, max_lat = self.rides['pickup_lat'].min(), self.rides['pickup_lat'].max()
        
        grid = {}
        for lon in np.arange(min_lon, max_lon, grid_size):
            for lat in np.arange(min_lat, max_lat, grid_size):
                # Count rides in this cell
                count = len(self.rides[
                    (self.rides['pickup_lon'] >= lon) &
                    (self.rides['pickup_lon'] < lon + grid_size) &
                    (self.rides['pickup_lat'] >= lat) &
                    (self.rides['pickup_lat'] < lat + grid_size)
                ])
                
                grid[(lon, lat)] = {
                    'lon': lon + grid_size/2,
                    'lat': lat + grid_size/2,
                    'demand': count
                }
        
        return grid
    
    def _find_underserved_areas(self, demand_grid, driver_locations, threshold=0.5):
        """Find areas with high demand but low driver coverage"""
        
        underserved = []
        
        for (lon, lat), cell_info in demand_grid.items():
            if cell_info['demand'] > 0:
                # Calculate distance to nearest driver
                distances = np.sqrt((driver_locations[:, 0] - lat)**2 + 
                                  (driver_locations[:, 1] - lon)**2)
                nearest_distance = np.min(distances)
                
                # Check if underserved
                if nearest_distance > threshold:  # > 500m
                    underserved.append({
                        'lon': cell_info['lon'],
                        'lat': cell_info['lat'],
                        'demand': cell_info['demand'],
                        'nearest_driver_distance': nearest_distance
                    })
        
        return pd.DataFrame(underserved)
    
    def _recommend_repositioning(self, underserved, demand_grid):
        """Recommend driver repositioning"""
        
        recommendations = []
        
        for _, area in underserved.iterrows():
            # Find nearby drivers
            nearby_drivers = self._find_nearby_drivers(
                area['lat'], area['lon'], 
                radius=2.0  # 2km
            )
            
            if len(nearby_drivers) > 0:
                # Recommend repositioning
                recommendations.append({
                    'target_lat': area['lat'],
                    'target_lon': area['lon'],
                    'expected_demand': area['demand'],
                    'drivers_to_reposition': min(3, len(nearby_drivers)),
                    'driver_ids': nearby_drivers['driver_id'].tolist()[:3]
                })
        
        return pd.DataFrame(recommendations)
    
    def _find_nearby_drivers(self, lat, lon, radius):
        """Find drivers within radius"""
        
        distances = np.sqrt((self.drivers['lat'] - lat)**2 + 
                          (self.drivers['lon'] - lon)**2) * 111  # Rough km conversion
        
        nearby = self.drivers[distances <= radius].copy()
        nearby['distance'] = distances[distances <= radius]
        
        return nearby.sort_values('distance')

# Example usage
# ride_analytics = RideSharingAnalytics(rides, drivers, riders)
# network_analysis = ride_analytics.analyze_ride_network()
# spatial_analysis = ride_analytics.analyze_spatial_patterns()
# optimization = ride_analytics.driver_optimization()

💡

Pro Tip: Graph algorithms can be computationally expensive for large networks. Consider using approximations or sampling for networks with millions of nodes.

5. Common Follow-Up Questions

Follow-up 1: How do you scale graph algorithms to large networks?

def scale_graph_algorithms(graph, method='sampling'):
    """Scale graph algorithms for large networks"""
    
    if method == 'sampling':
        # Node sampling
        sample_size = min(10000, graph.number_of_nodes())
        sampled_nodes = np.random.choice(list(graph.nodes()), size=sample_size, replace=False)
        subgraph = graph.subgraph(sampled_nodes)
        
        return subgraph
    
    elif method == 'distributed':
        # Use distributed graph processing frameworks
        # Example: GraphX, Pregel, or DGL
        frameworks = {
            'graphx': 'Apache Spark GraphX',
            'dgl': 'Deep Graph Library',
            'neo4j': 'Neo4j for graph databases'
        }
        return frameworks
    
    elif method == 'approximation':
        # Use approximate algorithms
        # Example: Approximate PageRank, Local Clustering
        from networkx.algorithms.approximation import pagerank_approx
        
        approx_pr = pagerank_approx.pagerank(graph, alpha=0.85)
        
        return approx_pr

# Example
# large_graph = nx.erdos_renyi_graph(100000, 0.001)
# sampled_graph = scale_graph_algorithms(large_graph, method='sampling')

Follow-up 2: How do you handle dynamic/temporal networks?

class TemporalNetworkAnalyzer:
    """Analyze temporal (time-evolving) networks"""
    
    def __init__(self):
        self.snapshots = []
    
    def create_temporal_snapshots(self, edges_with_timestamps, time_window):
        """Create network snapshots over time"""
        
        df = pd.DataFrame(edges_with_timestamps, columns=['u', 'v', 'timestamp'])
        
        # Create time windows
        df['time_window'] = df['timestamp'] // time_window
        
        # Create snapshot for each time window
        for window in df['time_window'].unique():
            window_edges = df[df['time_window'] == window][['u', 'v']].values
            
            G = nx.Graph()
            G.add_edges_from(window_edges)
            
            self.snapshots.append({
                'time': window,
                'graph': G,
                'stats': {
                    'n_nodes': G.number_of_nodes(),
                    'n_edges': G.number_of_edges(),
                    'density': nx.density(G)
                }
            })
        
        return self.snapshots
    
    def analyze_temporal_evolution(self):
        """Analyze how network evolves over time"""
        
        evolution = []
        
        for snapshot in self.snapshots:
            evolution.append({
                'time': snapshot['time'],
                **snapshot['stats']
            })
        
        evolution_df = pd.DataFrame(evolution)
        
        # Plot evolution
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        axes[0].plot(evolution_df['time'], evolution_df['n_nodes'], marker='o')
        axes[0].set_xlabel('Time Window')
        axes[0].set_ylabel('Number of Nodes')
        axes[0].set_title('Network Growth')
        
        axes[1].plot(evolution_df['time'], evolution_df['n_edges'], marker='o', color='orange')
        axes[1].set_xlabel('Time Window')
        axes[1].set_ylabel('Number of Edges')
        axes[1].set_title('Edge Growth')
        
        axes[2].plot(evolution_df['time'], evolution_df['density'], marker='o', color='green')
        axes[2].set_xlabel('Time Window')
        axes[2].set_ylabel('Density')
        axes[2].set_title('Network Density Evolution')
        
        plt.tight_layout()
        plt.savefig('temporal_network_evolution.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        return evolution_df

# Example usage
# temporal_analyzer = TemporalNetworkAnalyzer()
# snapshots = temporal_analyzer.create_temporal_snapshots(edges_with_times, time_window=3600)
# evolution = temporal_analyzer.analyze_temporal_evolution()

Company-Specific Tips

ℹ️

Google Tips:

Google heavily tests on graph algorithms
Know how to implement PageRank and its variants
Understand community detection algorithms
Be comfortable with large-scale graph processing

Uber Tips:

Uber values geospatial analytics
Know how to optimize driver positioning
Understand spatial clustering for demand prediction
Be familiar with real-time location analytics

Advanced Analytics: Graph Analytics, Geospatial, Network Analysis

Advanced Analytics: Graph Analytics, Geospatial, Network Analysis

The Interview Question

Detailed Answer

1. Graph Analytics Framework

2. Geospatial Analysis

3. Network Analysis for Recommendations

4. Real-World Application: Ride-Sharing Analytics

5. Common Follow-Up Questions

Company-Specific Tips

Quiz Section

Related Topics