Optimizing Python Code for Generating a Large Grid Map with Ocean Nodes Only

36 Views Asked by At

I'm working on a project to generate a global grid map with nodes representing ocean areas only, which will be used for sea route pathing on a global scale. The nodes should be connected with edges to their adjacent nodes, forming a walkable path where each node represents a point on the ocean.

Here are the parameters defining the grid:

LAT_MIN, LAT_MAX, LON_MIN, LON_MAX = -90, 90, -180, 180
LAT_STEP, LON_STEP = 0.3, 0.3

The grid map will be quite large, given the small step size and the global scale. I have a geopackage file ne_10m_land.gpkg that includes land polygons, and my goal is to exclude land nodes from the grid, leaving only the ocean nodes.

To inspect the contents of the geopackage file, I've printed the first few entries using geopandas:

import geopandas as gpd

ocean = gpd.read_file("routing/data/geopackages/ne_10m_land.gpkg")
print(ocean.head(5))

This results in the following output:

geometry
0  MULTIPOLYGON (((-0.00029 -71.49903, 0.01555 -7...
1  MULTIPOLYGON (((166.13697 -50.86435, 166.20525...
2  MULTIPOLYGON (((-78.78897 -33.60906, -78.78038...
3  MULTIPOLYGON (((163.98512 -20.04762, 163.98609...
4  MULTIPOLYGON (((134.70737 -6.58904, 134.72006 ...

The geometry column contains MULTIPOLYGON shapes representing land areas.

Questions:

  1. How can I optimize the land intersection checks to improve performance given the MULTIPOLYGON data structure?
  2. Are there any data structures or algorithms that can help reduce memory usage while still allowing me to query nodes and their neighbors efficiently?
  3. Would preprocessing the geopackage data into a more efficient format for this task be beneficial, and if so, what format and approach should I consider?

For full context, here is my code:

import os
import logging
import concurrent.futures
import traceback
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from concurrent.futures import ProcessPoolExecutor
from math import radians, cos, sin, asin, sqrt
from itertools import product
import numpy as np
import pickle

class Node:
    def __init__(self, latitude, longitude, node_id, walkable=True):
        self.latitude = latitude
        self.longitude = longitude
        self.id = node_id
        self.walkable = walkable
        self.neighbors = {}  # Dictionary to hold neighbors and their distances

    def add_neighbor(self, neighbor_node, distance):
        self.neighbors[neighbor_node.id] = distance

    def calculate_distance(self, other):
        # Haversine formula to calculate the great circle distance between two points on the earth
        lon1, lat1, lon2, lat2 = map(np.radians, [self.longitude, self.latitude, other.longitude, other.latitude])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        r = 6371  # Radius of Earth in kilometers
        return c * r

class GridMap:
    script_dir = os.path.dirname(os.path.abspath(__file__))
    land = gpd.read_file(os.path.join(script_dir, "data/geopackages/ne_10m_land.gpkg"))
    land_sindex = land.sindex

    def __init__(self, lat_min, lat_max, lon_min, lon_max, lat_step, lon_step):
        self.nodes = {}
        self.create_grid(lat_min, lat_max, lon_min, lon_max, lat_step, lon_step)

    def create_grid(self, lat_min, lat_max, lon_min, lon_max, lat_step, lon_step):
        node_id = 0
        total_nodes = ((lat_max - lat_min) // lat_step) * ((lon_max - lon_min) // lon_step)
        nodes_created = 0
        # Create nodes at each step in the latitude and longitude range
        for lat in np.arange(lat_min, lat_max, lat_step):
            for lon in np.arange(lon_min, lon_max, lon_step):
                point = Point(lon, lat)  # Create a point with longitude, latitude
                possible_matches_index = list(GridMap.land_sindex.intersection(point.bounds))
                possible_matches = GridMap.land.iloc[possible_matches_index]
                precise_matches = possible_matches[possible_matches.intersects(point)]
                walkable = precise_matches.empty  # A node is walkable if there are no precise land matches

                node = Node(lat, lon, node_id=(lat, lon), walkable=walkable)
                self.nodes[node.id] = node
                node_id += 1

                nodes_created += 1
                progress = (nodes_created / total_nodes) * 100
                if nodes_created % (total_nodes // 100) == 0:  # Log every 1% progress
                    logging.info(f"Grid creation progress: {progress:.2f}%")

        self.add_all_neighbors(lat_step, lon_step)

    def create_node(self, lat, lon, node_id):
        try:
            point = Point(lon, lat)  # Create a point with longitude, latitude
            possible_matches_index = list(GridMap.land_sindex.intersection(point.bounds))
            possible_matches = GridMap.land.iloc[possible_matches_index]
            precise_matches = possible_matches[possible_matches.intersects(point)]
            walkable = precise_matches.empty  # A node is walkable if there are no precise land matches

            node = Node(lat, lon, node_id=(lat, lon), walkable=walkable)
            return node
        except Exception as e:
            logging.error("Failed to create node at (%s, %s): %s" % (lat, lon, e))
            raise

    def add_all_neighbors(self, lat_step, lon_step):
        total_neighbors_checks = len(self.nodes) * 8  # Each node can have up to 8 neighbors
        checks_done = 0
        # Connect each node to its potential eight neighbors
        for node_id, node in self.nodes.items():
            lat, lon = node_id
            for dlat in [-lat_step, 0, lat_step]:
                for dlon in [-lon_step, 0, lon_step]:
                    # Skip the node itself
                    if dlat == 0 and dlon == 0:
                        continue
                    neighbor_id = (lat + dlat, lon + dlon)
                    if neighbor_id in self.nodes:  # Check if the neighbor exists
                        neighbor_node = self.nodes[neighbor_id]
                        distance = node.calculate_distance(neighbor_node)
                        node.add_neighbor(neighbor_node, distance)
                        checks_done += 1  # Increment the checks counter here

            progress = (checks_done / total_neighbors_checks) * 100
            if checks_done % (total_neighbors_checks // 100) == 0:  # Log every 1% progress
                logging.info(f"Neighbor addition progress: {progress:.2f}%")

    def create_grid_parallel(self, lat_range, lon_range, lat_step, lon_step):
        args = ((lat, lon, (lat, lon)) for lat, lon in product(lat_range, lon_range))

        with concurrent.futures.ProcessPoolExecutor() as executor:
            # Submit tasks to the executor
            future_to_latlon = {executor.submit(self.create_node, arg): arg for arg in args}

            # As tasks complete, process the results
            for future in concurrent.futures.as_completed(future_to_latlon):
                latlon = future_to_latlon[future]
                try:
                    node = future.result()
                    self.nodes[node.id] = node
                except Exception as exc:
                    logging.error('%r generated an exception: %s' % (latlon, exc))
                    traceback.print_exc()

    def save_to_file(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load_from_file(filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)

# Usage with the specified variables
LAT_MIN, LAT_MAX, LON_MIN, LON_MAX = -90, 90, -180, 180
LAT_STEP, LON_STEP = 0.3, 0.3

# Usage with the specified variables
lat_range = np.arange(LAT_MIN, LAT_MAX, LAT_STEP)
lon_range = np.arange(LON_MIN, LON_MAX, LON_STEP)

# Check if the grid has already been created and saved to a file
grid_filename = 'grid_map.pkl'
if os.path.exists(grid_filename):
    # Load the grid from the file
    grid_map = GridMap.load_from_file(grid_filename)
else:
    # Create the grid and save it to a file
    grid_map = GridMap(LAT_MIN, LAT_MAX, LON_MIN, LON_MAX, LAT_STEP, LON_STEP)
    grid_map.create_grid_parallel(lat_range, lon_range, LAT_STEP, LON_STEP)
    grid_map.save_to_file(grid_filename)

I want to conclude with the final question being: is this possible? I know of the computational resources and time this is going to take, so if someone assures me that it's not worth my while and to limit my grid to way smaller bounds, then I will result in any recommendations you might provide me with.

0

There are 0 best solutions below