Why is the total execution time so much greater than the profiled cumulated time in Python?

41 Views Asked by At

I'm running a Python script to find flight routes using a dataset with around 20k flights containing ~4000 flights each day. When profiling the code with cProfile, I print that the cumulative time recorded is much less than the query execution time. The discrepancy I'm seeing is:

Total function calls in cProfile: 0.468 seconds
Query executed time: 4.25759482383728 seconds

Here's the profilers output:

16499769 function calls in 0.468 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  4167579    0.155    0.000    0.155    0.000 {method 'get' of 'dict' objects}
  4059336    0.101    0.000    0.101    0.000 {method 'append' of 'collections.deque' objects}
  4059365    0.099    0.000    0.099    0.000 {method 'popleft' of 'collections.deque' objects}
  3972499    0.090    0.000    0.090    0.000 {built-in method builtins.len}
   135618    0.019    0.000    0.019    0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:109(<setcomp>)
    95182    0.003    0.000    0.003    0.000 {method 'append' of 'list' objects}
     1589    0.000    0.000    0.001    0.000 {method 'sort' of 'list' objects}
     8316    0.000    0.000    0.000    0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:91(<lambda>)
      125    0.000    0.000    0.000    0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:85(<lambda>)
      127    0.000    0.000    0.000    0.000 {method 'values' of 'dict' objects}
       30    0.000    0.000    0.000    0.000 /Users/<user>/PycharmProjects/route-parser/pure-python-9-8.py:95(<genexpr>)
        2    0.000    0.000    0.000    0.000 {built-in method time.time}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}

I understand that cProfile is not measuring wall time, but how can I bring these numbers closer together?

This is the program, eventually I'll not be reading from CSV and instead be reading from a DB.

import cProfile
import csv
import pstats
import time
from collections import defaultdict
from collections import deque
from datetime import timedelta
from io import StringIO

from dateutil.parser import parse

flights = []

with open('lambda.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header
    for row in reader:
        flight_id, origin, destination, start_datetime, end_datetime, capacity, equipment_type, tail_number = row

        start_datetime = parse(start_datetime)
        end_datetime = parse(end_datetime)

        flights.append({
            'flight_id': flight_id,
            'origin': origin,
            'destination': destination,
            'start_datetime': start_datetime,
            'end_datetime': end_datetime,
            'capacity': int(capacity),
            'equipment_type': equipment_type,
            'tail_number': tail_number
        })


# Initialize a connectivity map to store direct connections between airports
connectivity_map = defaultdict(set)

# Build the connectivity map from the flights data
for flight in flights:
    connectivity_map[flight['origin']].add(flight['destination'])


def find_routes(origin, destination, start_datetime=None, end_datetime=None):
    """
    Find possible flight routes from origin to destination within a specified time window.

    Parameters:
    - origin (str): The starting airport code.
    - destination (str): The destination airport code.
    - start_datetime (str): The start date and time in ISO format. If provided, searches 48 hours ahead from this time.
    - end_datetime (str): The end date and time in ISO format. If provided, searches 48 hours before this time.

    Returns:
    None (prints the valid paths, number of paths, and query execution time).
    """

    if start_datetime is not None:
        start_timestamp = int(parse(start_datetime).timestamp())
        end_timestamp = int(start_timestamp + timedelta(hours=48).total_seconds())
    elif end_datetime is not None:
        end_timestamp = int(parse(end_datetime).timestamp())
        start_timestamp = int(end_timestamp - timedelta(hours=48).total_seconds())
    else:
        raise ValueError("Either start_datetime or end_datetime must be provided")

    for flight in flights:
        flight['start_timestamp'] = flight['start_datetime'].timestamp()
        flight['end_timestamp'] = flight['end_datetime'].timestamp()

    # Filter out the flights which don't fit within the specified time window
    flights_filtered = [
        flight for flight in flights if
        start_timestamp <= flight['start_timestamp'] <= end_timestamp and
        end_timestamp >= flight['end_timestamp']
    ]

    pr = cProfile.Profile()
    s = StringIO()

    pr.enable()
    start_time = time.time()

    # Build a mapping from origin to destination with lists of flights that follow that route
    next_flights_map = defaultdict(lambda: defaultdict(list))
    for flight in flights_filtered:
        next_flights_map[flight['origin']][flight['destination']].append(flight)
    # Sort the flights for each route based on their start timestamp
    for origin_data in next_flights_map.values():
        for destination_data in origin_data.values():
            destination_data.sort(key=lambda x: x['start_timestamp'])

    valid_paths = []
    # Initialize the exploration queue with the first set of flights from the origin
    to_explore = deque([flight] for dest_flights in next_flights_map[origin].values() for flight in dest_flights)

    while to_explore:
        path = to_explore.popleft()
        last_flight = path[-1]

        if last_flight['destination'] == destination:
            valid_paths.append(path)
            continue

        if len(path) >= 4:
            continue

        # Keep track of airports already visited in the current path to avoid loops
        visited_airports = {flight['origin'] for flight in path}
        # Find potential next destinations that haven't been visited yet
        next_possible_destinations = connectivity_map[last_flight['destination']] - visited_airports

        # For each potential next destination, explore the flights going there
        for dest in next_possible_destinations:
            potential_flights = next_flights_map[last_flight['destination']].get(dest, [])
            for next_flight in potential_flights:
                # Ensure the next flight starts after the last one ends
                if next_flight['start_timestamp'] > last_flight['end_timestamp']:
                    new_path = path + [next_flight]
                    to_explore.append(new_path)

    end_time = time.time()

    pr.disable()

    sortby = 'cumulative'
    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
    ps.print_stats()

    print(s.getvalue())

    print(len(valid_paths))
    print(f"Query executed in: {end_time - start_time} seconds")


# Test the function with a sample input
find_routes('MIA', 'ATL', start_datetime='2023-09-15T10:00:00Z')

My question is: Why is there such a significant discrepancy between the cProfile cumulative time and the execution time of the query? Am I missing something or not accounting for some overheads?

Also any improvements to the algorithm will be appreciated :D

0

There are 0 best solutions below