I need to implement a library that provides the Merge Sort and Quick Sort algorithms for generic data, implementing the following function prototypes:

void merge_sort(void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*));
void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*));

Where:

  • base is a pointer to the first element of the array to be sorted;
  • nitems is the number of elements in the array to be sorted;
  • size is the size in bytes of each element of the array;
  • compar is the criterion by which to order the data (given two pointers to elements of the array, returns a number greater than, equal to, or less than zero if the first argument is respectively greater than, equal to, or less than the second).

For testing, I am going to use a file named records.csv that contains 20 million records to be sorted. Each record is described on a single line and contains the following fields:

  • id: (integer type) a unique identifier of the record;
  • field1: (string type) contains words extracted from the Divine Comedy, and you can assume that values do not contain spaces or commas;
  • field2: (integer type);
  • field3: (floating-point type).

The format is a standard CSV: fields are separated by commas, and records are separated by \n. Using the algorithms implemented previously, I need to implement the following function to sort records contained in the file records.csv in non-decreasing order according to the values contained in the three "field" columns:

void sort_records(FILE *infile, FILE *outfile, size_t field, size_t algo);

The algorithm for merge-sort and quick-sort is this:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

int counter = 0;

// Merge function for Merge Sort
static void merge(void *base, size_t left_size, size_t right_size, size_t size, int (*compar)(const void *, const void *)) {
    size_t i, j, k;

    // Calculate total size needed for temporary array
    size_t total_size = (left_size + right_size) * size;
    
    // Allocate memory for temporary array with the new size
    void *temp = malloc(total_size);
    
    if (temp == NULL) {
        // Handle memory allocation failure
        fprintf(stderr, "Error: Unable to allocate memory for temporary array\n");
        exit(EXIT_FAILURE);
    }

    i = j = k = 0;

    while (i < left_size && j < right_size) {
        if (compar((char *)base + i * size, (char *)base + (left_size + j) * size) <= 0) {
            memcpy((char *)temp + k * size, (char *)base + i * size, size);
            i++;
        } else {
            memcpy((char *)temp + k * size, (char *)base + (left_size + j) * size, size);
            j++;
        }
        k++;
    }

    while (i < left_size) {
        memcpy((char *)temp + k * size, (char *)base + i * size, size);
        i++;
        k++;
    }

    while (j < right_size) {
        memcpy((char *)temp + k * size, (char *)base + (left_size + j) * size, size);
        j++;
        k++;
    }

    memcpy(base, temp, (left_size + right_size) * size);
    free(temp);
    printf(". %d", counter);
    counter++;
}

// Merge Sort Algorithm
void merge_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    if (nitems <= 1) {
        return;
    }

    size_t middle = nitems / 2;
    merge_sort(base, middle, size, compar);
    merge_sort((char *)base + middle * size, nitems - middle, size, compar);
    merge(base, middle, nitems - middle, size, compar);
}

// Partition function for Quick Sort
static size_t partition(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    void *pivot = malloc(size);
    memcpy(pivot, base, size);

    size_t i = 1;
    size_t j = nitems - 1;

    while (1) {
        while (i < nitems && compar((char *)base + i * size, pivot) <= 0) {
            i++;
        }

        while (j > 0 && compar((char *)base + j * size, pivot) > 0) {
            j--;
        }

        if (i < j) {
            // Swap elements
            void *temp = malloc(size);
            memcpy(temp, (char *)base + i * size, size);
            memcpy((char *)base + i * size, (char *)base + j * size, size);
            memcpy((char *)base + j * size, temp, size);
            free(temp);
        } else {
            // Place pivot at correct position
            memcpy(base, pivot, size);
            free(pivot);
            return j;
        }
    }
}

// Quick Sort Algorithm
static void quick_sort_recursive(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    if (nitems <= 1) {
        return;
    }

    size_t pivot_index = partition(base, nitems, size, compar);
    if (pivot_index > 0) {
        quick_sort_recursive(base, pivot_index, size, compar);
    }
    if (pivot_index < nitems - 1) {
        quick_sort_recursive((char *)base + (pivot_index + 1) * size, nitems - pivot_index - 1, size, compar);
    }
}

void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    quick_sort_recursive(base, nitems, size, compar);
}

And the following code is for loading the records and using the algorithms:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "Esercizio1.h"

#define INITIAL_CAPACITY 2

/* Structure of each record in the .csv file */
struct record {
  int id;
  char *field1;
  int field2;
  float field3;
};

/* Takes the read buffer, the element number in the array (i) and the array pointer.
    Separates the buffer into tokens and loads the record in the array */
void load_record(char buffer[1024], size_t i, struct record **array) {
    struct record *record;
    char *token;
    record = malloc(sizeof(struct record));
    if(record == NULL) {
        fprintf(stderr,"load_record: unable to allocate memory for the read record\n");
        exit(1);
    }
    token = strtok(buffer, ",");
    record->id = atoi(token);
    token = strtok(NULL, ",");
    record->field1 = malloc((strlen(token) + 1) * sizeof(char));
    strcpy(record->field1, token);
    token = strtok(NULL, ",");
    record->field2 = atoi(token);
    token = strtok(NULL, "\n");
    record->field3 = atof(token);
    array[i] = record;
}

/* Takes 2 struct record pointers and comapres their respective field1 (string).
  It returns:
  1 if the first string precedes the second one (in alphabetical order)
  0 otherwise */
int compar_field1(const void *r1_p, const void *r2_p) {
  struct record *rec1_p, *rec2_p;
  if(r1_p == NULL) {
    fprintf(stderr, "compar_field1: first parameter is a NULL pointer.\n");
    exit(1);
  }
  if(r2_p == NULL) {
    fprintf(stderr, "compar_field1: second parameter is a NULL pointer.\n");
    exit(1);
  }
  rec1_p = (struct record*)r1_p;
  rec2_p = (struct record*)r2_p;
  return strcmp(rec1_p->field1, rec2_p->field1) < 0;
}

/* Takes 2 struct record pointers and comapres their respective field2 (int).
  It returns:
  1 if the first int is smaller than the second one
  0 otherwise */
int compar_field2(const void *r1_p, const void *r2_p) {
  struct record *rec1_p, *rec2_p;
  if(r1_p == NULL) {
    fprintf(stderr, "compar_field2: first parameter is a NULL pointer.\n");
    exit(1);
  }
  if(r2_p == NULL) {
    fprintf(stderr, "compar_field2: second parameter is a NULL pointer.\n");
    exit(1);
  }
  rec1_p = (struct record*)r1_p;
  rec2_p = (struct record*)r2_p;
  return rec1_p->field2 < rec2_p->field2;
}

/* Takes 2 struct record pointers and comapres their respective field3 (float).
  It returns:
  1 if the first float is smaller than the second one
  0 otherwise */
int compar_field3(const void *r1_p, const void *r2_p) {
  struct record *rec1_p, *rec2_p;
  if(r1_p == NULL) {
    fprintf(stderr, "compar_field3: first parameter is a NULL pointer.\n");
    exit(1);
  }
  if(r2_p == NULL) {
    fprintf(stderr, "compar_field3: second parameter is a NULL pointer.\n");
    exit(1);
  }
  rec1_p = (struct record*)r1_p;
  rec2_p = (struct record*)r2_p;
  return rec1_p->field3 < rec2_p->field3;
}

/* Takes the output file, the ordered array and the number of elements in the array,
  prints each record in the .csv output file */
void print_file(FILE *outfile, struct record **array, size_t n_elements) {
  size_t i;
  for(i = 0; i < n_elements; i++) {
    fprintf(outfile, "%d,%s,%d,%f\n", array[i]->id, array[i]->field1, array[i]->field2, array[i]->field3);
  }
}

/* Takes the input file, the output file, the field and the chosen sorting algorithm.
  It does (in order):
    - Creates an empty array with an inistial capacity of 2 elements
    - Loads all records inside the array (reallocates double the memory when needed)
    - Reallocates memory at the end when all records have been read (it usually saves memory)
    - Calls the chosen sorting library
    - Prints the output file */
void sort_records(FILE *infile, FILE *outfile, size_t field, size_t algo) {
  struct record **array;
  struct timespec start, end;
  double time_taken;
  char buffer[1024];
  size_t capacity = 2;
  size_t n_elements = 0;
  
  array = (struct record**) malloc(INITIAL_CAPACITY * sizeof(struct record*));
  if(array == NULL) {
    fprintf(stderr, "sort_records: unable to allocate memory for records\n");
    exit(1);
  }
  clock_gettime(CLOCK_REALTIME, &start);
  while(fgets(buffer, 1024, infile) != NULL) {
    if(n_elements == capacity) {
      array = (struct record**) realloc(array, sizeof(struct record*) * 2 * capacity);
      if(array == NULL) {
        fprintf(stderr,"sort_records: unable to reallocate memory for records\n");
        exit(1);
      }
      capacity = capacity * 2;
    }
    load_record(buffer, n_elements, array);
    n_elements++;
  }
  array = (struct record**) realloc(array, sizeof(struct record*) * n_elements);
  if(array == NULL) {
    fprintf(stderr,"sort_records: unable to reallocate memory for records at the end\n");
    exit(1);
  }
  clock_gettime(CLOCK_REALTIME, &end);
  time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
  printf("1) CSV file loaded in RAM!\n");
  printf("    Number of records loaded: %ld\n", n_elements);
  printf("    Time elapsed: %f seconds\n", time_taken);

  clock_gettime(CLOCK_REALTIME, &start);
  if(algo == 1) {
      merge_sort(array, n_elements, sizeof(struct record), compar_field1);
  } else {
      quick_sort(array, n_elements, sizeof(struct record), compar_field1);
  }

  clock_gettime(CLOCK_REALTIME, &end);
  time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
  printf("2) Records sorted!\n");
  printf("    Time elapsed: %f seconds\n", time_taken);
  
  clock_gettime(CLOCK_REALTIME, &start);
  print_file(outfile, array, n_elements);
  clock_gettime(CLOCK_REALTIME, &end);
  time_taken = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0;
  printf("3) File printed!\n");
  printf("    Time elapsed: %f seconds\n", time_taken);
}

/* Takes from command line the 2 file paths + field and algo parameters.
  It then performs some checks on the inputs given and, if they all pass,
  sort_records() is called. */
int main(int argc, char const *argv[]) {
  FILE *infile, *outfile;
  size_t field, algo;
  if(argc == 5) {
    infile = fopen(argv[1], "r");
    if(infile == NULL) {
      fprintf(stderr, "ERROR! Input file does not exist.\n");
      fprintf(stderr, "Terminating...\n");
      exit(1);
    }
    outfile = fopen(argv[2], "w");
    if(outfile == NULL) {
      fprintf(stderr, "ERROR! Output file can't be created or is already in use.\n");
      fprintf(stderr, "Terminating...\n");
      exit(1);
    }
    field = atol(argv[3]);
    if(field < 1 || field > 3) {
      fprintf(stderr, "ERROR! Field parameter can only be equal to 1, 2 or 3.\n");
      fprintf(stderr, "Terminating...\n");
      exit(1);
    }
    algo = atol(argv[4]);
    if(algo < 1 || algo > 2) {
      fprintf(stderr, "ERROR! Algo parameter can only be equal to 1 or 2.\n");
      fprintf(stderr, "Terminating...\n");
      exit(1);
    }
    sort_records(infile, outfile, field, algo);
  } else {
    fprintf(stderr, "ERROR! Wrong amount of arguments.\n");
    fprintf(stderr, "To launch this program correctly, type ./main_ex1 /input_file /output_file field algo\n");
    fprintf(stderr, "Terminating...\n");
    exit(1);
  }
  printf("SUCCESS! Terminating...\n");
  return 0;
}

The algorithm looks good but at some point it gives me segmentation fault:

zsh: segmentation fault  ./main_ex1 ../records.csv ../out.csv 1 1
1

There are 1 best solutions below

3
rcgldr On

The problem is mostly likely due to stack overflow in quicksort. To fix the stack overflow problem, recurse on the smaller part and loop on the larger part. The worst case time complexity remains O(n^2), in which case the sort may not finish in a reasonable amount of time.

partition(): needed some fixes. quicksort(): recurse on smaller part, loop on larger part. I only looked at the quicksort related code, I don't know if there are any issues with the rest of the code.

// Partition function for Quick Sort
static size_t partition(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    void *temp = malloc(size);                              // simplify
    void *pivot = malloc(size);
    memcpy(pivot, (char *)base+((nitems-1)/2)*size, size);  // middle value
    size_t i = 0;                                           // fix
    size_t j = nitems-1;                                    // fix
    while (1) {
        while (compar((char *)base + i * size, pivot) == 1) // fix
            i++;
        while (compar(pivot, (char *)base + j * size) == 1) // fix
            j--;
        if (i >= j)                                         // simplify
            break;
        // Swap elements
        memcpy(temp, (char *)base + i * size, size);
        memcpy((char *)base + i * size, (char *)base + j * size, size);
        memcpy((char *)base + j * size, temp, size);
        i++;                                                // fix
        j--;                                                // fix
    }
    free(pivot);                                            // simplify
    free(temp);
    return j;
}

// Quick Sort Algorithm
static void quick_sort_recursive(void *base, size_t nitems, size_t size, int(*compar)(const void *, const void *)) {
    // recurse on smaller part, loop on larger part
    while (nitems > 1){
        size_t pivot_index = partition(base, nitems, size, compar);
        if(pivot_index - 0 < nitems - pivot_index){
            quick_sort_recursive(base, pivot_index+1, size, compar);
            base = (char *)base + (pivot_index + 1) * size;
            nitems = nitems - pivot_index - 1;
        } else {
            quick_sort_recursive((char *)base + (pivot_index + 1) * size, nitems - pivot_index - 1, size, compar);
            nitems = pivot_index+1;
        }
    }
}

void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *)) {
    quick_sort_recursive(base, nitems, size, compar);
}

The following code uses classic Hoare Partition scheme, using indexes lo and hi instead of manipulating base and nitems, with the partition code included in quicksort_recursive():

// Quick Sort Algorithm
static void quick_sort_recursive(void *base, size_t lo, size_t hi, size_t size, int (*compar)(const void *, const void *))
{
    void *temp = malloc(size);
    void *pivot = malloc(size);
    size_t i;
    size_t j;
    while (lo < hi){
        // partition using middle value
        memcpy(pivot, (char *)base+(lo+(hi-lo)/2)*size, size);
        i = lo - 1;
        j = hi + 1;
        while (1){
            while (compar((char *)base + ++i * size, pivot) == 1);
            while (compar(pivot, (char *)base + --j * size) == 1);
            if (i >= j)
                break;
            memcpy(temp, (char *)base + i * size, size);
            memcpy((char *)base + i * size, (char *)base + j * size, size);
            memcpy((char *)base + j * size, temp, size);
        }
        // recurse on smaller part, loop on larger part
        if(j - lo < hi - j){
            quick_sort_recursive(base, lo, j, size, compar);
            lo = j+1;
        } else {
            quick_sort_recursive(base, j+1, hi, size, compar);
            hi = j;
        }
    }
    free(pivot);
    free(temp);
}

void quick_sort(void *base, size_t nitems, size_t size, int (*compar)(const void *, const void *))
{
    quick_sort_recursive(base, 0, nitems-1, size, compar);
}