Efficient way to collect all unique substrings of a string

152 Views Asked by At

I need to identify all substrings in a string with a minimum size and repeats. The caveat is that I don't want substrings returned that are themselves substrings of other returned substrings. In other words the set of substrings needs to be a disjoint set. The function below works but is very inefficient. 97.9% of the time is spent re-calculating the suffix array and LCP. I settled on this because after I remove the last substring from the string and re-calculate the SA and LCP, I can guarantee that no substrings of the last substring would be added. Is there a more efficient way to do this that would require calculating the SA and LCP once?

from typing import Dict, List, NamedTuple
import numpy as np
import pandas as pd
from pydivsufsort import divsufsort, kasai

class Substring(NamedTuple):
    length: int
    count: int

def find_unique_repeat_substrings(s: bytes, min_length: int = 20, min_repeats: int = 10) -> Dict[str, Substring]:

    string_dict = dict()
    K = len(s)

    while K>=min_length:
        sa = divsufsort(s)
        lcp = kasai(s, sa)
        K_loc = np.argmax(lcp)
        K=np.max(lcp)
        #calculate number of repeats
        loc = K_loc+1
        while lcp[loc]==K:
            loc += 1
        cnt = loc-K_loc+1
        longest_string = s[sa[K_loc]:sa[K_loc]+K]
        
        #add substring to dict
        if cnt >= min_repeats and K>=min_length:
            string_dict[longest_string.decode()] = Substring(length=K, count=cnt)
        
        #remove substring
        s = s.replace(longest_string, b"")  # Replacing with bytes
    return(string_dict)


s = "this string is repeated three times in this sentence. string string.".encode()
string_dict = find_unique_repeat_substrings(s,min_length = 4, min_repeats=2)
string_dict

{' string ': Substring(length=8, count=2), 'this': Substring(length=4, count=2)}

1

There are 1 best solutions below

0
Eric On

After the answer to @stef I thought I had a better direction. I'm pretty sure this can be improved on but it's much faster than the original and probably O(n log n).

import numpy as np
from pydivsufsort import divsufsort, kasai
from intervaltree import Interval, IntervalTree

def update_ranges(ranges_tree: IntervalTree, new_range: tuple):
    """
    This function updates the ranges in the given interval tree.
    It removes any intervals from the tree that are fully contained within the new interval.
    If the new interval is not fully covered by any existing intervals in the tree, it adds the new interval.

    :param ranges_tree: The interval tree to update.
    :param new_range: The new interval.
    :return: The updated interval tree.
    """
    new_interval = Interval(new_range[0], new_range[1])
    
    overlapping_intervals = ranges_tree[new_interval.begin:new_interval.end]  # Find overlapping intervals.
    
    # Remove intervals from the IntervalTree that are fully contained within new_interval.
    for interval in overlapping_intervals:
        if interval.begin >= new_interval.begin and interval.end <= new_interval.end:
            ranges_tree.remove(interval)

    # If new_interval is not fully covered by any existing intervals, add it to the IntervalTree.
    if not any(interval.begin <= new_interval.begin and interval.end >= new_interval.end for interval in overlapping_intervals):
        ranges_tree.add(new_interval)

    return ranges_tree  # Return updated IntervalTree.


def find_unique_repeat_substrings(s: bytes, min_length: int = 4, min_repeats: int = 2):
    """
    This function finds unique repeated substrings of a specified minimum length and number of repetitions in the given string.

    :param s: The input string.
    :param min_length: The minimum length of substrings to consider.
    :param min_repeats: The minimum number of repetitions of substrings.
    :return: A list of unique repeated substrings that meet the conditions.
    """
    if len(s) <= 1:
        return []
        
    sa = divsufsort(s)
    lcp = kasai(s, sa)

    zip_array = np.column_stack((sa, np.roll(lcp, 1, axis=0)))  # Roll lcp 1 position and stack with SA.

    active_string = False
    cnt = 1
    string_ranges = IntervalTree()
    max_current_str_len = 0

    for row in zip_array:
        # Evaluating active string; if lcp value is descending, exit active substring.
        if active_string and row[1] < max_current_str_len:
            if cnt >= min_repeats:  # If substring repeated enough times, update string ranges.
                string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))
            active_string = False
            cnt = 1
            max_current_str_len = 0
        # Evaluating active substring; if lcp value is non-descending, increment repeat count.
        elif active_string and row[1] >= max_current_str_len:
            cnt += 1
            max_current_str_len = max(max_current_str_len,row[1])
        # New substring found of at least size min_length; start tracking it.
        elif row[1] >= min_length and not active_string:
            start_str = row[0]
            str_len = row[1]
            active_string = True
            cnt += 1
            max_current_str_len = row[1]
    
    # If a new substring was being tracked at the end of the SA/LCP, update string ranges.
    if cnt >= min_repeats:
        string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))

    # Convert ranges to strings
    # Sorted is used to maintain a consistent order of the strings based on their starting position
    strings = sorted([s[x.begin:x.end] for x in string_ranges])
    
    return strings  # Return a list of unique repeated substrings that meet the conditions.

import unittest

class TestFindUniqueRepeatSubstrings(unittest.TestCase):
    def test_find_unique_repeat_substrings(self):
        test_cases = [
            {
                's': "this string is repeated three times in this sentence. string string .".encode(),
                'min_length': 4,
                'min_repeats': 2,
                'expected_result': [b' string ', b'this s'],
            },
            {
                's': "s st str stri strin string".encode(),
                'min_length': 4,
                'min_repeats': 2,
                'expected_result': [b' str', b'stri', b'trin'],
            },
            {
                's': "banana".encode(),
                'min_length': 2,
                'min_repeats': 2,
                'expected_result': [b'ana'],
            },
            {
                's': "".encode(),
                'min_length': 3,
                'min_repeats': 2,
                'expected_result': [],
            },
            {
                's': "a".encode(),
                'min_length': 1,
                'min_repeats': 1,
                'expected_result': [],
            },
                        {
                's': "aa".encode(),
                'min_length': 1,
                'min_repeats': 2,
                'expected_result': [b'a'],
            },
        ]

        for case in test_cases:
            s = case['s']
            min_length = case['min_length']
            min_repeats = case['min_repeats']
            expected_result = case['expected_result']

            try:
                result = find_unique_repeat_substrings(s, min_length, min_repeats)
                self.assertEqual(result, expected_result)
            except AssertionError as e:
                error_msg = f"AssertionError in test case:\n" \
                            f"s: {s}\n" \
                            f"min_length: {min_length}\n" \
                            f"min_repeats: {min_repeats}\n" \
                            f"Expected: {expected_result}\n" \
                            f"Result: {result}\n"
                raise AssertionError(error_msg) from e

suite = unittest.TestLoader().loadTestsFromTestCase(TestFindUniqueRepeatSubstrings)
runner = unittest.TextTestRunner()
runner.run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.007s

OK