I want to write a function int** text_match(const char* input, const char* pattern); that returns all match offsets(start and end respectively)
So far I have implemented this, by iterating over the input with an input offset, finding out if there is a match within the input + its offset, obtaining the offset and setting the input offset to the end offset of the match.
This approach has some very significant disadvantages
- some patterns exists that can cause an infinite amount of matches (^|\n|$)
- even once these infinite execution issues are fixed with (^|\n) will always find a match at the beginning of the string-fragment, thus viewing every index as a match
I experimented with pcre2_get_ovector_count() a bit, however so far whenever I provide it with the pcre2_match_dataafter having run pcre2_match() it only returns 1 and not the actual amount of matches
How could I find out all match offsets with pcre2 directly? Or at least avoid the issues I have described above?
my implementation so far (will be licensed under the MIT)
int** text_match(const char* input, const char* regex){
if(!input || !regex) return (int**)calloc(1, sizeof(int*));
int errorcode;
PCRE2_SIZE erroroffset;
pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);
if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
if(!match_data){
pcre2_code_free(compiled_regex);
return (int**)calloc(1, sizeof(int*));
}
int** output = (int**)calloc(1, sizeof(int*));
if(!output){
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return (int**)calloc(1, sizeof(int*));
}
PCRE2_SIZE start_offset = 0;
int matches_amount;
size_t length_input = strlen(input);
for(matches_amount = 0; ; matches_amount++){
if(start_offset >= length_input) break;
if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, 0, match_data, NULL) < 0) break;
PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
if(offsets == NULL) break;
output[matches_amount] = (int*)malloc(2 * sizeof(int));
if(output[matches_amount] == NULL) break;
output[matches_amount][0] = offsets[0];
output[matches_amount][1] = offsets[1];
int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
if(new_output == NULL) break;
output = new_output;
if(start_offset == offsets[1]){
start_offset++;
}else{
start_offset = offsets[1];
}
}
output[matches_amount] = NULL;
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return output;
}
Edit:
As pointed out by @ikegami the approach was right, however I did too little edge-case detection. This edit just serves one purpose. To provide the corrected function so that if someone finds this post at a later time and needs a function for this here you have it(will be licensed under the MIT btw so do with it whatever you want)
int** text_match(const char* input, const char* regex){
if(!input || !regex) return (int**)calloc(1, sizeof(int*));
int errorcode;
PCRE2_SIZE erroroffset;
pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);
if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
if(!match_data){
pcre2_code_free(compiled_regex);
return NULL;
}
int** output = (int**)calloc(1, sizeof(int*));
if(!output){
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return NULL;
}
PCRE2_SIZE start_offset = 0;
int matches_amount;
size_t length_input = strlen(input);
uint32_t options = 0;
for(matches_amount = 0; ; matches_amount++){
if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, options, match_data, NULL) < 0) break;
PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
if(offsets == NULL) break;
output[matches_amount] = (int*)malloc(2 * sizeof(int));
if(!output[matches_amount]){
for(int i = 0; output[i]; i++) free(output[i]);
free(output);
pcre2_code_free_8(compiled_regex);
pcre2_match_data_free_8(match_data);
return NULL;
}
output[matches_amount][0] = offsets[0];
output[matches_amount][1] = offsets[1];
int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
if(!new_output){
for(int i = 0; output[i]; i++) free(output[i]);
free(output);
pcre2_code_free_8(compiled_regex);
pcre2_match_data_free_8(match_data);
return NULL;
}
output = new_output;
if(offsets[0] == offsets[1]){
if(offsets[0] == length_input) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}else{
size_t startchar = pcre2_get_startchar(match_data);
if(start_offset > startchar) continue;
start_offset = startchar + 1;
for(; start_offset < length_input; start_offset++){
if((input[start_offset] & 0xc0) != 0x80) break;
}
}
}
output[matches_amount] = NULL;
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return output;
}
The
pcre2demoman page provides a demo. It includes the following:So you are indeed expected to manipulate the start position (and the options as well), although there are some issues with your current implementation.