/* * call-seq: * sarray.longest_nonmatch(target, from_index, min_match) -> [non_match_length, match_start, match_length] * * Mostly the inverse of longest_match, except that it first tries to find a * non-matching region, then a matching region. The target and from_index are * the same as in longest_match. The min_match argument is the smallest matching * region that you'll accept as significant enough to end the non-matching search. * Giving non_match=0 will stop at the first matching region. * * It works by first searching the suffix array for a non-matching region. When it * hits a character that is in the source (according to the suffix array) it tries * to find a matching region. If it can find a matching region that is longer than min_match * then it stops and returns, otherwise it adds this match to the length of the non-matching * region and continues. * * The return value is an Array of [non_match_length, match_start, match_length]. */ static VALUE SuffixArray_longest_nonmatch(VALUE self, VALUE target, VALUE from_index, VALUE min_match) { SuffixArray *sa = NULL; Data_Get_Struct(self, SuffixArray, sa); if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa->source)->len == 0) { rb_raise(cSAError, ERR_NOT_INITIALIZED); } // get the from and for_length arguments as unsigned ints size_t from = NUM2UINT(from_index); size_t min = NUM2INT(min_match); // get better pointers for the source (should already be in String form) unsigned char *source_ptr = RSTRING(sa->source)->ptr; size_t source_len = RSTRING(sa->source)->len; // get the target as a string VALUE target_str = StringValue(target); // better pointers again, we also need target_len as an in/out parameter unsigned char *target_ptr = RSTRING(target)->ptr; size_t target_len = RSTRING(target)->len; // check the input for validity, returning nil like in array operations if(from > target_len) { return Qnil; } // adjust for the from and for_length settings to be within the target len unsigned char *scan = target_ptr + from; unsigned char *end = target_ptr + target_len; size_t match_len = 0; size_t match_start = 0; while(scan < end) { if(*scan != source_ptr[sa->suffix_index[sa->starts[*scan]]]) { // printf("not found: %c\n", *scan); scan ++; } else { // search remaining stuff for a possible match, which return as a result as well match_len = end - scan; // printf("finding: match_len=%u, first char='%c', index=%u\n", match_len, *scan, scan - target_ptr); match_start = find_longest_match(source_ptr, source_len, scan, &match_len, sa->starts, sa->ends, sa->suffix_index); if(match_len == 0) { // match not found, which really shouldn't happen break; } else if(match_len > min) { // the match is possibly long enough, drop out break; } else { // the number of possibly matching characters is much too small, so we continue by skipping them scan += match_len; // reset the match_len and match_start to 0 to signal that a match hasn't been found yet match_len = match_start = 0; } } } VALUE result = rb_ary_new(); size_t nonmatch_len = (scan - (target_ptr + from)); // printf("nonmatch_len=%u,match_start=%u, match_len=%u\n", nonmatch_len, match_start, match_len); rb_ary_push(result, INT2FIX(nonmatch_len)); rb_ary_push(result, INT2FIX(match_start)); rb_ary_push(result, INT2FIX(match_len)); return result; }