Index: c_src/KinoSearch/Search/PhraseScorer.c =================================================================== --- c_src/KinoSearch/Search/PhraseScorer.c (revision 2526) +++ c_src/KinoSearch/Search/PhraseScorer.c (working copy) @@ -232,90 +232,96 @@ } } +static inline size_t +winnow_anchors(u32_t *anchors_start, const u32_t *const anchors_end, + const u32_t *candidates, const u32_t *const candidates_end, + size_t offset) +{ + u32_t *anchors = anchors_start; + u32_t *anchors_found = anchors_start; + u32_t target_anchor; + u32_t target_candidate; + + /* This function is a loop that finds terms that can continue a phrase. + * It overwrites the anchors in place, and returns the number remaining. + * The basic algorithm is to alternately increment the candidates' pointer + * until it is at or beyond its target position, and then increment the + * anchors' pointer until it is at or beyond its target. The non-standard + * form is to avoid unnecessary comparisons. I have not tested this + * loop for speed, but glancing at the object code produced (objdump -S) + * it appears to be significantly faster than the nested loop alternative. + * But given the vagaries of modern processors, it merits actual testing.*/ + + SPIN_CANDIDATES: + target_candidate = *anchors + offset; + while (*candidates < target_candidate) { + if (++candidates == candidates_end) goto DONE; + } + if (*candidates == target_candidate) goto MATCH; + goto SPIN_ANCHORS; + + SPIN_ANCHORS: + target_anchor = *candidates - offset; + while (*anchors < target_anchor) { + if (++anchors == anchors_end) goto DONE; + }; + if (*anchors == target_anchor) goto MATCH; + goto SPIN_CANDIDATES; + + MATCH: + *anchors_found++ = *anchors; + if (++anchors == anchors_end) goto DONE; + goto SPIN_CANDIDATES; + + DONE: + /* return number of anchors remaining */ + return anchors_found - anchors_start; +} + float PhraseScorer_calc_phrase_freq(PhraseScorer *self) { PostingList **const plists = self->plists; - ByteBuf *const anchor_set = self->anchor_set; - u32_t phrase_offset = self->phrase_offsets[0]; - u32_t i; - u32_t *anchors, *anchors_start, *anchors_end; - ScorePosting *first_posting; + u32_t *phrase_offsets = self->phrase_offsets; + u32_t i = 0; - /* create an anchor set */ - first_posting = (ScorePosting*)PList_Get_Posting(plists[0]); - BB_Copy_Str(anchor_set, (char*)first_posting->prox, - first_posting->freq * sizeof(u32_t)); - anchors_start = (u32_t*)anchor_set->ptr; + /* create a overwriteable anchor set from the first posting */ + ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[0]); + size_t anchors_remaining = posting->freq; + ByteBuf *const anchor_set = self->anchor_set; + /* declare these in advance to satisfy gcc -pedantic */ + u32_t *anchors_start, *anchors_end, *anchors; + + BB_GROW(self->anchor_set, anchors_remaining * sizeof(u32_t)); + anchors_start = (u32_t *)anchor_set->ptr; + anchors_end = anchors_start + anchors_remaining; anchors = anchors_start; - anchors_end = (u32_t*)BBEND(anchor_set); while(anchors < anchors_end) { - *anchors++ -= phrase_offset; + /* copy anchor positions skipping those less than initial offset */ + if (phrase_offsets[0] > posting->prox[i]) anchors_end--; + /* subtract phrase_offsets[0] to account for leading virtual terms */ + else *anchors++ = posting->prox[i] - phrase_offsets[0]; + i++; } /* match the positions of other terms against the anchor set */ - for (i = 1; i < self->num_elements; i++) { - ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[i]); - u32_t *candidates = posting->prox; - u32_t *candidates_end = posting->prox + posting->freq; + anchors_remaining = anchors_end - anchors_start; + for (i = 1; i < self->num_elements && anchors_remaining; i++) { + /* prepare the non-overwritten list of potential next terms */ + ScorePosting *posting = (ScorePosting *)PList_Get_Posting(plists[i]); + u32_t *candidates_start = posting->prox; + u32_t *candidates_end = candidates_start + posting->freq; - u32_t *new_anchors = anchors_start; - anchors = anchors_start; - anchors_end = (u32_t*)BBEND(anchor_set); - - phrase_offset = self->phrase_offsets[i]; - - while (anchors < anchors_end) { - u32_t target; - - /* Discard positions that occur too early in the field to match as - * a part of the phrase. For example, if the field begins with - * "The ants go marching one by one", that initial "The" cannot - * match as the second term in a phrase search for - * "fight the power". - */ - target = phrase_offset; - while (candidates < candidates_end && *candidates < target) { - candidates++; - } - if (candidates == candidates_end) - break; - - /* Discard partial matches which seemed promising earlier but - * which fail on this go-round. - */ - target = *candidates - phrase_offset; - while (anchors < anchors_end && *anchors < target) { - anchors++; - } - if (anchors == anchors_end) - break; - - /* Blast past any positions for the current term which are too low - * for the partial phrase matched in earlier iters. - */ - target = *anchors + phrase_offset; - while (candidates < candidates_end && *candidates < target) { - candidates++; - } - if (candidates == candidates_end) - break; - - /* Does the current position fall into the slot? */ - if (*candidates == target) { - /* The anchor has made it through another elimination round. */ - *new_anchors = *anchors; - new_anchors++; - } - anchors++; - } - - /* winnow down the size of the anchor set */ - anchor_set->len = (char*)new_anchors - (char*)anchors_start; + /* reduce anchor_set in place to those that match the next term */ + anchors_remaining = winnow_anchors(anchors_start, anchors_end, + candidates_start, candidates_end, + phrase_offsets[i]); + /* adjust end for number of anchors that remain */ + anchors_end = anchors_start + anchors_remaining; } /* the number of anchors left is the phrase freq */ - return (float) anchor_set->len / sizeof(u32_t); + return anchors_remaining; } u32_t Index: perl/t/502-phrasequery.t =================================================================== --- perl/t/502-phrasequery.t (revision 2526) +++ perl/t/502-phrasequery.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 4; +use Test::More tests => 8; use KinoSearch::Search::PhraseQuery; use KinoSearch::Index::Term; @@ -46,3 +46,32 @@ $hits = $searcher->search( query => $phrase_query ); is( $hits->total_hits, 1, 'avoid underflow when subtracting offset' ); +# "a b c" +$phrase_query = KinoSearch::Search::PhraseQuery->new( slop => 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'a' ), 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'b' ), 1 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'c' ), 2 ); +$hits = $searcher->search( query => $phrase_query ); +is( $hits->total_hits, 3, 'offset starting from zero' ); + +# "* * a b c" +$phrase_query = KinoSearch::Search::PhraseQuery->new( slop => 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'a' ), 2 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'b' ), 3 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'c' ), 4 ); +$hits = $searcher->search( query => $phrase_query ); +is( $hits->total_hits, 2, 'offset starting from two' ); + +# "* * * c d" +$phrase_query = KinoSearch::Search::PhraseQuery->new( slop => 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'c' ), 3 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'd' ), 4 ); +$hits = $searcher->search( query => $phrase_query ); +is( $hits->total_hits, 2, 'offset starting from three' ); + +# "a * c" +$phrase_query = KinoSearch::Search::PhraseQuery->new( slop => 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'a' ), 0 ); +$phrase_query->add_term(KinoSearch::Index::Term->new( 'content', 'c' ), 2 ); +$hits = $searcher->search( query => $phrase_query ); +is( $hits->total_hits, 3, 'offsets with gap in middle' );