[KinoSearch] KSx::Positions::PhraseScorer

Nathan Kurz nate at verse.com
Fri Sep 7 20:49:01 PDT 2007



Here's the in-progress PhraseScorer I've been working on.  At one
point, I had it compiling, but I've been rearranging classes a lot
since then so I offer this for discussion of the framework only.

This is pulled out from an incomplete KSx::Positions directory I'm
working on, which (unfortunately) aims to replace everything in
KinoSearch::Search.  But I'd like to be able to run it in parallel for
a bit, so some of the names are screwy.

There are some parts of this you will probably dislike on sight.  Some
of these can be changed easily, others may be tricky.  But realize
it's only a proof of concept, not a drop in replacement for anything.

1.  It's in C99.  I think this allows for greater clarity in places,
but this can be changed once easily.  I understand C99 is problematic
for Windows?

2. I like ASSERT and DEBUG macros.   These are defined away unless
compiled -DDEBUG, and can disappear from the final if you wish.
DEBUG() acts as a printf() that is controlled per function/module by
$ENV{DEBUG} at runtime.

3. My int sizes and types are a mess.  I started one way and then went
another.  I don't know which one should expand for 64 bit and which
ones shouldn't.

4. I've done stuff like define REFCOUNT_INC(x) to return x.  I think
this makes things a little clearer, but it's just sugar.

5. I've yet to figure out how to deal with TF-IDF stuff and the
Similarity object.  Contrary to your assertion that ScoreProx is the
worst class name, I might vote for Similarity as the least intuitive.

6. I haven't tried handling Match->length properly yet.  I think it's
possible, but
I'm not sure how to do it efficiently.

7. It doesn't actually work yet.  It's just the direction I'm
currently headed.  I have a separate recursive version that handles
deeper nesting, but it's in even more of a shambles.  Despite the
implementations, I think the algorithm works in both cases, though.

---------------------------------------------------------------------------------------------
#define KINO_USE_SHORT_NAMES
#include "KinoSearch/Util/ToolSet.h"

#define KINO_WANT_PPHRASESCORER_VTABLE
#include "KSx/Positions/PPhraseScorer.r"
#include "KSx/Positions/Tally.r"
#include "KSx/Positions/Match.r"

PPhraseScorer *
PPhraseScorer_new(int field_num, ArrayScorer *subscorer)
{
    ASSERT(arrayscorer && OBJ_IS_A(arrayscorer, ARRAYSCORER));
    ASSERT(field_num != FIELD_INVALID);

    if (! subscorer_is_legal(field_num, subscorer)) return NULL;

    CREATE(self, PPhraseScorer, PPHRASESCORER);

    self->subscorer = REFCOUNT_INC(subscorer);
    self->current_doc = 0;
    self->field_num = field_num;

    self->match = Match_new(10); // grows on demand
    // FUTURE: loop to calculate length properly
    self->match->length = size;

    int size = subscorer->array->size;
    self->position_ptrs = CALLOCATE(size, int);
    self->position_ends = CALLOCATE(size, int);
    self->phrase_offsets = CALLOCATE(size, int);

    DEBUG("created %p size %d", self, size);
    return self;
}

void
PPhraseScorer_destroy(PPhraseScorer* self)
{
    DEBUG("destroying %p", self);
    REFCOUNT_DEC(self->subscorer);
    REFCOUNT_DEC(self->match);

    free(self->position_ptrs);
    free(self->position_ends);
    free(self->phrase_offsets);

    free(self);
}

int
PPhraseScorer_match(PPhraseScorer *self, int target_doc)
{
    ASSERT(target_doc > self->current_doc);

    while (1) {
	// advance to the next document that satisfies scorer
	target_doc = Scorer_Match(self->subscorer, target_doc);

	// stop if there are no more matching documents
	if (target_doc == DOC_NOT_FOUND) break;
	DEBUG("subscorer matched %d", target_doc);

	// use this doc if the positions match the offsets
	if (PPhraseScorer_Find_Phrases(self)) break;
	
	// otherwise try the next doc
	target_doc++;
    }

    DEBUG("phrase matched %d", target_doc);
    self->current_doc = target_doc;
    return target_doc;
}


int
PPhraseScorer_find_phrases(PPhraseScorer *self)
{
    unsigned first_index = phrase_prepare(self);

    // start with zero positions matched
    Match_clear(self->match);

    // FUTURE: inline the calls to Match_clear and Match_push

    int target_pos = 0;
    while (1) {
	// find the first occurence of phrase at or after target_position
	int found_position = phrase_position(self, target_position,
					     first_index);
	if (found_position == POSITION_NOT_FOUND) break;
	DEBUG("found phrase at %d", found_position);

	// add the found_position to our list of matches and continue
	Match_push(self->match, found_position);
	// FUTURE: is it safe to increment by phrase length here?
	target_position = found_position + self->match->length;
    }

    DEBUG("found %d phrases", self->match->count);
    return self->match->count;
}

// FUTURE: decide how this Tally should really work
Tally *
PPhraseScorer_tally(PPhraseScorer *self)
{
    VArray *array = self->subscorer->array;
    MatchScorer **scorers = (MatchScorer **)array->elems;
    size_t num_scorers = array->size;
    ASSERT(num_scorers);

    float product = 1.0;
    for (int i = 0; i < num_scorers; i++) {
	Tally *subtally = Scorer_Tally(scorers[i]);
	ASSERT(subtally->score > 0);
	product *= subtally->score;
    }

    self->tally->score = score_product;

    return self->tally;
}


/* The general approach here is to start with the subscorer with the fewest
   position occurences, and then see if a sequence of occurences can be found
   at the appropriate phrase_offsets.  If the sequence is broken, we start
   again with the next legal occurence of the shortest subscorer.  Slightly
   better performance could sometimes be achieved by presorting the subscorers
   in order of increasing numbers of occurences, but for most documents the
   overhead of the sorting will be greater than the performance boost.

   I think this is a reasonable algorithm, and it does a reasonable job of
   avoiding redundant effort, but I have not attempted to optimize it.
   Judicious use of the C99 'restrict' keyword might considerably improve
   the compilers optimization here.  Rearranging the branches might yield
   considerable improvement as well.  And it is entirely possible that
   there is some completely different but better way to do this.
*/

static inline unsigned
phrase_prepare(PPhraseScorer *self) {
    VArray *array = self->subscorer->array;
    MatchScorer **scorers = (MatchScorer **)array->elems;
    size_t num_scorers = array->size;

    size_t shortest_index = 0;
    size_t shortest_count = scorers[0]->match->count;

    for (int i = 0; i < num_scorers; i++) {
	Match *match = scorers[i]->match;
	self->position_ptrs[i] = match->positions;
	self->position_ends[i] = match->positions + match->count;
	self->phrase_offsets[i] = i;
	
	// FUTURE: deal with lengths properly for phrase_offsets
	
	if (match->count < shortest_count) {
	    shortest_count = match->count;
	    shortest_index = i;
	}
    }

    return shortest_index;
}

static inline int
phrase_find(PPhraseScorer *self, int target_position, unsigned first_index)
{
    unsigned num_scorers = self->subscorer->array->size;
    ASSERT(target_position);
    ASSERT(num_scorers);
    ASSERT(first_index < num_scorers);
    DEBUG("self %p, target %d, index %u", self, target_position, first_index);

    // FUTURE: could optimize by presorting scorers by increasing occurences?

    // loop [first_index] to [num_scorers-1] then [0] back to [first_index]
    unsigned this_index = first_index; // normally start with the shortest list
    while (1) {	
	// convenience variables for what is essentially an inverted object
	int* position_ptr = self->position_ptrs[this_index];
	int* position_end = self->position_ends[this_index];
	int phrase_offset = self->phrase_offset[this_index];
	
	// search for an occurence offset from the target position
	int offset_position = target_position + phrase_offset;
	while (*position_ptr < offset_position) {
	    if (++position_ptr > position_end) {
		DEBUG("not found for offset %d, index %u, target %d",
		      phrase_offset, this_index, target_position);
		return POSITION_NOT_FOUND;
	    }
	}

	// update underlying with the current position
	self->position_ptrs[this_index] = position_ptr;
	ASSERT(position_ptr <= position_end);
	ASSERT(*position_ptr >= target_position + phrase_offset);

	if (*position_ptr > offset_position) {
	    target_position = *position_ptr - phrase_offset;
	    if (this_index == first_index) {
		DEBUG("raising target to %d", target_position);
		this_index++;
	    }
	    else {
		DEBUG("restarting with target %d", target_position);
		this_index = first_index;
		continue;
	    }
	}
	else {
	    if (this_index == first_index) {
		DEBUG("found target position %d", target_position);
		return target_position;
	    }
	    else {
		DEBUG("target %d ok index %u", target_position, this_index);
		this_index++;
	    }
	}

	// scorer index wraps back to 0 at num_scorers
	if (this_index == num_scorers) this_index = 0;
	ASSERT(this_index < num_scorers);
    };

    // return from within loop
    UNREACHABLE_RETURN(int);
}


static bool
subscorer_is_legal(int phrase_field_num, ArrayScorer *arrayscorer) {
    // FUTURE: ArrayScorers other than AndScorers are possible, but for now...
    if (! OBJ_IS_A(arrayscorer, PANDSCORER)) {
	return ERROR("subscorer is of type '%s' and is not a ArrayScorer",
		     arrayscorer->_->class_name);
    }
	
    /* NOTE: the checks below are considered runtime errors.  While they
       should not occur, the process need not be aborted to handle them */
    VArray* array = arrayscorer->array;
    if (array->count == 0) return ERROR("subscorer has no elements");

    MatchScorer** elements = (MatchScorer **)array->elems;

    for (int i = 0; i < array->count; i++) {
	MatchScorer* scorer = elements[i];
	if (! OBJ_IS_A(scorer, MATCHSCORER)) {
	    return ERROR("subscorer[%d] of type '%s' is not a MatchScorer",
			 i, scorer->_->class_name);
	}

	if (subscorer->field_num != phrase_field_num) {
	    return ERROR("mismatched field subscorer[%d]: got %d, wanted %d",
			 i, scorer->field_num, phrase_field_num);
	}
    }

    return true;
}

_______________________________________________
KinoSearch mailing list
KinoSearch at rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch




More information about the kinosearch mailing list