Index: perl/lib/KinoSearch/Search/Query.pm =================================================================== --- perl/lib/KinoSearch/Search/Query.pm (revision 2982) +++ perl/lib/KinoSearch/Search/Query.pm (working copy) @@ -54,22 +54,6 @@ sub extract_terms { shift->abstract_death } -=begin comment - -my @highlight_spans = $query->highlight_spans( - searcher => $searcher, - field => $field, - doc_vec => $doc_vector -); - -Return a list of HighlightSpan objects, indicating where in the $field the -text that matches the query occurs. - -=end comment -=cut - -sub highlight_spans { } # empty list by default - 1; __END__ Index: perl/lib/KinoSearch/Search/BooleanQuery.pm =================================================================== --- perl/lib/KinoSearch/Search/BooleanQuery.pm (revision 2982) +++ perl/lib/KinoSearch/Search/BooleanQuery.pm (working copy) @@ -35,15 +35,6 @@ return @terms; } -sub highlight_spans { - my $self = shift; - my @spans; - for my $clause ( @{ $self->get_clauses->to_perl } ) { - push @spans, $clause->get_query->highlight_spans( @_ ); - } - return @spans; -} - sub make_weight { my ( $self, $searcher ) = @_; return KinoSearch::Search::BooleanWeight->new( @@ -128,6 +119,15 @@ return $scorer; } +sub highlight_spans { + my $self = shift; + my @spans; + for my $sub_weight ( @{ $sub_weights{$$self} } ) { + push @spans, $sub_weight->highlight_spans( @_ ); + } + return @spans; +} + 1; __END__ Index: perl/lib/KinoSearch/Search/PhraseQuery.pm =================================================================== --- perl/lib/KinoSearch/Search/PhraseQuery.pm (revision 2982) +++ perl/lib/KinoSearch/Search/PhraseQuery.pm (working copy) @@ -47,83 +47,6 @@ sub extract_terms { shift->get_terms->to_perl } -sub highlight_spans { - my ( $self, %args ) = @_; - my $doc_vector = $args{doc_vec}; - my $field_name = $args{field}; - my $searcher = $args{searcher}; - - my $terms = $self->get_terms->to_perl; - - return if !@$terms or $terms->[0]->get_field ne $field_name; - my @term_texts = map { $_->get_text } @$terms; - - require KinoSearch::Highlight::HighlightSpan; - - my $posit_vec = KinoSearch::Util::BitVector->new; - my @term_vectors - = map { $doc_vector->term_vector( $field_name, $_ ) } - @term_texts; - - # make sure all terms are present - return unless scalar @term_vectors == scalar @term_texts; - - my $i = 0; - for my $tv (@term_vectors) { - # one term missing, ergo no phrase - return unless defined $tv; - if ( $i == 0 ) { - $posit_vec->set( @{ $tv->get_positions } ); - } - else { - # filter positions using logical "and" - my $other_posit_vec = KinoSearch::Util::BitVector->new; - $other_posit_vec->set( - grep { $_ >= 0 } - map { $_ - $i } @{ $tv->get_positions } - ); - $posit_vec->AND($other_posit_vec); - } - $i++; - } - - # add only those starts/ends that belong to a valid position - my $tv_start_positions = $term_vectors[0]->get_positions; - my $tv_starts = $term_vectors[0]->get_start_offsets; - my $tv_end_positions = $term_vectors[-1]->get_positions; - my $tv_ends = $term_vectors[-1]->get_end_offsets; - $i = 0; - my $j = 0; - my $last_token_index = $#term_vectors; - my @posits; - $posit_vec = $posit_vec->to_arrayref; - my $weight; - if(@$posit_vec) { - $weight = $self->make_weight($searcher)->get_value; - } - for my $valid_position ( @{ $posit_vec } ) { - - while ( $i <= $#$tv_start_positions ) { - last if ( $tv_start_positions->[$i] >= $valid_position ); - $i++; - } - $valid_position += $last_token_index; - while ( $j <= $#$tv_end_positions ) { - last if ( $tv_end_positions->[$j] >= $valid_position ); - $j++; - } - push @posits, KinoSearch::Highlight::HighlightSpan->new( - start_offset => $tv_starts->[$i], - end_offset => $tv_ends->[$j], - weight => $weight, - ); - $i++; - $j++; - } - - return @posits; -} - sub to_string { my ( $self, $proposed_field ) = @_; my $string @@ -220,6 +143,80 @@ ); } +sub highlight_spans { + my ( $self, %args ) = @_; + my $doc_vector = $args{doc_vec}; + my $field_name = $args{field}; + my $searcher = $args{searcher}; + my $parent = $self->get_parent; + my $terms = $parent->get_terms->to_perl; + + return if !@$terms or $terms->[0]->get_field ne $field_name; + my @term_texts = map { $_->get_text } @$terms; + + require KinoSearch::Highlight::HighlightSpan; + + my $posit_vec = KinoSearch::Util::BitVector->new; + my @term_vectors + = map { $doc_vector->term_vector( $field_name, $_ ) } + @term_texts; + + # make sure all terms are present + return unless scalar @term_vectors == scalar @term_texts; + + my $i = 0; + for my $tv (@term_vectors) { + # one term missing, ergo no phrase + return unless defined $tv; + if ( $i == 0 ) { + $posit_vec->set( @{ $tv->get_positions } ); + } + else { + # filter positions using logical "and" + my $other_posit_vec = KinoSearch::Util::BitVector->new; + $other_posit_vec->set( + grep { $_ >= 0 } + map { $_ - $i } @{ $tv->get_positions } + ); + $posit_vec->AND($other_posit_vec); + } + $i++; + } + + # add only those starts/ends that belong to a valid position + my $tv_start_positions = $term_vectors[0]->get_positions; + my $tv_starts = $term_vectors[0]->get_start_offsets; + my $tv_end_positions = $term_vectors[-1]->get_positions; + my $tv_ends = $term_vectors[-1]->get_end_offsets; + $i = 0; + my $j = 0; + my $last_token_index = $#term_vectors; + my @posits; + $posit_vec = $posit_vec->to_arrayref; + my $weight_val = $normalized_impact{$$self}; + for my $valid_position ( @{ $posit_vec } ) { + + while ( $i <= $#$tv_start_positions ) { + last if ( $tv_start_positions->[$i] >= $valid_position ); + $i++; + } + $valid_position += $last_token_index; + while ( $j <= $#$tv_end_positions ) { + last if ( $tv_end_positions->[$j] >= $valid_position ); + $j++; + } + push @posits, KinoSearch::Highlight::HighlightSpan->new( + start_offset => $tv_starts->[$i], + end_offset => $tv_ends->[$j], + weight => $weight_val, + ); + $i++; + $j++; + } + + return @posits; +} + 1; __END__ Index: perl/lib/KinoSearch/Search/TermQuery.pm =================================================================== --- perl/lib/KinoSearch/Search/TermQuery.pm (revision 2982) +++ perl/lib/KinoSearch/Search/TermQuery.pm (working copy) @@ -22,41 +22,6 @@ sub extract_terms { ( shift->get_term ) } -sub highlight_spans { - my ( $self, %args ) = @_; - my $doc_vector = $args{doc_vec}; - my $field_name = $args{field}; - my $searcher = $args{searcher}; - - my $term = $self->get_term; - - return if $term->get_field ne $field_name; - my $term_text = $term->get_text; - - require KinoSearch::Highlight::HighlightSpan; - - # add all starts and ends - my $term_vector - = $doc_vector->term_vector( $field_name, $term_text ); - return unless defined $term_vector; - my $starts = $term_vector->get_start_offsets; - my $ends = $term_vector->get_end_offsets; - my @posits; - my $weight; - if (@$starts) { - $weight = $self->make_weight( $searcher )->get_value; - } - while (@$starts) { - push @posits, KinoSearch::Highlight::HighlightSpan->new( - start_offset => shift @$starts, - end_offset => shift @$ends, - weight => $weight, - ); - } - - return @posits; -} - sub to_string { my ( $self, $proposed_field ) = @_; my $field = $self->get_term->get_field; @@ -143,6 +108,38 @@ ); } +sub highlight_spans { + my ( $self, %args ) = @_; + my $doc_vector = $args{doc_vec}; + my $field_name = $args{field}; + my $searcher = $args{searcher}; + + my $term = $self->get_parent->get_term; + + return if $term->get_field ne $field_name; + my $term_text = $term->get_text; + + require KinoSearch::Highlight::HighlightSpan; + + # add all starts and ends + my $term_vector + = $doc_vector->term_vector( $field_name, $term_text ); + return unless defined $term_vector; + my $starts = $term_vector->get_start_offsets; + my $ends = $term_vector->get_end_offsets; + my @posits; + my $weight_val = $normalized_impact{$$self}; + while (@$starts) { + push @posits, KinoSearch::Highlight::HighlightSpan->new( + start_offset => shift @$starts, + end_offset => shift @$ends, + weight => $weight_val, + ); + } + + return @posits; +} + 1; __END__ Index: perl/lib/KinoSearch/Search/Weight.pm =================================================================== --- perl/lib/KinoSearch/Search/Weight.pm (revision 2982) +++ perl/lib/KinoSearch/Search/Weight.pm (working copy) @@ -72,6 +72,8 @@ sub explain { shift->todo_death } +sub highlight_spans { } # empty list by default + 1; __END__ @@ -122,10 +124,8 @@ __POD__ -=begin devdocs +=head1 NAME -=head1 PRIVATE CLASS - KinoSearch::Search::Weight - Searcher-dependent transformation of a Query. =head1 SYNOPSIS @@ -144,6 +144,20 @@ Query's "weight" ought to be a single number: a coefficient... and indeed, eventually a Weight object gets turned into a $weight_value. +=head1 METHODS + +=head2 highlight_spans + + my @highlight_spans = $weight->highlight_spans( + searcher => $searcher, + field => $field, + doc_vec => $doc_vector + ); + +Return a list of HighlightSpan objects, indicating where in the C<$field> +the text that matches the query occurs. The base class's method returns an +empty list. + =head1 COPYRIGHT Copyright 2005-2008 Marvin Humphrey @@ -152,5 +166,4 @@ See L version 0.20. -=end devdocs =cut Index: perl/lib/KinoSearch/Highlight/Highlighter.pm =================================================================== --- perl/lib/KinoSearch/Highlight/Highlighter.pm (revision 2982) +++ perl/lib/KinoSearch/Highlight/Highlighter.pm (working copy) @@ -18,12 +18,15 @@ encoder => \our %encoder, formatter => \our %formatter, token_re => \our %token_re, + weight => \our %weight, ); use KinoSearch::Highlight::HeatMap; use KinoSearch::Highlight::SimpleHTMLFormatter; use KinoSearch::Highlight::SimpleHTMLEncoder; +BEGIN { __PACKAGE__->ready_get(qw( weight )) } + sub new { my $either = shift; my $args = @_ == 1 ? shift : {@_}; @@ -31,40 +34,60 @@ $token_re{$$self} = qr/\b\w+(?:'\w+)?\b/; # set and check the mandatory args - $searcher{$$self} = $args->{searcher}; - $query{$$self} = $args->{query}; + my $searcher = $searcher{$$self} = $args->{searcher}; + my $query = $query{$$self} = $args->{query}; $field{$$self} = $args->{field}; confess 'searcher is mandatory' unless $searcher{$$self}; confess 'query is mandatory' unless defined $query{$$self}; confess 'field is mandatory' unless defined $field{$$self}; # turn a query string into an object - if ( !a_isa_b( $query{$$self}, 'KinoSearch::Search::Query' ) + if ( !a_isa_b( $query, 'KinoSearch::Search::Query' ) ) { - $query{$$self} - = $searcher{$$self}->prepare_simple_search( $query{$$self} ); + $query{$$self} = $query + = $searcher->prepare_simple_search( $query ); } + # make a weight from the query + $weight{$$self} = $query->make_weight( $searcher ); + # read the rest of the args $encoder{$$self} = $args->{encoder}; $formatter{$$self} = $args->{formatter}; $excerpt_length{$$self} = $args->{excerpt_length}; - # set default values if they are not defined - $encoder{$$self} = KinoSearch::Highlight::SimpleHTMLEncoder->new - unless defined $encoder{$$self}; - $formatter{$$self} - = KinoSearch::Highlight::SimpleHTMLFormatter->new( - pre_tag => '', - post_tag => '', - ) - unless defined $formatter{$$self}; $excerpt_length{$$self} = 200 unless defined $excerpt_length{$$self}; return $self; } +sub set_encoder { + my ($self, $encoder) = @_; + confess("Not a KinoSearch::Highlight::Encoder") + unless a_isa_b( $encoder, 'KinoSearch::Highlight::Encoder' ); + $encoder{$$self} = $encoder; +} + +sub get_encoder { + $encoder{${+shift}} ||= KinoSearch::Highlight::SimpleHTMLEncoder->new; +} + +sub set_formatter { + my ($self, $formatter) = @_; + confess("Not a KinoSearch::Highlight::Formatter") + unless a_isa_b( $formatter, 'KinoSearch::Highlight::Formatter' ); + $formatter{$$self} = $formatter; +} + +sub get_formatter { + $formatter{${+shift}} + ||= KinoSearch::Highlight::SimpleHTMLFormatter->new( + pre_tag => '', + post_tag => '', + ); +} + sub create_excerpt { my ( $self, $hitdoc ) = @_; my $excerpt_field = $field{$$self}; @@ -82,7 +105,7 @@ return '' unless $text_length; # determine the rough boundaries of the excerpt - my @posits = $query{$$self}->highlight_spans( + my @posits = $weight{$$self}->highlight_spans( searcher => $searcher{$$self}, field => $excerpt_field, doc_vec => $searcher{$$self}->fetch_doc_vec( @@ -179,8 +202,8 @@ } # insert highlight tags - my $formatter = $formatter{$$self}; - my $encoder = $encoder{$$self}; + my $formatter = $self->get_formatter; + my $encoder = $self->get_encoder; my $output_text = ''; my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 ); while (@relative_starts) { @@ -263,8 +286,6 @@ query => $query, # required field => 'content', # required excerpt_length => 150, # default: 200 - formatter => $formatter, # default: a SimpleHTMLFormatter - encoder => $encoder, # default: a SimpleHTMLEncoder ); Constructor. Takes hash-style parameters: @@ -289,24 +312,31 @@ B - the maximum length of the excerpt, in characters. -=item * +=back -B - an object which isa L. Used -to perform the actual highlighting. +=head2 get_formatter, set_formatter -=item * +This is an accessor method. The formatter is an object which isa +L. Used to perform the actual +highlighting. By default, this is a SimpleHTMLFormatter that puts +C<< >> tags around the text fed through it. -B - an object which isa L. All +=head2 get_encoder, set_encoder + +This, too is an accessor method. The encoder is an object which isa +L. All excerpt text gets passed through the encoder, including highlighted terms. By default, this is a SimpleHTMLEncoder, which encodes HTML entities. -=back - =head2 create_excerpt This method takes a hit (a HitDoc object) as its sole argument and returns an excerpt as a string. +=head2 get_weight + +This returns a weight object associated with the query. + =head1 COPYRIGHT Copyright 2005-2008 Marvin Humphrey Index: perl/lib/KinoSearch/Highlight/HeatMap.pm =================================================================== --- perl/lib/KinoSearch/Highlight/HeatMap.pm (revision 2982) +++ perl/lib/KinoSearch/Highlight/HeatMap.pm (working copy) @@ -24,7 +24,7 @@ $window{$$self} = $args->{window}; confess 'spans is mandatory' if ! $spans; - $window{$$self} = 167 if !defined $window{$$self}; + $window{$$self} = 133 if !defined $window{$$self}; my $window = $window{$$self};