[KinoSearch] Feature request: highlight without excerpt

Edward Betts edwardbetts at gmail.com
Fri Jun 15 08:25:06 PDT 2007


On 08/06/07, Marvin Humphrey <marvin at rectangular.com> wrote:
> It should still go under $hit->{excerpts}.

First attempt, I still need to write docs and tests. I went with
"excerpt_length => undef", but this could easily be switched to 0 or
support both.

Index: lib/KinoSearch/Highlight/Highlighter.pm
===================================================================
--- lib/KinoSearch/Highlight/Highlighter.pm	(revision 2474)
+++ lib/KinoSearch/Highlight/Highlighter.pm	(working copy)
@@ -57,7 +57,8 @@
     }

     # scoring window is 1.66 * excerpt_length, with the loc in the middle
-    $spec{limit} = int( $spec{excerpt_length} / 3 );
+    $spec{limit} = int( $spec{excerpt_length} / 3 )
+        if defined $spec{excerpt_length};

     # use field name as key unless specified
     $spec{name} = $spec{field} unless defined $spec{name};
@@ -71,13 +72,45 @@
     # create an excerpt for each spec
     my %excerpts;
     for my $spec ( @{ $self->{specs} } ) {
-        $excerpts{ $spec->{name} }
-            = $self->_gen_excerpt( $doc, $doc_vector, $spec );
+        if (defined $spec->{excerpt_length}) {
+            $excerpts{ $spec->{name} }
+                = $self->_gen_excerpt( $doc, $doc_vector, $spec );
+        } else {
+            $excerpts{ $spec->{name} }
+                = $self->_gen_excerpt_no_length( $doc, $doc_vector, $spec );
+        }
     }

     return \%excerpts;
 }

+sub _gen_excerpt_no_length {
+    my ( $self, $doc, $doc_vector, $spec ) = @_;
+    my $excerpt_field  = $spec->{field};
+
+    my $text = $doc->{$excerpt_field};
+    return unless defined $text;
+    return '' unless length $text;
+
+    my $formatter   = $spec->{formatter};
+    my $encoder     = $spec->{encoder};
+
+    my $output_text = '';
+    my $posits = $self->_starts_and_ends( $doc_vector, $excerpt_field );
+    my $last_end = 0;
+    foreach (@$posits) {
+        my ($start, $end) = @$_;
+        $output_text .= $encoder->encode(
+            substr( $text, $last_end, $start - $last_end ) );
+        $output_text .= $formatter->highlight(
+            $encoder->encode( substr( $text, $start, $end - $start ) ) );
+        $last_end = $end;
+    }
+    $output_text .= $encoder->encode( substr( $text, $last_end ) );
+
+    return $output_text;
+}
+
 sub _gen_excerpt {
     my ( $self, $doc, $doc_vector, $spec ) = @_;
     my $excerpt_field  = $spec->{field};
@@ -186,7 +219,7 @@
     my $formatter   = $spec->{formatter};
     my $encoder     = $spec->{encoder};
     my $output_text = '';
-    my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
+    my ( $start, $end, $last_end ) = ( undef, undef, 0 );
     while (@relative_starts) {
         $end   = shift @relative_ends;
         $start = shift @relative_starts;


-- 
Edward Betts



More information about the kinosearch mailing list