[KinoSearch] Feature request: highlight without excerpt
Edward Betts
edwardbetts at gmail.com
Fri Jun 15 08:25:06 PDT 2007
On 08/06/07, Marvin Humphrey <marvin at rectangular.com> wrote:
> It should still go under $hit->{excerpts}.
First attempt, I still need to write docs and tests. I went with
"excerpt_length => undef", but this could easily be switched to 0 or
support both.
Index: lib/KinoSearch/Highlight/Highlighter.pm
===================================================================
--- lib/KinoSearch/Highlight/Highlighter.pm (revision 2474)
+++ lib/KinoSearch/Highlight/Highlighter.pm (working copy)
@@ -57,7 +57,8 @@
}
# scoring window is 1.66 * excerpt_length, with the loc in the middle
- $spec{limit} = int( $spec{excerpt_length} / 3 );
+ $spec{limit} = int( $spec{excerpt_length} / 3 )
+ if defined $spec{excerpt_length};
# use field name as key unless specified
$spec{name} = $spec{field} unless defined $spec{name};
@@ -71,13 +72,45 @@
# create an excerpt for each spec
my %excerpts;
for my $spec ( @{ $self->{specs} } ) {
- $excerpts{ $spec->{name} }
- = $self->_gen_excerpt( $doc, $doc_vector, $spec );
+ if (defined $spec->{excerpt_length}) {
+ $excerpts{ $spec->{name} }
+ = $self->_gen_excerpt( $doc, $doc_vector, $spec );
+ } else {
+ $excerpts{ $spec->{name} }
+ = $self->_gen_excerpt_no_length( $doc, $doc_vector, $spec );
+ }
}
return \%excerpts;
}
+sub _gen_excerpt_no_length {
+ my ( $self, $doc, $doc_vector, $spec ) = @_;
+ my $excerpt_field = $spec->{field};
+
+ my $text = $doc->{$excerpt_field};
+ return unless defined $text;
+ return '' unless length $text;
+
+ my $formatter = $spec->{formatter};
+ my $encoder = $spec->{encoder};
+
+ my $output_text = '';
+ my $posits = $self->_starts_and_ends( $doc_vector, $excerpt_field );
+ my $last_end = 0;
+ foreach (@$posits) {
+ my ($start, $end) = @$_;
+ $output_text .= $encoder->encode(
+ substr( $text, $last_end, $start - $last_end ) );
+ $output_text .= $formatter->highlight(
+ $encoder->encode( substr( $text, $start, $end - $start ) ) );
+ $last_end = $end;
+ }
+ $output_text .= $encoder->encode( substr( $text, $last_end ) );
+
+ return $output_text;
+}
+
sub _gen_excerpt {
my ( $self, $doc, $doc_vector, $spec ) = @_;
my $excerpt_field = $spec->{field};
@@ -186,7 +219,7 @@
my $formatter = $spec->{formatter};
my $encoder = $spec->{encoder};
my $output_text = '';
- my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
+ my ( $start, $end, $last_end ) = ( undef, undef, 0 );
while (@relative_starts) {
$end = shift @relative_ends;
$start = shift @relative_starts;
--
Edward Betts
More information about the kinosearch
mailing list