File Coverage

blib/lib/Lingua/EN/Segmenter/Evaluator.pm
Criterion Covered Total %
statement 11 63 17.4
branch 0 24 0.0
condition 0 44 0.0
subroutine 4 9 44.4
pod 0 6 0.0
total 15 146 10.2


line stmt bran cond sub pod time code
1             package Lingua::EN::Segmenter::Evaluator;
2              
3             =head1 NAME
4              
5             Lingua::EN::Segmenter::Evaluator - Evaluate a segmenting method
6              
7             =head1 SYNOPSIS
8            
9             my $tiling_segmenter = Lingua::EN::Segmenter::TextTiling->new();
10              
11             foreach (@ARGV) {
12             my $input = read_file($_);
13              
14             print "\nFile name: $_\n";
15              
16             printf "Results from TextTiling algorithm:
17             Strict scoring: %2d%% recall, %2d%% precision
18             Relaxed scoring: %2d%% recall, %2d%% precision
19             V. relaxed scoring: %2d%% recall, %2d%% precision
20             ", calc_stats(evaluate_segmenter($tiling_segmenter,20,$input));
21             }
22              
23             =head1 DESCRIPTION
24              
25             See synopsis.
26              
27             Also check out segmenter.pl in the eg directory.
28              
29             =head1 BUGS
30              
31             This module only works correctly when the segmenter has a MIN_SEGMENT_SIZE >= 2.
32              
33             =head1 AUTHORS
34              
35             David James
36              
37             =head1 SEE ALSO
38              
39             L, L,
40             L
41              
42             =cut
43              
44              
45             $VERSION = 0.10;
46             @EXPORT_OK = qw(evaluate_segmenter calc_stats);
47 1     1   2262 use strict;
  1         2  
  1         37  
48 1     1   6 use base 'Class::Exporter';
  1         3  
  1         84  
49 1     1   6 use Math::HashSum qw(hashsum);
  1         2  
  1         1011  
50              
51             # Create a new Evaluator object
52             sub new {
53 1     1 0 60 my $self = shift;
54 1         7 bless {
55             @_
56             }, $self
57             }
58              
59              
60             # Evaluate the segmenter on a particular input
61             sub evaluate_segmenter {
62 0     0 0   my ($self, $segmenter, $input, $num_segments) = @_;
63            
64 0           $self->{taken} = {};
65            
66 0           my $num_paragraphs = @{$segmenter->{splitter}->paragraph_breaks($input)};
  0            
67              
68 0           my $break = $self->{break} = $segmenter->{splitter}->segment_breaks($input);
69 0   0       $num_segments ||= scalar keys %{$break};
  0            
70 0           my $assigned = $self->{assigned} = $segmenter->segment($num_segments, $input);
71            
72 0   0       my @description = map { {
  0            
73             para=>$_,
74             true=>exists $break->{$_},
75             label=>$assigned->{$_},
76             strict=>exists $break->{$_} && exists $assigned->{$_},
77             relaxed=>$self->relaxed_weight($_),
78             very_relaxed=>$self->very_relaxed_weight($_),
79             } } (0..$num_paragraphs-1);
80            
81 0           return @description;
82             }
83              
84             # Get the weight of a particular index based on a relaxed scheme
85             # NOTE: Assumes that MIN_SEGMENT_SIZE >= 2
86             sub relaxed_weight {
87 0     0 0   my ($self, $i) = @_;
88 0           my $assigned = $self->{assigned}{$i};
89 0           my $break = $self->{break}{$i};
90 0 0 0       if ($assigned and $break) {
91 0           $self->take(1,"break",$i);
92 0           $self->take(1,"assigned",$i);
93 0           return 1;
94             }
95 0 0         if (defined $assigned) {
    0          
96 0 0 0       if ($assigned =~ /L/ and $self->take(1,"break",$i-1) or
    0 0        
      0        
      0        
97             $assigned =~ /R/ and $self->take(1,"break",$i+1)) {
98 0           return 0.8;
99             } elsif ($self->take(1,"break",$i-1) or $self->take(1,"break",$i+1)) {
100 0           return 0.4;
101             }
102             } elsif (exists $self->{break}{$i}) {
103 0 0 0       if ($self->take(1,"assigned",$i-1,"R") or
    0 0        
104             $self->take(1,"assigned",$i+1,"L")) {
105 0           return 0.8;
106             } elsif ($self->take(1,"assigned",$i-1) or
107             $self->take(1,"assigned",$i+1)) {
108            
109 0           return 0.4;
110             }
111             }
112 0           return 0;
113             }
114              
115             # Get the weight of a particular index based on a very relaxed scheme
116             # NOTE: Assumes that MIN_SEGMENT_SIZE >= 2
117             sub very_relaxed_weight {
118 0     0 0   my ($self, $i) = @_;
119 0           my $assigned = $self->{assigned}{$i};
120 0           my $break = $self->{break}{$i};
121            
122 0 0 0       if ($assigned or $break) {
123 0           foreach (-2..2) {
124 0   0       $assigned ||= $self->take(2,"assigned",$i+$_);
125 0   0       $break ||= $self->take(2,"break",$i+$_);
126             }
127             }
128 0   0       return ($assigned and $break);
129             }
130              
131             # Mark a particular index as used if it's not already used
132             sub take {
133 0     0 0   my ($self,$count,$which,$i,$req) = @_;
134 0 0 0       if (!$self->{taken}{$count}{$which}{$i} and $self->{$which}{$i}) {
135 0 0 0       if (!$req or $self->{$which}{$i} =~ /$req/) {
136 0           $self->{taken}{$count}{$which}{$i}++;
137 0           return 1;
138            
139             }
140             }
141 0           return;
142             }
143              
144             # Calculate precision and recall for strict, relaxed, very_relaxed
145             sub calc_stats {
146 0     0 0   my $self = shift;
147 0           my %sum = hashsum map { %$_ } @_;
  0            
148              
149             # Ensure "R" and "L" count as categories
150 0           $sum{label} = grep { $_->{label} } @_;
  0            
151              
152             # Ensure relaxed counts don't double-count
153 0           $sum{relaxed} -= ($sum{relaxed} - $sum{strict})/2;
154 0           $sum{very_relaxed} -= ($sum{very_relaxed} - $sum{strict})/2;
155            
156             # Sanity checks
157 0 0         if ($sum{true} == 0) {
    0          
158 0           die "No segment_breaks found. Please label the true segments in the original text so that we can evaluate the performance of the Segmenting algorithm";
159             } elsif ($sum{label} == 0) {
160 0           die "No segments labelled by Segmenting algorithm";
161             }
162              
163             # Return results
164 0           return map { 100*$sum{$_}/$sum{true}, 100*$sum{$_} / $sum{label} }
  0            
165             qw(strict relaxed very_relaxed);
166             }
167              
168             1;
169