File Coverage

blib/lib/Lingua/EN/Segmenter/TextTiling.pm

Criterion	Covered	Total	%
statement	12	12	100.0
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	16	16	100.0

line	stmt	sub	time	code
1				package Lingua::EN::Segmenter::TextTiling;
2
3				=head1 NAME
4
5				Lingua::EN::Segmenter::TextTiling - Segment text using the TextTiling method
6
7				=head1 SYNOPSIS
8
9				use Lingua::EN::Segmenter::TextTiling qw(segments);
10				use lib '.';
11
12				my $text = <
13				Lingua::EN::Segmenter is a useful module that allows text to be split up
14				into words, paragraphs, segments, and tiles.
15
16				Paragraphs are by default indicated by blank lines. Known segment breaks are
17				indicated by a line with only the word "segment_break" in it.
18
19				The module detects paragraphs that are unrelated to each other by comparing
20				the number of words per-paragraph that are related. The algorithm is designed
21				to work only on long segments.
22
23				SOUTH OF BAGHDAD, Iraq (CNN) -- Seven U.S. troops freed Sunday after being
24				held by Iraqi forces arrived by helicopter at a base south of Baghdad and were
25				transferred to a C-130 transport plane headed for Kuwait, CNN's Bob Franken
26				reported from the scene.
27
28				EOT
29
30				my $num_segment_breaks = 1;
31				my @segments = segments($num_segment_breaks,$text);
32				print $segments[0]; # Prints the first three paragraphs of the above text
33				print "\n----------SEGMENT_BREAK----------\n";
34				print $segments[1]; # Prints the last paragraph of the above text
35
36				# This module can also be used in an object-oriented fashion
37				my $splitter = new Lingua::EN::Splitter;
38				@words = $splitter->words($text);
39
40				=head1 DESCRIPTION
41
42				See synopsis.
43
44				=head1 EXTENDING
45
46				This module is designed to be easily extendable. Feel free to extend from this
47				module when designing alternate methods for text segmentation.
48
49				=head1 AUTHORS
50
51				David James
52
53				=head1 SEE ALSO
54
55				L, L,
56				L
57
58				=head1 LICENSE
59
60				Copyright (c) 2002 David James
61				All rights reserved.
62				This program is free software; you can redistribute it and/or
63				modify it under the same terms as Perl itself.
64
65				=cut
66
67				$VERSION = 0.10;
68				@EXPORT_OK = qw(
69				segment
70				segments
71				set_tiles_per_block
72				set_number_of_smoothing_rounds
73				set_tokens_per_tile
74				set_paragraph_regexp
75				set_non_word_regexp
76				set_locale
77				set_stop_words
78				);
79
80	1	1	3109	use Math::Vector::SortIndexes qw(sort_indexes_descending);
	1		212
	1		81
81	1	1	849	use Math::VecStat qw(average min sum);
	1		1336
	1		86
82	1	1	886	use Math::HashSum qw(hashsum);
	1		178
	1		61
83
84	1	1	7	use base 'Lingua::EN::Segmenter';
	1		1
	1		1476
85				use strict;
86
87
88				# Create a new instance of the object
89				sub new {
90				my $class = shift;
91				$class->SUPER::new(
92				TILES_PER_BLOCK=>7,
93				NUMBER_OF_SMOOTHING_ROUNDS=>2,
94				@_
95				);
96				}
97
98				# Segment a piece of text
99				sub segment {
100				my ($self, $num_segments, $input) = @_;
101
102				# Get smoothed depth scores
103				my $scores = $self->smoothed_depth_scores($input);
104
105				# Get paragraph breaks
106				my $breaks = $self->{splitter}->paragraph_breaks($input);
107
108				# Get predicted segment breaks
109				return $self->tile2segment($num_segments, $breaks, $scores);
110				}
111
112				sub set_tiles_per_block {
113				my $self = shift;
114				$self->{TILES_PER_BLOCK} = shift;
115				}
116
117				sub set_number_of_smoothing_rounds {
118				my $self = shift;
119				$self->{NUMBER_OF_SMOOTHING_ROUNDS} = shift;
120				}
121
122
123				######################################################################
124				# PRIVATE METHODS
125				######################################################################
126
127				# Accept as input the scores of the tiles. Output segment scores.
128				sub tile2segment {
129				my ($self, $num_segments, $breaks, $scores) = @_;
130
131				my @indexes = sort_indexes_descending @$scores;
132
133				my @too_close = (
134				-$self->{MIN_SEGMENT_SIZE}..-1, 1..$self->{MIN_SEGMENT_SIZE}
135				);
136				my @direction = qw(L R);
137				my (%segments, %verbose, $cut_off_depth);
138
139				# Calculate the most likely segment breaks
140				GAP: foreach my $i (@indexes) {
141				my $tile_no = $i + $self->{TILES_PER_BLOCK};
142				my $closest_break = (min( map { abs($_ - $tile_no) } @$breaks ))[1];
143				$segments{$closest_break+$_} and next GAP for (@too_close);
144				$segments{$closest_break} .=
145				$direction[$breaks->[$closest_break] > $tile_no];
146				$verbose{$tile_no} = [ $tile_no, $scores->[$i], $closest_break ];
147				$cut_off_depth = $scores->[$i];
148				last if keys %segments == $num_segments;
149				}
150
151				# Verbose output
152				if ($self->{VERBOSE}) {
153				printf "Cut-off depth = %6.4f\n\n", $cut_off_depth;
154				print " Gap Depth Para\n";
155				foreach (sort { $a <=> $b } keys %verbose) {
156				printf "%4d %6.3f %4d\n", @{$verbose{$_}}
157				}
158				print "\n";
159				}
160				return \%segments;
161				}
162
163				# Calculate depth scores based on a list of gap scores
164				sub depth {
165				no warnings;
166
167				my $self = shift;
168				my @score = @{$_[0]};
169				my @depth;
170				for my $i (1..$#score) {
171				$depth[$i] = $score[$i-1] + $score[$i+1] - 2*$score[$i];
172				}
173				$depth[0] = $score[1] - $score[0];
174				$depth[$#score] = $score[-2] - $score[-1];
175				return \@depth;
176				}
177
178				# Given some depth scores, smooth them.
179				sub smooth {
180				my $self = shift;
181				my @depth = @{$_[0]};
182				unshift @depth, $depth[0];
183				push @depth, $depth[-1];
184				for (1..$self->{NUMBER_OF_SMOOTHING_ROUNDS}) {
185				foreach my $j (1..$#depth-1) {
186				$depth[$j] = average $depth[$j-1], $depth[$j], $depth[$j+1];
187				}
188				}
189				return [ @depth[1..$#depth-1] ];
190				}
191
192				# Take text as input and output a list of smoothed depth scores
193				sub smoothed_depth_scores {
194				my ($self, $input) = @_;
195				my $words = $self->{splitter}->words($input);
196				my $tiles = $self->{splitter}->tile($words);
197				my $depth_scores = $self->depth($self->gap_scores($tiles));
198				return $self->smooth($depth_scores);
199				}
200
201				# Scores for the gap between two tiles
202				sub gap_scores {
203				my $self = shift;
204				my @tiles = @{$_[0]};
205				my $TILES_PER_BLOCK = $self->{TILES_PER_BLOCK};
206				my (@score, $i);
207				for $i ($TILES_PER_BLOCK .. @tiles-$TILES_PER_BLOCK) {
208				my $L = $i-$TILES_PER_BLOCK;
209				my $R = $i+$TILES_PER_BLOCK-1;
210				my %l = hashsum map { %$_ } @tiles[$L..$i-1];
211				my %r = hashsum map { %$_ } @tiles[$i..$R];
212				my %all = map { %$_ } @tiles[$L..$R];
213				my $numerator = sum map { $l{$_}*$r{$_} } keys %all;
214				my $denom1 = sum map { $l{$_}*$l{$_} } keys %all;
215				my $denom2 = sum map { $r{$_}*$r{$_} } keys %all;
216				push @score, $numerator/sqrt($denom1*$denom2);
217				}
218				return \@score;
219				}
220
221
222				1;
223
224
225
226
227