File Coverage

blib/lib/Lingua/EN/Splitter.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package Lingua::EN::Splitter;
2
3				=head1 NAME
4
5				Lingua::EN::Splitter - Split text into words, paragraphs, segments, and tiles
6
7				=head1 SYNOPSIS
8
9				use Lingua::EN::Splitter qw(words paragraphs paragraph_breaks
10				segment_breaks tiles set_tokens_per_tile);
11
12				my $text = <
13				Lingua::EN::Splitter is a useful module that allows text to be split up
14				into words, paragraphs, segments, and tiles.
15
16				Paragraphs are by default indicated by blank lines. Known segment breaks are
17				indicated by a line with only the word "segment_break" in it.
18
19				segment_break
20
21				This module does not make any attempt to guess segment boundaries. For that,
22				see L.
23
24				EOT
25
26				# Set the number of tokens per tile to 20 (the default)
27				set_tokens_per_tile(20);
28
29				my @words = words $text;
30				my @paragraphs = paragraphs $text;
31				my @paragraph_breaks = paragraph_breaks $text;
32				my @segment_breaks = segment_breaks $text;
33				my @tiles = tile words $text;
34
35				print "@words[0..3,5]"; # Prints "lingua en segmenter is useful"
36				print "@words[43..46,53]"; # Prints "this module does not guess"
37				print $paragraphs[2]; # Prints the third paragraph of the above text
38				print $paragraph_breaks[2]; # Prints which tile the 3rd paragraph starts on
39				print $segment_breaks[1]; # Prints which tile the 2nd segment starts on
40				print $tiles[1]; # Prints @words[20..39] filtered for stopwords
41				# and stemmed
42
43				# This module can also be used in an object-oriented fashion
44				my $splitter = new Lingua::EN::Splitter;
45				@words = $splitter->words $text;
46
47
48				=head1 DESCRIPTION
49
50				See synopsis.
51
52				This module can be used in an object-oriented fashion or the routines can be
53				exported.
54
55				=head1 AUTHORS
56
57				David James
58
59				=head1 SEE ALSO
60
61				L, L,
62				L
63
64				=cut
65
66				$VERSION = 0.10;
67				@EXPORT_OK = qw(
68				words
69				paragraphs
70				breaks
71				paragraph_breaks
72				segment_breaks
73
74				set_tokens_per_tile
75				set_paragraph_regexp
76				set_non_word_regexp
77				set_locale
78				set_stop_words
79				);
80
81	1	1	6	use Math::HashSum qw(hashsum);
	1		2
	1		60
82	1	1	6	use base 'Class::Exporter';
	1		1
	1		70
83	1	1	516	use Lingua::Stem;
	0
	0
84				use Lingua::EN::StopWords qw(%StopWords);
85				use strict;
86				use Carp qw(croak);
87				no warnings;
88
89				# Create a new instance of this object
90				sub new {
91				my $class = shift;
92				my $stemmer = Lingua::Stem->new;
93				$stemmer->stem_caching({ -level=>2 });
94				bless {
95				PARAGRAPH_BREAK=>qr/\n\s(segment_break)?\s\n/,
96				NON_WORD_CHARACTER=>qr/\W/,
97				TOKENS_PER_TILE=>20,
98				STEMMER=>$stemmer,
99				STOP_WORDS=>\%StopWords,
100				@_
101				}, $class;
102				}
103
104				# Split text into words
105				sub words {
106				my $self = shift;
107				my $input = lc shift;
108				$input =~ s/$self->{PARAGRAPH_BREAK}/ /g;
109				return [ split /$self->{NON_WORD_CHARACTER}+/, $input ];
110				}
111
112				# Split text into paragraphs
113				sub paragraphs {
114				my ($self, $input) = @_;
115				return [
116				grep { /\S/ and !/^segment_break$/i }
117				split /$self->{PARAGRAPH_BREAK}/i, $input
118				];
119				}
120
121				# Return a list of paragraph and segment breaks
122				sub breaks {
123				my $self = shift;
124				my $input = lc shift;
125
126				# Eliminate empty paragraphs at the very end
127				$input =~ s/$self->{PARAGRAPH_BREAK}\s*\Z//;
128
129				# Convert paragraph breaks to tokens
130				$input =~ s/$self->{PARAGRAPH_BREAK}/ PNO$1 /g;
131
132				my @words = split /(?:$self->{NON_WORD_CHARACTER})+/, $input;
133				my (@breaks,%segment_breaks,$num_words);
134
135				foreach (@words) {
136				if (/^PNO(segment_break)?$/) {
137				my $segment_break = $1;
138				$segment_break and $segment_breaks{scalar @breaks}++;
139				push @breaks, $num_words / $self->{TOKENS_PER_TILE};
140				} else {
141				$num_words++;
142				}
143				}
144				return (\@breaks,\%segment_breaks);
145				}
146
147				# Return a list of paragraph breaks
148				sub paragraph_breaks {
149				my $self = shift;
150				return ($self->breaks(@_))[0];
151				}
152
153				# Return a list of real segment breaks
154				sub segment_breaks {
155				my $self = shift;
156				return ($self->breaks(@_))[1];
157				}
158
159				# Convert a list of words into tiles
160				sub tile {
161				my $self = shift;
162				my $words = ref $_[0] ? shift : \@_;
163				my @tiles;
164
165				while (@$words) {
166				push @tiles, {
167				hashsum map { @{$self->{STEMMER}->stem($_)}, 1 }
168				grep { !exists $self->{STOP_WORDS}->{$_} }
169				splice @$words, 0, $self->{TOKENS_PER_TILE}
170				};
171				}
172				return \@tiles;
173				}
174
175				#########################################################
176				# Mutator methods
177				#########################################################
178
179				sub set_tokens_per_tile {
180				my $self = shift;
181				$self->{TOKENS_PER_TILE} = shift;
182				}
183
184				sub set_paragraph_regexp {
185				my $self = shift;
186				$self->{PARAGRAPH_BREAK} = shift;
187				}
188
189				sub set_non_word_regexp {
190				my $self = shift;
191				$self->{NON_WORD_CHARACTER} = shift;
192				}
193
194				sub set_locale {
195				my $self = shift;
196				$self->{STEMMER}->set_locale(shift);
197				}
198
199				sub set_stop_words {
200				my $self = shift;
201				$self->{STOP_WORDS} = shift;
202				}
203
204
205				1;