File Coverage

blib/lib/Text/Sequence.pm

Criterion	Covered	Total	%
statement	73	79	92.4
branch	22	26	84.6
condition			n/a
subroutine	10	12	83.3
pod	6	6	100.0
total	111	123	90.2

line	stmt	bran	sub	pod	time	code
1						package Text::Sequence;
2
3	1		1		529	use strict;
	1				1
	1				22
4	1		1		3	use Carp;
	1				1
	1				49
5	1		1		3	use vars qw($VERSION);
	1				2
	1				842
6
7						$VERSION = '0.27';
8
9						=pod
10
11						=head1 NAME
12
13						Text::Sequence - spot one-dimensional sequences in patterns of text
14
15						=head1 SYNOPSIS
16
17						use Text::Sequence;
18
19						my @list = get_files_in_dir();
20						my ($sequences, $singletons) = Text::Sequence::find($somedir);
21
22
23						my $sequence = $sequences->[0];
24						print $sequence->template();
25
26						my $num = 0;
27						foreach my ($element) ($sequence->members()) {
28						++$num;
29						print "$num) $filename\n";
30						}
31
32						=head1 DESCRIPTION
33
34						A sequence could be a list of files like
35
36						00001.jpg
37						00002.jpg
38						00003.jpg
39						...
40						05000.jpg
41
42
43						or
44
45						raw.0001.txt
46						...
47						raw.0093.txt
48
49						or
50
51						foo3a.html
52						foo3b.html
53						foo3c.html
54
55						or even
56
57						1.mp3
58						100.mp3
59
60						in which case their templates would be
61
62						%.5d.tif
63
64						raw.%.4d.txt
65
66						foo3%s.html
67
68						%d.mp3
69
70						respectively.
71
72						This library will attempt to
73
74						=over 4
75
76						=item find all sequences in a given list
77
78						=item tell you which elements are missing from a sequence
79
80						=item be able to cope with non padded numbers in sequences
81
82						=back
83
84						It does B spot multi-dimensional sequences, e.g. C.
85
86						=head1 METHODS
87
88
89						=head2 find( @elements )
90
91						my ($sequences, $singletons) = Text::Sequence::find($somedir);
92
93						A static method to find all the sequences in a list of elements.
94						Both are returned as arrayrefs.
95
96						=cut
97
98						sub find {
99	7		7	1	2855	my @elements = @_;
100	7				12	my %candidates = _find_candidates(@elements);
101	7				14	my @seqs = _find_sequences(\%candidates);
102
103						# Find singletons by process of elimination, going through
104						# all sequence members.
105	7				7	my %singletons = map { $_ => 1 } @elements;
	77				104
106	7				14	foreach my $seq (@seqs) {
107	18				22	my @members = $seq->members;
108	18				32	delete $singletons{$seq->template($_)} foreach @members;
109						}
110
111	7				52	return (\@seqs, [ keys %singletons ]);
112						}
113
114
115						sub _find_candidates {
116	7		7		7	my %candidates;
117
118	7				10	foreach my $element (@_) {
119	77	50			154	next unless $element =~ /\d/; # nothing without numbers
120
121	77				1218	while ($element =~ /\G.*?(?:(\d+)\|(?
122	102				73	my $cand = $element;
123
124	102	100			134	if (defined $1) {
		50
125						# Numerical sequence
126	83				171	my $num = substr($cand, $-[1], $+[1] - $-[1], '%d');
127
128						# There could be multiple lengths of the number we just
129						# changed to a %d, need to analyse the length frequencies
130						# in conjunction with the padding to see if differing
131						# lengths are still part of the same sequence (e.g.
132						# to distinguish foo.%3d.bar from foo.%02d.bar).
133	83				87	my $length = length($num);
134						# Note that a single zero is not counted as padded.
135	83	100			112	my $pad = ($num =~ /^0\d/) ? 'p' : '';
136						# Note how we "de-pad" the members here.
137	83				49	push @{ $candidates{$cand}{formats}{$pad . $length} }, $num + 0;
	83				193
138	83				290	$candidates{$cand}{count}++;
139						}
140						elsif (defined $2) {
141	19				44	my $letter = substr($cand, $-[2], $+[2] - $-[2], '%s');
142	19				21	push @{ $candidates{$cand}{formats}{letter} }, $letter;
	19				38
143	19				84	$candidates{$cand}{count}++;
144						}
145						else {
146	0				0	die "BUG! Missing number or letter at pos ", pos($element),
147						" of '$element', match was '$&'";
148						}
149						}
150						}
151	7				37	return %candidates;
152						}
153
154						sub _find_sequences {
155	7		7		20	my ($candidates) = @_;
156
157	7				6	my @seqs;
158
159	7				15	foreach my $cand (keys %$candidates) {
160						# it's not a sequence if there's only 1
161	35	100			61	next if $candidates->{$cand}{count} == 1;
162
163	15				15	my $formats = $candidates->{$cand}{formats};
164
165	15	100			22	if (my $letters = $formats->{letter}) {
166	5				13	push @seqs, Text::Sequence->new($cand, @$letters);
167	5				9	next;
168						}
169
170						# That was the easy bit, numbers are much harder.
171
172						# First look for padded numbers. Padding is quite a
173						# deliberate action, so our best effort assumption is that if
174						# there is a number padded to length n, any other (non-padded)
175						# numbers of length n must belong to the same sequence. It's
176						# not quite optimal, but we'd need some serious AI to separate
177						# things like (1, 4, 64, 256, 07 .. 13) into
178						#
179						# [ map 4**$_, 0 .. 3 ] and [ 07 .. 13 ]
180						#
181						# The following code will separate it into
182						#
183						# [ 07 .. 13, 64 ] and [ 1, 4, 256 ]
184						#
185	10				45	foreach my $padded (grep /^p/, keys %$formats) {
186	9				24	(my $length = $padded) =~ s/^p//;
187						my @members = (
188	9				13	@{ $formats->{$padded} },
189	9	100			8	@{ $formats->{$length} \|\| [] },
	9				37
190						);
191	9				19	delete @$formats{$padded, $length};
192	9				33	(my $pcand = $cand) =~ s/%d/%.${length}d/;
193	9				24	push @seqs, Text::Sequence->new($pcand, @members);
194						}
195						# Now the remaining elements (if any) all get swept into the
196						# %d non-padded bucket.
197	10				24	my @members = ( map @{ $formats->{$_} }, sort keys %$formats );
	8				13
198	10	100			23	push @seqs, Text::Sequence->new($cand, @members) if @members;
199						}
200
201	7				16	return @seqs;
202						}
203
204
205
206						=head2 new( $template, @member_nums )
207
208						Creates a new sequence object.
209
210						=cut
211
212						sub new {
213	18		18	1	15	my $class = shift;
214	18	50			28	my $template = shift or die "You must pass a template\n";
215
216	18				24	my $self = bless {
217						template => $template,
218						re => _to_re($template),
219						members => [ @_ ],
220						}, $class;
221
222	18				39	return $self;
223						}
224
225
226						sub _to_re {
227	18		18		14	my $re = shift;
228
229	18	100			62	if ($re =~ m!%\.(\d+)d!) {
		100
		50
230	9				10	my $m = $1;
231	9				50	$re =~ s!$&!(\\d{$m})!;
232						} elsif ($re =~ m!%d!) {
233	4				17	$re =~ s!$&!(\\d+)!;
234						} elsif ($re =~ m!%s!) {
235	5				27	$re =~ s!$&!(.+=?)!;
236						}
237
238	18				66	return $re;
239
240						}
241
242						=head2 template( $number_or_letter )
243
244						Tell you the template of the sequence, in C-like formats.
245
246						If you pass in a number or letter then it will substitute it in to
247						return an actual sequence element.
248
249						=cut
250
251						sub template {
252	118		118	1	141	my $self = shift;
253
254	118	100			118	if (@_) {
255	82				228	return sprintf($self->{template}, $_[0]);
256						} else {
257	36				64	return $self->{template};
258						}
259						}
260
261						=head2 members()
262
263						Returns a list describing the members of the sequence. Each item in
264						the list is a letter or (non-padded) number which can be substituted
265						into the template to obtain the original element
266
267						For members of the same width, order is preserved from the original
268						call to C.
269
270						=cut
271
272						sub members {
273	30		30	1	11852	my $self = shift;
274	30				24	return @{ $self->{members} };
	30				84
275						}
276
277						=head2 in( $string)
278
279						Tells you whether a particular string is in this sequence.
280
281						=cut
282
283						sub in {
284	0		0	1		my $self = shift;
285	0					my $test = shift;
286
287	0					my $re = $self->{re};
288
289	0					return $test =~ m!$re!;
290
291						}
292
293						=head2 re
294
295						Returns the regular expression used to determine whether something
296						is in the sequence or not.
297
298						=cut
299
300						sub re {
301	0		0	1		return $_[0]->{re};
302						}
303
304
305						=head1 AUTHOR
306
307						Simon Wistow
308						Adam Spiers
309
310						=head1 COPYRIGHT
311
312						Copyright (c) 2004 - Simon Wistow
313
314						=head1 BUGS
315
316						Can't insist on sequences being contiguous (yet).
317
318						=head1 SEE ALSO
319
320						=cut
321
322						1;