File Coverage

blib/lib/DateTime/Format/Natural/Extract.pm

Criterion	Covered	Total	%
statement	160	160	100.0
branch	54	56	96.4
condition	48	51	94.1
subroutine	13	13	100.0
pod			n/a
total	275	280	98.2

line	stmt	bran	cond	sub	time	code
1						package DateTime::Format::Natural::Extract;
2
3	26			26	233	use strict;
	26				99
	26				809
4	26			26	160	use warnings;
	26				60
	26				808
5	26				14520	use base qw(
6						DateTime::Format::Natural::Duration::Checks
7						DateTime::Format::Natural::Formatted
8	26			26	158	);
	26				64
9	26			26	218	use boolean qw(true false);
	26				76
	26				143
10
11	26			26	1640	use constant DATE_TYPE => 0x01;
	26				69
	26				1462
12	26			26	165	use constant GRAMMAR_TYPE => 0x02;
	26				71
	26				1353
13	26			26	178	use constant DURATION_TYPE => 0x04;
	26				97
	26				1475
14
15	26			26	213	use DateTime::Format::Natural::Utils qw(trim);
	26				120
	26				60222
16
17						our $VERSION = '0.13';
18
19						my %grammar_durations = map { $_ => true } qw(for_count_unit);
20
21						my $get_range = sub
22						{
23						my ($aref, $index) = @_;
24						return [ grep defined, @$aref[$index, $index + 1] ];
25						};
26
27						my $extract_duration = sub
28						{
29						my ($skip, $indexes, $index) = @_;
30
31						return false unless defined $indexes->[$index] && defined $indexes->[$index + 1];
32						my ($left_index, $right_index) = ($indexes->[$index][1], $indexes->[$index + 1][0]);
33
34						return ($skip->{$left_index} \|\| $skip->{$right_index}) ? false : true;
35						};
36
37						sub _extract_expressions
38						{
39	209			209	473	my $self = shift;
40	209				558	my ($extract_string) = @_;
41
42	209				622	$extract_string =~ s/^[,;.]//;
43	209				709	$extract_string =~ s/[,;.]$//;
44
45	209				1013	while ($extract_string =~ /([,;.])/g) {
46	18				67	my $mark = $1;
47	18				191	my %patterns = (
48						',' => qr/(?!\d{4})/,
49						';' => qr/(?=\w)/,
50						'.' => qr/(?=\w)/,
51						);
52	18				54	my $pattern = $patterns{$mark};
53	18				338	$extract_string =~ s/\Q$mark\E \s+? $pattern/ [token] /x; # pretend punctuation marks are tokens
54						}
55
56	209				1599	my $timespan_sep = $self->{data}->__timespan('literal');
57
58	209				2003	1 while $extract_string =~ s/^$timespan_sep\s+//i;
59	209				1526	1 while $extract_string =~ s/\s+$timespan_sep$//i;
60
61	209				1217	$self->_rewrite(\$extract_string);
62
63	209				2019	my @tokens = split /\s+/, $extract_string;
64	209				576	my %entries = %{$self->{data}->__grammar('')};
	209				1268
65
66	209				1088	my (@expressions, %skip);
67
68	209	100			1470	if ($extract_string =~ /\s+ $timespan_sep \s+/ix) {
69	97				680	$self->_extract_duration($extract_string, \@tokens, \@expressions, \%skip);
70						}
71
72	209				2583	my (%expand, %lengths);
73	209				2040	foreach my $keyword (keys %entries) {
74	14003				26285	$expand{$keyword} = $self->_expand_for($keyword);
75	14003				41784	$lengths{$keyword} = @{$entries{$keyword}->[0]};
	14003				27494
76						}
77
78	209				924	my $seen_expression;
79	209				415	do {
80	439				2915	$seen_expression = false;
81	439				1464	my $date_index;
82	439				2027	for (my $i = 0; $i < @tokens; $i++) {
83	1844	100			16416	next if $skip{$i};
84	1085	100			2826	if ($self->_check_for_date($tokens[$i], $i, \$date_index)) {
85	16				216	last;
86						}
87						}
88						GRAMMAR:
89	439				8937	foreach my $keyword (sort { $lengths{$b} <=> $lengths{$a} } grep { $lengths{$_} <= @tokens } keys %entries) {
	98371				137588
	29413				54827
90	19338				380549	my @grammar = @{$entries{$keyword}};
	19338				95731
91	19338				32450	my $types_entry = shift @grammar;
92	19338				53804	my @grammars = [ [ @grammar ], false ];
93	19338	100	100		83236	if ($expand{$keyword} && @$types_entry + 1 <= @tokens) {
94	4431				48310	@grammar = $self->_expand($keyword, $types_entry, \@grammar);
95	4431				19090	unshift @grammars, [ [ @grammar ], true ];
96						}
97	19338				132463	foreach my $grammar (@grammars) {
98	23766				67048	my $expanded = $grammar->[1];
99	23766				42567	my $length = $lengths{$keyword};
100	23766	100			50486	$length++ if $expanded;
101	23766				161979	foreach my $entry (@{$grammar->[0]}) {
	23766				51492
102	150471	100			1032607	my ($types, $expression) = $expanded ? @$entry : ($types_entry, $entry);
103	150471				1115610	my $definition = $expression->[0];
104	150471				279784	my $matched = false;
105	150471				403738	my $pos = 0;
106	150471				227682	my @indexes;
107						my $date_index;
108	150471				328388	for (my $i = 0; $i < @tokens; $i++) {
109	842658	100			5805733	next if $skip{$i};
110	492285	100			907164	last unless defined $types->[$pos];
111	492193	100			1019170	if ($self->_check_for_date($tokens[$i], $i, \$date_index)) {
112	9905				115897	next;
113						}
114	482288	100	66		8188431	if ($types->[$pos] eq 'SCALAR' && defined $definition->{$pos} && $tokens[$i] =~ /^$definition->{$pos}$/i
		100	100
		100	100
			66
			100
115						or $types->[$pos] eq 'REGEXP' && $tokens[$i] =~ $definition->{$pos}
116						&& (@indexes ? ($i - $indexes[-1] == 1) : true)
117						) {
118	7879				91485	$matched = true;
119	7879				25665	push @indexes, $i;
120	7879				21517	$pos++;
121						}
122						elsif ($matched) {
123	5239				37162	last;
124						}
125						}
126	150471	100	100		1164630	if ($matched
		100	100
127						&& @indexes == $length
128						&& (defined $date_index ? ($indexes[0] - $date_index == 1) : true)
129						) {
130	223	100			6170	my $expression = join ' ', (defined $date_index ? $tokens[$date_index] : (), @tokens[@indexes]);
131	223	100			841	my $start_index = defined $date_index ? $indexes[0] - 1 : $indexes[0];
132	223	100			781	my $type = $grammar_durations{$keyword} ? DURATION_TYPE : GRAMMAR_TYPE;
133	223				1365	push @expressions, [ [ $start_index, $indexes[-1] ], $expression, { flags => $type } ];
134	223	100			1440	$skip{$_} = true foreach (defined $date_index ? $date_index : (), @indexes);
135	223				1428	$seen_expression = true;
136	223				2117	last GRAMMAR;
137						}
138						}
139						}
140						}
141	439	100	100		7950	if (defined $date_index && !$seen_expression) {
142	7				166	push @expressions, [ [ ($date_index) x 2 ], $tokens[$date_index], { flags => DATE_TYPE } ];
143	7				27	$skip{$date_index} = true;
144	7				39	$seen_expression = true;
145						}
146						} while ($seen_expression);
147
148	209				2780	return $self->_finalize_expressions(\@expressions, \@tokens);
149						}
150
151						sub _extract_duration
152						{
153	97			97	277	my $self = shift;
154	97				336	my ($extract_string, $tokens, $expressions, $skip) = @_;
155
156	97				668	my $timespan_sep = $self->{data}->__timespan('literal');
157
158	97				1636	my @strings = grep /\S/, map trim($_), split /\b $timespan_sep \b/ix, $extract_string;
159	97	50			555	if (@strings) {
160	97				254	my $index = 0;
161	97				292	my @indexes;
162	97				284	foreach my $string (@strings) {
163	208				793	my @string_tokens = split /\s+/, $string;
164	208				637	push @indexes, [ $index, $index + $#string_tokens ];
165	208				435	$index += $#string_tokens + 1;
166	208		100		1985	$index++ while defined $tokens->[$index] && $tokens->[$index] =~ /^$timespan_sep$/i;
167						}
168						DURATION: {
169	97				319	for (my $i = 0; $i <= $#strings - 1; $i++) {
	160				851
170	179	100			893	next unless $extract_duration->($skip, \@indexes, $i);
171	111				1753	my $save_expression = false;
172	111				577	my @chunks;
173	111				363	foreach my $extract (qw(_first_to_last_extract _from_count_to_count_extract)) {
174	206	100			2168	if ($self->$extract($get_range->(\@strings, $i), $get_range->(\@indexes, $i), $tokens, \@chunks)) {
175	63				1378	$save_expression = true;
176	63				253	last;
177						}
178						}
179	111	100			1742	if ($save_expression) {
180	63				702	my $timespan_sep_index = $chunks[0]->[0][1] + 1;
181	63				353	my $expression = join ' ', ($chunks[0]->[1], $tokens->[$timespan_sep_index], $chunks[1]->[1]);
182	63				234	my @indexes = ($chunks[0]->[0][0], $chunks[1]->[0][1]);
183	63				355	push @$expressions, [ [ @indexes ], $expression, { flags => DURATION_TYPE } ];
184	63				331	$skip->{$_} = true foreach ($indexes[0] .. $indexes[1]);
185	63				990	redo DURATION;
186						}
187						}
188						}
189						}
190						}
191
192						sub _finalize_expressions
193						{
194	209			209	519	my $self = shift;
195	209				661	my ($expressions, $tokens) = @_;
196
197	209				1672	my $timespan_sep = $self->{data}->__timespan('literal');
198	209				596	my (@duration_indexes, @final_expressions);
199
200	209				721	my $seen_duration = false;
201
202	209				1474	my @expressions = sort { $a->[0][0] <=> $b->[0][0] } @$expressions;
	97				615
203
204	209				1060	for (my $i = 0; $i < @expressions; $i++) {
205	293				749	my $expression = $expressions[$i];
206
207	293				911	my $prev = $expression->[0][0] - 1;
208	293				678	my $next = $expression->[0][1] + 1;
209
210	293	100	100		2006	if ($expression->[2]->{flags} & DATE_TYPE
		50
211						\|\| $expression->[2]->{flags} & GRAMMAR_TYPE
212						) {
213	224	100	100		768	if (!$seen_duration
		100	100
			100
			66
			100
			100
214						&& defined $tokens->[$next]
215						&& $tokens->[$next] =~ /^$timespan_sep$/i
216						&& defined $expressions[$i + 1]
217						&& ($expressions[$i + 1]->[2]->{flags} & DATE_TYPE
218						\|\| $expressions[$i + 1]->[2]->{flags} & GRAMMAR_TYPE)
219						&& $expressions[$i + 1]->[0][0] - $next == 1
220						) {
221	31				1313	push @duration_indexes, ($expression->[0][0] .. $expression->[0][1]);
222	31				118	$seen_duration = true;
223						}
224						elsif ($seen_duration) {
225	31				838	push @duration_indexes, ($prev, $expression->[0][0] .. $expression->[0][1]);
226	31				197	push @final_expressions, join ' ', @$tokens[@duration_indexes];
227	31				83	@duration_indexes = ();
228	31				100	$seen_duration = false;
229						}
230						else {
231	162				4690	push @final_expressions, $expression->[1];
232						}
233						}
234						elsif ($expression->[2]->{flags} & DURATION_TYPE) {
235	69				327	push @final_expressions, $expression->[1];
236						}
237						}
238
239	209			262	1514	my $exclude = sub { $_[0] =~ /^\d{1,2}$/ };
	262				7554
240
241	209				836	return grep !$exclude->($_), @final_expressions;
242						}
243
244						sub _check_for_date
245						{
246	493278			493278	734295	my $self = shift;
247	493278				904447	my ($token, $index, $date_index) = @_;
248
249	493278				2081633	my ($formatted) = $token =~ $self->{data}->__regexes('format');
250	493278				1337774	my %count = $self->_count_separators($formatted);
251	493278	100			1068317	if ($self->_check_formatted('ymd', \%count)) {
252	9921				18619	$$date_index = $index;
253	9921				23090	return true;
254						}
255						else {
256	483357				1134651	return false;
257						}
258						}
259
260						1;
261						__END__
262
263						=head1 NAME
264
265						DateTime::Format::Natural::Extract - Extract parsable expressions from strings
266
267						=head1 SYNOPSIS
268
269						Please see the DateTime::Format::Natural documentation.
270
271						=head1 DESCRIPTION
272
273						C<DateTime::Format::Natural::Extract> extracts expressions from strings to be
274						processed by the parse methods.
275
276						=head1 SEE ALSO
277
278						L<DateTime::Format::Natural>
279
280						=head1 AUTHOR
281
282						Steven Schubiger <schubiger@cpan.org>
283
284						=head1 LICENSE
285
286						This program is free software; you may redistribute it and/or
287						modify it under the same terms as Perl itself.
288
289						See L<http://dev.perl.org/licenses/>
290
291						=cut