File Coverage

blib/lib/Date/Extract.pm
Criterion Covered Total %
statement 104 104 100.0
branch 24 26 92.3
condition 42 57 73.6
subroutine 22 22 100.0
pod 2 2 100.0
total 194 211 91.9


line stmt bran cond sub pod time code
1             package Date::Extract;
2 7     7   614747 use strict;
  7         13  
  7         231  
3 7     7   30 use warnings;
  7         7  
  7         176  
4 7     7   4323 use DateTime::Format::Natural;
  7         3507166  
  7         856  
5 7     7   83 use List::Util 'reduce';
  7         14  
  7         589  
6 7     7   42 use parent 'Class::Data::Inheritable';
  7         15  
  7         66  
7              
8             our $VERSION = '0.06';
9              
10             __PACKAGE__->mk_classdata($_) for qw/scalar_downgrade handlers regex/;
11              
12             sub _croak {
13 3     3   26 require Carp;
14 3         520 Carp::croak @_;
15             }
16              
17             sub new {
18 41     41 1 1456714 my $class = shift;
19 41         257 my %args = (
20             format => 'DateTime',
21             returns => 'first',
22             prefers => 'nearest',
23             time_zone => 'floating',
24             @_,
25             );
26              
27 41 100 100     230 if ($args{format} ne 'DateTime'
      100        
28             && $args{format} ne 'verbatim'
29             && $args{format} ne 'epoch') {
30 1         3 _croak "Invalid `format` passed to constructor: expected `DateTime', `verbatim', `epoch'.";
31             }
32              
33 40 100 100     243 if ($args{returns} ne 'first'
      100        
      100        
      100        
      100        
34             && $args{returns} ne 'last'
35             && $args{returns} ne 'earliest'
36             && $args{returns} ne 'latest'
37             && $args{returns} ne 'all'
38             && $args{returns} ne 'all_cron') {
39 1         5 _croak "Invalid `returns` passed to constructor: expected `first', `last', `earliest', `latest', `all', or `all_cron'.";
40             }
41              
42 39 100 66     159 if ($args{prefers} ne 'nearest'
      100        
43             && $args{prefers} ne 'past'
44             && $args{prefers} ne 'future') {
45 1         3 _croak "Invalid `prefers` passed to constructor: expected `nearest', `past', or `future'.";
46             }
47              
48 38   33     225 my $self = bless \%args, ref($class) || $class;
49              
50 38         101 return $self;
51             }
52              
53             # This method will combine the arguments of parser->new and extract. Modify the
54             # "to" hash directly.
55              
56             sub _combine_args {
57 57     57   113 shift;
58              
59 57         112 my $from = shift;
60 57         108 my $to = shift;
61              
62 57   33     496 $to->{format} ||= $from->{format};
63 57   33     379 $to->{prefers} ||= $from->{prefers};
64 57   66     287 $to->{returns} ||= $from->{returns};
65 57   33     380 $to->{time_zone} ||= $from->{time_zone};
66             }
67              
68             sub extract {
69 57     57 1 113677 my $self = shift;
70 57         138 my $text = shift;
71 57         230 my %args = @_;
72              
73             # using extract as a class method
74 57 100       305 $self = $self->new
75             if !ref($self);
76              
77             # combine the arguments of parser->new and this
78 57         275 $self->_combine_args($self, \%args);
79              
80             # when in scalar context, downgrade
81             $args{returns} = $self->_downgrade($args{returns})
82 57 100       393 unless wantarray;
83              
84             # do the work
85 57         301 my @ret = $self->_extract($text, %args);
86              
87             # munge the output to match the desired return type
88 57         370 return $self->_handle($args{returns}, @ret);
89             }
90              
91             # build the giant regex used for parsing. it has to be a single regex, so that
92             # the order of matches is correct.
93             sub _build_regex {
94 5     5   56 my $self = shift;
95              
96 5         14 my $relative = '(?:today|tomorrow|yesterday)';
97              
98 5         8 my $long_weekday = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)';
99 5         17 my $short_weekday = '(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
100 5         21 my $weekday = "(?:$long_weekday|$short_weekday)";
101              
102 5         15 my $relative_weekday = "(?:(?:next|previous|last)\\s*$weekday)";
103              
104 5         10 my $long_month = '(?:January|February|March|April|May|June|July|August|September|October|November|December)';
105 5         9 my $short_month = '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
106 5         18 my $month = "(?:$long_month|$short_month)";
107              
108             # 1 - 31
109 5         36 my $cardinal_monthday = "(?:[1-9]|[12][0-9]|3[01])";
110 5         17 my $monthday = "(?:$cardinal_monthday(?:st|nd|rd|th)?)";
111              
112 5         26 my $day_month = "(?:$monthday\\s*$month)";
113 5         18 my $month_day = "(?:$month\\s*$monthday)";
114 5         33 my $day_month_year = "(?:(?:$day_month|$month_day)\\s*,?\\s*\\d\\d\\d\\d)";
115              
116 5         10 my $yyyymmdd = "(?:\\d\\d\\d\\d[-/]\\d\\d[-/]\\d\\d)";
117 5         8 my $ddmmyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d)";
118 5         9 my $ddmmyyyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d\\d\\d)";
119              
120 5         13 my $other = $self->_build_more_regex;
121 5 50       19 $other = "|$other"
122             if $other;
123              
124 5         2694 my $regex = qr{
125             \b(
126             $relative # today
127             | $relative_weekday # last Friday
128             | $weekday # Monday
129             | $day_month_year # November 13th, 1986
130             | $day_month # November 13th
131             | $month_day # 13 Nov
132             | $yyyymmdd # 1986/11/13
133             | $ddmmyy # 11-13-86
134             | $ddmmyyyy # 11-13-1986
135             $other # anything from the subclass
136             )\b
137             }ix;
138              
139 5         39 $self->regex($regex);
140             }
141              
142             # this is to be used in subclasses for adding more stuff to the regex
143             # for example, to add support for $foo_bar and $baz_quux, return
144             # "$foo_bar|$baz_quux"
145 5     5   16 sub _build_more_regex { '' }
146              
147             # build the list->scalar downgrade types
148             sub _build_scalar_downgrade {
149 5     5   95 my $self = shift;
150              
151 5         39 $self->scalar_downgrade({
152             all => 'first',
153             all_cron => 'earliest',
154             });
155             }
156              
157             # build the handlers that munge the list of dates to the desired order
158             sub _build_handlers {
159 5     5   80 my $self = shift;
160              
161             $self->handlers({
162             all_cron => sub {
163 1     1   5 sort { DateTime->compare_ignore_floating($a, $b) } @_
  2         47  
164             },
165 1     1   6 all => sub { @_ },
166              
167 2 100   2   32 earliest => sub { reduce { $a < $b ? $a : $b } @_ },
  4         192  
168 1 100   1   14 latest => sub { reduce { $a > $b ? $a : $b } @_ },
  2         113  
169 51     51   599 first => sub { $_[0] },
170 1     1   20 last => sub { $_[-1] },
171 5         110 });
172             }
173              
174             # actually perform the scalar downgrade
175             sub _downgrade {
176 51     51   106 my $self = shift;
177 51         90 my $returns = shift;
178              
179 51   66     282 my $downgrades = $self->scalar_downgrade || $self->_build_scalar_downgrade;
180 51   66     898 return $downgrades->{$returns} || $returns;
181             }
182              
183             sub _handle {
184 57     57   124 my $self = shift;
185 57         121 my $returns = shift;
186              
187 57   66     406 my $handlers = $self->handlers || $self->_build_handlers;
188 57         1082 my $handler = $handlers->{$returns};
189 57 50       345 return defined $handler ? $handler->(@_) : @_
190             }
191              
192             sub _extract {
193 57     57   113 my $self = shift;
194 57         90 my $text = shift;
195 57         202 my %args = @_;
196              
197 57   66     237 my $regex = $self->regex || $self->_build_regex;
198 57         1917 my @gleaned = $text =~ /$regex/g;
199              
200 57 100       307 return @gleaned if $self->{format} eq 'verbatim';
201              
202 56         100 my %dtfn_args;
203             $dtfn_args{prefer_future} = 1
204 56 100 66     436 if $args{prefers} && $args{prefers} eq 'future';
205 56         137 $dtfn_args{time_zone} = $args{time_zone};
206              
207 56         434 my $parser = DateTime::Format::Natural->new(%dtfn_args);
208 56         225642 my @ret;
209 56         197 for (@gleaned) {
210 78         1248 my $dt = $parser->parse_datetime($_);
211             push @ret, $dt->set_time_zone($args{time_zone})
212 78 100       538613 if $parser->success;
213             }
214              
215 56 100       3161 if ($self->{format} eq 'epoch') {
216 1         3 return map { $_->epoch } @ret;
  3         22  
217             }
218 55         3832 return @ret;
219             }
220              
221             1;
222              
223             __END__
224              
225             =head1 NAME
226              
227             Date::Extract - extract probable dates from strings
228              
229             =head1 SYNOPSIS
230              
231             my $parser = Date::Extract->new();
232             my $dt = $parser->extract($arbitrary_text)
233             or die "No date found.";
234             return $dt->ymd;
235              
236             =head1 MOTIVATION
237              
238             There are already a few modules for getting a date out of a string.
239             L<DateTime::Format::Natural> should be your first choice. There's also
240             L<Time::ParseDate> which fits many formats. Finally, you can coerce
241             L<Date::Manip> to do your bidding.
242              
243             But I needed something that will take an arbitrary block of text, search it for
244             something that looks like a date string, and extract it. This module fills this
245             niche. By design it will produce few false positives. This means it will not
246             catch nearly everything that looks like a date string. So if you have the string
247             "do homework for class 2019" it won't return a L<DateTime> object with the year
248             set to 2019. This is what your users would probably expect.
249              
250             =head1 METHODS
251              
252             =head2 new PARAMHASH => C<Date::Extract>
253              
254             =head3 arguments
255              
256             =over 4
257              
258             =item format
259              
260             Choose what format the extracted date(s) will be. The default is "DateTime",
261             which will return L<DateTime> object(s). Other option include "verbatim" (return
262             the original text), or "epoch" (return Unix timestamp).
263              
264             =item time_zone
265              
266             Only relevant when C<format> is set to "DateTime".
267              
268             Forces a particular time zone to be set (this actually matters, as "tomorrow"
269             on Monday at 11 PM means something different than "tomorrow" on Tuesday at 1
270             AM).
271              
272             By default it will use the "floating" time zone. See the documentation for
273             L<DateTime>.
274              
275             This controls both the input time zone and output time zone.
276              
277             =item prefers
278              
279             This argument decides what happens when an ambiguous date appears in the
280             input. For example, "Friday" may refer to any number of Fridays. The valid
281             options for this argument are:
282              
283             =over 4
284              
285             =item nearest
286              
287             Prefer the nearest date. This is the default.
288              
289             =item future
290              
291             Prefer the closest future date.
292              
293             =item past
294              
295             Prefer the closest past date. B<NOT YET SUPPORTED>.
296              
297             =back
298              
299             =item returns
300              
301             If the text has multiple possible dates, then this argument determines which
302             date will be returned. By default it's 'first'.
303              
304             =over 4
305              
306             =item first
307              
308             Returns the first date found in the string.
309              
310             =item last
311              
312             Returns the final date found in the string.
313              
314             =item earliest
315              
316             Returns the date found in the string that chronologically precedes any other
317             date in the string.
318              
319             =item latest
320              
321             Returns the date found in the string that chronologically follows any other
322             date in the string.
323              
324             =item all
325              
326             Returns all dates found in the string, in the order they were found in the
327             string.
328              
329             =item all_cron
330              
331             Returns all dates found in the string, in chronological order.
332              
333             =back
334              
335             =back
336              
337             =head2 extract text, ARGS => dates
338              
339             Takes an arbitrary amount of text and extracts one or more dates from it. The
340             return value will be zero or more dates, which by default are L<DateTime>
341             objects (but can be customized with the C<format> argument). If called in scalar
342             context, only one will be returned, even if the C<returns> argument specifies
343             multiple possible return values.
344              
345             See the documentation of C<new> for the configuration of this method. Any
346             arguments passed into this method will trump those from the constructor.
347              
348             You may reuse a parser for multiple calls to C<extract>.
349              
350             You do not need to have an instantiated C<Date::Extract> object to call this
351             method. Just C<< Date::Extract->extract($foo) >> will work.
352              
353             =head1 FORMATS HANDLED
354              
355             =over 4
356              
357             =item * today; tomorrow; yesterday
358              
359             =item * last Friday; next Monday; previous Sat
360              
361             =item * Monday; Mon
362              
363             =item * November 13th, 1986; Nov 13, 1986
364              
365             =item * 13 November 1986; 13 Nov 1986
366              
367             =item * November 13th; Nov 13
368              
369             =item * 13 Nov; 13th November
370              
371             =item * 1986/11/13; 1986-11-13
372              
373             =item * 11-13-86; 11/13/1986
374              
375             =back
376              
377             =head1 CAVEATS
378              
379             This module is I<intentionally> very simple. Surprises are I<not> welcome
380             here.
381              
382             =head1 SEE ALSO
383              
384             L<DateTime::Format::Natural>, L<Time::ParseDate>, L<Date::Manip>
385              
386             =head1 AUTHOR
387              
388             Shawn M Moore, C<< <sartak at bestpractical dot com> >>
389              
390             =head1 ACKNOWLEDGEMENTS
391              
392             Thanks to Steven Schubiger for writing the fine L<DateTime::Format::Natural>.
393             We still use it, but it doesn't quite fill all the particular needs we have.
394              
395             =head1 COPYRIGHT & LICENSE
396              
397             Copyright 2007-2009 Best Practical Solutions.
398              
399             This program is free software; you can redistribute it and/or modify it
400             under the same terms as Perl itself.
401              
402             =cut
403