File Coverage

blib/lib/Date/Extract.pm
Criterion Covered Total %
statement 104 104 100.0
branch 24 26 92.3
condition 42 57 73.6
subroutine 22 22 100.0
pod 2 2 100.0
total 194 211 91.9


line stmt bran cond sub pod time code
1             package Date::Extract; # git description: 0.06-9-g11a9993
2             # ABSTRACT: Extract probable dates from strings
3              
4 6     6   655951 use strict;
  6         73  
  6         181  
5 6     6   32 use warnings;
  6         10  
  6         158  
6 6     6   3496 use DateTime::Format::Natural;
  6         3650785  
  6         386  
7 6     6   79 use List::Util 'reduce';
  6         14  
  6         381  
8 6     6   41 use parent 'Class::Data::Inheritable';
  6         14  
  6         57  
9              
10             our $VERSION = '0.07';
11              
12             __PACKAGE__->mk_classdata($_) for qw/scalar_downgrade handlers regex/;
13              
14             sub _croak {
15 3     3   18 require Carp;
16 3         461 Carp::croak @_;
17             }
18              
19             sub new {
20 41     41 1 44907 my $class = shift;
21 41         246 my %args = (
22             format => 'DateTime',
23             returns => 'first',
24             prefers => 'nearest',
25             time_zone => 'floating',
26             @_,
27             );
28              
29 41 100 100     231 if ($args{format} ne 'DateTime'
      100        
30             && $args{format} ne 'verbatim'
31             && $args{format} ne 'epoch') {
32 1         6 _croak "Invalid `format` passed to constructor: expected `DateTime', `verbatim', `epoch'.";
33             }
34              
35 40 100 100     173 if ($args{returns} ne 'first'
      100        
      100        
      100        
      100        
36             && $args{returns} ne 'last'
37             && $args{returns} ne 'earliest'
38             && $args{returns} ne 'latest'
39             && $args{returns} ne 'all'
40             && $args{returns} ne 'all_cron') {
41 1         6 _croak "Invalid `returns` passed to constructor: expected `first', `last', `earliest', `latest', `all', or `all_cron'.";
42             }
43              
44 39 100 66     142 if ($args{prefers} ne 'nearest'
      100        
45             && $args{prefers} ne 'past'
46             && $args{prefers} ne 'future') {
47 1         6 _croak "Invalid `prefers` passed to constructor: expected `nearest', `past', or `future'.";
48             }
49              
50 38   33     186 my $self = bless \%args, ref($class) || $class;
51              
52 38         125 return $self;
53             }
54              
55             # This method will combine the arguments of parser->new and extract. Modify the
56             # "to" hash directly.
57              
58             sub _combine_args {
59 57     57   97 shift;
60              
61 57         102 my $from = shift;
62 57         100 my $to = shift;
63              
64 57   33     321 $to->{format} ||= $from->{format};
65 57   33     238 $to->{prefers} ||= $from->{prefers};
66 57   66     196 $to->{returns} ||= $from->{returns};
67 57   33     195 $to->{time_zone} ||= $from->{time_zone};
68             }
69              
70             sub extract {
71 57     57 1 91076 my $self = shift;
72 57         121 my $text = shift;
73 57         129 my %args = @_;
74              
75             # using extract as a class method
76 57 100       196 $self = $self->new
77             if !ref($self);
78              
79             # combine the arguments of parser->new and this
80 57         218 $self->_combine_args($self, \%args);
81              
82             # when in scalar context, downgrade
83             $args{returns} = $self->_downgrade($args{returns})
84 57 100       239 unless wantarray;
85              
86             # do the work
87 57         209 my @ret = $self->_extract($text, %args);
88              
89             # munge the output to match the desired return type
90 57         1641 return $self->_handle($args{returns}, @ret);
91             }
92              
93             # build the giant regex used for parsing. it has to be a single regex, so that
94             # the order of matches is correct.
95             sub _build_regex {
96 5     5   78 my $self = shift;
97              
98 5         12 my $relative = '(?:today|tomorrow|yesterday)';
99              
100 5         11 my $long_weekday = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)';
101 5         10 my $short_weekday = '(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
102 5         36 my $weekday = "(?:$long_weekday|$short_weekday)";
103              
104 5         20 my $relative_weekday = "(?:(?:next|previous|last)\\s*$weekday)";
105              
106 5         12 my $long_month = '(?:January|February|March|April|May|June|July|August|September|October|November|December)';
107 5         12 my $short_month = '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
108 5         20 my $month = "(?:$long_month|$short_month)";
109              
110             # 1 - 31
111 5         11 my $cardinal_monthday = "(?:[1-9]|[12][0-9]|3[01])";
112 5         12 my $monthday = "(?:$cardinal_monthday(?:st|nd|rd|th)?)";
113              
114 5         18 my $day_month = "(?:$monthday\\s*$month)";
115 5         22 my $month_day = "(?:$month\\s*$monthday)";
116 5         36 my $day_month_year = "(?:(?:$day_month|$month_day)\\s*,?\\s*\\d\\d\\d\\d)";
117              
118 5         10 my $yyyymmdd = "(?:\\d\\d\\d\\d[-/]\\d\\d[-/]\\d\\d)";
119 5         13 my $ddmmyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d)";
120 5         8 my $ddmmyyyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d\\d\\d)";
121              
122 5         25 my $other = $self->_build_more_regex;
123 5 50       18 $other = "|$other"
124             if $other;
125              
126 5         2398 my $regex = qr{
127             \b(
128             $relative # today
129             | $relative_weekday # last Friday
130             | $weekday # Monday
131             | $day_month_year # November 13th, 1986
132             | $day_month # November 13th
133             | $month_day # 13 Nov
134             | $yyyymmdd # 1986/11/13
135             | $ddmmyy # 11-13-86
136             | $ddmmyyyy # 11-13-1986
137             $other # anything from the subclass
138             )\b
139             }ix;
140              
141 5         44 $self->regex($regex);
142             }
143              
144             # this is to be used in subclasses for adding more stuff to the regex
145             # for example, to add support for $foo_bar and $baz_quux, return
146             # "$foo_bar|$baz_quux"
147 5     5   16 sub _build_more_regex { '' }
148              
149             # build the list->scalar downgrade types
150             sub _build_scalar_downgrade {
151 5     5   80 my $self = shift;
152              
153 5         28 $self->scalar_downgrade({
154             all => 'first',
155             all_cron => 'earliest',
156             });
157             }
158              
159             # build the handlers that munge the list of dates to the desired order
160             sub _build_handlers {
161 5     5   104 my $self = shift;
162              
163             $self->handlers({
164             all_cron => sub {
165 1     1   7 sort { DateTime->compare_ignore_floating($a, $b) } @_
  2         79  
166             },
167 1     1   5 all => sub { @_ },
168              
169 2 100   2   26 earliest => sub { reduce { $a < $b ? $a : $b } @_ },
  4         200  
170 1 100   1   8 latest => sub { reduce { $a > $b ? $a : $b } @_ },
  2         86  
171 51     51   245 first => sub { $_[0] },
172 1     1   10 last => sub { $_[-1] },
173 5         103 });
174             }
175              
176             # actually perform the scalar downgrade
177             sub _downgrade {
178 51     51   79 my $self = shift;
179 51         88 my $returns = shift;
180              
181 51   66     174 my $downgrades = $self->scalar_downgrade || $self->_build_scalar_downgrade;
182 51   66     593 return $downgrades->{$returns} || $returns;
183             }
184              
185             sub _handle {
186 57     57   112 my $self = shift;
187 57         106 my $returns = shift;
188              
189 57   66     198 my $handlers = $self->handlers || $self->_build_handlers;
190 57         574 my $handler = $handlers->{$returns};
191 57 50       223 return defined $handler ? $handler->(@_) : @_
192             }
193              
194             sub _extract {
195 57     57   89 my $self = shift;
196 57         90 my $text = shift;
197 57         208 my %args = @_;
198              
199 57   66     162 my $regex = $self->regex || $self->_build_regex;
200 57         1443 my @gleaned = $text =~ /$regex/g;
201              
202 57 100       194 return @gleaned if $self->{format} eq 'verbatim';
203              
204 56         86 my %dtfn_args;
205             $dtfn_args{prefer_future} = 1
206 56 100 66     245 if $args{prefers} && $args{prefers} eq 'future';
207 56         112 $dtfn_args{time_zone} = $args{time_zone};
208              
209 56         264 my $parser = DateTime::Format::Natural->new(%dtfn_args);
210 56         240549 my @ret;
211 56         148 for (@gleaned) {
212 78         1086 my $dt = $parser->parse_datetime($_);
213             push @ret, $dt->set_time_zone($args{time_zone})
214 78 100       474564 if $parser->success;
215             }
216              
217 56 100       2299 if ($self->{format} eq 'epoch') {
218 1         4 return map { $_->epoch } @ret;
  3         24  
219             }
220 55         1270 return @ret;
221             }
222              
223             1;
224              
225             __END__
226              
227             =pod
228              
229             =encoding UTF-8
230              
231             =head1 NAME
232              
233             Date::Extract - Extract probable dates from strings
234              
235             =head1 VERSION
236              
237             version 0.07
238              
239             =head1 SYNOPSIS
240              
241             my $parser = Date::Extract->new();
242             my $dt = $parser->extract($arbitrary_text)
243             or die "No date found.";
244             return $dt->ymd;
245              
246             =head1 MOTIVATION
247              
248             There are already a few modules for getting a date out of a string.
249             L<DateTime::Format::Natural> should be your first choice. There's also
250             L<Time::ParseDate> which fits many formats. Finally, you can coerce
251             L<Date::Manip> to do your bidding.
252              
253             But I needed something that will take an arbitrary block of text, search it for
254             something that looks like a date string, and extract it. This module fills this
255             niche. By design it will produce few false positives. This means it will not
256             catch nearly everything that looks like a date string. So if you have the string
257             "do homework for class 2019" it won't return a L<DateTime> object with the year
258             set to 2019. This is what your users would probably expect.
259              
260             =head1 METHODS
261              
262             =head2 C<new(PARAMHASH)> => C<Date::Extract>
263              
264             =head3 arguments
265              
266             =over 4
267              
268             =item format
269              
270             Choose what format the extracted date(s) will be. The default is "DateTime",
271             which will return L<DateTime> object(s). Other option include "verbatim" (return
272             the original text), or "epoch" (return Unix timestamp).
273              
274             =item time_zone
275              
276             Only relevant when C<format> is set to "DateTime".
277              
278             Forces a particular time zone to be set (this actually matters, as "tomorrow"
279             on Monday at 11 PM means something different than "tomorrow" on Tuesday at 1
280             AM).
281              
282             By default it will use the "floating" time zone. See the documentation for
283             L<DateTime>.
284              
285             This controls both the input time zone and output time zone.
286              
287             =item prefers
288              
289             This argument decides what happens when an ambiguous date appears in the
290             input. For example, "Friday" may refer to any number of Fridays. The valid
291             options for this argument are:
292              
293             =over 4
294              
295             =item nearest
296              
297             Prefer the nearest date. This is the default.
298              
299             =item future
300              
301             Prefer the closest future date.
302              
303             =item past
304              
305             Prefer the closest past date. B<NOT YET SUPPORTED>.
306              
307             =back
308              
309             =item returns
310              
311             If the text has multiple possible dates, then this argument determines which
312             date will be returned. By default it's 'first'.
313              
314             =over 4
315              
316             =item first
317              
318             Returns the first date found in the string.
319              
320             =item last
321              
322             Returns the final date found in the string.
323              
324             =item earliest
325              
326             Returns the date found in the string that chronologically precedes any other
327             date in the string.
328              
329             =item latest
330              
331             Returns the date found in the string that chronologically follows any other
332             date in the string.
333              
334             =item all
335              
336             Returns all dates found in the string, in the order they were found in the
337             string.
338              
339             =item all_cron
340              
341             Returns all dates found in the string, in chronological order.
342              
343             =back
344              
345             =back
346              
347             =head2 C<extract(text, ARGS) => dates
348              
349             Takes an arbitrary amount of text and extracts one or more dates from it. The
350             return value will be zero or more dates, which by default are L<DateTime>
351             objects (but can be customized with the C<format> argument). If called in scalar
352             context, only one will be returned, even if the C<returns> argument specifies
353             multiple possible return values.
354              
355             See the documentation of C<new> for the configuration of this method. Any
356             arguments passed into this method will trump those from the constructor.
357              
358             You may reuse a parser for multiple calls to C<extract>.
359              
360             You do not need to have an instantiated C<Date::Extract> object to call this
361             method. Just C<< Date::Extract->extract($foo) >> will work.
362              
363             =head1 FORMATS HANDLED
364              
365             =over 4
366              
367             =item * C<today>; C<tomorrow>; C<yesterday>
368              
369             =item * C<last Friday>; C<next Monday>; C<previous Sat>
370              
371             =item * C<Monday>; C<Mon>
372              
373             =item * C<November 13th, 1986>; C<Nov 13, 1986>
374              
375             =item * C<13 November 1986>; C<13 Nov 1986>
376              
377             =item * C<November 13th>; C<Nov 13>
378              
379             =item * C<13 Nov>; C<13th November>
380              
381             =item * C<1986/11/13>; C<1986-11-13>
382              
383             =item * C<11-13-86>; C<11/13/1986>
384              
385             =back
386              
387             =head1 CAVEATS
388              
389             This module is I<intentionally> very simple. Surprises are I<not> welcome
390             here.
391              
392             =head1 SEE ALSO
393              
394             L<DateTime::Format::Natural>, L<Time::ParseDate>, L<Date::Manip>
395              
396             =head1 ACKNOWLEDGEMENTS
397              
398             =for :stopwords Schubiger
399              
400             Thanks to Steven Schubiger for writing the fine L<DateTime::Format::Natural>.
401             We still use it, but it doesn't quite fill all the particular needs we have.
402              
403             =head1 SUPPORT
404              
405             Bugs may be submitted through L<the RT bug tracker|https://rt.cpan.org/Public/Dist/Display.html?Name=Date-Extract>
406             (or L<bug-Date-Extract@rt.cpan.org|mailto:bug-Date-Extract@rt.cpan.org>).
407              
408             I am also usually active on irc, as 'ether' at C<irc.perl.org> and C<irc.libera.chat>.
409              
410             =head1 AUTHOR
411              
412             Shawn M Moore, <sartak@gmail.com>
413              
414             =head1 CONTRIBUTORS
415              
416             =for stopwords Karen Etheridge Steven Haryanto Jim Brandt Alex Vandiver Jesse Vincent Thomas Sibley
417              
418             =over 4
419              
420             =item *
421              
422             Karen Etheridge <ether@cpan.org>
423              
424             =item *
425              
426             Steven Haryanto <stevenharyanto@gmail.com>
427              
428             =item *
429              
430             Jim Brandt <jbrandt@bestpractical.com>
431              
432             =item *
433              
434             Alex Vandiver <alex@chmrr.net>
435              
436             =item *
437              
438             Jesse Vincent <jesse@bestpractical.com>
439              
440             =item *
441              
442             Thomas Sibley <trs@bestpractical.com>
443              
444             =back
445              
446             =head1 COPYRIGHT AND LICENCE
447              
448             This software is copyright (c) 2007 by Best Practical Solutions.
449              
450             This is free software; you can redistribute it and/or modify it under
451             the same terms as the Perl 5 programming language system itself.
452              
453             =cut