File Coverage

blib/lib/Date/Extract/PERLANCAR.pm
Criterion Covered Total %
statement 109 114 95.6
branch 22 28 78.5
condition 30 60 50.0
subroutine 22 23 95.6
pod 2 2 100.0
total 185 227 81.5


line stmt bran cond sub pod time code
1             package Date::Extract::PERLANCAR;
2              
3             our $DATE = '2017-06-13'; # DATE
4             our $VERSION = '0.001'; # VERSION
5              
6 5     5   259183 use 5.010001;
  5         20  
7 5     5   24 use strict;
  5         12  
  5         88  
8 5     5   21 use warnings;
  5         10  
  5         128  
9              
10 5     5   2299 use DateTime::Format::Natural;
  5         2307536  
  5         343  
11 5     5   50 use List::Util 'reduce';
  5         14  
  5         273  
12 5     5   31 use parent 'Class::Data::Inheritable';
  5         13  
  5         39  
13              
14             __PACKAGE__->mk_classdata($_) for qw/scalar_downgrade handlers regex/;
15              
16             sub _croak {
17 0     0   0 require Carp;
18 0         0 Carp::croak @_;
19             }
20              
21             sub new {
22 26     26 1 24082 my $class = shift;
23 26         133 my %args = (
24             format => 'DateTime',
25             returns => 'first',
26             prefers => 'nearest',
27             time_zone => 'floating',
28             @_,
29             );
30              
31 26 50 100     134 if ($args{format} ne 'DateTime'
      100        
      66        
32             && $args{format} ne 'verbatim'
33             && $args{format} ne 'epoch'
34             && $args{format} ne 'combined') {
35 0         0 _croak "Invalid `format` passed to constructor: expected `DateTime', `verbatim', `epoch', `combined'.";
36             }
37              
38 26 0 33     117 if ($args{returns} ne 'first'
      33        
      0        
      0        
      0        
39             && $args{returns} ne 'last'
40             && $args{returns} ne 'earliest'
41             && $args{returns} ne 'latest'
42             && $args{returns} ne 'all'
43             && $args{returns} ne 'all_cron') {
44 0         0 _croak "Invalid `returns` passed to constructor: expected `first', `last', `earliest', `latest', `all', or `all_cron'.";
45             }
46              
47 26 50 66     109 if ($args{prefers} ne 'nearest'
      66        
48             && $args{prefers} ne 'past'
49             && $args{prefers} ne 'future') {
50 0         0 _croak "Invalid `prefers` passed to constructor: expected `nearest', `past', or `future'.";
51             }
52              
53 26   33     139 my $self = bless \%args, ref($class) || $class;
54              
55 26         66 return $self;
56             }
57              
58             # This method will combine the arguments of parser->new and extract. Modify the
59             # "to" hash directly.
60              
61             sub _combine_args {
62 60     60   111 shift;
63              
64 60         114 my $from = shift;
65 60         110 my $to = shift;
66              
67 60   33     439 $to->{format} ||= $from->{format};
68 60   33     330 $to->{prefers} ||= $from->{prefers};
69 60   66     283 $to->{returns} ||= $from->{returns};
70 60   33     266 $to->{time_zone} ||= $from->{time_zone};
71             }
72              
73             sub extract {
74 60     60 1 65825 my $self = shift;
75 60         132 my $text = shift;
76 60         162 my %args = @_;
77              
78             # using extract as a class method
79 60 100       220 $self = $self->new
80             if !ref($self);
81              
82             # combine the arguments of parser->new and this
83 60         241 $self->_combine_args($self, \%args);
84              
85             # when in scalar context, downgrade
86             $args{returns} = $self->_downgrade($args{returns})
87 60 100       247 unless wantarray;
88              
89             # do the work
90 60         236 my @ret = $self->_extract($text, %args);
91              
92             # munge the output to match the desired return type
93 60         1693 return $self->_handle($args{returns}, @ret);
94             }
95              
96             # build the giant regex used for parsing. it has to be a single regex, so that
97             # the order of matches is correct.
98             sub _build_regex {
99 5     5   94 my $self = shift;
100              
101 5         30 my $relative = '(?:today|tomorrow|yesterday)';
102              
103 5         12 my $long_weekday = '(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)';
104 5         13 my $short_weekday = '(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)';
105 5         22 my $weekday = "(?:$long_weekday|$short_weekday)";
106              
107 5         17 my $relative_weekday = "(?:(?:next|previous|last)\\s*$weekday)";
108              
109 5         13 my $long_month = '(?:January|February|March|April|May|June|July|August|September|October|November|December)';
110 5         11 my $short_month = '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
111 5         45 my $month = "(?:$long_month|$short_month)";
112              
113             # 1 - 31
114 5         14 my $cardinal_monthday = "(?:[1-9]|[12][0-9]|3[01])";
115 5         19 my $monthday = "(?:$cardinal_monthday(?:st|nd|rd|th)?)";
116              
117 5         19 my $day_month = "(?:$monthday\\s*$month)";
118 5         20 my $month_day = "(?:$month\\s*$monthday)";
119 5         24 my $day_month_year = "(?:(?:$day_month|$month_day)\\s*,?\\s*\\d\\d\\d\\d)";
120              
121 5         16 my $yyyymmdd_hhmmss = "(?:\\d\\d\\d\\d[-/]\\d\\d[-/]\\d\\d)[T ](?:\\d\\d):(?:\\d\\d):(?:\\d\\d)";
122 5         14 my $yyyymmdd = "(?:\\d\\d\\d\\d[-/]\\d\\d[-/]\\d\\d)";
123 5         13 my $ddmmyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d)";
124 5         12 my $ddmmyyyy = "(?:\\d\\d[-/]\\d\\d[-/]\\d\\d\\d\\d)";
125              
126 5         19 my $other = $self->_build_more_regex;
127 5 50       21 $other = "|$other"
128             if $other;
129              
130 5         2197 my $regex = qr{
131             \b(
132             $relative # today
133             | $relative_weekday # last Friday
134             | $weekday # Monday
135             | $day_month_year # November 13th, 1986
136             | $day_month # November 13th
137             | $month_day # 13 Nov
138             | $yyyymmdd_hhmmss # 1986-11-13T00:12:34 or 1986-11-13 00:12:34
139             | $yyyymmdd # 1986/11/13
140             | $ddmmyy # 11-13-86
141             | $ddmmyyyy # 11-13-1986
142             $other # anything from the subclass
143             )\b
144             }ix;
145              
146 5         41 $self->regex($regex);
147             }
148              
149             # this is to be used in subclasses for adding more stuff to the regex
150             # for example, to add support for $foo_bar and $baz_quux, return
151             # "$foo_bar|$baz_quux"
152 5     5   15 sub _build_more_regex { '' }
153              
154             # build the list->scalar downgrade types
155             sub _build_scalar_downgrade {
156 5     5   79 my $self = shift;
157              
158 5         31 $self->scalar_downgrade({
159             all => 'first',
160             all_cron => 'earliest',
161             });
162             }
163              
164             # build the handlers that munge the list of dates to the desired order
165             sub _build_handlers {
166 5     5   81 my $self = shift;
167              
168             $self->handlers({
169             all_cron => sub {
170 1     1   6 sort { DateTime->compare_ignore_floating($a, $b) } @_
  2         61  
171             },
172 1     1   7 all => sub { @_ },
173              
174 2 100   2   29 earliest => sub { reduce { $a < $b ? $a : $b } @_ },
  4         184  
175 1 100   1   10 latest => sub { reduce { $a > $b ? $a : $b } @_ },
  2         98  
176 54     54   288 first => sub { $_[0] },
177 1     1   10 last => sub { $_[-1] },
178 5         92 });
179             }
180              
181             # actually perform the scalar downgrade
182             sub _downgrade {
183 54     54   107 my $self = shift;
184 54         101 my $returns = shift;
185              
186 54   66     197 my $downgrades = $self->scalar_downgrade || $self->_build_scalar_downgrade;
187 54   66     687 return $downgrades->{$returns} || $returns;
188             }
189              
190             sub _handle {
191 60     60   145 my $self = shift;
192 60         113 my $returns = shift;
193              
194 60   66     279 my $handlers = $self->handlers || $self->_build_handlers;
195 60         674 my $handler = $handlers->{$returns};
196 60 50       223 return defined $handler ? $handler->(@_) : @_
197             }
198              
199             sub _extract {
200 60     60   128 my $self = shift;
201 60         124 my $text = shift;
202 60         182 my %args = @_;
203              
204 60         135 my $fmt = $self->{format};
205              
206 60   66     231 my $regex = $self->regex || $self->_build_regex;
207 60         561 my @combined;
208 60         669 while ($text =~ /$regex/g) {
209 86         932 push @combined, {
210             pos => $-[0],
211             verbatim => $1,
212             };
213             }
214              
215 60 100       182 return (map {$_->{verbatim}} @combined) if $fmt eq 'verbatim';
  3         10  
216              
217 59         111 my %dtfn_args;
218             $dtfn_args{prefer_future} = 1
219 59 100 66     322 if $args{prefers} && $args{prefers} eq 'future';
220 59         133 $dtfn_args{time_zone} = $args{time_zone};
221              
222 59         298 my $parser = DateTime::Format::Natural->new(%dtfn_args);
223 59         159426 for (@combined) {
224 83         308 my $dt = $parser->parse_datetime($_->{verbatim});
225 83 100       442527 if ($parser->success) {
226 80         2429 $dt->set_time_zone($args{time_zone});
227 80         1011 $_->{DateTime} = $dt;
228             }
229             }
230              
231 59 100       331 if ($fmt eq 'epoch') {
    100          
232 1         3 return map { $_->{DateTime}->epoch } @combined;
  3         23  
233             } elsif ($fmt eq 'combined') {
234 1         11 return @combined;
235             } else {
236 57         139 return map {$_->{DateTime}} @combined;
  77         1532  
237             }
238             }
239              
240             1;
241             # ABSTRACT: Extract probable dates from strings
242              
243             __END__
244              
245             =pod
246              
247             =encoding UTF-8
248              
249             =head1 NAME
250              
251             Date::Extract::PERLANCAR - Extract probable dates from strings
252              
253             =head1 VERSION
254              
255             version 0.001
256              
257             =head1 SYNOPSIS
258              
259             my $parser = Date::Extract::PERLANCAR->new();
260             my $dt = $parser->extract($arbitrary_text)
261             or die "No date found.";
262             return $dt->ymd;
263              
264             =head1 DESCRIPTION
265              
266             This is a temporary fork of L<Date::Extract> (last updated at 0.06) to add
267             features that I need. The features will eventually be merged into Date::Extract.
268             Currently it adds:
269              
270             =over
271              
272             =item * Add 'combined' format
273              
274             =item * Recognize yyyy-mm-ddThh:mm:ss in addition to yyyy-mm-dd
275              
276             =back
277              
278             =head1 MOTIVATION
279              
280             There are already a few modules for getting a date out of a string.
281             L<DateTime::Format::Natural> should be your first choice. There's also
282             L<Time::ParseDate> which fits many formats. Finally, you can coerce
283             L<Date::Manip> to do your bidding.
284              
285             But I needed something that will take an arbitrary block of text, search it for
286             something that looks like a date string, and extract it. This module fills this
287             niche. By design it will produce few false positives. This means it will not
288             catch nearly everything that looks like a date string. So if you have the string
289             "do homework for class 2019" it won't return a L<DateTime> object with the year
290             set to 2019. This is what your users would probably expect.
291              
292             =head1 METHODS
293              
294             =head2 new PARAMHASH => C<Date::Extract::PERLANCAR>
295              
296             =head3 arguments
297              
298             =over 4
299              
300             =item format
301              
302             Choose what format the extracted date(s) will be. The default is "DateTime",
303             which will return L<DateTime> object(s). Other option include "verbatim" (return
304             the original text), "epoch" (return Unix timestamp), or "combined" (return
305             hashref containing these keys "verbatim", "DateTime", "pos" [position of date
306             string in the text]).
307              
308             =item time_zone
309              
310             Only relevant when C<format> is set to "DateTime".
311              
312             Forces a particular time zone to be set (this actually matters, as "tomorrow"
313             on Monday at 11 PM means something different than "tomorrow" on Tuesday at 1
314             AM).
315              
316             By default it will use the "floating" time zone. See the documentation for
317             L<DateTime>.
318              
319             This controls both the input time zone and output time zone.
320              
321             =item prefers
322              
323             This argument decides what happens when an ambiguous date appears in the
324             input. For example, "Friday" may refer to any number of Fridays. The valid
325             options for this argument are:
326              
327             =over 4
328              
329             =item nearest
330              
331             Prefer the nearest date. This is the default.
332              
333             =item future
334              
335             Prefer the closest future date.
336              
337             =item past
338              
339             Prefer the closest past date. B<NOT YET SUPPORTED>.
340              
341             =back
342              
343             =item returns
344              
345             If the text has multiple possible dates, then this argument determines which
346             date will be returned. By default it's 'first'.
347              
348             =over 4
349              
350             =item first
351              
352             Returns the first date found in the string.
353              
354             =item last
355              
356             Returns the final date found in the string.
357              
358             =item earliest
359              
360             Returns the date found in the string that chronologically precedes any other
361             date in the string.
362              
363             =item latest
364              
365             Returns the date found in the string that chronologically follows any other
366             date in the string.
367              
368             =item all
369              
370             Returns all dates found in the string, in the order they were found in the
371             string.
372              
373             =item all_cron
374              
375             Returns all dates found in the string, in chronological order.
376              
377             =back
378              
379             =back
380              
381             =head2 extract text, ARGS => dates
382              
383             Takes an arbitrary amount of text and extracts one or more dates from it. The
384             return value will be zero or more dates, which by default are L<DateTime>
385             objects (but can be customized with the C<format> argument). If called in scalar
386             context, only one will be returned, even if the C<returns> argument specifies
387             multiple possible return values.
388              
389             See the documentation of C<new> for the configuration of this method. Any
390             arguments passed into this method will trump those from the constructor.
391              
392             You may reuse a parser for multiple calls to C<extract>.
393              
394             You do not need to have an instantiated C<Date::Extract::PERLANCAR> object to
395             call this method. Just C<< Date::Extract::PERLANCAR->extract($foo) >> will work.
396              
397             =head1 FORMATS HANDLED
398              
399             =over 4
400              
401             =item * today; tomorrow; yesterday
402              
403             =item * last Friday; next Monday; previous Sat
404              
405             =item * Monday; Mon
406              
407             =item * November 13th, 1986; Nov 13, 1986
408              
409             =item * 13 November 1986; 13 Nov 1986
410              
411             =item * November 13th; Nov 13
412              
413             =item * 13 Nov; 13th November
414              
415             =item * 1986/11/13; 1986-11-13
416              
417             =item * 11-13-86; 11/13/1986
418              
419             =back
420              
421             =head1 CAVEATS
422              
423             This module is I<intentionally> very simple. Surprises are I<not> welcome
424             here.
425              
426             =head1 SEE ALSO
427              
428             L<DateTime::Format::Natural>, L<Time::ParseDate>, L<Date::Manip>
429              
430             =head1 ORIGINAL AUTHOR
431              
432             Shawn M Moore, C<< <sartak at bestpractical dot com> >>
433              
434             =head1 ACKNOWLEDGEMENTS
435              
436             Thanks to Steven Schubiger for writing the fine L<DateTime::Format::Natural>.
437             We still use it, but it doesn't quite fill all the particular needs we have.
438              
439             =head1 ORIGINAL COPYRIGHT & LICENSE
440              
441             Copyright 2007-2009 Best Practical Solutions.
442              
443             This program is free software; you can redistribute it and/or modify it
444             under the same terms as Perl itself.
445              
446             =head1 AUTHOR
447              
448             perlancar <perlancar@cpan.org>
449              
450             =head1 COPYRIGHT AND LICENSE
451              
452             This software is copyright (c) 2017, 2014, 2009 by perlancar@cpan.org.
453              
454             This is free software; you can redistribute it and/or modify it under
455             the same terms as the Perl 5 programming language system itself.
456              
457             =cut