File Coverage

blib/lib/Date/Parse/Modern.pm
Criterion Covered Total %
statement 119 135 88.1
branch 45 56 80.3
condition 49 76 64.4
subroutine 8 8 100.0
pod 1 2 50.0
total 222 277 80.1


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             package Date::Parse::Modern;
4              
5 2     2   131376 use strict;
  2         16  
  2         59  
6 2     2   11 use warnings;
  2         3  
  2         45  
7 2     2   26 use v5.10;
  2         6  
8              
9 2     2   11 use Carp;
  2         3  
  2         185  
10 2     2   951 use Time::Local 1.26;
  2         4883  
  2         116  
11 2     2   16 use Exporter 'import';
  2         4  
  2         3761  
12             our @EXPORT = ('strtotime');
13              
14             ###############################################################################
15              
16             # https://pause.perl.org/pause/query?ACTION=pause_operating_model#3_5_factors_considering_in_the_indexing_phase
17             our $VERSION = 0.4;
18              
19             # https://timezonedb.com/download
20             my $TZ_OFFSET = {
21             'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
22             'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
23             'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
24             'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
25             'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
26             'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
27             'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
28             'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
29             'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
30             'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
31             'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
32             'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
33             'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
34             'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
35             'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
36             'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
37             'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
38             'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
39             'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
40             'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
41             'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
42             'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
43             'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
44             'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
45             'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
46             };
47              
48             # Separator between dates pieces: '-' or '/' or '\'
49             my $sep = qr/[\/\\-]/;
50              
51             # Force a local timezone offset (used for unit tests)
52             our $LOCAL_TZ_OFFSET = undef;
53              
54             # Use caching for repeated lookups for the same TZ offset
55             our $USE_TZ_CACHE = 1;
56              
57             # These are undocumented package variables. They could be changed to support
58             # alternate languages but there are caveats. These are cached and changing
59             # them after strtotime() is called won't affect anything. No one has requested
60             # alternate languages, so I'm leaving this undocumented for now.
61             our $MONTH_MAP = {
62             'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
63             'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
64             };
65              
66             # See above
67             our $MONTH_REGEXP = qr/
68             Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|
69             Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December
70             /ix;
71              
72             ###############################################################################
73             ###############################################################################
74             ###############################################################################
75              
76             =head1 NAME
77              
78             C - Provide string to unixtime conversions
79              
80             =head1 DESCRIPTION
81              
82             C provides a single function C which takes a datetime string
83             and returns a unixtime. Care was given to support the most modern style strings that you would
84             commonly find in log files or on the internet.
85              
86             =head1 USAGE
87              
88             use Date::Parse::Modern;
89              
90             C exports the C function automatically.
91              
92             =head1 FUNCTIONS
93              
94             =head2 strtotime($string)
95              
96             my $unixtime = strtotime('1979-02-24'); # 288691200
97              
98             Simply feed C a string with some type of date or time in it, and it will return an
99             integer unixtime. If the string is unparseable, or a weird error occurs, it will return C.
100              
101             All the "magic" in C is done using regular expressions that look for common datetime
102             formats. Common formats like YYYY-MM-DD and HH:II:SS are easily detected and converted to the
103             appropriate formats. This allows the date or time to be found anywhere in the string, in (almost) any
104             order. In all cases, the day of the week is ignored in the input string.
105              
106             B Strings without a year are assumed to be in the current year. Example: C
107              
108             B Strings with only a date are assumed to occur at the midnight. Example: C<2023-01-15>
109              
110             B Strings with only time are assumed to be the current day. Example: C<10:15am>
111              
112             B In strings with numeric B textual time zone offsets, the numeric is used. Example:
113             C<14 Nov 1994 11:34:32 -0500 (EST)>
114              
115             =head1 Bugs/Features
116              
117             Please submit bugs and feature requests on Github:
118              
119             https://github.com/scottchiefbaker/perl-Date-Parse-Modern
120              
121             =head1 AUTHORS
122              
123             Scott Baker - https://www.perturb.org/
124              
125             =cut
126              
127             ###############################################################################
128             ###############################################################################
129             ###############################################################################
130              
131             # The logic here is that we use regular expressions to pull out various patterns
132             # YYYY/MM/DD, H:I:S, DD MonthWord YYYY
133             sub strtotime {
134 38     38 1 188 my ($str, $debug) = @_;
135              
136 38 100       101 if (!defined($str)) {
137 1         4 return undef;
138             }
139              
140 37         64 my ($year, $month, $day) = (0, 0, 0);
141 37         65 my ($hour, $min , $sec, $ms) = (0, 0, 0, 0);
142              
143             ###########################################################################
144             ###########################################################################
145              
146 37         95 state $rule_1 = qr/
147             \b
148             ((\d{4})$sep(\d{2})$sep(\d{2}) # YYYY-MM-DD
149             |
150             (\d{2})$sep(\d{2})$sep(\d{4})) # DD-MM-YYYY
151             /x;
152              
153             # First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
154 37 100       301 if ($str =~ $rule_1) {
155             # YYYY-MM-DD: 1999-12-24
156 14 100 66     52 if ($2 || $3) {
157 13         25 $year = $2;
158 13         20 $month = $3;
159 13         22 $day = $4;
160             }
161              
162             # DD-MM-YYYY: 12-24-1999
163 14 100 66     61 if ($5 || $6) {
164 1         2 $day = $5;
165 1         2 $month = $6;
166 1         3 $year = $7;
167              
168             # It might be American format (MM-DD-YYYY) so we do a quick flip/flop
169 1 50       4 if ($month > 12) {
170 1         4 ($day, $month) = ($month, $day);
171             }
172             }
173             }
174              
175             # The year may be on the end of the string: Sat May 8 21:24:31 2021
176 37 100       79 if (!$year) {
177 23         72 ($year) = $str =~ m/\s(\d{4})\b/;
178             }
179              
180             ###########################################################################
181              
182 37         192 state $rule_2 = qr/
183             (\d{1,2})? # Maybe some digits before month
184             \s*
185             ($MONTH_REGEXP) # A textual month
186             \s+
187             (\d{1,4}) # Digits
188             [\s\$] # Whitespace OR end of line
189             ((\d{4}) )? # If there are digits ater the space it's 'Jan 13 2000'
190             /x;
191              
192             # Next we look for alpha months followed by a digit if we didn't find a numeric month above
193             # This will find: "April 13" and also "13 April 1995"
194 37 100 100     269 if (!$month && $str =~ $rule_2) {
195              
196             # Get the numerical number for this month
197 13         47 my $month_name = lc(substr($2,0,3));
198 13         25 $month = $MONTH_MAP->{$month_name};
199              
200             # 17 March 94
201 13 100       31 if ($1) {
202 4         9 $day = int($1);
203 4         8 $year = int($3);
204             # April 13 or April 13 94
205             } else {
206 9         23 $day = int($3);
207              
208             # *IF* there is a $5 it's a year
209 9   50     30 $year ||= int($5 || 0);
      66        
210             }
211             }
212              
213             ###########################################################################
214              
215             # Alternate date string like like: 21/dec/93 or dec/21/93 much less common
216 37 100 100     249 if (!$month && $str =~ /(.*)($MONTH_REGEXP)(.*)/) {
217 5         14 my $before = $1;
218 5         12 my $after = $3;
219              
220             # Lookup the numeric month based on the string name
221 5   50     20 $month = $MONTH_MAP->{lc($2)} || 0;
222              
223             # Month starts string: dec/21/93 or feb/14/1999
224 5 100 33     21 if ($before eq "") {
    50          
225 2         28 $after =~ m/(\d{2})$sep(\d{2,4})/;
226              
227 2         5 $day = $1;
228 2         5 $year = $2;
229              
230             # Month in the middle: 21/dec/93
231             } elsif ($before && $after) {
232 3         9 $before =~ m/(\d+)\D/; # Just the digits
233 3   50     9 $day = $1 || 0;
234              
235 3         9 $after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
236              
237             # If it's not a time (has a colon) it's the year
238 3 100       10 if ($2 ne ":") {
239 1         3 $year = $1;
240             }
241             }
242             }
243              
244             ###########################################################################
245              
246 37         53 state $rule_3 = qr/
247             (\b|T) # Anchor point
248             (\d{1,2}): # Hours
249             (\d{1,2}):? # Minutes
250             (\d{2}(Z|\.\d+)?)? # Seconds (optional)
251             \ ?(am|pm|AM|PM)? # AMPM (optional)
252             /x;
253              
254             # Now we look for times: 10:14, 10:14:17, 08:15pm
255 37 100       240 if ($str =~ $rule_3) {
256 27         71 $hour = int($2);
257 27         39 $min = int($3);
258 27   100     83 $sec = $4 || 0; # Not int() cuz it might be float for milliseconds
259              
260 27         53 $sec =~ s/Z$//;
261              
262             # The string of AM or PM
263 27   100     85 my $ampm = lc($6 || "");
264              
265             # PM means add 12 hours
266 27 100       55 if ($ampm eq "pm") {
267 1         2 $hour += 12;
268             }
269              
270             # 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
271 27 50 66     60 if ($ampm && ($hour == 24 || $hour == 12)) {
      100        
272 2         5 $hour -= 12;
273             }
274             }
275              
276             # Just some basic sanity checking
277 37   100     107 my $has_time = ($hour || $min || $sec);
278 37   66     100 my $has_date = ($year || $month || $day);
279              
280 37 100 100     110 if (!$has_time && !$has_date) {
281             # One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
282 4 100       13 if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
283 1         4 $year = $1;
284 1         3 $month = $2;
285 1         2 $day = $3;
286              
287 1         4 $hour = $4;
288 1         2 $min = $5;
289 1         2 $sec = $6;
290             } else {
291 3         19 return undef;
292             }
293             }
294              
295             ###########################################################################
296             ###########################################################################
297              
298             # Sanity check some basic boundaries
299             # I don't think we need this any more since we eval() and timegm_modern() will barf and return undef
300             #if ($month > 12 || $day > 31 || $hour > 23 || $min > 60 || $sec > 61) {
301             # return undef;
302             #}
303              
304 34   66     92 $month ||= (localtime())[4] + 1; # If there is no month, we assume the current month
305 34   66     74 $day ||= (localtime())[3]; # If there is no day, we assume the current day
306             # If we STILL don't have a year it may be a time only string so we assume it's the current year
307 34   66     184 $year ||= (localtime())[5] + 1900;
308              
309             # Convert any two digit years to four digits
310 34 100       97 if ($year < 100) {
311 5         7 $year += 1900;
312             }
313              
314             # Time::Local doesn't support fractional seconds, so we make an int version
315             # and then add the ms after the timegm_modern() conversion
316 34         62 $ms = $sec - int($sec);
317 34         42 $sec = int($sec);
318              
319             # If we have all the requisite pieces we build a unixtime
320 34         48 my $ret;
321 34 50 0     47 my $err = $@ || 'Error' unless eval {
322 34         102 $ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
323              
324 34         1616 return 1;
325             };
326              
327 34 50 33     72 if ($err && $err =~ /Undefined subroutine/) {
328 0         0 print STDERR $err;
329 0         0 return undef;
330             };
331              
332 34         42 $ret += $ms;
333              
334             # If we find a timezone offset we take that in to account now
335             # Either: +1000 or -0700
336             # or
337             # 11:53 PST (One to four chars after a time)
338 34         52 my $tz_offset_seconds = 0;
339 34         46 my $tz_str = '';
340 34         43 state $tz_rule = qr/
341             (
342             (\s|:\d\d) # Start AFTER a space, or time (:12)
343             ([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
344             |
345             \d{2}\ # Only match chars if they're AFTER a time
346             ([A-Z]{1,4})\b # Capitalized TZ at end of string
347             |
348             \d{2}(Z)$ # Just a simple Z at the end
349             )
350             /x;
351              
352 34 100 100     433 if ($ret && $str =~ $tz_rule) {
    100          
353 15         28 my $str_offset = 0;
354              
355             # String timezone: 11:53 PST
356 15 100 100     55 if ($6 || $7) {
357             # Whichever form matches, the TZ is that one
358 10   50     27 my $tz_code = $6 || $7 || '';
359              
360             # Lookup the timezone offset in the table
361 10   100     39 $str_offset = $TZ_OFFSET->{$tz_code} || 0;
362             # Timezone offsets are in hours, so we convert to seconds
363 10         19 $str_offset *= 3600;
364              
365 10         16 $tz_str = $tz_code;
366             # Numeric format: +1000 or -0700
367             } else {
368             # Break the input string into parts so we can do math
369             # +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
370 5         18 $str_offset = ($4 + ($5 / 60)) * 3600;
371              
372 5 100       16 if ($3 eq "-") {
373 4         5 $str_offset *= -1;
374             }
375              
376 5         17 $tz_str = "$2$3$4";
377             }
378              
379 15         23 $tz_offset_seconds = $str_offset;
380             # No timezone info found so we assume the local timezone
381             } elsif ($ret) {
382 17         38 my $local_offset = get_local_offset($ret);
383              
384 17         23 $tz_offset_seconds = $local_offset;
385 17         31 $tz_str = 'UNSPECIFIED';
386             }
387              
388             # Subtract the timezone offset from the unixtime
389 34         56 $ret -= $tz_offset_seconds;
390              
391 34 50       68 if ($debug) {
392 0         0 my $color = "\e[38;5;45m";
393 0         0 my $reset = "\e[0m";
394 0         0 my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
395 0   0     0 my $output = sprintf("'%s' = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", $str, $year || -1, $month || -1, $day || -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
      0        
      0        
396              
397 0         0 print STDERR $color . $header . $reset . "\n";
398 0         0 print STDERR $output . "\n";
399             }
400              
401              
402 34         168 return $ret;
403             }
404              
405             # Return the timezone offset for the local machine
406             sub get_local_offset {
407 17     17 0 28 my $unixtime = $_[0];
408              
409             # Since timezones only change on the half-hour (at most), we
410             # round down the nearest half hour "bucket" and then cache
411             # that result. We probably could get away with a full hour
412             # here but we don't gain much performance/memory by doing that
413 17         24 my $bucket_size = 1800;
414 17         32 my $cache_key = $unixtime - ($unixtime % $bucket_size);
415              
416             # If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
417 17 50       34 if (defined($LOCAL_TZ_OFFSET)) {
418 17         38 return $LOCAL_TZ_OFFSET;
419             }
420              
421             # Simple memoizing (improves repeated performance a LOT)
422             # Note: this is even faster than `use Memoize`
423 0           state $x = {};
424 0 0 0       if ($USE_TZ_CACHE && $x->{$cache_key}) {
425 0           return $x->{$cache_key};
426             }
427              
428             # Get a time obj for this local timezone and UTC for the Unixtime
429             # Then compare the two to get the local TZ offset
430 0           my @t = localtime($unixtime);
431 0           my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
432              
433             # Cache the result
434 0 0         if ($USE_TZ_CACHE) {
435 0           $x->{$cache_key} = $ret;
436             }
437              
438 0           return $ret;
439             }
440              
441             1;
442              
443             __END__