File Coverage

blib/lib/Date/Parse/Modern.pm
Criterion Covered Total %
statement 135 153 88.2
branch 67 76 88.1
condition 47 68 69.1
subroutine 9 9 100.0
pod 1 3 33.3
total 259 309 83.8


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             package Date::Parse::Modern;
4              
5 2     2   229381 use strict;
  2         5  
  2         82  
6 2     2   10 use warnings;
  2         11  
  2         124  
7 2     2   30 use v5.10;
  2         7  
8              
9 2     2   12 use Carp;
  2         4  
  2         189  
10 2     2   1171 use Time::Local 1.26;
  2         6413  
  2         159  
11 2     2   14 use Exporter 'import';
  2         4  
  2         6622  
12             our @EXPORT = ('strtotime');
13              
14             ###############################################################################
15              
16             # https://pause.perl.org/pause/query?ACTION=pause_operating_model#3_5_factors_considering_in_the_indexing_phase
17             our $VERSION = '1.0';
18              
19             # https://timezonedb.com/download
20             my $TZ_OFFSET = {
21             'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
22             'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
23             'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
24             'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
25             'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
26             'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
27             'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
28             'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
29             'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
30             'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
31             'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
32             'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
33             'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
34             'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
35             'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
36             'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
37             'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
38             'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
39             'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
40             'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
41             'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
42             'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
43             'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
44             'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
45             'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
46             };
47              
48             # Separator between dates pieces: '-' or '/' or '\'
49             our $sep = qr/[\/\\-]/;
50              
51             # Force a local timezone offset (used for unit tests)
52             our $LOCAL_TZ_OFFSET = undef;
53              
54             # Use caching for repeated lookups for the same TZ offset
55             our $USE_TZ_CACHE = 1;
56              
57             # These are undocumented package variables. They could be changed to support
58             # alternate languages but there are caveats. These are cached and changing
59             # them after strtotime() is called won't affect anything. No one has requested
60             # alternate languages, so I'm leaving this undocumented for now.
61             our $MONTH_MAP = {
62             'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
63             'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
64             };
65              
66             # See above
67             our $MONTH_REGEXP = qr/
68             Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|
69             Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December
70             /ix;
71              
72             ###############################################################################
73             ###############################################################################
74             ###############################################################################
75              
76             =head1 NAME
77              
78             Date::Parse::Modern - Provide string to unixtime conversions
79              
80             =head1 DESCRIPTION
81              
82             C provides a single function C which takes a
83             datetime string and returns an integer unixtime. Care was given to support
84             modern datetime strings that one would commonly find in log files or on the
85             internet.
86              
87             =head1 USAGE
88              
89             use Date::Parse::Modern;
90              
91             C exports the C function automatically.
92              
93             =head1 FUNCTIONS
94              
95             =head2 strtotime($string)
96              
97             my $unixtime = strtotime('1979-02-24'); # 288691200
98              
99             Simply feed C a string with some type of date or time in it,
100             and it will return an integer unixtime. If the string is unparseable, or a
101             weird error occurs, it will return C.
102              
103             All the "magic" in C is done using regular expressions that look
104             for common datetime formats. Common formats like C and C
105             are easily detected and converted to the appropriate formats. This allows the
106             date or time to be found anywhere in the string, in (almost) any order. If you
107             limit your string to only the date/time portion the parsing will be much
108             quicker. Shorter input equals faster parsing.
109              
110             B Strings without a year are assumed to be in the current year.
111             Example: C
112              
113             B Strings with only a date are assumed to occur at midnight.
114             Example: C<2023-01-15>
115              
116             B Strings with only time are assumed to be the current day.
117             Example: C<10:15am>
118              
119             B In strings with numeric B textual time zone offsets, the numeric
120             is used. Example: C<14 Nov 1994 11:34:32 -0500 (EST)>
121              
122             B If a string contains neither a textual nor numeric timezone, the local
123             timezone is assumed. Example: C<11/23/2025 08:00 PM>
124              
125             B In all cases, the day of the week is ignored in the input string.
126             Example: C
127              
128             =head1 Will you support XYZ format?
129              
130             Everyone has their B date/time format, and we'd like to support as
131             many as possible. We have tried to support as much of
132             L as possible, but we
133             cannot support everything. Every new format we support runs the risk of slowing
134             down things for existing formats. You can submit a feature request on Github
135             for new formats but we may reject them if adding support would slow down others.
136              
137             =head1 Bugs/Features
138              
139             Please submit bugs and feature requests on Github:
140              
141             https://github.com/scottchiefbaker/perl-Date-Parse-Modern
142              
143             =head1 AUTHORS
144              
145             Scott Baker - https://www.perturb.org/
146              
147             =cut
148              
149             ###############################################################################
150             ###############################################################################
151             ###############################################################################
152              
153             # The logic here is that we use regular expressions to pull out various patterns
154             # YYYY/MM/DD, H:I:S, DD MonthWord YYYY
155             sub strtotime {
156 56     56 1 234712 my ($str, $debug) = @_;
157              
158 56 100       204 if (!defined($str)) {
159 1         5 return undef;
160             }
161              
162 55         151 my ($year, $month, $day) = (0, 0, 0);
163 55         120 my ($hour, $min , $sec, $ms) = (0, 0, 0, 0);
164              
165             ###########################################################################
166             ###########################################################################
167              
168 55         204 state $rule_1 = qr/
169             \b
170             ((\d{4})$sep(\d{1,2})$sep(\d{1,2}) # YYYY-MM-DD
171             |
172             (\d{1,2})$sep(\d{1,2})$sep(\d{4})) # DD-MM-YYYY
173             /x;
174              
175             # First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
176 55 100       776 if ($str =~ $rule_1) {
177             # YYYY-MM-DD: 1999-12-24
178 25 100 66     133 if ($2 || $3) {
179 21         46 $year = $2;
180 21         52 $month = $3;
181 21         48 $day = $4;
182             }
183              
184             # DD-MM-YYYY: 12-24-1999
185 25 100 66     156 if ($5 || $6) {
186 4         10 $day = $5;
187 4         9 $month = $6;
188 4         27 $year = $7;
189              
190             # It might be American format (MM-DD-YYYY) so we do a quick flip/flop
191 4 100       15 if ($month > 12) {
192 2         8 ($day, $month) = ($month, $day);
193             }
194             }
195             }
196              
197             ###########################################################################
198              
199 55         395 state $rule_2 = qr/
200             (\d{1,2})? # Maybe some digits before month
201             \s*
202             ($MONTH_REGEXP) # A textual month
203             \s+
204             (\d{1,4}) # Digits
205             [\s\$] # Whitespace OR end of line
206             ((\d{2}|\d{4})[ \$])? # If there are two or four digits ater it's a year
207             /x;
208              
209             # Next we look for alpha months followed by a digit if we didn't find a numeric month above
210             # This will find: "April 13" and also "13 April 1995"
211 55 100 100     527 if (!$month && $str =~ $rule_2) {
212              
213             # Get the numerical number for this month
214 18         87 my $month_name = lc(substr($2,0,3));
215 18         53 $month = $MONTH_MAP->{$month_name};
216              
217             # 17 March 94
218 18 100       52 if ($1) {
219 4         12 $day = int($1);
220 4         13 $year = int($3);
221             # April 13 or April 13 94
222             } else {
223 14         47 $day = int($3);
224              
225             # *IF* we still don't have a year
226 14 50       36 if (!$year) {
227 14   100     57 my $part = $5 || 0;
228 14         31 $year = int($part)
229             }
230             }
231             }
232              
233             ###########################################################################
234              
235             # Alternate date string like like: 21/dec/93 or dec/21/93 much less common
236 55 100 100     480 if (!$month && $str =~ /(.*)($MONTH_REGEXP)(.*)/) {
237 7         27 my $before = $1;
238 7         16 my $after = $3;
239              
240             # Lookup the numeric month based on the string name
241 7   50     37 $month = $MONTH_MAP->{lc($2)} || 0;
242              
243             # Month starts string: dec/21/93 or feb/14/1999
244 7 100 33     32 if ($before eq "") {
    50          
245 4 100       82 if ($after =~ m/(\d{2})$sep(\d{2,4})/) {
246 2         7 $day = $1;
247 2         5 $year = $2;
248             }
249             # Month in the middle: 21/dec/93
250             } elsif ($before && $after) {
251 3         13 $before =~ m/(\d+)\D/; # Just the digits
252 3   50     12 $day = $1 || 0;
253              
254 3         11 $after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
255              
256             # If it's not a time (has a colon) it's the year
257 3 100       13 if ($2 ne ":") {
258 1         2 $year = $1;
259             }
260             }
261             }
262              
263             # The year may be on the end of the string: Sat May 8 21:24:31 2021
264             # We use a negative look-behind to make sure it's not a +0900 or -0100 timezone
265 55 100       145 if (!$year) {
266 17         105 ($year) = $str =~ m/\b(?
267             }
268              
269             # Match 1st, 2nd, 3rd, 29th
270 55 100 100     168 if (!$day && $str =~ m/\b(\d{1,2})(st|nd|rd|th)/) {
271 2         6 $day = $1;
272             }
273              
274             ###########################################################################
275              
276 55         80 state $rule_3 = qr/
277             (\b|T) # Anchor point
278             (\d{1,2}): # Hours
279             (\d{1,2}):? # Minutes
280             (\d{2}(Z|\.\d+)?)? # Seconds (optional)
281             \ ?(am|pm|AM|PM)? # AMPM (optional)
282             /x;
283              
284             # Now we look for times: 10:14, 10:14:17, 08:15pm
285 55 100       527 if ($str =~ $rule_3) {
286 37         111 $hour = int($2);
287 37         69 $min = int($3);
288 37   100     136 $sec = $4 || 0; # Not int() cuz it might be float for milliseconds
289 37         89 $sec =~ s/Z$//; # Remove and Z at the end
290              
291             # The string of AM or PM
292 37   100     201 my $ampm = lc($6 || "");
293              
294             # PM means add 12 hours
295 37 100       87 if ($ampm eq "pm") {
296 3         7 $hour += 12;
297             }
298              
299             # 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
300 37 100 100     134 if ($ampm && ($hour == 24 || $hour == 12)) {
      100        
301 3         7 $hour -= 12;
302             }
303             }
304              
305             # Just some basic sanity checking
306 55   100     294 my $has_time = ($hour || $min || $sec);
307 55   66     160 my $has_date = ($year || $month || $day);
308              
309 55 100 100     150 if (!$has_time && !$has_date) {
310             # One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
311 4 100       22 if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
312 1         5 $year = $1;
313 1         3 $month = $2;
314 1         2 $day = $3;
315              
316 1         3 $hour = $4;
317 1         2 $min = $5;
318 1         3 $sec = $6;
319             } else {
320 3         24 return undef;
321             }
322             }
323              
324             ###########################################################################
325             ###########################################################################
326              
327             # If there is no month, we assume the current month
328 52 100       108 if (!$month) {
329 1         28 $month = (localtime())[4] + 1;
330             }
331              
332             # If there is no day, we assume the current day
333 52 100       123 if (!$day) {
334 1         10 $day = (localtime())[3];
335             }
336              
337             # If we STILL don't have a year it may be a time only string so we assume it's the current year
338 52 100       103 if (!$year) {
339 4         104 $year = (localtime())[5] + 1900;
340             }
341              
342             # Convert any two digit years to four digits
343 52 100       194 if ($year < 100) {
344 8         14 $year += 1900;
345             }
346              
347             # Time::Local doesn't support fractional seconds, so we make an int version
348             # and then add the ms after the timegm_modern() conversion
349 52         97 $ms = $sec - int($sec);
350 52         78 $sec = int($sec);
351              
352             # If we have all the requisite pieces we build a unixtime
353 52         71 my $ret;
354 52         100 my $ok = eval {
355 52         218 $ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
356              
357 52         2608 return 1;
358             };
359             # This has to be *immediately* after the eval or something else might
360             # tromp on the error message
361 52         99 my $err = $@;
362              
363 52 50 33     135 if ($err && $err =~ /Undefined subroutine/) {
364 0         0 print STDERR $err;
365 0         0 return undef;
366             };
367              
368 52 100       137 if ($ms) {
369 1         4 $ret += $ms;
370             }
371              
372             # If we find a timezone offset we take that in to account now
373             # Either: +1000 or -0700
374             # or
375             # 11:53 PST (One to four chars after a time)
376 52         93 my $tz_offset_seconds = undef;
377 52         96 my $tz_str = '';
378 52         87 state $tz_rule = qr/
379             (
380             (\s|:\d\d) # Start AFTER a space, or time (:12)
381             ([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
382             |
383             \d{2}\ # Only match chars if they're AFTER a time
384             ([A-Z]{1,4})\b # Capitalized TZ at end of string
385             |
386             \d{2}(Z)$ # Just a simple Z at the end
387             )
388             /x;
389              
390             # If we have a string with a timezone piece
391 52 100 66     1149 if (defined($ret) && $str =~ $tz_rule) {
392 22         44 my $str_offset = undef;
393              
394             # String timezone: 11:53 PST
395 22 100 100     131 if ($6 || $7) {
396             # Whichever form matches, the TZ is that one
397 16   50     57 $tz_str = $6 || $7 || '';
398              
399             # In rare cases we get timezone after AM/PM: "12:12 PM CST"
400 16 100       49 if (in_array($tz_str, ("AM", "PM", "am", "pm"))) {
401 2 100       48 if ($str =~ /$tz_str ([A-Z]{1,4})\b/) {
402 1         5 $tz_str = $1;
403             }
404             }
405              
406             # AM/PM sometimes gets flagged as the TZ so we skip it if it happens
407 16 100       66 if (uc($tz_str) !~ /^(AM|PM)$/) {
408             # Lookup the timezone offset in the table
409 15   100     67 $str_offset = $TZ_OFFSET->{$tz_str} || 0;
410             # Timezone offsets are in hours, so we convert to seconds
411 15         29 $str_offset *= 3600;
412             }
413             # Numeric format: +1000 or -0700
414             } else {
415             # Break the input string into parts so we can do math
416             # +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
417 6         30 $str_offset = ($4 + ($5 / 60)) * 3600;
418              
419 6 100       23 if ($3 eq "-") {
420 5         11 $str_offset *= -1;
421             }
422              
423 6         23 $tz_str = "$3$4$5";
424             }
425              
426 22 100       45 if (defined($str_offset)) {
427 21         55 $tz_offset_seconds = $str_offset;
428             }
429             }
430              
431             # No timezone info found so we assume the local timezone
432 52 100 66     212 if (defined($ret) && !defined($tz_offset_seconds)) {
433 31         80 my $local_offset = get_local_offset($ret);
434              
435 31         53 $tz_offset_seconds = $local_offset;
436 31         61 $tz_str = 'UNSPECIFIED';
437             }
438              
439             # Subtract the timezone offset from the unixtime
440 52         106 $ret -= $tz_offset_seconds;
441              
442 52 50       113 if ($debug) {
443 0         0 my $color = "\e[38;5;45m";
444 0         0 my $reset = "\e[0m";
445 0         0 my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
446 0   0     0 my $output = sprintf("%12s = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", "'$str'", $year || -1, $month || -1, $day || -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
      0        
      0        
447              
448 0         0 print STDERR $color . $header . $reset . "\n";
449 0         0 print STDERR $output . "\n";
450             }
451              
452              
453 52         339 return $ret;
454             }
455              
456             # Return the timezone offset for the local machine
457             sub get_local_offset {
458 31     31 0 60 my $unixtime = $_[0];
459              
460             # If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
461 31 50       68 if (defined($LOCAL_TZ_OFFSET)) {
462 31         87 return $LOCAL_TZ_OFFSET;
463             }
464              
465             # Since timezones only change on the half-hour (at most), we
466             # round down the nearest half hour "bucket" and then cache
467             # that result. We probably could get away with a full hour
468             # here but we don't gain much performance/memory by doing that
469 0         0 my $bucket_size = 1800;
470 0         0 my $cache_key = $unixtime - ($unixtime % $bucket_size);
471              
472             # Simple memoizing (improves repeated performance a LOT)
473             # Note: this is even faster than `use Memoize`
474 0         0 state $x = {};
475 0 0 0     0 if ($USE_TZ_CACHE && $x->{$cache_key}) {
476 0         0 return $x->{$cache_key};
477             }
478              
479             # Get a time obj for this local timezone and UTC for the Unixtime
480             # Then compare the two to get the local TZ offset
481 0         0 my @t = localtime($unixtime);
482 0         0 my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
483              
484             # Cache the result
485 0 0       0 if ($USE_TZ_CACHE) {
486 0         0 $x->{$cache_key} = $ret;
487             }
488              
489 0         0 return $ret;
490             }
491              
492             sub in_array {
493 16     16 0 60 my ($needle, @haystack) = @_;
494              
495 16         33 my $ret = grep { $_ eq $needle; } @haystack;
  64         175  
496              
497 16         49 return $ret;
498             }
499              
500             1;
501              
502             __END__