File Coverage

blib/lib/Date/Parse/Modern.pm
Criterion Covered Total %
statement 123 141 87.2
branch 53 64 82.8
condition 42 62 67.7
subroutine 8 8 100.0
pod 1 2 50.0
total 227 277 81.9


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             package Date::Parse::Modern;
4              
5 2     2   141506 use strict;
  2         14  
  2         60  
6 2     2   10 use warnings;
  2         5  
  2         50  
7 2     2   26 use v5.10;
  2         7  
8              
9 2     2   10 use Carp;
  2         4  
  2         202  
10 2     2   991 use Time::Local 1.26;
  2         4984  
  2         117  
11 2     2   15 use Exporter 'import';
  2         4  
  2         3814  
12             our @EXPORT = ('strtotime');
13              
14             ###############################################################################
15              
16             # https://pause.perl.org/pause/query?ACTION=pause_operating_model#3_5_factors_considering_in_the_indexing_phase
17             our $VERSION = 0.5;
18              
19             # https://timezonedb.com/download
20             my $TZ_OFFSET = {
21             'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
22             'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
23             'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
24             'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
25             'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
26             'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
27             'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
28             'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
29             'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
30             'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
31             'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
32             'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
33             'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
34             'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
35             'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
36             'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
37             'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
38             'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
39             'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
40             'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
41             'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
42             'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
43             'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
44             'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
45             'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
46             };
47              
48             # Separator between dates pieces: '-' or '/' or '\'
49             my $sep = qr/[\/\\-]/;
50              
51             # Force a local timezone offset (used for unit tests)
52             our $LOCAL_TZ_OFFSET = undef;
53              
54             # Use caching for repeated lookups for the same TZ offset
55             our $USE_TZ_CACHE = 1;
56              
57             # These are undocumented package variables. They could be changed to support
58             # alternate languages but there are caveats. These are cached and changing
59             # them after strtotime() is called won't affect anything. No one has requested
60             # alternate languages, so I'm leaving this undocumented for now.
61             our $MONTH_MAP = {
62             'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
63             'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
64             };
65              
66             # See above
67             our $MONTH_REGEXP = qr/
68             Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|
69             Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December
70             /ix;
71              
72             ###############################################################################
73             ###############################################################################
74             ###############################################################################
75              
76             =head1 NAME
77              
78             C - Provide string to unixtime conversions
79              
80             =head1 DESCRIPTION
81              
82             C provides a single function C which takes a datetime string
83             and returns a unixtime. Care was given to support the most modern style strings that you would
84             commonly find in log files or on the internet.
85              
86             =head1 USAGE
87              
88             use Date::Parse::Modern;
89              
90             C exports the C function automatically.
91              
92             =head1 FUNCTIONS
93              
94             =head2 strtotime($string)
95              
96             my $unixtime = strtotime('1979-02-24'); # 288691200
97              
98             Simply feed C a string with some type of date or time in it, and it will return an
99             integer unixtime. If the string is unparseable, or a weird error occurs, it will return C.
100              
101             All the "magic" in C is done using regular expressions that look for common datetime
102             formats. Common formats like YYYY-MM-DD and HH:II:SS are easily detected and converted to the
103             appropriate formats. This allows the date or time to be found anywhere in the string, in (almost) any
104             order. In all cases, the day of the week is ignored in the input string.
105              
106             B Strings without a year are assumed to be in the current year. Example: C
107              
108             B Strings with only a date are assumed to occur at midnight. Example: C<2023-01-15>
109              
110             B Strings with only time are assumed to be the current day. Example: C<10:15am>
111              
112             B In strings with numeric B textual time zone offsets, the numeric is used. Example:
113             C<14 Nov 1994 11:34:32 -0500 (EST)>
114              
115             =head1 Will you support XYZ format?
116              
117             Everyone has their B date/time format, and we'd like to support as many
118             as possible. We have tried to support as much of
119             L as possible, but we
120             cannot support everything. Every new format we support runs the risk of slowing
121             down things for existing formats. You can submit a feature request on Github
122             for new formats but we may reject them if adding support would slow down others.
123              
124             =head1 Bugs/Features
125              
126             Please submit bugs and feature requests on Github:
127              
128             https://github.com/scottchiefbaker/perl-Date-Parse-Modern
129              
130             =head1 AUTHORS
131              
132             Scott Baker - https://www.perturb.org/
133              
134             =cut
135              
136             ###############################################################################
137             ###############################################################################
138             ###############################################################################
139              
140             # The logic here is that we use regular expressions to pull out various patterns
141             # YYYY/MM/DD, H:I:S, DD MonthWord YYYY
142             sub strtotime {
143 43     43 1 201 my ($str, $debug) = @_;
144              
145 43 100       105 if (!defined($str)) {
146 1         4 return undef;
147             }
148              
149 42         86 my ($year, $month, $day) = (0, 0, 0);
150 42         74 my ($hour, $min , $sec, $ms) = (0, 0, 0, 0);
151              
152             ###########################################################################
153             ###########################################################################
154              
155 42         104 state $rule_1 = qr/
156             \b
157             ((\d{4})$sep(\d{2})$sep(\d{2}) # YYYY-MM-DD
158             |
159             (\d{2})$sep(\d{2})$sep(\d{4})) # DD-MM-YYYY
160             /x;
161              
162             # First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
163 42 100       341 if ($str =~ $rule_1) {
164             # YYYY-MM-DD: 1999-12-24
165 15 100 66     61 if ($2 || $3) {
166 14         39 $year = $2;
167 14         20 $month = $3;
168 14         30 $day = $4;
169             }
170              
171             # DD-MM-YYYY: 12-24-1999
172 15 100 66     66 if ($5 || $6) {
173 1         2 $day = $5;
174 1         3 $month = $6;
175 1         3 $year = $7;
176              
177             # It might be American format (MM-DD-YYYY) so we do a quick flip/flop
178 1 50       4 if ($month > 12) {
179 1         3 ($day, $month) = ($month, $day);
180             }
181             }
182             }
183              
184             # The year may be on the end of the string: Sat May 8 21:24:31 2021
185 42 100       99 if (!$year) {
186 27         84 ($year) = $str =~ m/\s(\d{4})\b/;
187             }
188              
189             ###########################################################################
190              
191 42         215 state $rule_2 = qr/
192             (\d{1,2})? # Maybe some digits before month
193             \s*
194             ($MONTH_REGEXP) # A textual month
195             \s+
196             (\d{1,4}) # Digits
197             [\s\$] # Whitespace OR end of line
198             ((\d{2}|\d{4})[ \$])? # If there are two or four digits ater it's a year
199             /x;
200              
201             # Next we look for alpha months followed by a digit if we didn't find a numeric month above
202             # This will find: "April 13" and also "13 April 1995"
203 42 100 100     325 if (!$month && $str =~ $rule_2) {
204              
205             # Get the numerical number for this month
206 17         58 my $month_name = lc(substr($2,0,3));
207 17         38 $month = $MONTH_MAP->{$month_name};
208              
209             # 17 March 94
210 17 100       47 if ($1) {
211 4         11 $day = int($1);
212 4         70 $year = int($3);
213             # April 13 or April 13 94
214             } else {
215 13         33 $day = int($3);
216              
217             # *IF* we still don't have a year
218 13 100       30 if (!$year) {
219 5   100     17 my $part = $5 || 0;
220 5         10 $year = int($part)
221             }
222             }
223             }
224              
225             ###########################################################################
226              
227             # Alternate date string like like: 21/dec/93 or dec/21/93 much less common
228 42 100 100     265 if (!$month && $str =~ /(.*)($MONTH_REGEXP)(.*)/) {
229 5         15 my $before = $1;
230 5         9 my $after = $3;
231              
232             # Lookup the numeric month based on the string name
233 5   50     23 $month = $MONTH_MAP->{lc($2)} || 0;
234              
235             # Month starts string: dec/21/93 or feb/14/1999
236 5 100 33     22 if ($before eq "") {
    50          
237 2 50       36 if ($after =~ m/(\d{2})$sep(\d{2,4})/) {
238 2         6 $day = $1;
239 2         4 $year = $2;
240             }
241             # Month in the middle: 21/dec/93
242             } elsif ($before && $after) {
243 3         12 $before =~ m/(\d+)\D/; # Just the digits
244 3   50     11 $day = $1 || 0;
245              
246 3         8 $after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
247              
248             # If it's not a time (has a colon) it's the year
249 3 100       13 if ($2 ne ":") {
250 1         2 $year = $1;
251             }
252             }
253             }
254              
255             ###########################################################################
256              
257 42         68 state $rule_3 = qr/
258             (\b|T) # Anchor point
259             (\d{1,2}): # Hours
260             (\d{1,2}):? # Minutes
261             (\d{2}(Z|\.\d+)?)? # Seconds (optional)
262             \ ?(am|pm|AM|PM)? # AMPM (optional)
263             /x;
264              
265             # Now we look for times: 10:14, 10:14:17, 08:15pm
266 42 100       294 if ($str =~ $rule_3) {
267 32         83 $hour = int($2);
268 32         55 $min = int($3);
269 32   100     105 $sec = $4 || 0; # Not int() cuz it might be float for milliseconds
270 32         69 $sec =~ s/Z$//; # Remove and Z at the end
271              
272             # The string of AM or PM
273 32   100     104 my $ampm = lc($6 || "");
274              
275             # PM means add 12 hours
276 32 100       105 if ($ampm eq "pm") {
277 1         3 $hour += 12;
278             }
279              
280             # 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
281 32 50 66     82 if ($ampm && ($hour == 24 || $hour == 12)) {
      100        
282 2         3 $hour -= 12;
283             }
284             }
285              
286             # Just some basic sanity checking
287 42   100     127 my $has_time = ($hour || $min || $sec);
288 42   66     106 my $has_date = ($year || $month || $day);
289              
290 42 100 100     92 if (!$has_time && !$has_date) {
291             # One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
292 4 100       14 if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
293 1         3 $year = $1;
294 1         2 $month = $2;
295 1         3 $day = $3;
296              
297 1         3 $hour = $4;
298 1         2 $min = $5;
299 1         1 $sec = $6;
300             } else {
301 3         17 return undef;
302             }
303             }
304              
305             ###########################################################################
306             ###########################################################################
307              
308             # If there is no month, we assume the current month
309 39 100       69 if (!$month) {
310 1         25 $month = (localtime())[4] + 1;
311             }
312              
313             # If there is no day, we assume the current day
314 39 100       77 if (!$day) {
315 1         14 $day = (localtime())[3];
316             }
317              
318             # If we STILL don't have a year it may be a time only string so we assume it's the current year
319 39 100       70 if (!$year) {
320 5         128 $year = (localtime())[5] + 1900;
321             }
322              
323             # Convert any two digit years to four digits
324 39 100       101 if ($year < 100) {
325 8         10 $year += 1900;
326             }
327              
328             # Time::Local doesn't support fractional seconds, so we make an int version
329             # and then add the ms after the timegm_modern() conversion
330 39         69 $ms = $sec - int($sec);
331 39         51 $sec = int($sec);
332              
333             # If we have all the requisite pieces we build a unixtime
334 39         48 my $ret;
335 39         56 my $ok = eval {
336 39         134 $ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
337              
338 39         1521 return 1;
339             };
340             # This has to be *immediately* after the eval or something else might
341             # tromp on the error message
342 39         78 my $err = $@;
343              
344 39 50 33     85 if ($err && $err =~ /Undefined subroutine/) {
345 0         0 print STDERR $err;
346 0         0 return undef;
347             };
348              
349 39         51 $ret += $ms;
350              
351             # If we find a timezone offset we take that in to account now
352             # Either: +1000 or -0700
353             # or
354             # 11:53 PST (One to four chars after a time)
355 39         54 my $tz_offset_seconds = 0;
356 39         55 my $tz_str = '';
357 39         50 state $tz_rule = qr/
358             (
359             (\s|:\d\d) # Start AFTER a space, or time (:12)
360             ([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
361             |
362             \d{2}\ # Only match chars if they're AFTER a time
363             ([A-Z]{1,4})\b # Capitalized TZ at end of string
364             |
365             \d{2}(Z)$ # Just a simple Z at the end
366             )
367             /x;
368              
369             # If we have a string with a timezone piece
370 39 100 100     502 if ($ret && $str =~ $tz_rule) {
    100          
371 17         30 my $str_offset = 0;
372              
373             # String timezone: 11:53 PST
374 17 100 100     67 if ($6 || $7) {
375             # Whichever form matches, the TZ is that one
376 11   50     32 my $tz_code = $6 || $7 || '';
377              
378             # Lookup the timezone offset in the table
379 11   100     41 $str_offset = $TZ_OFFSET->{$tz_code} || 0;
380             # Timezone offsets are in hours, so we convert to seconds
381 11         17 $str_offset *= 3600;
382              
383 11         18 $tz_str = $tz_code;
384             # Numeric format: +1000 or -0700
385             } else {
386             # Break the input string into parts so we can do math
387             # +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
388 6         20 $str_offset = ($4 + ($5 / 60)) * 3600;
389              
390 6 100       19 if ($3 eq "-") {
391 5         7 $str_offset *= -1;
392             }
393              
394 6         19 $tz_str = "$3$4$5";
395             }
396              
397 17         25 $tz_offset_seconds = $str_offset;
398             # No timezone info found so we assume the local timezone
399             } elsif ($ret) {
400 20         47 my $local_offset = get_local_offset($ret);
401              
402 20         33 $tz_offset_seconds = $local_offset;
403 20         33 $tz_str = 'UNSPECIFIED';
404             }
405              
406             # Subtract the timezone offset from the unixtime
407 39         66 $ret -= $tz_offset_seconds;
408              
409 39 50       72 if ($debug) {
410 0         0 my $color = "\e[38;5;45m";
411 0         0 my $reset = "\e[0m";
412 0         0 my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
413 0   0     0 my $output = sprintf("'%s' = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", $str, $year || -1, $month || -1, $day || -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
      0        
      0        
414              
415 0         0 print STDERR $color . $header . $reset . "\n";
416 0         0 print STDERR $output . "\n";
417             }
418              
419              
420 39         184 return $ret;
421             }
422              
423             # Return the timezone offset for the local machine
424             sub get_local_offset {
425 20     20 0 29 my $unixtime = $_[0];
426              
427             # If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
428 20 50       42 if (defined($LOCAL_TZ_OFFSET)) {
429 20         88 return $LOCAL_TZ_OFFSET;
430             }
431              
432             # Since timezones only change on the half-hour (at most), we
433             # round down the nearest half hour "bucket" and then cache
434             # that result. We probably could get away with a full hour
435             # here but we don't gain much performance/memory by doing that
436 0           my $bucket_size = 1800;
437 0           my $cache_key = $unixtime - ($unixtime % $bucket_size);
438              
439             # Simple memoizing (improves repeated performance a LOT)
440             # Note: this is even faster than `use Memoize`
441 0           state $x = {};
442 0 0 0       if ($USE_TZ_CACHE && $x->{$cache_key}) {
443 0           return $x->{$cache_key};
444             }
445              
446             # Get a time obj for this local timezone and UTC for the Unixtime
447             # Then compare the two to get the local TZ offset
448 0           my @t = localtime($unixtime);
449 0           my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
450              
451             # Cache the result
452 0 0         if ($USE_TZ_CACHE) {
453 0           $x->{$cache_key} = $ret;
454             }
455              
456 0           return $ret;
457             }
458              
459             1;
460              
461             __END__