File Coverage

blib/lib/Date/Parse/Modern.pm

Criterion	Covered	Total	%
statement	123	141	87.2
branch	53	64	82.8
condition	42	62	67.7
subroutine	8	8	100.0
pod	1	2	50.0
total	227	277	81.9

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							package Date::Parse::Modern;
4
5	2			2		141506	use strict;
	2					14
	2					60
6	2			2		10	use warnings;
	2					5
	2					50
7	2			2		26	use v5.10;
	2					7
8
9	2			2		10	use Carp;
	2					4
	2					202
10	2			2		991	use Time::Local 1.26;
	2					4984
	2					117
11	2			2		15	use Exporter 'import';
	2					4
	2					3814
12							our @EXPORT = ('strtotime');
13
14							###############################################################################
15
16							# https://pause.perl.org/pause/query?ACTION=pause_operating_model#3_5_factors_considering_in_the_indexing_phase
17							our $VERSION = 0.5;
18
19							# https://timezonedb.com/download
20							my $TZ_OFFSET = {
21							'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
22							'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
23							'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
24							'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
25							'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
26							'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
27							'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
28							'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
29							'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
30							'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
31							'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
32							'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
33							'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
34							'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
35							'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
36							'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
37							'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
38							'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
39							'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
40							'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
41							'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
42							'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
43							'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
44							'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
45							'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
46							};
47
48							# Separator between dates pieces: '-' or '/' or '\'
49							my $sep = qr/[\/\\-]/;
50
51							# Force a local timezone offset (used for unit tests)
52							our $LOCAL_TZ_OFFSET = undef;
53
54							# Use caching for repeated lookups for the same TZ offset
55							our $USE_TZ_CACHE = 1;
56
57							# These are undocumented package variables. They could be changed to support
58							# alternate languages but there are caveats. These are cached and changing
59							# them after strtotime() is called won't affect anything. No one has requested
60							# alternate languages, so I'm leaving this undocumented for now.
61							our $MONTH_MAP = {
62							'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
63							'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
64							};
65
66							# See above
67							our $MONTH_REGEXP = qr/
68							Jan\|January\|Feb\|February\|Mar\|March\|Apr\|April\|May\|Jun\|June\|
69							Jul\|July\|Aug\|August\|Sep\|September\|Oct\|October\|Nov\|November\|Dec\|December
70							/ix;
71
72							###############################################################################
73							###############################################################################
74							###############################################################################
75
76							=head1 NAME
77
78							C - Provide string to unixtime conversions
79
80							=head1 DESCRIPTION
81
82							C provides a single function C which takes a datetime string
83							and returns a unixtime. Care was given to support the most modern style strings that you would
84							commonly find in log files or on the internet.
85
86							=head1 USAGE
87
88							use Date::Parse::Modern;
89
90							C exports the C function automatically.
91
92							=head1 FUNCTIONS
93
94							=head2 strtotime($string)
95
96							my $unixtime = strtotime('1979-02-24'); # 288691200
97
98							Simply feed C a string with some type of date or time in it, and it will return an
99							integer unixtime. If the string is unparseable, or a weird error occurs, it will return C.
100
101							All the "magic" in C is done using regular expressions that look for common datetime
102							formats. Common formats like YYYY-MM-DD and HH:II:SS are easily detected and converted to the
103							appropriate formats. This allows the date or time to be found anywhere in the string, in (almost) any
104							order. In all cases, the day of the week is ignored in the input string.
105
106							B Strings without a year are assumed to be in the current year. Example: C
107
108							B Strings with only a date are assumed to occur at midnight. Example: C<2023-01-15>
109
110							B Strings with only time are assumed to be the current day. Example: C<10:15am>
111
112							B In strings with numeric B textual time zone offsets, the numeric is used. Example:
113							C<14 Nov 1994 11:34:32 -0500 (EST)>
114
115							=head1 Will you support XYZ format?
116
117							Everyone has their B date/time format, and we'd like to support as many
118							as possible. We have tried to support as much of
119							L as possible, but we
120							cannot support everything. Every new format we support runs the risk of slowing
121							down things for existing formats. You can submit a feature request on Github
122							for new formats but we may reject them if adding support would slow down others.
123
124							=head1 Bugs/Features
125
126							Please submit bugs and feature requests on Github:
127
128							https://github.com/scottchiefbaker/perl-Date-Parse-Modern
129
130							=head1 AUTHORS
131
132							Scott Baker - https://www.perturb.org/
133
134							=cut
135
136							###############################################################################
137							###############################################################################
138							###############################################################################
139
140							# The logic here is that we use regular expressions to pull out various patterns
141							# YYYY/MM/DD, H:I:S, DD MonthWord YYYY
142							sub strtotime {
143	43			43	1	201	my ($str, $debug) = @_;
144
145	43	100				105	if (!defined($str)) {
146	1					4	return undef;
147							}
148
149	42					86	my ($year, $month, $day) = (0, 0, 0);
150	42					74	my ($hour, $min , $sec, $ms) = (0, 0, 0, 0);
151
152							###########################################################################
153							###########################################################################
154
155	42					104	state $rule_1 = qr/
156							\b
157							((\d{4})$sep(\d{2})$sep(\d{2}) # YYYY-MM-DD
158							\|
159							(\d{2})$sep(\d{2})$sep(\d{4})) # DD-MM-YYYY
160							/x;
161
162							# First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
163	42	100				341	if ($str =~ $rule_1) {
164							# YYYY-MM-DD: 1999-12-24
165	15	100	66			61	if ($2 \|\| $3) {
166	14					39	$year = $2;
167	14					20	$month = $3;
168	14					30	$day = $4;
169							}
170
171							# DD-MM-YYYY: 12-24-1999
172	15	100	66			66	if ($5 \|\| $6) {
173	1					2	$day = $5;
174	1					3	$month = $6;
175	1					3	$year = $7;
176
177							# It might be American format (MM-DD-YYYY) so we do a quick flip/flop
178	1	50				4	if ($month > 12) {
179	1					3	($day, $month) = ($month, $day);
180							}
181							}
182							}
183
184							# The year may be on the end of the string: Sat May 8 21:24:31 2021
185	42	100				99	if (!$year) {
186	27					84	($year) = $str =~ m/\s(\d{4})\b/;
187							}
188
189							###########################################################################
190
191	42					215	state $rule_2 = qr/
192							(\d{1,2})? # Maybe some digits before month
193							\s*
194							($MONTH_REGEXP) # A textual month
195							\s+
196							(\d{1,4}) # Digits
197							[\s\$] # Whitespace OR end of line
198							((\d{2}\|\d{4})[ \$])? # If there are two or four digits ater it's a year
199							/x;
200
201							# Next we look for alpha months followed by a digit if we didn't find a numeric month above
202							# This will find: "April 13" and also "13 April 1995"
203	42	100	100			325	if (!$month && $str =~ $rule_2) {
204
205							# Get the numerical number for this month
206	17					58	my $month_name = lc(substr($2,0,3));
207	17					38	$month = $MONTH_MAP->{$month_name};
208
209							# 17 March 94
210	17	100				47	if ($1) {
211	4					11	$day = int($1);
212	4					70	$year = int($3);
213							# April 13 or April 13 94
214							} else {
215	13					33	$day = int($3);
216
217							# IF we still don't have a year
218	13	100				30	if (!$year) {
219	5		100			17	my $part = $5 \|\| 0;
220	5					10	$year = int($part)
221							}
222							}
223							}
224
225							###########################################################################
226
227							# Alternate date string like like: 21/dec/93 or dec/21/93 much less common
228	42	100	100			265	if (!$month && $str =~ /(.)($MONTH_REGEXP)(.)/) {
229	5					15	my $before = $1;
230	5					9	my $after = $3;
231
232							# Lookup the numeric month based on the string name
233	5		50			23	$month = $MONTH_MAP->{lc($2)} \|\| 0;
234
235							# Month starts string: dec/21/93 or feb/14/1999
236	5	100	33			22	if ($before eq "") {
		50
237	2	50				36	if ($after =~ m/(\d{2})$sep(\d{2,4})/) {
238	2					6	$day = $1;
239	2					4	$year = $2;
240							}
241							# Month in the middle: 21/dec/93
242							} elsif ($before && $after) {
243	3					12	$before =~ m/(\d+)\D/; # Just the digits
244	3		50			11	$day = $1 \|\| 0;
245
246	3					8	$after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
247
248							# If it's not a time (has a colon) it's the year
249	3	100				13	if ($2 ne ":") {
250	1					2	$year = $1;
251							}
252							}
253							}
254
255							###########################################################################
256
257	42					68	state $rule_3 = qr/
258							(\b\|T) # Anchor point
259							(\d{1,2}): # Hours
260							(\d{1,2}):? # Minutes
261							(\d{2}(Z\|\.\d+)?)? # Seconds (optional)
262							\ ?(am\|pm\|AM\|PM)? # AMPM (optional)
263							/x;
264
265							# Now we look for times: 10:14, 10:14:17, 08:15pm
266	42	100				294	if ($str =~ $rule_3) {
267	32					83	$hour = int($2);
268	32					55	$min = int($3);
269	32		100			105	$sec = $4 \|\| 0; # Not int() cuz it might be float for milliseconds
270	32					69	$sec =~ s/Z$//; # Remove and Z at the end
271
272							# The string of AM or PM
273	32		100			104	my $ampm = lc($6 \|\| "");
274
275							# PM means add 12 hours
276	32	100				105	if ($ampm eq "pm") {
277	1					3	$hour += 12;
278							}
279
280							# 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
281	32	50	66			82	if ($ampm && ($hour == 24 \|\| $hour == 12)) {
			100
282	2					3	$hour -= 12;
283							}
284							}
285
286							# Just some basic sanity checking
287	42		100			127	my $has_time = ($hour \|\| $min \|\| $sec);
288	42		66			106	my $has_date = ($year \|\| $month \|\| $day);
289
290	42	100	100			92	if (!$has_time && !$has_date) {
291							# One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
292	4	100				14	if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
293	1					3	$year = $1;
294	1					2	$month = $2;
295	1					3	$day = $3;
296
297	1					3	$hour = $4;
298	1					2	$min = $5;
299	1					1	$sec = $6;
300							} else {
301	3					17	return undef;
302							}
303							}
304
305							###########################################################################
306							###########################################################################
307
308							# If there is no month, we assume the current month
309	39	100				69	if (!$month) {
310	1					25	$month = (localtime())[4] + 1;
311							}
312
313							# If there is no day, we assume the current day
314	39	100				77	if (!$day) {
315	1					14	$day = (localtime())[3];
316							}
317
318							# If we STILL don't have a year it may be a time only string so we assume it's the current year
319	39	100				70	if (!$year) {
320	5					128	$year = (localtime())[5] + 1900;
321							}
322
323							# Convert any two digit years to four digits
324	39	100				101	if ($year < 100) {
325	8					10	$year += 1900;
326							}
327
328							# Time::Local doesn't support fractional seconds, so we make an int version
329							# and then add the ms after the timegm_modern() conversion
330	39					69	$ms = $sec - int($sec);
331	39					51	$sec = int($sec);
332
333							# If we have all the requisite pieces we build a unixtime
334	39					48	my $ret;
335	39					56	my $ok = eval {
336	39					134	$ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
337
338	39					1521	return 1;
339							};
340							# This has to be immediately after the eval or something else might
341							# tromp on the error message
342	39					78	my $err = $@;
343
344	39	50	33			85	if ($err && $err =~ /Undefined subroutine/) {
345	0					0	print STDERR $err;
346	0					0	return undef;
347							};
348
349	39					51	$ret += $ms;
350
351							# If we find a timezone offset we take that in to account now
352							# Either: +1000 or -0700
353							# or
354							# 11:53 PST (One to four chars after a time)
355	39					54	my $tz_offset_seconds = 0;
356	39					55	my $tz_str = '';
357	39					50	state $tz_rule = qr/
358							(
359							(\s\|:\d\d) # Start AFTER a space, or time (:12)
360							([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
361							\|
362							\d{2}\ # Only match chars if they're AFTER a time
363							([A-Z]{1,4})\b # Capitalized TZ at end of string
364							\|
365							\d{2}(Z)$ # Just a simple Z at the end
366							)
367							/x;
368
369							# If we have a string with a timezone piece
370	39	100	100			502	if ($ret && $str =~ $tz_rule) {
		100
371	17					30	my $str_offset = 0;
372
373							# String timezone: 11:53 PST
374	17	100	100			67	if ($6 \|\| $7) {
375							# Whichever form matches, the TZ is that one
376	11		50			32	my $tz_code = $6 \|\| $7 \|\| '';
377
378							# Lookup the timezone offset in the table
379	11		100			41	$str_offset = $TZ_OFFSET->{$tz_code} \|\| 0;
380							# Timezone offsets are in hours, so we convert to seconds
381	11					17	$str_offset *= 3600;
382
383	11					18	$tz_str = $tz_code;
384							# Numeric format: +1000 or -0700
385							} else {
386							# Break the input string into parts so we can do math
387							# +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
388	6					20	$str_offset = ($4 + ($5 / 60)) * 3600;
389
390	6	100				19	if ($3 eq "-") {
391	5					7	$str_offset *= -1;
392							}
393
394	6					19	$tz_str = "$3$4$5";
395							}
396
397	17					25	$tz_offset_seconds = $str_offset;
398							# No timezone info found so we assume the local timezone
399							} elsif ($ret) {
400	20					47	my $local_offset = get_local_offset($ret);
401
402	20					33	$tz_offset_seconds = $local_offset;
403	20					33	$tz_str = 'UNSPECIFIED';
404							}
405
406							# Subtract the timezone offset from the unixtime
407	39					66	$ret -= $tz_offset_seconds;
408
409	39	50				72	if ($debug) {
410	0					0	my $color = "\e[38;5;45m";
411	0					0	my $reset = "\e[0m";
412	0					0	my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
413	0		0			0	my $output = sprintf("'%s' = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", $str, $year \|\| -1, $month \|\| -1, $day \|\| -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
			0
			0
414
415	0					0	print STDERR $color . $header . $reset . "\n";
416	0					0	print STDERR $output . "\n";
417							}
418
419
420	39					184	return $ret;
421							}
422
423							# Return the timezone offset for the local machine
424							sub get_local_offset {
425	20			20	0	29	my $unixtime = $_[0];
426
427							# If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
428	20	50				42	if (defined($LOCAL_TZ_OFFSET)) {
429	20					88	return $LOCAL_TZ_OFFSET;
430							}
431
432							# Since timezones only change on the half-hour (at most), we
433							# round down the nearest half hour "bucket" and then cache
434							# that result. We probably could get away with a full hour
435							# here but we don't gain much performance/memory by doing that
436	0						my $bucket_size = 1800;
437	0						my $cache_key = $unixtime - ($unixtime % $bucket_size);
438
439							# Simple memoizing (improves repeated performance a LOT)
440							# Note: this is even faster than `use Memoize`
441	0						state $x = {};
442	0	0	0				if ($USE_TZ_CACHE && $x->{$cache_key}) {
443	0						return $x->{$cache_key};
444							}
445
446							# Get a time obj for this local timezone and UTC for the Unixtime
447							# Then compare the two to get the local TZ offset
448	0						my @t = localtime($unixtime);
449	0						my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
450
451							# Cache the result
452	0	0					if ($USE_TZ_CACHE) {
453	0						$x->{$cache_key} = $ret;
454							}
455
456	0						return $ret;
457							}
458
459							1;
460
461							__END__