File Coverage

blib/lib/Date/Parse/Modern.pm

Criterion	Covered	Total	%
statement	119	135	88.1
branch	45	56	80.3
condition	49	76	64.4
subroutine	8	8	100.0
pod	1	2	50.0
total	222	277	80.1

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							package Date::Parse::Modern;
4
5	2			2		131376	use strict;
	2					16
	2					59
6	2			2		11	use warnings;
	2					3
	2					45
7	2			2		26	use v5.10;
	2					6
8
9	2			2		11	use Carp;
	2					3
	2					185
10	2			2		951	use Time::Local 1.26;
	2					4883
	2					116
11	2			2		16	use Exporter 'import';
	2					4
	2					3761
12							our @EXPORT = ('strtotime');
13
14							###############################################################################
15
16							# https://pause.perl.org/pause/query?ACTION=pause_operating_model#3_5_factors_considering_in_the_indexing_phase
17							our $VERSION = 0.4;
18
19							# https://timezonedb.com/download
20							my $TZ_OFFSET = {
21							'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
22							'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
23							'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
24							'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
25							'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
26							'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
27							'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
28							'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
29							'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
30							'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
31							'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
32							'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
33							'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
34							'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
35							'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
36							'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
37							'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
38							'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
39							'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
40							'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
41							'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
42							'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
43							'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
44							'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
45							'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
46							};
47
48							# Separator between dates pieces: '-' or '/' or '\'
49							my $sep = qr/[\/\\-]/;
50
51							# Force a local timezone offset (used for unit tests)
52							our $LOCAL_TZ_OFFSET = undef;
53
54							# Use caching for repeated lookups for the same TZ offset
55							our $USE_TZ_CACHE = 1;
56
57							# These are undocumented package variables. They could be changed to support
58							# alternate languages but there are caveats. These are cached and changing
59							# them after strtotime() is called won't affect anything. No one has requested
60							# alternate languages, so I'm leaving this undocumented for now.
61							our $MONTH_MAP = {
62							'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
63							'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
64							};
65
66							# See above
67							our $MONTH_REGEXP = qr/
68							Jan\|January\|Feb\|February\|Mar\|March\|Apr\|April\|May\|Jun\|June\|
69							Jul\|July\|Aug\|August\|Sep\|September\|Oct\|October\|Nov\|November\|Dec\|December
70							/ix;
71
72							###############################################################################
73							###############################################################################
74							###############################################################################
75
76							=head1 NAME
77
78							C - Provide string to unixtime conversions
79
80							=head1 DESCRIPTION
81
82							C provides a single function C which takes a datetime string
83							and returns a unixtime. Care was given to support the most modern style strings that you would
84							commonly find in log files or on the internet.
85
86							=head1 USAGE
87
88							use Date::Parse::Modern;
89
90							C exports the C function automatically.
91
92							=head1 FUNCTIONS
93
94							=head2 strtotime($string)
95
96							my $unixtime = strtotime('1979-02-24'); # 288691200
97
98							Simply feed C a string with some type of date or time in it, and it will return an
99							integer unixtime. If the string is unparseable, or a weird error occurs, it will return C.
100
101							All the "magic" in C is done using regular expressions that look for common datetime
102							formats. Common formats like YYYY-MM-DD and HH:II:SS are easily detected and converted to the
103							appropriate formats. This allows the date or time to be found anywhere in the string, in (almost) any
104							order. In all cases, the day of the week is ignored in the input string.
105
106							B Strings without a year are assumed to be in the current year. Example: C
107
108							B Strings with only a date are assumed to occur at the midnight. Example: C<2023-01-15>
109
110							B Strings with only time are assumed to be the current day. Example: C<10:15am>
111
112							B In strings with numeric B textual time zone offsets, the numeric is used. Example:
113							C<14 Nov 1994 11:34:32 -0500 (EST)>
114
115							=head1 Bugs/Features
116
117							Please submit bugs and feature requests on Github:
118
119							https://github.com/scottchiefbaker/perl-Date-Parse-Modern
120
121							=head1 AUTHORS
122
123							Scott Baker - https://www.perturb.org/
124
125							=cut
126
127							###############################################################################
128							###############################################################################
129							###############################################################################
130
131							# The logic here is that we use regular expressions to pull out various patterns
132							# YYYY/MM/DD, H:I:S, DD MonthWord YYYY
133							sub strtotime {
134	38			38	1	188	my ($str, $debug) = @_;
135
136	38	100				101	if (!defined($str)) {
137	1					4	return undef;
138							}
139
140	37					64	my ($year, $month, $day) = (0, 0, 0);
141	37					65	my ($hour, $min , $sec, $ms) = (0, 0, 0, 0);
142
143							###########################################################################
144							###########################################################################
145
146	37					95	state $rule_1 = qr/
147							\b
148							((\d{4})$sep(\d{2})$sep(\d{2}) # YYYY-MM-DD
149							\|
150							(\d{2})$sep(\d{2})$sep(\d{4})) # DD-MM-YYYY
151							/x;
152
153							# First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
154	37	100				301	if ($str =~ $rule_1) {
155							# YYYY-MM-DD: 1999-12-24
156	14	100	66			52	if ($2 \|\| $3) {
157	13					25	$year = $2;
158	13					20	$month = $3;
159	13					22	$day = $4;
160							}
161
162							# DD-MM-YYYY: 12-24-1999
163	14	100	66			61	if ($5 \|\| $6) {
164	1					2	$day = $5;
165	1					2	$month = $6;
166	1					3	$year = $7;
167
168							# It might be American format (MM-DD-YYYY) so we do a quick flip/flop
169	1	50				4	if ($month > 12) {
170	1					4	($day, $month) = ($month, $day);
171							}
172							}
173							}
174
175							# The year may be on the end of the string: Sat May 8 21:24:31 2021
176	37	100				79	if (!$year) {
177	23					72	($year) = $str =~ m/\s(\d{4})\b/;
178							}
179
180							###########################################################################
181
182	37					192	state $rule_2 = qr/
183							(\d{1,2})? # Maybe some digits before month
184							\s*
185							($MONTH_REGEXP) # A textual month
186							\s+
187							(\d{1,4}) # Digits
188							[\s\$] # Whitespace OR end of line
189							((\d{4}) )? # If there are digits ater the space it's 'Jan 13 2000'
190							/x;
191
192							# Next we look for alpha months followed by a digit if we didn't find a numeric month above
193							# This will find: "April 13" and also "13 April 1995"
194	37	100	100			269	if (!$month && $str =~ $rule_2) {
195
196							# Get the numerical number for this month
197	13					47	my $month_name = lc(substr($2,0,3));
198	13					25	$month = $MONTH_MAP->{$month_name};
199
200							# 17 March 94
201	13	100				31	if ($1) {
202	4					9	$day = int($1);
203	4					8	$year = int($3);
204							# April 13 or April 13 94
205							} else {
206	9					23	$day = int($3);
207
208							# IF there is a $5 it's a year
209	9		50			30	$year \|\|= int($5 \|\| 0);
			66
210							}
211							}
212
213							###########################################################################
214
215							# Alternate date string like like: 21/dec/93 or dec/21/93 much less common
216	37	100	100			249	if (!$month && $str =~ /(.)($MONTH_REGEXP)(.)/) {
217	5					14	my $before = $1;
218	5					12	my $after = $3;
219
220							# Lookup the numeric month based on the string name
221	5		50			20	$month = $MONTH_MAP->{lc($2)} \|\| 0;
222
223							# Month starts string: dec/21/93 or feb/14/1999
224	5	100	33			21	if ($before eq "") {
		50
225	2					28	$after =~ m/(\d{2})$sep(\d{2,4})/;
226
227	2					5	$day = $1;
228	2					5	$year = $2;
229
230							# Month in the middle: 21/dec/93
231							} elsif ($before && $after) {
232	3					9	$before =~ m/(\d+)\D/; # Just the digits
233	3		50			9	$day = $1 \|\| 0;
234
235	3					9	$after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
236
237							# If it's not a time (has a colon) it's the year
238	3	100				10	if ($2 ne ":") {
239	1					3	$year = $1;
240							}
241							}
242							}
243
244							###########################################################################
245
246	37					53	state $rule_3 = qr/
247							(\b\|T) # Anchor point
248							(\d{1,2}): # Hours
249							(\d{1,2}):? # Minutes
250							(\d{2}(Z\|\.\d+)?)? # Seconds (optional)
251							\ ?(am\|pm\|AM\|PM)? # AMPM (optional)
252							/x;
253
254							# Now we look for times: 10:14, 10:14:17, 08:15pm
255	37	100				240	if ($str =~ $rule_3) {
256	27					71	$hour = int($2);
257	27					39	$min = int($3);
258	27		100			83	$sec = $4 \|\| 0; # Not int() cuz it might be float for milliseconds
259
260	27					53	$sec =~ s/Z$//;
261
262							# The string of AM or PM
263	27		100			85	my $ampm = lc($6 \|\| "");
264
265							# PM means add 12 hours
266	27	100				55	if ($ampm eq "pm") {
267	1					2	$hour += 12;
268							}
269
270							# 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
271	27	50	66			60	if ($ampm && ($hour == 24 \|\| $hour == 12)) {
			100
272	2					5	$hour -= 12;
273							}
274							}
275
276							# Just some basic sanity checking
277	37		100			107	my $has_time = ($hour \|\| $min \|\| $sec);
278	37		66			100	my $has_date = ($year \|\| $month \|\| $day);
279
280	37	100	100			110	if (!$has_time && !$has_date) {
281							# One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
282	4	100				13	if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
283	1					4	$year = $1;
284	1					3	$month = $2;
285	1					2	$day = $3;
286
287	1					4	$hour = $4;
288	1					2	$min = $5;
289	1					2	$sec = $6;
290							} else {
291	3					19	return undef;
292							}
293							}
294
295							###########################################################################
296							###########################################################################
297
298							# Sanity check some basic boundaries
299							# I don't think we need this any more since we eval() and timegm_modern() will barf and return undef
300							#if ($month > 12 \|\| $day > 31 \|\| $hour > 23 \|\| $min > 60 \|\| $sec > 61) {
301							# return undef;
302							#}
303
304	34		66			92	$month \|\|= (localtime())[4] + 1; # If there is no month, we assume the current month
305	34		66			74	$day \|\|= (localtime())[3]; # If there is no day, we assume the current day
306							# If we STILL don't have a year it may be a time only string so we assume it's the current year
307	34		66			184	$year \|\|= (localtime())[5] + 1900;
308
309							# Convert any two digit years to four digits
310	34	100				97	if ($year < 100) {
311	5					7	$year += 1900;
312							}
313
314							# Time::Local doesn't support fractional seconds, so we make an int version
315							# and then add the ms after the timegm_modern() conversion
316	34					62	$ms = $sec - int($sec);
317	34					42	$sec = int($sec);
318
319							# If we have all the requisite pieces we build a unixtime
320	34					48	my $ret;
321	34	50	0			47	my $err = $@ \|\| 'Error' unless eval {
322	34					102	$ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
323
324	34					1616	return 1;
325							};
326
327	34	50	33			72	if ($err && $err =~ /Undefined subroutine/) {
328	0					0	print STDERR $err;
329	0					0	return undef;
330							};
331
332	34					42	$ret += $ms;
333
334							# If we find a timezone offset we take that in to account now
335							# Either: +1000 or -0700
336							# or
337							# 11:53 PST (One to four chars after a time)
338	34					52	my $tz_offset_seconds = 0;
339	34					46	my $tz_str = '';
340	34					43	state $tz_rule = qr/
341							(
342							(\s\|:\d\d) # Start AFTER a space, or time (:12)
343							([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
344							\|
345							\d{2}\ # Only match chars if they're AFTER a time
346							([A-Z]{1,4})\b # Capitalized TZ at end of string
347							\|
348							\d{2}(Z)$ # Just a simple Z at the end
349							)
350							/x;
351
352	34	100	100			433	if ($ret && $str =~ $tz_rule) {
		100
353	15					28	my $str_offset = 0;
354
355							# String timezone: 11:53 PST
356	15	100	100			55	if ($6 \|\| $7) {
357							# Whichever form matches, the TZ is that one
358	10		50			27	my $tz_code = $6 \|\| $7 \|\| '';
359
360							# Lookup the timezone offset in the table
361	10		100			39	$str_offset = $TZ_OFFSET->{$tz_code} \|\| 0;
362							# Timezone offsets are in hours, so we convert to seconds
363	10					19	$str_offset *= 3600;
364
365	10					16	$tz_str = $tz_code;
366							# Numeric format: +1000 or -0700
367							} else {
368							# Break the input string into parts so we can do math
369							# +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
370	5					18	$str_offset = ($4 + ($5 / 60)) * 3600;
371
372	5	100				16	if ($3 eq "-") {
373	4					5	$str_offset *= -1;
374							}
375
376	5					17	$tz_str = "$2$3$4";
377							}
378
379	15					23	$tz_offset_seconds = $str_offset;
380							# No timezone info found so we assume the local timezone
381							} elsif ($ret) {
382	17					38	my $local_offset = get_local_offset($ret);
383
384	17					23	$tz_offset_seconds = $local_offset;
385	17					31	$tz_str = 'UNSPECIFIED';
386							}
387
388							# Subtract the timezone offset from the unixtime
389	34					56	$ret -= $tz_offset_seconds;
390
391	34	50				68	if ($debug) {
392	0					0	my $color = "\e[38;5;45m";
393	0					0	my $reset = "\e[0m";
394	0					0	my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
395	0		0			0	my $output = sprintf("'%s' = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", $str, $year \|\| -1, $month \|\| -1, $day \|\| -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
			0
			0
396
397	0					0	print STDERR $color . $header . $reset . "\n";
398	0					0	print STDERR $output . "\n";
399							}
400
401
402	34					168	return $ret;
403							}
404
405							# Return the timezone offset for the local machine
406							sub get_local_offset {
407	17			17	0	28	my $unixtime = $_[0];
408
409							# Since timezones only change on the half-hour (at most), we
410							# round down the nearest half hour "bucket" and then cache
411							# that result. We probably could get away with a full hour
412							# here but we don't gain much performance/memory by doing that
413	17					24	my $bucket_size = 1800;
414	17					32	my $cache_key = $unixtime - ($unixtime % $bucket_size);
415
416							# If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
417	17	50				34	if (defined($LOCAL_TZ_OFFSET)) {
418	17					38	return $LOCAL_TZ_OFFSET;
419							}
420
421							# Simple memoizing (improves repeated performance a LOT)
422							# Note: this is even faster than `use Memoize`
423	0						state $x = {};
424	0	0	0				if ($USE_TZ_CACHE && $x->{$cache_key}) {
425	0						return $x->{$cache_key};
426							}
427
428							# Get a time obj for this local timezone and UTC for the Unixtime
429							# Then compare the two to get the local TZ offset
430	0						my @t = localtime($unixtime);
431	0						my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
432
433							# Cache the result
434	0	0					if ($USE_TZ_CACHE) {
435	0						$x->{$cache_key} = $ret;
436							}
437
438	0						return $ret;
439							}
440
441							1;
442
443							__END__