File Coverage

blib/lib/SQL/Type/Guess.pm

Criterion	Covered	Total	%
statement	59	60	98.3
branch	19	22	86.3
condition	16	19	84.2
subroutine	9	9	100.0
pod	6	6	100.0
total	109	116	93.9

line	stmt	bran	cond	sub	pod	time	code
1							package SQL::Type::Guess;
2	4			4		251541	use strict;
	4					32
	4					106
3	4			4		18	use warnings;
	4					5
	4					4133
4							our $VERSION = '0.05';
5
6							=head1 NAME
7
8							SQL::Type::Guess - guess an appropriate column type for a set of data
9
10							=head1 SYNOPSIS
11
12							my @data=(
13							{ seen => 1, when => '20140401', greeting => 'Hello', value => '1.05' },
14							{ seen => 0, when => '20140402', greeting => 'World', value => '99.05' },
15							{ seen => 0, when => '20140402', greeting => 'World', value => '9.005' },
16							);
17
18							my $g= SQL::Type::Guess->new();
19							$g->guess( @data );
20
21							print $g->as_sql( table => 'test' );
22							# create table test (
23							# "seen" decimal(1,0),
24							# "greeting" varchar(5),
25							# "value" decimal(5,3),
26							# "when" date
27							# )
28
29							=cut
30
31							=head1 METHODS
32
33							=head2 C<< SQL:::Type::Guess->new( %OPTIONS ) >>
34
35							my $g= SQL::Type::Guess->new();
36
37							Creates a new C<SQL::Type::Guess> instance. The following options are
38							supported:
39
40							=over 4
41
42							=item B<column_type>
43
44							Hashref of already known column types.
45
46							=item B<column_map>
47
48							Hashref mapping the combinations SQL type names
49							to the resulting type name.
50
51							=back
52
53							=cut
54
55							sub new {
56	3			3	1	387	my( $class, %options )= @_;
57
58	3		50			25	$options{ column_type } \|\|= {};
59							$options{ column_map } \|\|= {
60	3		50			62	";date" => 'date',
61							";datetime" => 'datetime',
62							";decimal" => 'decimal(%2$d,%3$d)',
63							";varchar" => 'varchar(%1$d)',
64							"date;" => 'date',
65							"datetime;" => 'datetime',
66							"datetime;datetime" => 'datetime',
67							"decimal;" => 'decimal(%2$d,%3$d)',
68							"varchar;" => 'varchar(%1$d)',
69							"varchar;date" => 'varchar(%1$d)',
70							"varchar;datetime" => 'varchar(%1$d)',
71							"varchar;decimal" => 'varchar(%1$d)',
72							"varchar;varchar" => 'varchar(%1$d)',
73							"date;decimal" => 'decimal(%2$d,%3$d)',
74							"date;varchar" => 'varchar(%1$d)',
75							"date;date" => 'date',
76							"datetime;varchar" => 'varchar(%1$d)',
77							"decimal;date" => 'decimal(%2$d,%3$d)',
78							"decimal;varchar" => 'varchar(%1$d)',
79							"decimal;decimal" => 'decimal(%2$d,%3$d)',
80							";" => '',
81							};
82
83	3					9	bless \%options => $class;
84							}
85
86							=head2 C<< $g->column_type >>
87
88							$g->guess({ foo => 1, bar => 'Hello' },{ foo => 1000, bar => 'World' });
89							print $g->column_type->{ 'foo' } # decimal(4,0)
90
91							Returns a hashref containing the SQL types to store all
92							values in the columns seen so far.
93
94							=cut
95
96	3			3	1	9	sub column_type { $_[0]->{column_type} };
97
98							=head2 C<< $g->column_map >>
99
100							Returns the hashref used for the type transitions. The current
101							transitions used for generalizing data are:
102
103							date -> decimal -> varchar
104
105							This is not entirely safe, as C<2014-01-01> can't be safely
106							loaded into an C<decimal> column, but assuming your data is representative
107							of the data to be stored that shouldn't be much of an issue.
108
109							=cut
110
111	28			28	1	47	sub column_map { $_[0]->{column_map} };
112
113							=head2 C<< $g->guess_data_type $OLD_TYPE, @VALUES >>
114
115							$type= $g->guess_data_type( $type, 1,2,3,undef,'Hello','World', );
116
117							Returns the data type that encompasses the already established data type in C<$type>
118							and the new values as passed in via C<@values>.
119
120							If there is no preexisting data type, C<$type> can be C<undef> or the empty string.
121
122							=cut
123
124							our @recognizers = (
125							sub { if( ! defined $_[0] or $_[0] eq '' ) { return {} }}, # empty value, nothing to change
126							sub { if( $_[0] =~ /^((?:19\|20)\d\d)-?(0\d\|1[012])-?([012]\d\|3[01])$/ ) { return { this_value_type => 'date', 'pre' => 8 } }}, # date
127							sub { if( $_[0] =~ m!^\s[01]\d/[0123]\d/(?:19\|20)\d\d\s[012]\d:[012345]\d:[0123456]\d(\.\d)?$! ) { return { this_value_type => 'datetime', } }}, # US-datetime
128							sub { if( $_[0] =~ m!^\s[0123]\d\.[01]\d\.(?:19\|20)\d\d\s[012]\d:[012345]\d:[0123456]\d(\.\d)?$! ) { return { this_value_type => 'datetime', } }}, # EU-datetime
129							sub { if( $_[0] =~ m!^\s(?:19\|20)\d\d-[01]\d-[0123]\d[\sT][012]\d:[012345]\d:[0123456]\d(\.\d)?Z?$! ) { return { this_value_type => 'datetime', } }}, # ISO-datetime
130							sub { if( $_[0] =~ m!^\s*[01]\d/[0123]\d/(?:19\|20)\d\d$! ) { return { this_value_type => 'date', } }}, # US-date
131							sub { if( $_[0] =~ m!^\s*[0123]\d\.[01]\d\.(?:19\|20)\d\d$! ) { return { this_value_type => 'date', } }}, # EU-date
132							sub { if( $_[0] =~ /^\s[+-]?(\d+)\s$/ ) { return { this_value_type => 'decimal', 'pre' => length($1), post => 0 } }}, # integer
133							sub { if( $_[0] =~ /^\s[+-]?(\d+)\.(\d+)\s$/ ) { return { this_value_type => 'decimal', 'pre' => length($1), post => length($2) } }}, # integer
134							sub { return { this_value_type => 'varchar', length => length $_[0] }}, # catch-all
135							);
136
137							sub guess_data_type {
138	28			28	1	1256	my( $self, $type, @values )= @_;
139
140	28					48	my $column_map= $self->column_map;
141	28					45	for my $value (@values) {
142	46					55	my $old_type = $type;
143
144	46					50	my ( $descriptor );
145	46					58	for (@recognizers) {
146	327	100				417	last if $descriptor = $_->($value);
147							};
148	46		100			93	$descriptor->{ this_value_type } \|\|= '';
149	46		100			92	$descriptor->{ pre } \|\|= 0;
150	46		100			127	$descriptor->{ post } \|\|= 0;
151	46		100			142	$descriptor->{ length } \|\|= length( $value ) \|\| 0;
			100
152
153	46	100				92	if( $type ) {
154	30	100				158	if( $type =~ s/\s*$(\d+)$// ) {
		100
155	6	100				31	if( $1 > $descriptor->{ 'length' } ) {
156	2					7	$descriptor->{ 'length' } = $1;
157							};
158
159							} elsif( $type =~ s/\s*$(\d+),(\d+)$// ) {
160	22					48	my( $new_prec, $new_post )= ($1,$2);
161	22					33	my $new_pre= $new_prec - $new_post;
162	22	100				41	$descriptor->{ pre } = $new_pre > $descriptor->{ pre } ? $new_pre : $descriptor->{ pre } ;
163	22	50				53	$descriptor->{ post } = $2 > $descriptor->{ post } ? $2 : $descriptor->{ post } ;
164							};
165							} else {
166	16					24	$type= '';
167							};
168
169	46					68	my $this_value_type = $descriptor->{ this_value_type };
170	46	100				72	if( $type ne $this_value_type ) {
171	20	50				51	if( not exists $column_map->{ "$type;$this_value_type" }) {
172	0					0	die "Unknown transition '$type' => '$this_value_type'";
173							};
174							};
175
176							{
177	4			4		31	no warnings;
	4					8
	4					1565
	46					50
178	46					206	$type = sprintf $column_map->{ "$type;$this_value_type" }, $descriptor->{ 'length' }, $descriptor->{ pre } + $descriptor->{ post }, $descriptor->{ post };
179							};
180							};
181	28					81	$type
182							};
183
184							=head2 C<< $g->guess( @RECORDS ) >>
185
186							my @data= (
187							{ rownum => 1, name => 'John Smith', street => 'Nowhere Road', birthday => '1996-01-01' },
188							{ rownum => 2, name => 'John Doe', street => 'Anywhere Plaza', birthday => '1904-01-01' },
189							{ rownum => 3, name => 'John Bull', street => 'Everywhere Street', birthday => '2001-09-01' },
190							);
191							$g->guess( @data );
192
193							Modifies the data types for the keys in the given hash.
194
195							=cut
196
197							sub guess {
198	2			2	1	22	my( $self, @records )= @_;
199	2					7	my $column_type= $self->column_type;
200	2					7	for my $row (@records) {
201	5					13	for my $col (keys %$row) {
202	16					42	my( $new_type )= $self->guess_data_type($column_type->{$col}, $row->{ $col });
203	16	100	100			56	if( $new_type ne ($column_type->{ $col } \|\| '')) {
204							#print sprintf "%s: %s => %s ('%s')\n",
205							# $col, ($column_type{ $col } \|\| 'unknown'), ($new_type \|\| 'unknown'), $info->{$col};
206	8					24	$column_type->{ $col }= $new_type;
207							};
208							}
209							}
210							}
211
212							=head2 C<< $g->as_sql %OPTIONS >>
213
214							print $g->as_sql();
215
216							Returns an SQL string that describes the data seen so far.
217
218							Options:
219
220							=over 4
221
222							=item B<user>
223
224							Supply a username for the table
225
226							=item B<columns>
227
228							This allows you to specify the columns and their order. The default
229							is alphabetical order of the columns.
230
231							=back
232
233							=cut
234
235							sub as_sql {
236	1			1	1	16	my( $self, %options )= @_;
237	1					3	my $table= $options{ table };
238							my $user= defined $options{ user }
239	1	50				4	? "$options{ user }."
240							: ''
241							;
242	1					5	my $column_type= $self->column_type;
243	1		50			5	$options{ columns }\|\|= [ sort keys %{ $column_type } ];
	1					10
244	1					2	my $columns= join ",\n", map { qq{ "$_" $column_type->{ $_ }} } @{ $options{ columns }};
	4					12
	1					3
245	1					5	my($sql)= <<SQL;
246							create table $user$table (
247							$columns
248							)
249							SQL
250	1					8	return $sql;
251							}
252
253							1;
254
255							=head1 BUG TRACKER
256
257							Please report bugs in this module via the RT CPAN bug queue at
258							L<https://rt.cpan.org/Public/Dist/Display.html?Name=SQL-Type-Guess>
259							or via mail to L<sql-type-guess-Bugs@rt.cpan.org>.
260
261							=head1 AUTHOR
262
263							Max Maischein C<corion@cpan.org>
264
265							=head1 COPYRIGHT (c)
266
267							Copyright 2014-2018 by Max Maischein C<corion@cpan.org>.
268
269							=head1 LICENSE
270
271							This module is released under the same terms as Perl itself.
272
273							=cut