File Coverage

blib/lib/Data/Mining/AssociationRules.pm

Criterion	Covered	Total	%
statement	132	133	99.2
branch	38	56	67.8
condition	6	12	50.0
subroutine	10	11	90.9
pod	4	6	66.6
total	190	218	87.1

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Mining::AssociationRules;
2
3	1			1		29186	use strict;
	1					2
	1					39
4	1			1		6	use warnings;
	1					2
	1					34
5
6							BEGIN {
7	1			1		6	use Exporter ();
	1					5
	1					21
8	1			1		5	use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	1					1
	1					154
9	1			1		2	$AUTHOR = 'Dan Frankowski ';
10	1					5	@EXPORT = @EXPORT_OK = qw(generate_frequent_sets
11							generate_rules
12							read_frequent_sets
13							read_transaction_file
14							set_debug);
15
16	1					3	%EXPORT_TAGS = ();
17	1					19	@ISA = qw(Exporter);
18	1					1900	$VERSION = 0.1;
19							}
20
21							my $debug = 0;
22
23							=head1 NAME
24
25							Data::Mining:AssociationRules - Mine association rules and frequent
26							sets from data.
27
28							=head1 SYNOPSIS
29
30							use Data::Mining::AssociationRules;
31
32							my %transaction_map;
33							my $transaction_file = "foo.txt";
34
35							read_transaction_file(\%transaction_map, $transaction_file);
36
37							generate_frequent_sets(\%transaction_map, $output_file_prefix,
38							$support_threshold, $max_n);
39
40							generate_rules($output_file_prefix, $support_threshold,
41							$confidence_threshold, $max_n);
42
43							read_frequent_sets($set_map_ref, $file_prefix)
44
45							set_debug(1);
46
47							perl arm.pl -transaction-file foo.txt -support 2 -confidence-threshold 0.01 -max-set-size 6
48
49							See also FUNCTIONS, DESCRIPTION, and EXAMPLES below.
50
51							=head1 INSTALLATION
52
53							The typical:
54
55							=over
56
57							=item 0 perl Makefile.PL
58
59							=item 0 make test
60
61							=item 0 make install
62
63							=back
64
65							=head1 FUNCTIONS
66
67							=cut
68
69							=pod
70
71							=head2 read_transaction_file($transaction_map_ref, $transaction_file)
72
73							Read in a transaction map from a file which has lines of two
74							whitespace-separated columns:
75
76							=over
77
78							transaction-id item-id
79
80							=back
81
82							=cut
83
84							sub read_transaction_file {
85	1			1	1	37	my $transaction_map_ref = shift;
86	1					3	my $transaction_file = shift;
87
88	1	50				42	open(BFILE, $transaction_file) or die "Couldn't open $transaction_file: $!\n";
89	1					36	while ( ) {
90	10					27	my @data = split;
91	10	50				26	die "Expected 2 columns, found ", int(@data), "\n" if int(@data) != 2;
92	10					17	my ($tid, $item) = @data;
93	10					50	$$transaction_map_ref{$item}{$tid}++;
94							}
95	1					27	close(BFILE);
96							}
97
98							=pod
99
100							=head2 generate_frequent_sets ($transaction_map_ref, $file_prefix, $support_threshold, $max_n)
101
102							Given
103
104							=over
105
106							=item 0 a map of transactions
107
108							=item 0 a file prefix
109
110							=item 0 a support threshold
111
112							=item 0 a maximum frequent set size to look for (optional)
113
114							=back
115
116							generate the frequent sets in some files, one file per size of the set.
117							That is, all 1-sets are in a file, all 2-sets in another, etc.
118
119							The files are lines of the form:
120
121							=over
122
123							support-count item-set
124
125							=back
126
127							where
128
129							=over
130
131							=item 0 support-count is the number of transactions in which the item-set appears
132
133							=item 0 item-set is one or more space-separated items
134
135							=back
136
137							=cut
138
139							sub generate_frequent_sets {
140	3			3	1	1886	my $transaction_map_ref = shift;
141	3					5	my $file_prefix = shift;
142	3					6	my $support_threshold = shift;
143	3					5	my $max_n = shift;
144
145							# Generate 1-sets
146	3					4	my $n = 1;
147	3					9	my $out_nset = nset_filename($n, $file_prefix, $support_threshold);
148	3	50				245	open(OUT, ">$out_nset") or die "Couldn't open $out_nset for writing: $!\n";
149	3					6	while (my ($item, $item_map) = each %{$transaction_map_ref}) {
	21					58
150	18					23	my $support = int(keys(%$item_map));
151	18	100				39	if ($support >= $support_threshold) {
152	14					40	print OUT "$support $item\n";
153							}
154							}
155	3					3	my $num_nsets = int(keys(%{$transaction_map_ref}));
	3					7
156	3	50				7	print STDERR "$num_nsets $n-sets\n" if $debug;
157	3					141	close(OUT);
158
159							# Generate n-sets
160	3					6	my $done = 0;
161	3					13	while ($num_nsets > 0) {
162	7					8	$n++;
163	7					8	$num_nsets = 0;
164
165	7	50	33			22	last if defined($max_n) && ($n > $max_n);
166
167							# Go through (n-1)-sets, pruning as you go
168	7					20	my $prior_nset = nset_filename($n-1, $file_prefix, $support_threshold);
169	7	50				211	open(PRIOR, $prior_nset) or die "Couldn't open $prior_nset: $!\n";
170	7					17	$out_nset = nset_filename($n, $file_prefix, $support_threshold);
171	7	50				399	open(OUT, ">$out_nset") or die "Couldn't open $out_nset: $!\n";
172	7					73	while ( ) {
173	22					69	my ($count, @set) = split;
174
175							# Create userset, which contains the intersection of $transaction{@set}
176	22					30	my %userset = % {$$transaction_map_ref{$set[0]}};
	22					75
177	22					59	foreach my $item ( @set[1 .. $#set] ) {
178	10					31	while (my ($user, $dummy) = each %userset) {
179	14	100				63	if (!exists($$transaction_map_ref{$item}{$user})) {
180	4					16	delete($userset{$user});
181							}
182							}
183							}
184
185							# For each 1-set, intersect further, and spit out if > support_threshold
186	22					29	while (my ($item, $user_set) = each %{$transaction_map_ref}) {
	154					486
187							# Only spit sets of non-decreasing elements
188							# This keeps out duplicates
189	132					130	my $dup_set = 0;
190	132					169	foreach my $set_item ( @set ) {
191	168	100				349	if ($set_item ge $item) {
192	86					87	$dup_set = 1;
193	86					95	last;
194							}
195							}
196
197	132	100				294	if (!$dup_set) {
198	46					134	my %newset = %userset;
199	46					122	while (my ($user, $dummy) = each %newset) {
200	70	100				175	if (!exists($$user_set{$user})) {
201	61					199	delete($newset{$user});
202							}
203							}
204							#print "newset is now " . map_str(\%newset) . "\n";
205	46					57	my $num_users = int(keys(%newset));
206							#print "item $item set @set numusers is $num_users\n";
207	46	100				128	if ($num_users >= $support_threshold) {
208	8					58	print OUT "$num_users @set $item\n";
209	8					19	$num_nsets++;
210							}
211							}
212							}
213							}
214	7					68	close(PRIOR);
215	7					280	close(OUT);
216	7	50	66			32	print STDERR "$num_nsets $n-sets\n" if ($num_nsets > 0) && $debug;
217	7	100				199	unlink($out_nset) if 0 == $num_nsets;
218							}
219							}
220
221							=pod
222
223							=head2 read_frequent_sets($set_map_ref, $file_prefix)
224
225							Given
226
227							=over
228
229							=item 0 a set map
230
231							=item 0 a file prefix
232
233							=item 0 support threshold
234
235							=item 0 max frequent set size (optional)
236
237							=back
238
239							read all the frequent sets into a single map, which has as its key the
240							frequent set (joined by single spaces) and as its value the support.
241
242							=cut
243
244							sub read_frequent_sets {
245	9			9	1	41	my $set_map_ref = shift;
246	9					11	my $file_prefix = shift;
247	9					11	my $support_threshold = shift;
248	9					11	my $max_n = shift;
249
250	9	50				164	opendir(DIR, '.') \|\| die "can't opendir '.': $!";
251	9	100				195	my @files = grep { /^$file_prefix/ && -f "./$_" } readdir(DIR);
	120					1144
252	9					95	closedir DIR;
253
254	9					16	foreach my $file (@files) {
255							# print STDERR "Read file $file ..\n";
256	51	100				364	if ( $file =~ /${file_prefix}\-support\-(\d+)\-(\d+)set/ ) {
257	26					49	my $support = $1;
258	26					31	my $n = $2;
259	26	100	33			97	next if ($support != $support_threshold)
			66
260							\|\| (defined($max_n) && ($n > $max_n));
261
262	21	50				557	open(SETS, $file) or die "Couldn't open $file: $!\n";
263	21					204	while ( ) {
264	66					183	my ($count, @set) = split;
265	66					350	$$set_map_ref{join(' ', @set)} = $count;
266							}
267	21					199	close(SETS);
268							}
269							}
270							}
271
272							# =pod
273
274							# =head2 nset_filename($n, $file_prefix, $support_threshold)
275
276							# Given
277
278							# =over
279
280							# =item 0 set size
281
282							# =item 0 a file prefix
283
284							# =item 0 a support threshold
285
286							# =back
287
288							# return the name of the file that contains the specified frequent sets.
289
290							# =cut
291							sub nset_filename {
292	17			17	0	22	my $n = shift;
293	17					20	my $file_prefix = shift;
294	17					20	my $support_threshold = shift;
295
296	17					65	return $file_prefix . "-support-" . $support_threshold . "-" . $n . "set.txt";
297							}
298
299							=pod
300
301							=head2 generate_rules($file_prefix, $support_threshold, $max_n)
302
303							Given
304
305							=over
306
307							=item 0 a file prefix
308
309							=item 0 a support threshold (optional)
310
311							=item 0 a confidence threshold (optional)
312
313							=item 0 maximum frequent set size to look for (optional)
314
315							=back
316
317							create a file with all association rules in it. The output file is of
318							the form:
319
320							support-count confidence left-hand-set-size right-hand-set-size frequent-set-size left-hand-set => right-hand-set
321
322							=cut
323
324							sub generate_rules {
325	3			3	1	1078	my $file_prefix = shift;
326	3					6	my $support_threshold = shift;
327	3					6	my $confidence_threshold = shift;
328	3					4	my $max_n = shift;
329
330	3	50				10	$support_threshold = 1 if !defined($support_threshold);
331	3	50				9	$confidence_threshold = 0 if !defined($confidence_threshold);
332
333	3					4	my $num_rules = 0;
334
335							# Read in frequent set supports
336	3					5	my %frequent_set;
337	3					7	read_frequent_sets(\%frequent_set, $file_prefix, $support_threshold, $max_n);
338
339	3	50				11	die "Found no frequent sets from file prefix $file_prefix support $support_threshold " if (0 == int(keys(%frequent_set)));
340
341							# Go through the sets computing stats
342	3					510	my $rulefile = $file_prefix . '-support-' . $support_threshold . '-conf-' .
343							$confidence_threshold . '-rules.txt';
344	3	50				221	open(RULES, ">$rulefile") or die "Couldn't open $rulefile: $!\n";
345	3					14	while (my ($set, $count) = each %frequent_set) {
346							# Traverse all subsets (save full and empty)
347	22					29	my $support = $frequent_set{$set};
348	22	50				36	die "Couldn't find frequent set '$set'" if !defined($support);
349	22					42	my @set = split('\s+', $set);
350
351	22					84	for my $lhs_selector (1..(1<
352	24					76	my @lhs_set = @set[grep $lhs_selector&1<<$_, 0..$#set];
353	24					39	my $all_ones = (1<
354	24					28	my $rhs_selector = $all_ones ^ $lhs_selector;
355	24					65	my @rhs_set = @set[grep $rhs_selector&1<<$_, 0..$#set];
356							# print "lhs_selector $lhs_selector 1<
357
358							# print "lhs_set @lhs_set ";
359							# print "rhs_set @rhs_set\n";
360
361	24					56	my $lhs_set = join(' ', @lhs_set);
362	24					29	my $rhs_set = join(' ', @rhs_set);
363
364							# Spit out rule
365	24					39	my $lhs_support = $frequent_set{$lhs_set};
366							#my $rhs_support = $frequent_set{$rhs_set};
367	24	50				42	die "Couldn't find frequent set '$lhs_set'" if !defined($lhs_support);
368							#die "Couldn't find frequent set '$rhs_set'" if !defined($rhs_support);
369
370							# For rule A => B, support = T(AB), conf = T(AB) / T(A)
371	24					31	my $conf = $support / $lhs_support;
372
373	24	50				45	if ($conf >= $confidence_threshold) {
374	24					21	$num_rules++;
375	24					225	print RULES "$support ", sprintf("%.3f ", $conf),
376							int(@lhs_set), ' ', int(@rhs_set), ' ', int(@set), ' ',
377							"$lhs_set => $rhs_set\n";
378							}
379							}
380							}
381	3					91	close(RULES);
382	3	50				18	print STDERR "$num_rules rules\n" if $debug;
383							}
384
385							sub set_debug {
386	0			0	0		$debug = $_[0];
387							}
388
389							1;
390
391							=head1 DESCRIPTION
392
393							This module contains some functions to do association rule mining from
394							text files. This sounds obscure, but really measures beautifully
395							simple things through counting.
396
397							=head2 FREQUENT SETS
398
399							Frequent sets answer the question, "Which events occur together more
400							than N times?"
401
402							=head3 The detail
403
404							The 'transaction file' contains items in transactions. A set of items
405							has 'support' s if all the items occur together in at least s
406							transactions. (In many papers, support is a number between 0 and 1
407							representing the fraction of total transactions. I found the absolute
408							number itself more interesting, so I use that instead. Sorry for the
409							confusion.) For an itemset "A B C", the support is sometimes notated
410							"T(A B C)" (the number of 'T'ransactions).
411
412							A set of items is called a 'frequent set' if it has support at least
413							the given support threshold. Generating frequent set produces all
414							frequent sets, and some information about each set (e.g., its
415							support).
416
417							=head2 RULES
418
419							Association rules answer the (related) question, "When these events
420							occur, how often do those events also occur?"
421
422							=head3 The detail
423
424							A rule has a left-hand set of items and a right-hand set
425							of items. A rule "LHS => RHS" with a support s and 'confidence' c means
426							that the underlying frequent set (LHS + RHS) occured together in at
427							least s transactions, and for all the transactions LHS occurred in,
428							RHS also occured in at least the fraction c (a number from 0 to 1).
429
430							Generating rules produces all rules with support at least the given
431							support threshold, and confidence at least the given confidence
432							threshold. The confidence is sometimes notated "conf(LHS => RHS) =
433							T(LHS + RHS) / T(LHS)". There is also related data with each rule
434							(e.g., the size of its LHS and RHS, the support, the confidence,
435							etc.).
436
437							=head3 FREQUENT SETS AND ASSOCIATION RULES GENERALLY USEFUL
438
439							Although association rule mining is often described in commercial
440							terms like "market baskets" or "transactions" (collections of events)
441							and "items" (events), one can imagine events that make this sort of
442							counting useful across many domains. Events could be
443
444							=over
445
446							=item 0 stock market went down at time t
447
448							=item 0 patient had symptom X
449
450							=item 0 flower petal length was > 5mm
451
452							=back
453
454							For this reason, I believe counting frequent sets and looking at
455							association rules to be a fundamental tool of any data miner, someone
456							who is looking for patterns in pre-existing data, whether commercial
457							or not.
458
459							=head1 EXAMPLES
460
461							Given the following input file:
462
463							234 Orange
464							463 Strawberry
465							53 Apple
466							234 Banana
467							412 Peach
468							467 Pear
469							234 Pear
470							147 Pear
471							141 Orange
472							375 Orange
473
474							Generating frequent sets at support threshold 1 (a.k.a. 'at support 1')
475							produces three files:
476
477							The 1-sets:
478
479							1 Strawberry
480							1 Banana
481							1 Apple
482							3 Orange
483							1 Peach
484							3 Pear
485
486							The 2-sets:
487
488							1 Banana Orange
489							1 Banana Pear
490							1 Orange Pear
491
492							The 3-sets:
493
494							1 Banana Orange Pear
495
496							Generating the rules at support 1 produces the following:
497
498							1 0.333 1 1 2 Orange => Pear
499							1 0.333 1 1 2 Pear => Orange
500							1 1.000 1 2 3 Banana => Orange Pear
501							1 0.333 1 2 3 Orange => Banana Pear
502							1 1.000 2 1 3 Banana Orange => Pear
503							1 0.333 1 2 3 Pear => Banana Orange
504							1 1.000 2 1 3 Banana Pear => Orange
505							1 1.000 2 1 3 Orange Pear => Banana
506							1 1.000 1 1 2 Banana => Orange
507							1 0.333 1 1 2 Orange => Banana
508							1 1.000 1 1 2 Banana => Pear
509							1 0.333 1 1 2 Pear => Banana
510
511							Generating frequent sets at support 2 produces one file:
512
513							3 Orange
514							3 Pear
515
516							Generating rules at support 2 produces nothing.
517
518							Generating rules at support 1 and confidence 0.5 produces:
519
520							1 1.000 1 2 3 Banana => Orange Pear
521							1 1.000 2 1 3 Banana Orange => Pear
522							1 1.000 2 1 3 Banana Pear => Orange
523							1 1.000 2 1 3 Orange Pear => Banana
524							1 1.000 1 1 2 Banana => Orange
525							1 1.000 1 1 2 Banana => Pear
526
527							Note all the lower confidence rules are gone.
528
529							=head1 ALGORITHM
530
531							=head2 Generating frequent sets
532
533							Generating frequent sets is straight-up Apriori. See for example:
534
535							http://www.almaden.ibm.com/software/quest/Publications/papers/vldb94_rj.pdf
536
537							I have not optimized. It depends on having the transactions all in
538							memory. However, given that, it still might scale decently (millions
539							of transactions).
540
541							=head2 Generating rules
542
543							Generating rules is a very vanilla implementation. It requires
544							reading all the frequent sets into memory, which does not scale at
545							all. Given that, since computers have lots of memory these days, you
546							might still be able to get away with millions of frequent sets (which
547							is <
548
549							=head1 BUGS
550
551							There is an existing tool (written in C) to mine frequent sets I kept
552							running across:
553
554							http://fuzzy.cs.uni-magdeburg.de/~borgelt/software.html#assoc
555
556							I should check it out to see if it is easy or desirable to be
557							file-level compatible with it.
558
559							One could imagine wrapping it in Perl, but the Perl-C/C++ barrier is
560							where I have encountered all my troubles in the past, so I wouldn't
561							personally pursue that.
562
563							=head1 VERSION
564
565							This document describes Data::Mining::AssociationRules version 0.1.
566
567							=head1 AUTHOR
568
569							Dan Frankowski
570							dfrankow@winternet.com
571							http://www.winternet.com/~dfrankow
572
573							Hey, if you download this module, drop me an email! That's the fun
574							part of this whole open source thing.
575
576							=head1 LICENSE
577
578							This program is free software; you can redistribute it and/or modify
579							it under the same terms as Perl itself.
580
581							The full text of the license can be found in the LICENSE file included
582							in the distribution and available in the CPAN listing for
583							Data::Mining::AssociationRules (see www.cpan.org or search.cpan.org).
584
585							=head1 DISCLAIMER
586
587							To the maximum extent permitted by applicable law, the author of this
588							module disclaims all warranties, either express or implied, including
589							but not limited to implied warranties of merchantability and fitness
590							for a particular purpose, with regard to the software and the
591							accompanying documentation.
592
593							=cut