File Coverage

blib/lib/Data/Mining/AssociationRules.pm
Criterion Covered Total %
statement 132 133 99.2
branch 38 56 67.8
condition 6 12 50.0
subroutine 10 11 90.9
pod 4 6 66.6
total 190 218 87.1


line stmt bran cond sub pod time code
1             package Data::Mining::AssociationRules;
2            
3 1     1   29186 use strict;
  1         2  
  1         39  
4 1     1   6 use warnings;
  1         2  
  1         34  
5            
6             BEGIN {
7 1     1   6 use Exporter ();
  1         5  
  1         21  
8 1     1   5 use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         1  
  1         154  
9 1     1   2 $AUTHOR = 'Dan Frankowski ';
10 1         5 @EXPORT = @EXPORT_OK = qw(generate_frequent_sets
11             generate_rules
12             read_frequent_sets
13             read_transaction_file
14             set_debug);
15            
16 1         3 %EXPORT_TAGS = ();
17 1         19 @ISA = qw(Exporter);
18 1         1900 $VERSION = 0.1;
19             }
20            
21             my $debug = 0;
22            
23             =head1 NAME
24            
25             Data::Mining:AssociationRules - Mine association rules and frequent
26             sets from data.
27            
28             =head1 SYNOPSIS
29            
30             use Data::Mining::AssociationRules;
31            
32             my %transaction_map;
33             my $transaction_file = "foo.txt";
34            
35             read_transaction_file(\%transaction_map, $transaction_file);
36            
37             generate_frequent_sets(\%transaction_map, $output_file_prefix,
38             $support_threshold, $max_n);
39            
40             generate_rules($output_file_prefix, $support_threshold,
41             $confidence_threshold, $max_n);
42            
43             read_frequent_sets($set_map_ref, $file_prefix)
44            
45             set_debug(1);
46            
47             perl arm.pl -transaction-file foo.txt -support 2 -confidence-threshold 0.01 -max-set-size 6
48            
49             See also FUNCTIONS, DESCRIPTION, and EXAMPLES below.
50            
51             =head1 INSTALLATION
52            
53             The typical:
54            
55             =over
56            
57             =item 0 perl Makefile.PL
58            
59             =item 0 make test
60            
61             =item 0 make install
62            
63             =back
64            
65             =head1 FUNCTIONS
66            
67             =cut
68            
69             =pod
70            
71             =head2 read_transaction_file($transaction_map_ref, $transaction_file)
72            
73             Read in a transaction map from a file which has lines of two
74             whitespace-separated columns:
75            
76             =over
77            
78             transaction-id item-id
79            
80             =back
81            
82             =cut
83            
84             sub read_transaction_file {
85 1     1 1 37 my $transaction_map_ref = shift;
86 1         3 my $transaction_file = shift;
87            
88 1 50       42 open(BFILE, $transaction_file) or die "Couldn't open $transaction_file: $!\n";
89 1         36 while ( ) {
90 10         27 my @data = split;
91 10 50       26 die "Expected 2 columns, found ", int(@data), "\n" if int(@data) != 2;
92 10         17 my ($tid, $item) = @data;
93 10         50 $$transaction_map_ref{$item}{$tid}++;
94             }
95 1         27 close(BFILE);
96             }
97            
98             =pod
99            
100             =head2 generate_frequent_sets ($transaction_map_ref, $file_prefix, $support_threshold, $max_n)
101            
102             Given
103            
104             =over
105            
106             =item 0 a map of transactions
107            
108             =item 0 a file prefix
109            
110             =item 0 a support threshold
111            
112             =item 0 a maximum frequent set size to look for (optional)
113            
114             =back
115            
116             generate the frequent sets in some files, one file per size of the set.
117             That is, all 1-sets are in a file, all 2-sets in another, etc.
118            
119             The files are lines of the form:
120            
121             =over
122            
123             support-count item-set
124            
125             =back
126            
127             where
128            
129             =over
130            
131             =item 0 support-count is the number of transactions in which the item-set appears
132            
133             =item 0 item-set is one or more space-separated items
134            
135             =back
136            
137             =cut
138            
139             sub generate_frequent_sets {
140 3     3 1 1886 my $transaction_map_ref = shift;
141 3         5 my $file_prefix = shift;
142 3         6 my $support_threshold = shift;
143 3         5 my $max_n = shift;
144            
145             # Generate 1-sets
146 3         4 my $n = 1;
147 3         9 my $out_nset = nset_filename($n, $file_prefix, $support_threshold);
148 3 50       245 open(OUT, ">$out_nset") or die "Couldn't open $out_nset for writing: $!\n";
149 3         6 while (my ($item, $item_map) = each %{$transaction_map_ref}) {
  21         58  
150 18         23 my $support = int(keys(%$item_map));
151 18 100       39 if ($support >= $support_threshold) {
152 14         40 print OUT "$support $item\n";
153             }
154             }
155 3         3 my $num_nsets = int(keys(%{$transaction_map_ref}));
  3         7  
156 3 50       7 print STDERR "$num_nsets $n-sets\n" if $debug;
157 3         141 close(OUT);
158            
159             # Generate n-sets
160 3         6 my $done = 0;
161 3         13 while ($num_nsets > 0) {
162 7         8 $n++;
163 7         8 $num_nsets = 0;
164            
165 7 50 33     22 last if defined($max_n) && ($n > $max_n);
166            
167             # Go through (n-1)-sets, pruning as you go
168 7         20 my $prior_nset = nset_filename($n-1, $file_prefix, $support_threshold);
169 7 50       211 open(PRIOR, $prior_nset) or die "Couldn't open $prior_nset: $!\n";
170 7         17 $out_nset = nset_filename($n, $file_prefix, $support_threshold);
171 7 50       399 open(OUT, ">$out_nset") or die "Couldn't open $out_nset: $!\n";
172 7         73 while ( ) {
173 22         69 my ($count, @set) = split;
174            
175             # Create userset, which contains the intersection of $transaction{@set}
176 22         30 my %userset = % {$$transaction_map_ref{$set[0]}};
  22         75  
177 22         59 foreach my $item ( @set[1 .. $#set] ) {
178 10         31 while (my ($user, $dummy) = each %userset) {
179 14 100       63 if (!exists($$transaction_map_ref{$item}{$user})) {
180 4         16 delete($userset{$user});
181             }
182             }
183             }
184            
185             # For each 1-set, intersect further, and spit out if > support_threshold
186 22         29 while (my ($item, $user_set) = each %{$transaction_map_ref}) {
  154         486  
187             # Only spit sets of non-decreasing elements
188             # This keeps out duplicates
189 132         130 my $dup_set = 0;
190 132         169 foreach my $set_item ( @set ) {
191 168 100       349 if ($set_item ge $item) {
192 86         87 $dup_set = 1;
193 86         95 last;
194             }
195             }
196            
197 132 100       294 if (!$dup_set) {
198 46         134 my %newset = %userset;
199 46         122 while (my ($user, $dummy) = each %newset) {
200 70 100       175 if (!exists($$user_set{$user})) {
201 61         199 delete($newset{$user});
202             }
203             }
204             #print "newset is now " . map_str(\%newset) . "\n";
205 46         57 my $num_users = int(keys(%newset));
206             #print "item $item set @set numusers is $num_users\n";
207 46 100       128 if ($num_users >= $support_threshold) {
208 8         58 print OUT "$num_users @set $item\n";
209 8         19 $num_nsets++;
210             }
211             }
212             }
213             }
214 7         68 close(PRIOR);
215 7         280 close(OUT);
216 7 50 66     32 print STDERR "$num_nsets $n-sets\n" if ($num_nsets > 0) && $debug;
217 7 100       199 unlink($out_nset) if 0 == $num_nsets;
218             }
219             }
220            
221             =pod
222            
223             =head2 read_frequent_sets($set_map_ref, $file_prefix)
224            
225             Given
226            
227             =over
228            
229             =item 0 a set map
230            
231             =item 0 a file prefix
232            
233             =item 0 support threshold
234            
235             =item 0 max frequent set size (optional)
236            
237             =back
238            
239             read all the frequent sets into a single map, which has as its key the
240             frequent set (joined by single spaces) and as its value the support.
241            
242             =cut
243            
244             sub read_frequent_sets {
245 9     9 1 41 my $set_map_ref = shift;
246 9         11 my $file_prefix = shift;
247 9         11 my $support_threshold = shift;
248 9         11 my $max_n = shift;
249            
250 9 50       164 opendir(DIR, '.') || die "can't opendir '.': $!";
251 9 100       195 my @files = grep { /^$file_prefix/ && -f "./$_" } readdir(DIR);
  120         1144  
252 9         95 closedir DIR;
253            
254 9         16 foreach my $file (@files) {
255             # print STDERR "Read file $file ..\n";
256 51 100       364 if ( $file =~ /${file_prefix}\-support\-(\d+)\-(\d+)set/ ) {
257 26         49 my $support = $1;
258 26         31 my $n = $2;
259 26 100 33     97 next if ($support != $support_threshold)
      66        
260             || (defined($max_n) && ($n > $max_n));
261            
262 21 50       557 open(SETS, $file) or die "Couldn't open $file: $!\n";
263 21         204 while ( ) {
264 66         183 my ($count, @set) = split;
265 66         350 $$set_map_ref{join(' ', @set)} = $count;
266             }
267 21         199 close(SETS);
268             }
269             }
270             }
271            
272             # =pod
273            
274             # =head2 nset_filename($n, $file_prefix, $support_threshold)
275            
276             # Given
277            
278             # =over
279            
280             # =item 0 set size
281            
282             # =item 0 a file prefix
283            
284             # =item 0 a support threshold
285            
286             # =back
287            
288             # return the name of the file that contains the specified frequent sets.
289            
290             # =cut
291             sub nset_filename {
292 17     17 0 22 my $n = shift;
293 17         20 my $file_prefix = shift;
294 17         20 my $support_threshold = shift;
295            
296 17         65 return $file_prefix . "-support-" . $support_threshold . "-" . $n . "set.txt";
297             }
298            
299             =pod
300            
301             =head2 generate_rules($file_prefix, $support_threshold, $max_n)
302            
303             Given
304            
305             =over
306            
307             =item 0 a file prefix
308            
309             =item 0 a support threshold (optional)
310            
311             =item 0 a confidence threshold (optional)
312            
313             =item 0 maximum frequent set size to look for (optional)
314            
315             =back
316            
317             create a file with all association rules in it. The output file is of
318             the form:
319            
320             support-count confidence left-hand-set-size right-hand-set-size frequent-set-size left-hand-set => right-hand-set
321            
322             =cut
323            
324             sub generate_rules {
325 3     3 1 1078 my $file_prefix = shift;
326 3         6 my $support_threshold = shift;
327 3         6 my $confidence_threshold = shift;
328 3         4 my $max_n = shift;
329            
330 3 50       10 $support_threshold = 1 if !defined($support_threshold);
331 3 50       9 $confidence_threshold = 0 if !defined($confidence_threshold);
332            
333 3         4 my $num_rules = 0;
334            
335             # Read in frequent set supports
336 3         5 my %frequent_set;
337 3         7 read_frequent_sets(\%frequent_set, $file_prefix, $support_threshold, $max_n);
338            
339 3 50       11 die "Found no frequent sets from file prefix $file_prefix support $support_threshold " if (0 == int(keys(%frequent_set)));
340            
341             # Go through the sets computing stats
342 3         510 my $rulefile = $file_prefix . '-support-' . $support_threshold . '-conf-' .
343             $confidence_threshold . '-rules.txt';
344 3 50       221 open(RULES, ">$rulefile") or die "Couldn't open $rulefile: $!\n";
345 3         14 while (my ($set, $count) = each %frequent_set) {
346             # Traverse all subsets (save full and empty)
347 22         29 my $support = $frequent_set{$set};
348 22 50       36 die "Couldn't find frequent set '$set'" if !defined($support);
349 22         42 my @set = split('\s+', $set);
350            
351 22         84 for my $lhs_selector (1..(1<
352 24         76 my @lhs_set = @set[grep $lhs_selector&1<<$_, 0..$#set];
353 24         39 my $all_ones = (1<
354 24         28 my $rhs_selector = $all_ones ^ $lhs_selector;
355 24         65 my @rhs_set = @set[grep $rhs_selector&1<<$_, 0..$#set];
356             # print "lhs_selector $lhs_selector 1<
357            
358             # print "lhs_set @lhs_set ";
359             # print "rhs_set @rhs_set\n";
360            
361 24         56 my $lhs_set = join(' ', @lhs_set);
362 24         29 my $rhs_set = join(' ', @rhs_set);
363            
364             # Spit out rule
365 24         39 my $lhs_support = $frequent_set{$lhs_set};
366             #my $rhs_support = $frequent_set{$rhs_set};
367 24 50       42 die "Couldn't find frequent set '$lhs_set'" if !defined($lhs_support);
368             #die "Couldn't find frequent set '$rhs_set'" if !defined($rhs_support);
369            
370             # For rule A => B, support = T(AB), conf = T(AB) / T(A)
371 24         31 my $conf = $support / $lhs_support;
372            
373 24 50       45 if ($conf >= $confidence_threshold) {
374 24         21 $num_rules++;
375 24         225 print RULES "$support ", sprintf("%.3f ", $conf),
376             int(@lhs_set), ' ', int(@rhs_set), ' ', int(@set), ' ',
377             "$lhs_set => $rhs_set\n";
378             }
379             }
380             }
381 3         91 close(RULES);
382 3 50       18 print STDERR "$num_rules rules\n" if $debug;
383             }
384            
385             sub set_debug {
386 0     0 0   $debug = $_[0];
387             }
388            
389             1;
390            
391             =head1 DESCRIPTION
392            
393             This module contains some functions to do association rule mining from
394             text files. This sounds obscure, but really measures beautifully
395             simple things through counting.
396            
397             =head2 FREQUENT SETS
398            
399             Frequent sets answer the question, "Which events occur together more
400             than N times?"
401            
402             =head3 The detail
403            
404             The 'transaction file' contains items in transactions. A set of items
405             has 'support' s if all the items occur together in at least s
406             transactions. (In many papers, support is a number between 0 and 1
407             representing the fraction of total transactions. I found the absolute
408             number itself more interesting, so I use that instead. Sorry for the
409             confusion.) For an itemset "A B C", the support is sometimes notated
410             "T(A B C)" (the number of 'T'ransactions).
411            
412             A set of items is called a 'frequent set' if it has support at least
413             the given support threshold. Generating frequent set produces all
414             frequent sets, and some information about each set (e.g., its
415             support).
416            
417             =head2 RULES
418            
419             Association rules answer the (related) question, "When these events
420             occur, how often do those events also occur?"
421            
422             =head3 The detail
423            
424             A rule has a left-hand set of items and a right-hand set
425             of items. A rule "LHS => RHS" with a support s and 'confidence' c means
426             that the underlying frequent set (LHS + RHS) occured together in at
427             least s transactions, and for all the transactions LHS occurred in,
428             RHS also occured in at least the fraction c (a number from 0 to 1).
429            
430             Generating rules produces all rules with support at least the given
431             support threshold, and confidence at least the given confidence
432             threshold. The confidence is sometimes notated "conf(LHS => RHS) =
433             T(LHS + RHS) / T(LHS)". There is also related data with each rule
434             (e.g., the size of its LHS and RHS, the support, the confidence,
435             etc.).
436            
437             =head3 FREQUENT SETS AND ASSOCIATION RULES GENERALLY USEFUL
438            
439             Although association rule mining is often described in commercial
440             terms like "market baskets" or "transactions" (collections of events)
441             and "items" (events), one can imagine events that make this sort of
442             counting useful across many domains. Events could be
443            
444             =over
445            
446             =item 0 stock market went down at time t
447            
448             =item 0 patient had symptom X
449            
450             =item 0 flower petal length was > 5mm
451            
452             =back
453            
454             For this reason, I believe counting frequent sets and looking at
455             association rules to be a fundamental tool of any data miner, someone
456             who is looking for patterns in pre-existing data, whether commercial
457             or not.
458            
459             =head1 EXAMPLES
460            
461             Given the following input file:
462            
463             234 Orange
464             463 Strawberry
465             53 Apple
466             234 Banana
467             412 Peach
468             467 Pear
469             234 Pear
470             147 Pear
471             141 Orange
472             375 Orange
473            
474             Generating frequent sets at support threshold 1 (a.k.a. 'at support 1')
475             produces three files:
476            
477             The 1-sets:
478            
479             1 Strawberry
480             1 Banana
481             1 Apple
482             3 Orange
483             1 Peach
484             3 Pear
485            
486             The 2-sets:
487            
488             1 Banana Orange
489             1 Banana Pear
490             1 Orange Pear
491            
492             The 3-sets:
493            
494             1 Banana Orange Pear
495            
496             Generating the rules at support 1 produces the following:
497            
498             1 0.333 1 1 2 Orange => Pear
499             1 0.333 1 1 2 Pear => Orange
500             1 1.000 1 2 3 Banana => Orange Pear
501             1 0.333 1 2 3 Orange => Banana Pear
502             1 1.000 2 1 3 Banana Orange => Pear
503             1 0.333 1 2 3 Pear => Banana Orange
504             1 1.000 2 1 3 Banana Pear => Orange
505             1 1.000 2 1 3 Orange Pear => Banana
506             1 1.000 1 1 2 Banana => Orange
507             1 0.333 1 1 2 Orange => Banana
508             1 1.000 1 1 2 Banana => Pear
509             1 0.333 1 1 2 Pear => Banana
510            
511             Generating frequent sets at support 2 produces one file:
512            
513             3 Orange
514             3 Pear
515            
516             Generating rules at support 2 produces nothing.
517            
518             Generating rules at support 1 and confidence 0.5 produces:
519            
520             1 1.000 1 2 3 Banana => Orange Pear
521             1 1.000 2 1 3 Banana Orange => Pear
522             1 1.000 2 1 3 Banana Pear => Orange
523             1 1.000 2 1 3 Orange Pear => Banana
524             1 1.000 1 1 2 Banana => Orange
525             1 1.000 1 1 2 Banana => Pear
526            
527             Note all the lower confidence rules are gone.
528            
529             =head1 ALGORITHM
530            
531             =head2 Generating frequent sets
532            
533             Generating frequent sets is straight-up Apriori. See for example:
534            
535             http://www.almaden.ibm.com/software/quest/Publications/papers/vldb94_rj.pdf
536            
537             I have not optimized. It depends on having the transactions all in
538             memory. However, given that, it still might scale decently (millions
539             of transactions).
540            
541             =head2 Generating rules
542            
543             Generating rules is a very vanilla implementation. It requires
544             reading all the frequent sets into memory, which does not scale at
545             all. Given that, since computers have lots of memory these days, you
546             might still be able to get away with millions of frequent sets (which
547             is <
548            
549             =head1 BUGS
550            
551             There is an existing tool (written in C) to mine frequent sets I kept
552             running across:
553            
554             http://fuzzy.cs.uni-magdeburg.de/~borgelt/software.html#assoc
555            
556             I should check it out to see if it is easy or desirable to be
557             file-level compatible with it.
558            
559             One could imagine wrapping it in Perl, but the Perl-C/C++ barrier is
560             where I have encountered all my troubles in the past, so I wouldn't
561             personally pursue that.
562            
563             =head1 VERSION
564            
565             This document describes Data::Mining::AssociationRules version 0.1.
566            
567             =head1 AUTHOR
568            
569             Dan Frankowski
570             dfrankow@winternet.com
571             http://www.winternet.com/~dfrankow
572            
573             Hey, if you download this module, drop me an email! That's the fun
574             part of this whole open source thing.
575            
576             =head1 LICENSE
577            
578             This program is free software; you can redistribute it and/or modify
579             it under the same terms as Perl itself.
580            
581             The full text of the license can be found in the LICENSE file included
582             in the distribution and available in the CPAN listing for
583             Data::Mining::AssociationRules (see www.cpan.org or search.cpan.org).
584            
585             =head1 DISCLAIMER
586            
587             To the maximum extent permitted by applicable law, the author of this
588             module disclaims all warranties, either express or implied, including
589             but not limited to implied warranties of merchantability and fitness
590             for a particular purpose, with regard to the software and the
591             accompanying documentation.
592            
593             =cut