File Coverage

blib/lib/Data/Mining/Apriori.pm
Criterion Covered Total %
statement 19 287 6.6
branch 0 130 0.0
condition 1 45 2.2
subroutine 6 16 37.5
pod 7 11 63.6
total 33 489 6.7


line stmt bran cond sub pod time code
1             package Data::Mining::Apriori;
2            
3 1     1   87819 use 5.010001;
  1         4  
4 1     1   5 use strict;
  1         2  
  1         22  
5 1     1   5 use warnings;
  1         2  
  1         57  
6 1     1   641 use Algorithm::Combinatorics qw(subsets variations);
  1         4551  
  1         94  
7 1     1   867 no if $] >= 5.017011, warnings => 'experimental::smartmatch';
  1         16  
  1         7  
8            
9             our $VERSION = 0.17;
10            
11             my$self;
12            
13             $" = ', ';
14             $| = 1;
15             $SIG{'INT'} = \&stop;
16            
17             sub new{
18 1     1 1 4475 my $type = shift;
19 1   33     8 my $class = ref($type)||$type;
20 1         18 $self = {
21             totalTransactions => 0,
22             metrics => {
23             minSupport => 0.01,
24             minConfidence => 0.10,
25             minLift => undef,
26             minLeverage => undef,
27             minConviction => undef,
28             minCoverage => undef,
29             minCorrelation => undef,
30             minCosine => undef,
31             minLaplace => undef,
32             minJaccard => undef
33             },
34             precision => 3,
35             output => undef,
36             pathOutputFiles => "",
37             messages => undef,
38             keyItemsDescription => undef,
39             keyItemsTransactions => undef,
40             limitRules => undef,
41             limitSubsets => undef,
42             numberSubsets => 0,
43             frequentItemset => [],
44             associationRules => undef,
45             implications => undef,
46             largeItemsetLength => 2,
47             rule => 0
48             };
49 1         4 bless($self,$class);
50 1         3 return $self;
51             }
52            
53             sub validate_data{
54             (defined $self->{keyItemsDescription})
55 0 0   0 0   or die('Error: $apriori->{keyItemsDescription} is not defined!');
56             (defined $self->{keyItemsTransactions})
57 0 0         or die('Error: $apriori->{keyItemsTransactions} is not defined!');
58             (defined $self->{metrics}{minSupport})
59 0 0         or die('Error: $apriori->{metrics}{minSupport} is not defined!');
60             (defined $self->{metrics}{minConfidence})
61 0 0         or die('Error: $apriori->{metrics}{minConfidence} is not defined!');
62             }
63            
64             sub insert_key_items_transaction{
65 0 0   0 1   (scalar(@_)==2)
66             or die('Error: $apriori->insert_key_items_transaction(\@items) missing parameter key items to array reference!');
67 0 0         (ref($_[1]) eq "ARRAY")
68             or die('Error: $apriori->insert_key_items_transaction(\@items) parameter key items is not an array reference!');
69 0           my@items=sort(@{$_[1]});
  0            
70 0           my @itemsets=subsets(\@items);
71 0           foreach my$itemset(@itemsets){
72 0           $self->{keyItemsTransactions}{"@{$itemset}"}++;
  0            
73             }
74 0           $self->{totalTransactions}++;
75             }
76            
77             sub input_data_file{
78 0 0   0 1   (scalar(@_)==3)
79             or die('Error: $apriori->input_data_file("datafile.txt",",") missing parameters path to data file and/or item separator!');
80 0           my$file=$_[1];
81 0           my$separator=$_[2];
82 0 0         (-e $file)
83             or die("Error: the file \"$file\" does not exists!");
84 0 0         (-r $file)
85             or die("Error: the file \"$file\" is not readable!");
86 0 0         (-T $file)
87             or die("Error: the file \"$file\" is not a text file!");
88 0 0         open(FILE,"<$file")
89             or die("Error: $!");
90 0           while(my$line=){
91 0           $line=~s/\r\n|\n//;
92 0           my@items=split($separator,$line);
93 0 0         if(scalar(@items)>=2){
94 0           $self->insert_key_items_transaction(\@items);
95             }
96             }
97 0           close(FILE);
98             }
99            
100             sub quantity_possible_rules{
101 0     0 1   $self->validate_data;
102 0           return ((3**scalar(keys(%{$self->{keyItemsDescription}})))-(2**(scalar(keys(%{$self->{keyItemsDescription}}))+1))+1);
  0            
  0            
103             }
104            
105             sub generate_rules{
106 0     0 1   $self->validate_data;
107 0 0         if($self->{messages}){
108 0           print "\n${\scalar(keys(%{$self->{keyItemsDescription}}))} items, ${\$self->quantity_possible_rules} possible rules";
  0            
  0            
  0            
109             }
110 0           my @largeItemsetLengthOne = grep{(($self->{keyItemsTransactions}{$_}/$self->{totalTransactions})*100)>=$self->{metrics}{minSupport}}keys(%{$self->{keyItemsDescription}});
  0            
  0            
111 0           $self->association_rules(\@largeItemsetLengthOne);
112             }
113            
114             sub association_rules{
115 0     0 1   my @largeItemset = @{$_[1]};
  0            
116 0           my @variations = variations(\@largeItemset,$self->{largeItemsetLength});
117 0           my @frequentItemset;
118 0 0         if($self->{messages}){
119 0           print "\nLarge itemset of length $self->{largeItemsetLength}, ${\scalar(@largeItemset)} items ";
  0            
120 0           print "\nProcessing ...";
121             }
122             VARIATIONS:
123 0           foreach my$variation(@variations){
124 0           my@candidateItemset=@{$variation};
  0            
125 0           my@antecedent;
126             my@consequent;
127 0           for(my$antecedentLength=0;$antecedentLength<$#candidateItemset;$antecedentLength++){
128 0           push@antecedent,$candidateItemset[$antecedentLength];
129 0           @consequent=();
130 0           for(my$consequentLength=($antecedentLength+1);$consequentLength<=$#candidateItemset;$consequentLength++){
131 0           push@consequent,$candidateItemset[$consequentLength];
132             }
133 0           @antecedent=sort@antecedent;
134 0           @consequent=sort@consequent;
135 0 0         next if("@consequent"~~@{$self->{implications}{"@antecedent"}});
  0            
136 0 0 0       last VARIATIONS if(defined $self->{limitSubsets} && $self->{numberSubsets} == $self->{limitSubsets});
137 0           $self->{numberSubsets}++;
138 0           push @{$self->{implications}{"@antecedent"}},"@consequent";
  0            
139 0           my@implication;
140 0           push@implication,@antecedent,@consequent;
141 0           @implication=sort(@implication);
142 0 0         next unless $self->{keyItemsTransactions}{"@antecedent"};
143 0           my$supportAntecedent=($self->{keyItemsTransactions}{"@antecedent"}/$self->{totalTransactions});
144 0 0         next unless $self->{keyItemsTransactions}{"@implication"};
145 0           my$supportConsequent=($self->{keyItemsTransactions}{"@implication"}/$self->{totalTransactions});
146 0           my $support = $supportConsequent;
147 0 0         next if($support < $self->{metrics}{minSupport});
148 0           my $confidence = ($supportConsequent/$supportAntecedent);
149 0 0 0       next if(defined $self->{metrics}{minConfidence} && $confidence < $self->{metrics}{minConfidence});
150 0           my $lift = ($support/($supportAntecedent*$supportConsequent));
151 0 0 0       next if(defined $self->{metrics}{minLift} && $lift < $self->{metrics}{minLift});
152 0           my $leverage = ($support-($supportAntecedent*$supportConsequent));
153 0 0 0       next if(defined $self->{metrics}{minLeverage} && $leverage < $self->{metrics}{minLeverage});
154 0 0         my $conviction = ((1-$supportConsequent)==0)?"NaN":((1-$confidence)==0)?"NaN":((1-$supportConsequent)/(1-$confidence));
    0          
155 0 0 0       next if(defined $self->{metrics}{minConviction} && $conviction < $self->{metrics}{minConviction});
156 0           my $coverage = $supportAntecedent;
157 0 0 0       next if(defined $self->{metrics}{minCoverage} && $coverage < $self->{metrics}{minCoverage});
158 0           my $correlation = (($support-($supportAntecedent*$supportConsequent))/sqrt($supportAntecedent*(1-$supportAntecedent)*$supportConsequent*(1-$supportConsequent)));
159 0 0 0       next if(defined $self->{metrics}{minCorrelation} && $correlation < $self->{metrics}{minCorrelation});
160 0           my $cosine = ($support/sqrt($supportAntecedent*$supportConsequent));
161 0 0 0       next if(defined $self->{metrics}{minCosine} && $cosine < $self->{metrics}{minCosine});
162 0           my $laplace = (($support+1)/($supportAntecedent+2));
163 0 0 0       next if(defined $self->{metrics}{minLaplace} && $laplace < $self->{metrics}{minLaplace});
164 0           my $jaccard = ($support/($supportAntecedent+$supportConsequent-$support));
165 0 0 0       next if(defined $self->{metrics}{minJaccard} && $jaccard < $self->{metrics}{minJaccard});
166 0           $self->{rule}++;
167 0           $support = sprintf("%.$self->{precision}f", $support);
168 0           $confidence = sprintf("%.$self->{precision}f", $confidence);
169 0           $lift = sprintf("%.$self->{precision}f", $lift);
170 0           $leverage = sprintf("%.$self->{precision}f", $leverage);
171 0 0         $conviction = sprintf("%.$self->{precision}f", $conviction)if($conviction ne "NaN");
172 0           $coverage = sprintf("%.$self->{precision}f", $coverage);
173 0           $correlation = sprintf("%.$self->{precision}f", $correlation);
174 0           $cosine = sprintf("%.$self->{precision}f", $cosine);
175 0           $laplace = sprintf("%.$self->{precision}f", $laplace);
176 0           $jaccard = sprintf("%.$self->{precision}f", $jaccard);
177             $self->{associationRules}{$self->{rule}} = {
178 0           implication => "{ @antecedent } => { @consequent }",
179             support => $support,
180             confidence => $confidence,
181             lift => $lift,
182             leverage => $leverage,
183             conviction => $conviction,
184             coverage => $coverage,
185             correlation => $correlation,
186             cosine => $cosine,
187             laplace => $laplace,
188             jaccard => $jaccard,
189             items => [@antecedent, @consequent]
190             };
191 0           my@items=grep{!($_~~@frequentItemset)}@implication;
  0            
192 0           push@frequentItemset,@items;
193 0 0 0       last VARIATIONS if(defined $self->{limitRules} && $self->{rule} == $self->{limitRules});
194             }
195             }
196 0 0         if($self->{messages}){
197 0           print "\nFrequent itemset: { @frequentItemset }, ${\scalar(@frequentItemset)} items ";
  0            
198             }
199 0 0         if(defined $self->{associationRules}){
200 0           @{$self->{frequentItemset}}=@frequentItemset;
  0            
201 0           $self->output;
202             }
203             return if((defined $self->{limitRules} && $self->{rule} == $self->{limitRules})
204 0 0 0       ||(defined $self->{limitSubsets} && $self->{numberSubsets} == $self->{limitSubsets}));
      0        
      0        
205 0 0         if(scalar(@frequentItemset)>=($self->{largeItemsetLength}+1)){
206 0           $self->{largeItemsetLength}++;
207 0           $self->{associationRules} = undef;
208 0           $self->association_rules(\@frequentItemset);
209             }
210             }
211            
212             sub stop{
213 0 0   0 0   if($self->{messages}){
214 0           print "\nStopping ...";
215 0 0         $self->output if $self->{associationRules};
216 0           print "\nExit? (Y/N): ";
217 0           my $answer = ;
218 0           chomp($answer);
219 0 0         if($answer =~ /^y$/i){
220 0           exit;
221             }
222             else{
223 0           print "Processing ...";
224             }
225             }
226             else{
227 0 0         $self->output if $self->{associationRules};
228 0           exit;
229             }
230             }
231            
232             sub output{
233 0 0   0 1   if($self->{output}){
234 0 0         if($self->{output}==1){
    0          
235 0           $self->file;
236             }
237             elsif($self->{output}==2){
238 0           $self->excel;
239             }
240             }
241             }
242            
243             sub file{
244 0 0   0 0   if($self->{messages}){
245 0           print "\nExporting to file $self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.txt ...";
246             }
247 0 0         open(FILE,">$self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.txt")
248             or die("\nError: $self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.txt $!");
249 0           print FILE "Rules\tSupport\tConfidence";
250 0           my@headings=('Lift', 'Leverage', 'Conviction', 'Coverage', 'Correlation', 'Cosine', 'Laplace', 'Jaccard');
251 0           my@metrics;
252 0           foreach my$metric(@headings){
253 0 0         push@metrics,$metric if defined $self->{metrics}{"min$metric"};
254             }
255 0           foreach my$metric(@metrics){
256 0           print FILE "\t$metric";
257             }
258 0           print FILE "\n";
259 0           foreach my$rule(sort{$a<=>$b}keys(%{$self->{associationRules}})){
  0            
  0            
260 0           $self->{associationRules}{$rule}{support}=~s/\./,/;
261 0           $self->{associationRules}{$rule}{confidence}=~s/\./,/;
262 0           foreach my$metric(@metrics){
263 0           $self->{associationRules}{$rule}{lc$metric}=~s/\./,/;
264             }
265 0           print FILE "R$rule\t$self->{associationRules}{$rule}{support}\t$self->{associationRules}{$rule}{confidence}";
266 0           foreach my$metric(@metrics){
267 0           print FILE "\t$self->{associationRules}{$rule}{${\lc$metric}}";
  0            
268             }
269 0           print FILE "\n";
270             }
271 0           print FILE "\n";
272 0           foreach my$rule(sort{$a<=>$b}keys(%{$self->{associationRules}})){
  0            
  0            
273 0           print FILE "Rule R$rule: $self->{associationRules}{$rule}{implication}\n";
274 0           print FILE "Support: $self->{associationRules}{$rule}{support}\n";
275 0           print FILE "Confidence: $self->{associationRules}{$rule}{confidence}\n";
276 0           foreach my$metric(@metrics){
277 0           print FILE "$metric: $self->{associationRules}{$rule}{${\lc$metric}}\n";
  0            
278             }
279 0           print FILE "Items:\n";
280 0           foreach my$item(@{$self->{associationRules}{$rule}{items}}){
  0            
281 0           print FILE "$item $self->{keyItemsDescription}{$item}\n";
282             }
283 0           print FILE "\n";
284             }
285 0           print FILE "Frequent itemset: { @{$self->{frequentItemset}} }\n";
  0            
286 0           print FILE "Items:\n";
287 0           foreach my$item(@{$self->{frequentItemset}}){
  0            
288 0           print FILE "$item $self->{keyItemsDescription}{$item}\n";
289             }
290 0           close(FILE);
291             }
292            
293             sub excel{
294 0     0 0   require Excel::Writer::XLSX;
295 0 0         if($self->{messages}){
296 0           print "\nExporting to excel $self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.xlsx ...";
297             }
298 0 0         my $workbook = Excel::Writer::XLSX->new("$self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.xlsx")
299             or die("\nError: $self->{pathOutputFiles}output_large_itemset_length_$self->{largeItemsetLength}.xlsx $!");
300 0           my $worksheet = $workbook->add_worksheet();
301 0           my $bold = $workbook->add_format(bold => 1);
302 0           my $headings = ['Rules', 'Support', 'Confidence'];
303 0           my@metrics=('Lift', 'Leverage', 'Conviction', 'Coverage', 'Correlation', 'Cosine', 'Laplace', 'Jaccard');
304 0           foreach my$metric(@metrics){
305 0 0         push@{$headings},$metric if defined $self->{metrics}{"min$metric"};
  0            
306             }
307 0           @metrics=@{$headings}[3..$#{$headings}];
  0            
  0            
308 0           my(@rules,@support,@confidence,@lift,@leverage,@conviction,@coverage,@correlation,@cosine,@laplace,@jaccard);
309 0           foreach my$rule(sort{$a<=>$b}keys(%{$self->{associationRules}})){
  0            
  0            
310 0           push @rules,"R$rule";
311 0           push @support,$self->{associationRules}{$rule}{support};
312 0           push @confidence,$self->{associationRules}{$rule}{confidence};
313 0 0         push @lift,$self->{associationRules}{$rule}{lift} if defined $self->{metrics}{minLift};
314 0 0         push @leverage,$self->{associationRules}{$rule}{leverage} if defined $self->{metrics}{minLeverage};
315 0 0         push @conviction,$self->{associationRules}{$rule}{conviction} if defined $self->{metrics}{minConviction};
316 0 0         push @coverage,$self->{associationRules}{$rule}{coverage} if defined $self->{metrics}{minCoverage};
317 0 0         push @correlation,$self->{associationRules}{$rule}{correlation} if defined $self->{metrics}{minCorrelation};
318 0 0         push @cosine,$self->{associationRules}{$rule}{cosine} if defined $self->{metrics}{minCosine};
319 0 0         push @laplace,$self->{associationRules}{$rule}{laplace} if defined $self->{metrics}{minLaplace};
320 0 0         push @jaccard,$self->{associationRules}{$rule}{jaccard} if defined $self->{metrics}{minJaccard};
321             }
322 0           my$line=(scalar(@rules)+1);
323 0           my@data=(\@rules,\@support,\@confidence);
324 0 0         push @data,\@lift if defined $self->{metrics}{minLift};
325 0 0         push @data,\@leverage if defined $self->{metrics}{minLeverage};
326 0 0         push @data,\@conviction if defined $self->{metrics}{minConviction};
327 0 0         push @data,\@coverage if defined $self->{metrics}{minCoverage};
328 0 0         push @data,\@correlation if defined $self->{metrics}{minCorrelation};
329 0 0         push @data,\@cosine if defined $self->{metrics}{minCosine};
330 0 0         push @data,\@laplace if defined $self->{metrics}{minLaplace};
331 0 0         push @data,\@jaccard if defined $self->{metrics}{minJaccard};
332 0           $worksheet->write('A1', $headings, $bold);
333 0           $worksheet->write('A2', \@data);
334 0           my$chart=$workbook->add_chart(type =>'column', embedded=>1);
335 0           my@columns=('B'..'M');
336 0           my$i=0;
337 0           $chart->add_series(
338             name => 'Support',
339             categories => '=Sheet1!$A$2:$A$'.$line,
340             values => '=Sheet1!$'.$columns[$i].'$2:$'.$columns[$i].'$'.$line,
341             );
342 0           $i++;
343 0           $chart->add_series(
344             name => 'Confidence',
345             categories => '=Sheet1!$A$2:$A$'.$line,
346             values => '=Sheet1!$'.$columns[$i].'$2:$'.$columns[$i].'$'.$line,
347             );
348 0           foreach my$metric(@metrics){
349 0           $i++;
350 0           $chart->add_series(
351             name => $metric,
352             categories => '=Sheet1!$A$2:$A$'.$line,
353             values => '=Sheet1!$'.$columns[$i].'$2:$'.$columns[$i].'$'.$line,
354             );
355             }
356 0           $worksheet->insert_chart($columns[($i+2)].'2', $chart);
357 0           $line+=2;
358 0           my $urlFormat = $workbook->add_format(
359             color => 'blue',
360             );
361 0           my$url=2;
362 0           foreach my$rule(sort{$a<=>$b}keys(%{$self->{associationRules}})){
  0            
  0            
363 0           $worksheet->write_url("A$line","internal:Sheet1!A$url",$urlFormat,"<");
364 0           $line++;
365 0           $worksheet->write_url("A$url","internal:Sheet1!A$line",$urlFormat,"R$rule");
366 0           $worksheet->write("A$line","Rule R$rule: $self->{associationRules}{$rule}{implication}");
367 0           $line++;
368 0           $worksheet->write("A$line","Support: $self->{associationRules}{$rule}{support}");
369 0           $line++;
370 0           $worksheet->write("A$line","Confidence: $self->{associationRules}{$rule}{confidence}");
371 0           $line++;
372 0           foreach my$metric(@metrics){
373 0           $worksheet->write("A$line","$metric: $self->{associationRules}{$rule}{${\lc$metric}}");
  0            
374 0           $line++;
375             }
376 0           $worksheet->write("A$line","Items:");
377 0           $line++;
378 0           foreach my$item(@{$self->{associationRules}{$rule}{items}}){
  0            
379 0           $worksheet->write("A$line","$item $self->{keyItemsDescription}{$item}");
380 0           $line++;
381             }
382 0           $line++;
383 0           $url++;
384             }
385 0           $worksheet->write("A$line","Frequent itemset: { @{$self->{frequentItemset}} }");
  0            
386 0           $line++;
387 0           $worksheet->write("A$line","Items:");
388 0           $line++;
389 0           foreach my$item(@{$self->{frequentItemset}}){
  0            
390 0           $worksheet->write("A$line","$item $self->{keyItemsDescription}{$item}");
391 0           $line++;
392             }
393 0           $workbook->close;
394             }
395            
396             return 1;
397             __END__