File Coverage

blib/lib/HTML/WordTagRatio/WeightedRatio.pm
Criterion Covered Total %
statement 15 33 45.4
branch 0 2 0.0
condition 0 3 0.0
subroutine 5 7 71.4
pod 2 2 100.0
total 22 47 46.8


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             HTML::WordTagRatio::WeightedRatio - Perl module for determining the ratio of words to tags in a range of tokens in an HTML document.
4            
5             =head1 SYNOPSIS
6            
7             use HTML::WordTagRatio::WeightedRatio;
8             use HTML::Content::HTMLTokenizer;
9             use HTML::Content::ContentExtractor;
10            
11             my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
12            
13             open(HTML,"index.html");
14             my $doc = join("",);
15             close(HTML);
16            
17             my ($word_count_arr_ref,$tag_count_arr_ref,$token_type_arr_ref,$token_hash_ref) = $tokenizer->Tokenize($doc);
18            
19             my $ratio = new HTML::WordTagRatio::WeightedRatio();
20            
21             my $value = $ratio->RangeValue(0, @$word_count_arr_ref,
22             $word_count_arr_ref, $tag_count_arr_ref);
23            
24             =head1 DESCRIPTION
25            
26             HTML::WordTagRatio::WeightedRatio computes a ratio of Words to Tags for a given range. In psuedo code, the ratio is
27            
28             Words^2/(Words + Tags)
29            
30             =head2 Methods
31            
32             =over 4
33            
34             =item * my $ratio = new HTML::WordTagRatio::WeightedRatio()
35            
36             Initializes HTML::WordTagRatio::WeightedRatio
37            
38             =item * my $value = $ratio->RangeValue($start, $end, \@WordCount, \@TagCount)
39            
40             $value is computed as follows:
41            
42             ($WordCount[$end] - $WordCount[$start])**2/(($WordCount[$end] - $WordCount[$start]) + ($TagCount[$end] - $TagCount[$start]))
43            
44             This is the number of words in the range squared, divided by the total number of tags in range. $WordCount[$i] is the number of word tokens before or at the ith token in the input HTML document. $TagCount[$i] is the number of tag tokens before or at the ith token in the input HTML document.
45            
46             =back
47            
48             =head1 AUTHOR
49            
50             Jean Tavernier (jj.tavernier@gmail.com)
51            
52             =head1 COPYRIGHT
53            
54             Copyright 2005 Jean Tavernier. All rights reserved.
55            
56             This library is free software; you can redistribute it and/or
57             modify it under the same terms as Perl itself.
58            
59             =head1 SEE ALSO
60            
61             ContentExtractorDriver.pl (1), HTML::Content::HTMLTokenizer (3), HTML::Content::ContentExtractor (3), HTML::WordTagRatio::Ratio (3), HTML::WordTagRatio::SmoothedRatio (3), HTML::WordTagRatio::RelativeRatio (3), HTML::WordTagRatio::ExponentialRatio (3), HTML::WordTagRatio::NormalizedRatio (3).
62            
63             =cut
64            
65             package HTML::WordTagRatio::WeightedRatio;
66 1     1   960 use strict;
  1         1  
  1         22  
67 1     1   3 use warnings;
  1         1  
  1         17  
68 1     1   3 use Carp;
  1         0  
  1         59  
69 1     1   6 use HTML::WordTagRatio::Ratio;
  1         2  
  1         29  
70 1     1   3 use vars qw(@ISA);
  1         1  
  1         158  
71             @ISA = qw(HTML::WordTagRatio::Ratio);
72            
73             # new - constructs WeightedRatio object
74             # - preconditions: none
75             # - postconditions: WeightedRatio is constructed
76             sub new
77             {
78 0     0 1   my $invocant = shift;
79 0   0       my $class = ref($invocant) || $invocant;
80 0           my($self) = new HTML::WordTagRatio::Ratio();
81            
82 0           return(bless($self, $class));
83             }
84             # RangeValue - returns value of a range of tokens
85             # - preconditions: 1st arg is an integer >= 0 and < length of @{3rd argument}
86             # 2nd arg is an integer > 1st arg and < length of @{3rd argument}
87             # 3rd arg is an array ref which points to an array of monotonically
88             # increasing integers, indicating the number of words found
89             # in the HTML document before or at the i_th token (i being an
90             # index into the array)
91             # 4th arg is an array ref which points to an array of monotonically
92             # increasing integers, indicating the number of tags found
93             # in the HTML document before or at the i_th token (i being an
94             # index into the array)
95             # - postconditions: floating point value returned indicating the value of the range
96             sub RangeValue
97             {
98 0     0 1   my $self = shift;
99 0           my $i = shift;
100 0           my $j = shift;
101 0           my $tN = shift;
102 0           my $tT = shift;
103 0           my @N = @{$tN};
  0            
104 0           my @T = @{$tT};
  0            
105            
106 0 0         if ($j <= $i)
107             {
108 0           return -1;
109             }
110 0           my $NinRange = $N[$j] - $N[$i];
111 0           my $TinRange = $T[$j] - $T[$i];
112 0           return $NinRange**2/($NinRange + $TinRange);
113             }
114             1;