File Coverage

blib/lib/HTML/WordTagRatio/Ratio.pm
Criterion Covered Total %
statement 12 31 38.7
branch 0 2 0.0
condition 0 3 0.0
subroutine 4 6 66.6
pod 2 2 100.0
total 18 44 40.9


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             HTML::WordTagRatio::Ratio - Default module for determining the ratio of words to tags in a range of tokens in an HTML document.
4            
5             =head1 SYNOPSIS
6            
7             use HTML::WordTagRatio::Ratio;
8             use HTML::Content::HTMLTokenizer;
9             use HTML::Content::ContentExtractor;
10            
11             my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
12            
13             open(HTML,"index.html");
14             my $doc = join("",);
15             close(HTML);
16            
17             my ($word_count_arr_ref,$tag_count_arr_ref,$token_type_arr_ref,$token_hash_ref) = $tokenizer->Tokenize($doc);
18            
19             my $ratio = new HTML::WordTagRatio::Ratio();
20            
21             my $value = $ratio->RangeValue(0, @$word_count_arr_ref,
22             $word_count_arr_ref, $tag_count_arr_ref);
23            
24             =head1 DESCRIPTION
25            
26             HTML::WordTagRatio::Ratio and derived classes compute a ratio of Words to Tags for a given range. Ratio is the base class and merely returns the number of word tokens in the range.
27            
28             =head2 Methods
29            
30             =over 4
31            
32             =item * my $ratio = new HTML::WordTagRatio::Ratio()
33            
34             Initializes HTML::WordTagRatio::Ratio
35            
36             =item * my $value = $ratio->RangeValue($start, $end, \@WordCount, \@TagCount)
37            
38             Returns the number of word tokens in ($start,$end]. $WordCount[$i] is the number of word tokens before or at the ith token in the input HTML document. $TagCount[$i] is the number of tag tokens before or at the ith token in the input HTML document.
39            
40             =back
41            
42             =head1 AUTHOR
43            
44             Jean Tavernier (jj.tavernier@gmail.com)
45            
46             =head1 COPYRIGHT
47            
48             Copyright 2005 Jean Tavernier. All rights reserved.
49            
50             This library is free software; you can redistribute it and/or
51             modify it under the same terms as Perl itself.
52            
53             =head1 SEE ALSO
54            
55             ContentExtractorDriver.pl (1), HTML::Content::ContentExtractor (3), HTML::Content::HTMLTokenizer (3), HTML::WordTagRatio::WeightedRatio (3), HTML::WordTagRatio::SmoothedRatio (3), HTML::WordTagRatio::RelativeRatio (3), HTML::WordTagRatio::ExponentialRatio (3), HTML::WordTagRatio::NormalizedRatio (3).
56            
57             =cut
58             package HTML::WordTagRatio::Ratio;
59 1     1   3 use strict;
  1         1  
  1         20  
60 1     1   2 use warnings;
  1         1  
  1         17  
61 1     1   3 use Carp;
  1         1  
  1         37  
62            
63 1     1   450 use fields qw(Exists);
  1         1128  
  1         3  
64            
65             # new - constructs Ratio object
66             # - preconditions: none
67             # - postconditions: Ratio is constructed
68             sub new
69             {
70 0     0 1   my $invocant = shift;
71 0   0       my $class = ref($invocant) || $invocant;
72            
73 0           my $self = fields::new($invocant);
74 0           $self->{Exists} = 1;
75 0           return bless($self, $class);
76             }
77             # RangeValue - returns value of a range of tokens
78             # - preconditions: 1st arg is an integer >= 0 and < length of @{3rd argument}
79             # 2nd arg is an integer > 1st arg and < length of @{3rd argument}
80             # 3rd arg is an array ref which points to an array of monotonically
81             # increasing integers, indicating the number of words found
82             # in the HTML document before or at the i_th token (i being an
83             # index into the array)
84             # 4th arg is an array ref which points to an array of monotonically
85             # increasing integers, indicating the number of tags found
86             # in the HTML document before or at the i_th token (i being an
87             # index into the array)
88             # - postconditions: floating point value returned indicating the value of the range
89             sub RangeValue
90             {
91 0     0 1   my $self = shift;
92 0           my $i = shift;
93 0           my $j = shift;
94 0           my $tN = shift;
95 0           my $tT = shift;
96 0           my @N = @{$tN};
  0            
97 0           my @T = @{$tT};
  0            
98            
99 0 0         if ($j <= $i)
100             {
101 0           return -1;
102             }
103 0           my $NinRange = $N[$j] - $N[$i];
104 0           my $TinRange = $T[$j] - $T[$i];
105 0           return $NinRange;
106             }
107             1;