File Coverage

blib/lib/HTML/Content/HTMLTokenizer.pm
Criterion Covered Total %
statement 20 51 39.2
branch 2 6 33.3
condition 1 3 33.3
subroutine 5 8 62.5
pod 2 4 50.0
total 30 72 41.6


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             HTML::Content::HTMLTokenizer - Perl module to tokenize HTML documents.
4            
5             =head1 SYNOPSIS
6            
7             use HTML::Content::HTMLTokenizer;
8            
9             my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
10            
11             open(HTML,"index.html");
12             my $doc = join("",);
13             close(HTML);
14            
15             my ($word_count_arr_ref,$tag_count_arr_ref,$token_type_arr_ref,$token_hash_ref) = $tokenizer->Tokenize($doc);
16            
17             =head1 DESCRIPTION
18            
19             HTML::Content::HTMLTokenizer has one main method, Tokenize, which tokenizes a HTML document into a sequence of 'TAG' and 'WORD' tokens.
20            
21             =head2 Methods
22            
23             =over 4
24            
25             =item * my $tokenizer = new HTML::Content::HTMLTokenizer($tagMarker,$wordMarker)
26            
27             Initializes HTML::Content::HTMLTokenizer.
28            
29             $tagMarker - String that will represent tags in the token sequence returned from Tokenize.
30            
31             $wordMarker - String that will represent words in the token sequence returned from Tokenize.
32            
33             =item * my (\@WordCount,\@TokenCount,\@Sequence,\%Tokens) = $tokenizer->Tokenize(\$htmldocument);
34            
35             $WordCount[$i] is the number of word tokens before or at the ith token in the input HTML document.
36            
37             $TagCount[$i] is the number of tag tokens before or at the ith token in the input HTML document.
38            
39             $Sequence[$i] is the type of token at the ith spot in the input HTML document. Either $tagMarker or $wordMarker.
40            
41             $Tokens{$i} is the word at the ith spot in the input HTML document. This is defined only if there is a word at the ith spot in the document.
42            
43             =back
44            
45             =head1 AUTHOR
46            
47             Jean Tavernier (jj.tavernier@gmail.com)
48            
49             =head1 COPYRIGHT
50            
51             Copyright 2005 Jean Tavernier. All rights reserved.
52            
53             This library is free software; you can redistribute it and/or
54             modify it under the same terms as Perl itself.
55            
56             =head1 SEE ALSO
57            
58             ContentExtractorDriver.pl (1), HTML::Content::ContentExtractor (3), HTML::WordTagRatio::Ratio (3),HTML::WordTagRatio::WeightedRatio (3), HTML::WordTagRatio::SmoothedRatio (3), HTML::WordTagRatio::RelativeRatio (3), HTML::WordTagRatio::ExponentialRatio (3), HTML::WordTagRatio::NormalizedRatio (3).
59            
60             =cut
61            
62            
63             package HTML::Content::HTMLTokenizer;
64            
65 1     1   3 use strict;
  1         1  
  1         21  
66 1     1   3 use warnings;
  1         2  
  1         16  
67 1     1   2 use Carp;
  1         1  
  1         40  
68            
69 1     1   3 use fields qw(TAGMARKER WORDMARKER);
  1         1  
  1         3  
70            
71             # new - constructs HTMLTokenizer object
72             # - preconditions: 1st arg points to string to indicate tag
73             # 2nd arg points to string to indicate word
74             # - postconditions: HTMLTokenizer is constructed
75             sub new
76             {
77 1     1 1 203 my $invocant = shift;
78 1   33     14 my $class = ref($invocant) || $invocant;
79 1 50       3 my $tagMarker = shift or croak "HTMLTokenizer: TagMarker missing \n\tex: HTMLTokenizer tok = new HTMLTokenizer('TAG','WORD');\n";
80 1 50       1 my $wordMarker = shift or croak "HTMLTokenizer: WordMarker missing \n\tex: HTMLTokenizer tok = new HTMLTokenizer('TAG','WORD');\n";
81            
82 1         4 my $self = fields::new($invocant);
83 1         2515 $self->{TAGMARKER} = $tagMarker;
84 1         2 $self->{WORDMARKER} = $wordMarker;
85            
86 1         82 return bless($self, $class);
87             }
88             sub GetTagMarker
89             {
90 0     0 0   my $self = shift;
91 0           return $self->{TAGMARKER};
92             }
93             sub GetWordMarker
94             {
95 0     0 0   my $self = shift;
96 0           return $self->{WORDMARKER};
97             }
98             sub Tokenize
99             {
100 0     0 1   my $self = shift;
101 0           my $doc = shift;
102 0           my @N = ();
103 0           my @T = ();
104 0           my %tokens = ();
105            
106             #Remove carriage returns and newlines
107 0           $doc =~ s/[\n\r]+/ /g;
108            
109             #Eliminate comments
110 0           $doc =~ s/()/ <> /gis;
111            
112             #Eliminate scripts
113 0           $doc =~ s/(.+?<\/script>)/ <> /gis;
114            
115             #Eliminate scripts
116 0           $doc =~ s/(.+?<\/style>)/ <> /gis;
117            
118             #Eliminate tag words
119 0           $doc =~ s/(<.+?>)/ <> /gs;
120            
121             #Remove HTML spaces
122 0           $doc =~ s/\Q \E/ /g;
123            
124 0           $doc =~ s/\Q"\E/\"/g;
125 0           $doc =~ s/\Q—\E/-/g;
126            
127             #Remove HTML directives
128 0           $doc =~ s/\Q&\E.*?\Q;\E/ /g;
129            
130 0           my @seq = split(/\s+/,$doc);
131            
132 0           my $tagcnt = 0;
133 0           my $wordcnt = 0;
134 0           for(my $i = 0; $i <= $#seq; $i++)
135             {
136 0 0         if ($seq[$i] eq '<>')
137             {
138 0           $seq[$i] = $self->{TAGMARKER};
139 0           $tagcnt++;
140             }
141             else
142             {
143 0           $tokens{$i} = $seq[$i];
144 0           $seq[$i] = $self->{WORDMARKER};
145 0           $wordcnt++;
146             }
147 0           push(@N,$wordcnt);
148 0           push(@T,$tagcnt);
149             }
150 0           return (\@N,\@T,\@seq,\%tokens);
151             }
152             1;