File Coverage

blib/lib/HTML/Diff.pm
Criterion Covered Total %
statement 87 94 92.5
branch 23 24 95.8
condition 10 12 83.3
subroutine 12 13 92.3
pod 0 6 0.0
total 132 149 88.5


line stmt bran cond sub pod time code
1             package HTML::Diff;
2              
3             our $VERSION = '0.59';
4              
5 1     1   58994 use 5.006;
  1         3  
6 1     1   5 use strict;
  1         2  
  1         23  
7 1     1   5 use warnings;
  1         6  
  1         33  
8              
9 1     1   5 use Exporter;
  1         2  
  1         110  
10             our @ISA = qw(Exporter);
11             our @EXPORT = qw(line_diff word_diff html_word_diff);
12              
13             # This list of tags is taken from the XHTML spec and includes
14             # all those for which no closing tag is expected. In addition
15             # the pattern below matches any tag which ends with a slash /
16              
17             our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link
18             col colgroup frame input isindex area
19             embed img bgsound marquee);
20              
21 1     1   1111 use Algorithm::Diff 'sdiff';
  1         6588  
  1         1580  
22              
23             sub member {
24 122     122 0 434 my ($item, @list) = @_;
25              
26 122         177 return scalar(grep {$_ eq $item} @list);
  2196         3968  
27             }
28              
29             sub html_word_diff {
30 10     10 0 2765 my ($left, $right) = @_;
31              
32             # Split the two texts into words and tags.
33 10         194 my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm;
34 10         180 my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm;
35            
36 10 100       24 @leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) }
  149         886  
37             @leftchks;
38 10 100       67 @rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) }
  132         764  
39             @rightchks;
40              
41             # Remove blanks; maybe the above regexes could handle this?
42 10         62 @leftchks = grep { $_ ne '' } @leftchks;
  588         1048  
43 10         38 @rightchks = grep { $_ ne '' } @rightchks;
  531         909  
44              
45             # Now we process each segment by turning it into a pair. The first element
46             # is the text as we want it to read in the result. The second element is
47             # the value we will to use in comparisons. It contains an identifier
48             # for each of the balanced tags that it lies within.
49              
50             # This subroutine holds state in the tagstack variable
51 10         38 my $tagstack = [];
52             my $smear_tags = sub {
53 1119 100   1119   2173 if ($_ =~ /^<.*>/) {
54 160 100       339 if ($_ =~ m|^
55 38         99 my ($tag) = m|^]*)|;
56 38         57 $tag = lc $tag;
57             # print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n";
58             # If we found the closer for the tag on top
59             # of the stack, pop it off.
60 38 100 66     213 if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) {
61 36         56 my $stacktag = pop @$tagstack;
62             }
63 38         115 return [$_, $tag];
64             } else {
65 122         332 my ($tag) = m|^<\s*([^\s>]*)|;
66 122         179 $tag = lc $tag;
67             # print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n";
68 122 100 66     246 if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#)
69             { # (tags without correspond closer tags)
70 86         397 return [$_, $tag];
71             } else {
72 36         70 push @$tagstack, $tag;
73             }
74 36         83 return [$_, $_];
75             }
76             } else {
77 959         2240 my $result = [$_, (join "!!!", (@$tagstack, $_)) ];
78 959         1697 return $result;
79             }
80 10         49 };
81              
82             # Now do the "smear tags" operation across each of the chunk-lists
83 10         17 $tagstack = [];
84 10         19 @leftchks = map { &$smear_tags } @leftchks;
  588         851  
85             # TBD: better modularity would preclude having to reset the stack
86 10         49 $tagstack = [];
87 10         20 @rightchks = map { &$smear_tags } @rightchks;
  531         774  
88              
89             # print STDERR Data::Dumper::Dumper(\@leftchks);
90             # print STDERR Data::Dumper::Dumper(\@rightchks);
91            
92             # Now do the diff, using the "comparison" half of the pair to
93             # compare two chuncks.
94             my $chunks = sdiff(\@leftchks, \@rightchks,
95 10     1141   80 sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ });
  1141         19874  
  1141         2950  
  1141         2609  
96              
97             # print STDERR Data::Dumper::Dumper($chunks);
98            
99             # Finally, process the output of sdiff by concatenating
100             # consecutive chunks that were "unchanged."
101 10         5727 my $lastsignal = '';
102 10         15 my $lbuf = "";
103 10         14 my $rbuf = "";
104 10         12 my @result;
105             my $ch;
106 10         18 foreach $ch (@$chunks) {
107 594         1005 my ($signal, $left, $right) = @$ch;
108 594 100 100     1551 if ($signal ne $lastsignal && $lastsignal ne '') {
109 55 100 100     172 if ($signal ne 'u' && $lastsignal ne 'u') {
110 7         9 $signal = 'c';
111             } else {
112 48         106 push @result, [$lastsignal, $lbuf, $rbuf];
113 48         77 $lbuf = "";
114 48         62 $rbuf = "";
115             }
116             }
117             # if ($signal eq 'u' && $lastsignal ne 'u') {
118             # push @result, [$lastsignal, $lbuf, $rbuf]
119             # unless $lastsignal eq '';
120             # $lbuf = "";
121             # $rbuf = "";
122             # } elsif ($signal ne 'u' && $lastsignal eq 'u') {
123             # push @result, [$lastsignal, $lbuf, $rbuf];
124             # $lbuf = "";
125             # $rbuf = "";
126             # }
127 594         927 my $lelem = elem_mkp($left);
128 594         1033 my $relem = elem_mkp($right);
129 594 100       1060 $lbuf .= (defined $lelem ? $lelem : '');
130 594 100       1055 $rbuf .= (defined $relem ? $relem : '');
131 594         1019 $lastsignal = $signal;
132             }
133 10         26 push @result, [$lastsignal, $lbuf, $rbuf];
134 10         480 return \@result;
135             }
136              
137             # these are like "accessors" for the two halves of the diff-chunk pairs
138             sub elem_mkp {
139 1188     1188 0 1309 my ($e) = @_;
140 1188 100       2190 return undef unless ref $e eq 'ARRAY';
141 1119         1481 my ($mkp, $cmp) = @$e;
142 1119         1812 return $mkp;
143             }
144              
145             sub elem_cmprsn {
146 1141     1141 0 1383 my ($e) = @_;
147 1141 50       2263 return undef unless ref $e eq 'ARRAY';
148 1141         1665 my ($mkp, $cmp) = @$e;
149 1141         2092 return $cmp;
150             }
151              
152             # Finally a couple of non-HTML diff routines
153              
154             sub line_diff {
155 1     1 0 319 my ($left, $right) = @_;
156 1         19 my (@leftchks) = $left =~ m/(.*\n?)/gm;
157 1         14 my (@rightchks) = $right =~ m/(.*\n?)/gm;
158 1         4 my $result = sdiff(\@leftchks, \@rightchks);
159             # my @result = map { [ $_->[1], $_->[2] ] } @$result;
160 1         268 return $result;
161             }
162              
163             sub word_diff {
164 0     0 0   my ($left, $right) = @_;
165 0           my (@leftchks) = $left =~ m/([^\s]*\s?)/gm;
166 0           my (@rightchks) = $right =~ m/([^\s]*\s?)/gm;
167              
168 0           my $result = sdiff(\@leftchks, \@rightchks);
169 0           my @result = (map { [ $_->[1], $_->[2] ] } @$result);
  0            
170 0           return $result;
171             }
172              
173             1;
174              
175             =head1 NAME
176              
177             HTML::Diff - compare two HTML strings and return a list of differences
178              
179             =head1 SYNOPSIS
180              
181             use HTML::Diff;
182              
183             $result = html_word_diff($left_text, $right_text);
184              
185             =head1 DESCRIPTION
186              
187             This module compares two strings of HTML and returns a list of a
188             chunks which indicate the diff between the two input strings, where
189             changes in formatting are considered changes.
190              
191             HTML::Diff does not strictly parse the HTML. Instead, it uses regular
192             expressions to make a decent effort at understanding the given HTML.
193             As a result, there are many valid HTML documents for which it will not
194             produce the correct answer. But there may be some invalid HTML
195             documents for which it gives you the answer you're looking for. Your
196             mileage may vary; test it on lots of inputs from your domain before
197             relying on it.
198              
199             Returns a reference to a list of triples [, , ].
200             Each triple represents a check of the input texts. The flag tells you
201             whether it represents a deletion, insertion, a modification, or an
202             unchanged chunk.
203              
204             Every character of each input text is accounted for by some triple in
205             the output. Specifically, Concatenating all the members from
206             the return value should produce C<$left_text>, and likewise the
207             members concatenate together to produce C<$right_text>.
208              
209             The is either C<'u'>, C<'+'>, C<'-'>, or C<'c'>, indicating
210             whether the two chunks are the same, the $right_text contained this
211             chunk and the left chunk didn't, or vice versa, or the two chunks are
212             simply different. This follows the usage of Algorithm::Diff.
213              
214             The difference is computed on a word-by-word basis, "breaking" on
215             visible words in the HTML text. If a tag only is changed, it will not
216             be returned as an independent chunk but will be shown as a change to
217             one of the neighboring words. For balanced tags, such as , it
218             is intended that a change to the tag will be treated as a change to
219             all words in between.
220              
221             =head1 SEE ALSO
222              
223             L provides the diff algorithm used in this module.
224              
225             L can generate a diff between two XML files, and also
226             patch an XML file, given a diff.
227              
228             =head1 REPOSITORY
229              
230             L
231              
232             =head1 AUTHOR
233              
234             Whipped up by Ezra elias kilty Cooper, Eezra@ezrakilty.netE.
235              
236             Patch contributed by Adam Easjo@koldfront.dkE.
237              
238             =head1 COPYRIGHT AND LICENSE
239              
240             Copyright 2003-2014 by Ezra elias kilty Cooper, Eezra@ezrakilty.netE
241              
242             This program is free software; you can redistribute it and/or
243             modify it under the same terms as Perl itself.
244              
245             =cut