File Coverage

blib/lib/Perl/Strip.pm
Criterion Covered Total %
statement 9 90 10.0
branch 0 50 0.0
condition 0 57 0.0
subroutine 3 6 50.0
pod 2 2 100.0
total 14 205 6.8


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Perl::Strip - reduce file size by stripping whitespace, comments, pod etc.
4              
5             =head1 SYNOPSIS
6              
7             use Perl::Strip;
8              
9             =head1 DESCRIPTION
10              
11             This module transforms perl sources into a more compact format. It does
12             this by removing most whitespace, comments, pod, and by some other means.
13              
14             The resulting code looks obfuscated, but perl (and the deparser) don't
15             have any problems with that. Depending on the source file you can expect
16             about 30-60% "compression".
17              
18             The main target for this module is low-diskspace environments, such as
19             L, boot floppy/CDs/flash environments and so on.
20              
21             See also the commandline utility L.
22              
23             =head1 METHODS
24              
25             The C class is a subclsass of L, and as such
26             inherits all of it's methods, even the ones not documented here.
27              
28             =over 4
29              
30             =cut
31              
32             package Perl::Strip;
33              
34             our $VERSION = '1.2';
35             our $CACHE_VERSION = 3;
36              
37 1     1   1148 use common::sense;
  1         14  
  1         6  
38              
39 1     1   722 use PPI;
  1         123821  
  1         46  
40              
41 1     1   6 use base PPI::Transform::;
  1         2  
  1         506  
42              
43             =item my $transform = new Perl::Strip key => value...
44              
45             Creates a new Perl::Strip transform object. It supports the following
46             parameters:
47              
48             =over 4
49            
50             =item optimise_size => $bool
51              
52             By default, this module optimises I, not raw size. This
53             switch changes that (and makes it slower).
54              
55             =item keep_nl => $bool
56              
57             By default, whitespace will either be stripped or replaced by a space. If
58             this option is enabled, then newlines will not be removed. This has the
59             advantage of keeping line number information intact (e.g. for backtraces),
60             but of course doesn't compress as well.
61              
62             =item cache => $path
63              
64             Since this module can take a very long time (minutes for the larger files
65             in the perl distribution), it can utilise a cache directory. The directory
66             will be created if it doesn't exist, and can be deleted at any time.
67              
68             =back
69              
70             =cut
71              
72             # PPI::Transform compatible
73             sub document {
74 0     0 1   my ($self, $doc) = @_;
75              
76 0           $self->{optimise_size} = 1; # more research is needed
77              
78             # special stripping for unicore/ files
79 0 0         if (eval { $doc->child (1)->content =~ /^# .* (build by mktables|machine-generated .*mktables) / }) {
  0            
80              
81 0           for my $heredoc (@{ $doc->find (PPI::Token::HereDoc::) }) {
  0            
82 0           my $src = join "", $heredoc->heredoc;
83              
84             # special stripping for unicore swashes and properties
85             # much more could be done by going binary
86 0           for ($src) {
87             s/^(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?\t(?:0*([0-9a-fA-F]+))?/$1\t$2\t$3/gm
88 0 0         if $self->{optimise_size};
89              
90             # s{
91             # ^([0-9a-fA-F]+)\t([0-9a-fA-F]*)\t
92             # }{
93             # # ww - smaller filesize, UU - compress better
94             # pack "C0UU",
95             # hex $1,
96             # length $2 ? (hex $2) - (hex $1) : 0
97             # }gemx;
98              
99 0           s/#.*\n/\n/mg;
100 0           s/\s+\n/\n/mg;
101             }
102              
103             # PPI seems to be mostly undocumented
104 0           $heredoc->{_heredoc} = [split /$/, $src];
105             }
106             }
107              
108 0           $doc->prune (PPI::Token::Comment::);
109 0           $doc->prune (PPI::Token::Pod::);
110              
111             # prune END stuff
112 0           for (my $last = $doc->last_element; $last; ) {
113 0           my $prev = $last->previous_token;
114              
115 0 0         if ($last->isa (PPI::Token::Whitespace::)) {
    0          
    0          
116 0           $last->delete;
117             } elsif ($last->isa (PPI::Statement::End::)) {
118 0           $last->delete;
119 0           last;
120             } elsif ($last->isa (PPI::Token::Pod::)) {
121 0           $last->delete;
122             } else {
123 0           last;
124             }
125              
126 0           $last = $prev;
127             }
128              
129             # prune some but not all insignificant whitespace
130 0           for my $ws (@{ $doc->find (PPI::Token::Whitespace::) }) {
  0            
131 0           my $prev = $ws->previous_token;
132 0           my $next = $ws->next_token;
133              
134 0 0 0       if (!$prev || !$next) {
135 0           $ws->delete;
136             } else {
137 0 0 0       if ($next->isa (PPI::Token::Whitespace::)) {
    0 0        
      0        
      0        
      0        
      0        
      0        
      0        
138 0           $ws->delete;
139             } elsif (
140             $next->isa (PPI::Token::Operator::) && $next->{content} =~ /^(?:,|=|!|!=|==|=>)$/ # no ., because of digits. == float
141             or $prev->isa (PPI::Token::Operator::) && $prev->{content} =~ /^(?:,|=|\.|!|!=|==|=>)$/
142             or $prev->isa (PPI::Token::Structure::)
143             or ($self->{optimise_size} &&
144             ($prev->isa (PPI::Token::Word::)
145             && (PPI::Token::Symbol:: eq ref $next
146             || $next->isa (PPI::Structure::Block::)
147             || $next->isa (PPI::Structure::List::)
148             || $next->isa (PPI::Structure::Condition::)))
149             )
150             ) {
151             # perl has some idiotic warnings about nonexisting operators
152 0 0 0       if ($prev->isa (PPI::Token::Operator::) && $prev->{content} eq "="
      0        
      0        
153             && $next->isa (PPI::Token::Operator::) && $next->{content} =~ /[+\-]/
154             ) {
155             # avoid "Reverse %s operator" diagnostic
156             } else {
157 0           $ws->delete;
158             }
159             } else {
160 0           $ws->{content} = ' ';
161             }
162             }
163             }
164              
165             # prune whitespace around blocks, also ";" at end of blocks
166 0 0         if ($self->{optimise_size}) {
167             # these usually decrease size, but decrease compressability more
168 0           for my $struct (PPI::Structure::Block::, PPI::Structure::Condition::, PPI::Structure::List::) {
169 0           for my $node (@{ $doc->find ($struct) }) {
  0            
170 0           my $n1 = $node->first_token;
171             # my $n2 = $n1->previous_token;
172 0           my $n3 = $n1->next_token;
173 0 0         $n1->delete if $n1->isa (PPI::Token::Whitespace::);
174             # $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::); # unsafe! AE::timer $MAX_SIGNAL_LATENCY -($NOW - int$NOW)
175 0 0 0       $n3->delete if $n3 && $n3->isa (PPI::Token::Whitespace::);
176 0           my $n1 = $node->last_token;
177 0           my $n2 = $n1->next_token;
178 0           my $n3 = $n1->previous_token;
179 0 0         $n1->delete if $n1->isa (PPI::Token::Whitespace::);
180 0 0 0       $n2->delete if $n2 && $n2->isa (PPI::Token::Whitespace::);
181 0 0 0       $n3->{content} = "" # delete seems to trigger a bug inside PPI
      0        
182             if $n3 && ($n3->isa (PPI::Token::Whitespace::)
183             || ($n3->isa (PPI::Token::Structure::) && $n3->content eq ";"));
184             }
185             }
186             }
187              
188             # foreach => for
189 0           for my $node (@{ $doc->find (PPI::Statement::Compound::) }) {
  0            
190 0 0         if (my $n = $node->first_token) {
191 0 0 0       $n->{content} = "for" if $n->{content} eq "foreach" && $n->isa (PPI::Token::Word::);
192             }
193             }
194              
195             # reformat qw() lists which often have lots of whitespace
196 0           for my $node (@{ $doc->find (PPI::Token::QuoteLike::Words::) }) {
  0            
197 0 0         if ($node->{content} =~ /^qw(.)(.*)(.)$/s) {
198 0           my ($a, $qw, $b) = ($1, $2, $3);
199 0           $qw =~ s/^\s+//;
200 0           $qw =~ s/\s+$//;
201 0           $qw =~ s/\s+/ /g;
202 0           $node->{content} = "qw$a$qw$b";
203             }
204             }
205              
206             # prune return at end of sub-blocks
207             #TODO:
208             # PPI::Document
209             # PPI::Statement::Sub
210             # PPI::Token::Word 'sub'
211             # PPI::Token::Whitespace ' '
212             # PPI::Token::Word 'f'
213             # PPI::Structure::Block { ... }
214             # PPI::Statement
215             # PPI::Token::Word 'sub'
216             # PPI::Structure::Block { ... }
217             # PPI::Statement::Break
218             # PPI::Token::Word 'return'
219             # PPI::Token::Whitespace ' '
220             # PPI::Token::Number '5'
221             # PPI::Token::Structure ';'
222             # PPI::Statement::Compound
223             # PPI::Structure::Block { ... }
224             # PPI::Statement::Break
225             # PPI::Token::Word 'return'
226             # PPI::Token::Whitespace ' '
227             # PPI::Token::Number '8'
228             # PPI::Statement::Break
229             # PPI::Token::Word 'return'
230             # PPI::Token::Whitespace ' '
231             # PPI::Token::Number '7'
232              
233             1
234 0           }
235              
236             =item $perl = $transform->strip ($perl)
237              
238             Strips the perl source in C<$perl> and returns the stripped source.
239              
240             =cut
241              
242             sub strip {
243 0     0 1   my ($self, $src) = @_;
244              
245             my $filter = sub {
246 0 0   0     my $ppi = new PPI::Document \$src
247             or return;
248              
249 0 0         $self->document ($ppi)
250             or return;
251              
252 0           $src = $ppi->serialize;
253 0           };
254              
255 0 0 0       if (exists $self->{cache} && (2048 <= length $src)) {
256 0           my $file = "$self->{cache}/" . Digest::MD5::md5_hex "$CACHE_VERSION \n" . (!!$self->{optimise_size}) . "\n\x00$src";
257              
258 0 0         if (open my $fh, "<:perlio", $file) {
259             # zero size means unchanged
260 0 0         if (-s $fh) {
261 0           local $/;
262 0           $src = <$fh>
263             }
264             } else {
265 0           my $oldsrc = $src;
266              
267 0           $filter->();
268              
269 0           mkdir $self->{cache};
270              
271 0 0         if (open my $fh, ">:perlio", "$file~") {
272             # write a zero-byte file if source is unchanged
273 0 0 0       if ($oldsrc eq $src or (syswrite $fh, $src) == length $src) {
274 0           close $fh;
275 0           rename "$file~", $file;
276             }
277             }
278             }
279             } else {
280 0           $filter->();
281             }
282              
283 0           $src
284             }
285              
286             =back
287              
288             =head1 SEE ALSO
289              
290             L, L.
291              
292             =head1 AUTHOR
293              
294             Marc Lehmann
295             http://home.schmorp.de/
296              
297             =cut
298              
299             1;
300