File Coverage

blib/lib/Goo/TextUtilities.pm

Criterion	Covered	Total	%
statement	9	83	10.8
branch	0	10	0.0
condition			n/a
subroutine	3	17	17.6
pod	14	14	100.0
total	26	124	20.9

line	stmt	bran	sub	pod	time	code
1						#!/usr/bin/perl
2
3						package Goo::TextUtilities;
4
5						###############################################################################
6						# trexy.com - miscellaneous utilities for handling text
7						#
8						# Copyright Nigel Hamilton 2002
9						# All Rights Reserved
10						#
11						# Author: Nigel Hamilton
12						# Filename: Goo::TextUtilities.pm
13						# Description: Miscellaneous utilities for handling text
14						#
15						# Date Change
16						# -----------------------------------------------------------------------------
17						# 07/05/2001 Version 1
18						# 17/03/2003 Expanded to handle HTML, Javascript etc.
19						# 15/05/2003 ' this character was not being stripped from
20						# the HTML sent to the browser needed to strip it out
21						# 03/08/2005 Added getMatchingLineNumber
22						#
23						###############################################################################
24
25	1		1		6	use strict;
	1				2
	1				34
26
27	1		1		916	use URI;
	1				5824
	1				31
28	1		1		627	use Goo::Logger;
	1				3
	1				1243
29
30						###############################################################################
31						#
32						# get_hostname - return a hostname from the url
33						#
34						###############################################################################
35
36						sub get_hostname {
37
38	0		0	1		my ($url) = @_;
39
40	0					my $hostname;
41
42						# prepend http:// to the URL if it is missing - but why would it be missing?
43						# watch out for perltidy!
44	0	0				unless ($url =~ /^http:\/\//i) { $url = "http://" . $url; }
	0
45
46	0					eval {
47
48						# catch unwanted exception thrown
49						# this function will die if the protocol is not included (http://)
50						# if the protocol is partially included it won't die but will return null
51						# this was failing during a redirect from dogpile - the redirect worked on
52						# FireFox - but failed on IE???!!! - we suspected an encoding problem
53	0					my $uri = URI->new($url);
54	0					$hostname = $uri->host();
55						};
56
57	0	0				if ($@) {
58	0					Goo::Logger::write("Write tried to resolve this URL: $url ." . $@, "/tmp/uri.bug.log");
59	0					die("URI bug: $url " . $@);
60						}
61
62	0					return $hostname;
63						}
64
65						###############################################################################
66						#
67						# strip_hreftags - strip all href tags
68						#
69						###############################################################################
70
71						sub strip_hreftags {
72
73	0		0	1		my ($string) = @_;
74
75	0					$string =~ s!<a\s+
76						( "[^"]*" \|
77						'[^']*' \|
78						[^'">]
79						)*
80						>.*?</a>!!gsix;
81
82	0					return $string;
83
84						}
85
86						###############################################################################
87						#
88						# uppercase_first_letters - turn the first letters of each word into uppercase
89						#
90						###############################################################################
91
92						sub uppercase_first_letters {
93
94	0		0	1		my ($string) = @_;
95
96						# substitute at word boundaries
97						# store the word in $1
98						# set the whole thing to lowercase and the first letter to uppercase
99	0					$string =~ s/\b([\w\']+)/\L\u$1/g;
100
101	0					return $string;
102						}
103
104						###############################################################################
105						#
106						# escape_url - escape a url string
107						#
108						###############################################################################
109
110						sub escape_url {
111
112	0		0	1		my ($string) = @_;
113
114						# substitute any spaces for
115	0					$string =~ s/ /\+/g;
116
117	0					return $string;
118						}
119
120						###############################################################################
121						#
122						# strip_funky_html - strip any html that is too funky for a normal tag strip
123						#
124						###############################################################################
125
126						sub strip_funky_html {
127
128	0		0	1		my ($string) = @_;
129
130	0					$string =~ s!<script[^>]>.?</script>! !sig; # strip Javascript
131	0					$string =~ s!<style[^>]>.?</style>! !sig; # strip stylesheets
132	0					$string =~ s\|<!--.*?-->\| \|sig; # strip HTML comments
133
134	0					$string = strip_html($string); # strip all other tags
135
136						# strip any html entities like   - this could be better
137	0					$string =~ s/&[a-zA-Z]{1,4};/ /sig;
138
139						# strip any numeric entities
140	0					$string =~ s/&[0-9]{1,4};/ /g;
141
142						# strip any numeric entities
143	0					$string =~ s/&\#[0-9]{1,4};/ /g;
144
145						# strip any parentheses ()
146	0					$string =~ s/$\W*$/ /g;
147
148						# strip any literal carriage returns
149	0					$string =~ s/\\[rn]/ /g;
150
151	0					$string = compress_whitespace($string);
152
153	0					return $string;
154						}
155
156						###############################################################################
157						#
158						# strip_html - strip the html from a string
159						#
160						###############################################################################
161
162						sub strip_html {
163
164	0		0	1		my ($string) = @_;
165
166						# strip HTML entities
167	0					$string =~ s/\&lt\;/</ig;
168	0					$string =~ s/\&gt\;/>/ig;
169
170						# strip tags
171	0					$string =~ s/<[^>]*>//g;
172
173	0					return $string;
174						}
175
176						###############################################################################
177						#
178						# trim_whitespace - strip whitespace from the front and back of a string
179						#
180						###############################################################################
181
182						sub trim_whitespace {
183
184	0		0	1		my ($string) = @_;
185
186	0					$string =~ s/^\s+//g; # strip leading whitespace
187	0					$string =~ s/\s+$//g; # string trailing whitespace
188
189	0					return $string;
190						}
191
192						###############################################################################
193						#
194						# compress_whitespace - compress excess whitespace from many to 1 space
195						#
196						###############################################################################
197
198						sub compress_whitespace {
199
200	0		0	1		my ($string) = @_;
201
202	0					$string =~ s/\s+/ /g; # compress whitespace
203
204	0					return $string;
205						}
206
207						###############################################################################
208						#
209						# right_pad - pad a string on the righthand side up to a maximum
210						#
211						###############################################################################
212
213						sub right_pad {
214
215	0		0	1		my ($string, $padding, $maxsize) = @_;
216
217						# truncate the string if longer than maxsize
218	0					$string = substr($string, 0, $maxsize);
219
220						# add some padding on the right
221	0					return $string . $padding x ($maxsize - length($string));
222
223						}
224
225						###############################################################################
226						#
227						# strip_last_word - strip the last word off the end of a string
228						#
229						###############################################################################
230
231						sub strip_last_word {
232
233	0		0	1		my ($string) = @_;
234
235						# go to the end of the string and snip off the first bit of
236						# non-whitespace
237	0					$string =~ s/\S+$//;
238
239	0					return $string;
240
241						}
242
243						###############################################################################
244						#
245						# left_pad - pad a string on the lefthand side up to a maximum
246						#
247						###############################################################################
248
249						sub left_pad {
250
251	0		0	1		my ($string, $padding, $maxsize) = @_;
252
253						# truncate the string if longer than maxsize
254	0					$string = substr($string, 0, $maxsize);
255
256						# add some padding on the left
257	0					return ($maxsize - length($string)) x $padding . $string;
258
259						}
260
261						###############################################################################
262						#
263						# truncate_string - reduce the size of the string and remove the last word
264						#
265						###############################################################################
266
267						sub truncate_string {
268
269	0		0	1		my ($string, $size, $dots) = @_;
270
271						# print $string;
272	0	0				if (length($string) > $size) {
273
274						#print "--------> in here <----- $size";
275	0					$string = substr($string, 0, $size);
276
277						# print $string;
278						#print $string;
279						# lop off the last word - removes partial words
280	0					$string = strip_last_word($string);
281
282						# add dots if we want them
283	0	0				if ($dots) { $string .= $dots; }
	0
284						}
285
286	0					return $string;
287
288						}
289
290						###############################################################################
291						#
292						# escape_javascript - escape double quotes etc.
293						#
294						###############################################################################
295
296						sub escape_javascript {
297
298	0		0	1		my ($string) = @_;
299
300						# escape any double quotes, so the Javascript parses OK
301	0					$string =~ s/"/\\"/g;
302
303						# strip line feeds
304	0					$string =~ s/[\n\r]+//g;
305
306						# strip excess whitespace around = signs
307	0					$string =~ s/\s+=\s+/=/g;
308
309						# strip excess whitespace
310	0					$string =~ s/\s+/ /g;
311
312	0					return $string;
313
314						}
315
316						###############################################################################
317						#
318						# get_matching_line_number - return the linenumber that matches the regex
319						#
320						###############################################################################
321
322						sub get_matching_line_number {
323
324	0		0	1		my ($regex, $string) = @_;
325
326	0					my @lines = split(/\n/, $string);
327
328	0					my $linecount = 0;
329
330	0					foreach my $line (@lines) {
331
332	0					$linecount++;
333
334	0	0				if ($line =~ /$regex/) {
335
336						# add 5 to get into the body of the method
337	0					return $linecount;
338						}
339
340						}
341
342	0					return $linecount;
343						}
344
345						1;
346
347
348						__END__
349
350						=head1 NAME
351
352						Goo::TextUtilities - Miscellaneous utilities for handling text
353
354						=head1 SYNOPSIS
355
356						use Goo::TextUtilities;
357
358						=head1 DESCRIPTION
359
360
361
362						=head1 METHODS
363
364						=over
365
366						=item get_hostname
367
368						return a hostname from the url
369
370						=item strip_hreftags
371
372						strip all href tags in a string
373
374						=item uppercase_first_letters
375
376						turn the first letters of each word into uppercase
377
378						=item escape_url
379
380						escape a url string
381
382						=item strip_funky_html
383
384						strip any HTML that is too funky for a normal tag strip
385
386						=item strip_html
387
388						strip the HTML from a string
389
390						=item trim_whitespace
391
392						strip whitespace from the front and back of a string
393
394						=item compress_whitespace
395
396						compress excess whitespace from many spaces to one space
397
398						=item right_pad
399
400						pad a string on the righthand side up to a maximum number of characters
401
402						=item strip_last_word
403
404						strip the last word off the end of a string
405
406						=item left_pad
407
408						pad a string on the lefthand side up to a maximum
409
410						=item truncate_string
411
412						reduce the size of the string and remove the last word
413
414						=item escape_javascript
415
416						escape double quotes etc.
417
418						=item get_matching_line_number
419
420						return the linenumber that matches the regex
421
422						=back
423
424						=head1 AUTHOR
425
426						Nigel Hamilton <nigel@trexy.com>
427
428						=head1 SEE ALSO
429