File Coverage

blib/lib/Lingua/EN/Squeeze.pm

Criterion	Covered	Total	%
statement	126	129	97.6
branch	32	60	53.3
condition	0	72	0.0
subroutine	14	14	100.0
pod	2	2	100.0
total	174	277	62.8

line	stmt	bran	cond	sub	pod	time	code
1							# Squeez.pm - Perl package to shorten text to minimum syllables
2							# $Id: Squeeze.pm,v 1.8 2005-12-05 09:02:49 jaalto Exp $
3							#
4							# This file is maintaned by using Emacs (The Editor) and add-on
5							# packages. See http://tiny-tools.sourceforge.net/
6							#
7							# tinytab.el -- indent mode
8							# tinyperl -- Perl helper mode (pod docs, stubs etc)
9							#
10							# To generate HTML
11							#
12							# $ perl -e 'use Pod::Html qw(pod2html); pod2html shift @ARGV' FILE.pm
13
14							package Lingua::EN::Squeeze;
15							$Lingua::EN::Squeeze::VERSION = '2020.02';
16	1			1		5230	use Scalar::Util qw/ reftype /;
	1					2
	1					127
17
18	1			1		19	use 5.006;
	1					4
19	1			1		5	use strict;
	1					2
	1					33
20	1			1		5	use warnings;
	1					2
	1					109
21
22							my $LIB = "Lingua::EN::Squeeze"; # For debug printing
23
24
25							# ***********************************************************************
26							#
27							# POD HEADER
28							#
29							# ***********************************************************************
30
31							=pod
32
33							=head1 NAME
34
35							Lingua::EN::Squeeze - Shorten text to minimum syllables using hash table lookup and vowel deletion
36
37							=head1 SYNOPSIS
38
39							use Lingua::EN::Squeeze; # import only function
40							use Lingua::EN::Squeeze qw( :ALL ); # import all functions and variables
41							use English; # to use readable variable names
42
43							while (<>) {
44							print "Original: $_\n";
45							print "Squeezed: ", SqueezeText(lc $_), "\n";
46							}
47
48							# Or you can use object oriented interface
49
50							$squeeze = Lingua::EN::Squeeze->new();
51
52							while (<>) {
53							print "Original: $_\n";
54							print "Squeezed: ", $squeeze->SqueezeText(lc $_);
55							}
56
57							=head1 DESCRIPTION
58
59							This module squeezes English text to the most compact format possible,
60							so that it is barely readable.
61							Be sure to convert all text to lowercase before using the
62							SqueezeText() for maximum compression,
63							because optimizations have been
64							designed mostly for lower case letters.
65
66							B:
67							Each line is processed multiple times, so prepare for slow conversion time
68
69							You can use this module e.g. to preprocess text before it is sent to
70							electronic media that has some maximum text size limit. For example pagers
71							have an arbitrary text size limit, typically around 200 characters, which
72							you want to fill as much as possible. Alternatively you may have GSM
73							cellular phone which is capable of receiving Short Messages (SMS), whose
74							message size limit is 160 characters. For demonstration of this module's
75							SqueezeText() function, this paragraph's conversion result is presented
76							below. See yourself if it's readable (Yes, it takes some time to get used
77							to). The compression ratio is typically 30-40%
78
79							u _n use thi mod e.g. to prprce txt bfre i_s snt to
80							elrnic mda has som max txt siz lim. f_xmple pag
81							hv abitry txt siz lim, tpcly 200 chr, W/ u wnt
82							to fll as mch as psbleAlternatvly u may hv GSM cllar P8
83							w_s cpble of rcivng Short msg (SMS), WS/ msg siz
84							lim is 160 chr. 4 demonstrton of thi mods SquezText
85							fnc , dsc txt of thi prgra has ben cnvd_ blow
86							See uself if i_s redble (Yes, it tak som T to get usdto
87							compr rat is tpcly 30-40
88
89							And if $SQZ_OPTIMIZE_LEVEL is set to non-zero
90
91							u_nUseThiModE.g.ToPrprceTxtBfreI_sSntTo
92							elrnicMdaHasSomMaxTxtSizLim.F_xmplePag
93							hvAbitryTxtSizLim,Tpcly200Chr,W/UWnt
94							toFllAsMchAsPsbleAlternatvlyUMayHvGSMCllarP8
95							w_sCpbleOfRcivngShortMsg(SMS),WS/MsgSiz
96							limIs160Chr.4DemonstrtonOfThiModsSquezText
97							fnc,DscTxtOfThiPrgraHasBenCnvd_Blow
98							SeeUselfIfI_sRedble(Yes,ItTakSomTToGetUsdto
99							comprRatIsTpcly30-40
100
101							The comparision of these two show
102
103							Original text : 627 characters
104							Level 0 : 433 characters reduction 31 %
105							Level 1 : 345 characters reduction 45 % (+14% improvement)
106
107							There are few grammar rules which are used to shorten some English
108							tokens considerably:
109
110							Word that has _ is usually a verb
111
112							Word that has / is usually a substantive, noun,
113							pronomine or other non-verb
114
115							Read following substituting tokens in order to understand the basics of
116							converted text. Hopefully, the text is not pure Geek code (tm) to you after
117							some practice. In Geek code (Like G++L--J) you would need an external
118							parser to understand it. Here some common sense and time is needed to adapt
119							oneself to the compressed format. I
120							would be better off peeking the source code>
121
122							automatically => 'acly_'
123
124							for => 4
125							for him => 4h
126							for her => 4h
127							for them => 4t
128							for those => 4t
129
130							can => _n
131							does => _s
132
133							it is => i_s
134							that is => t_s
135							which is => w_s
136							that are => t_r
137							which are => w_r
138
139							less => -/
140							more => +/
141							most => ++
142
143							however => h/ver
144							think => thk_
145
146							useful => usful
147
148							you => u
149							your => u/
150							you'd => u/d
151							you'll => u/l
152							they => t/
153							their => t/r
154
155							will => /w
156							would => /d
157							with => w/
158							without => w/o
159							which => W/
160							whose => WS/
161
162							Time is expressed with big letters
163
164							time => T
165							minute => MIN
166							second => SEC
167							hour => HH
168							day => DD
169							month => MM
170							year => YY
171
172							Other big letter acronyms, think 8 to represent the speaker and the
173							microphone.
174
175							phone => P8
176
177							=head1 EXAMPLES
178
179							To add new words e.g. to word conversion hash table, you'd define a custom
180							set and merge them to existing ones. Do similarly to
181							C<%SQZ_WXLATE_MULTI_HASH> and C<$SQZ_ZAP_REGEXP> and then start using the
182							conversion function.
183
184							use English;
185							use Squeeze qw( :ALL );
186
187							my %myExtraWordHash =
188							(
189							new-word1 => 'conversion1'
190							, new-word2 => 'conversion2'
191							, new-word3 => 'conversion3'
192							, new-word4 => 'conversion4'
193							);
194
195							# First take the existing tables and merge them with the above
196							# translation table
197
198							my %mySustomWordHash =
199							(
200							%SQZ_WXLATE_HASH
201							, %SQZ_WXLATE_EXTRA_HASH
202							, %myExtraWordHash
203							);
204
205							my $myXlat = 0; # state flag
206
207							while (<>)
208							{
209							if ( $condition )
210							{
211							SqueezeHashSet \%mySustomWordHash; # Use MY conversions
212							$myXlat = 1;
213							}
214
215							if ( $myXlat and $condition )
216							{
217							SqueezeHashSet "reset"; # Back to default table
218							$myXlat = 0;
219							}
220
221							print SqueezeText $ARG;
222							}
223
224							Similarly you can redefine the multi word translation table by supplying
225							another hash reference in call to SqueezeHashSet(). To kill more text
226							immediately in addition to default, just concatenate regexps to variable
227							I<$SQZ_ZAP_REGEXP>
228
229							=head1 KNOWN BUGS
230
231							There may be lot of false conversions and if you think that some word
232							squeezing went too far, please 1) turn on the debug 2) send you example
233							text 3) debug log log to the maintainer. To see how the conversion goes
234							e.g. for word I:
235
236							use English;
237							use Lingua::EN:Squeeze;
238
239							# Activate debug when case-insensitive word "Messages" is found from
240							# the line.
241
242							SqueezeDebug( 1, '(?i)Messages' );
243
244							$ARG = "This line has some Messages in it";
245							print SqueezeText $ARG;
246
247							=head1 EXPORTABLE VARIABLES
248
249							The defaults may not apply to all types of text, so you may wish to extend
250							the hash tables and I<$SQZ_ZAP_REGEXP> to cope with your typical text.
251
252							=head2 $SQZ_ZAP_REGEXP
253
254							Text to kill immediately, like "Hm, Hi, Hello..." You can only set this
255							once, because this regexp is compiled immediately when C is
256							called for the first time.
257
258							=head2 $SQZ_OPTIMIZE_LEVEL
259
260							This controls how optimized the text will be. Currently there is only level
261							0 (default) and level 1. Level 1 removes all spaces. That usually improves
262							compression by average of 10%, but the text is more harder to read. If
263							space is real tight, use this extended compression optimization.
264
265							=head2 %SQZ_WXLATE_MULTI_HASH
266
267							I conversion hash table: "for you" => "4u" ...
268
269							=head2 %SQZ_WXLATE_HASH
270
271							I conversion hash table: word => conversion. This table is applied
272							after C<%SQZ_WXLATE_MULTI_HASH> has been used.
273
274							=head2 %SQZ_WXLATE_EXTRA_HASH
275
276							Aggressive I conversions like: without => w/o are applied last.
277
278							=cut
279
280
281							# **********************************************************************
282							#
283							# MODULE INTERFACE
284							#
285							# ***********************************************************************
286
287							# Somehow doesn't work in Perl 5.004 ?
288							# use autouse 'Carp' => qw( croak carp cluck confess );
289
290	1			1		7	use Carp;
	1					1
	1					87
291	1			1		520	use SelfLoader;
	1					8645
	1					51
292	1			1		7	use English;
	1					2
	1					6
293
294							BEGIN
295							{
296							# ......................................................... &use ...
297
298							use vars qw
299	1					182	(
300							@ISA
301							@EXPORT
302							@EXPORT_OK
303							%EXPORT_TAGS
304
305							$FILE_ID
306
307							$debug
308							$debugRegexp
309
310							$SQZ_ZAP_REGEXP
311							$SQZ_OPTIMIZE_LEVEL
312
313							%SQZ_WXLATE_HASH
314							%SQZ_WXLATE_EXTRA_HASH
315							%SQZ_WXLATE_MULTI_HASH
316	1			1		370	);
	1					2
317
318	1			1		9	$FILE_ID =
319							q$Id: Squeeze.pm,v 1.8 2005-12-05 09:02:49 jaalto Exp $;
320
321							# Here woudl be the real version number, which you use like this:
322							#
323							# use Squeeze 1.34;
324							#
325							# Derive version number, the index is 1 if matches
326							# Clearcase @@ in file_id string. index is 2 if this was
327							# RCS identifier.
328
329	1	50				7	my $ver = (split ' ', $FILE_ID)[$FILE_ID =~ /@@/ ? 1 : 2];
330
331							# Commented out. Better to use the date based version number,
332							# because it is more informative
333							#
334							# $VERSION = sprintf "%d.%02d", $ver =~ /(\d+)\.(\d+)/;
335
336							# ...................................................... &export ...
337
338	1			1		8	use Exporter ();
	1					2
	1					75
339
340	1					17	@ISA = qw(Exporter);
341
342	1					4	@EXPORT = qw
343							(
344							&SqueezeText
345							&SqueezeControl
346							&SqueezeDebug
347							);
348
349	1					2	@EXPORT_OK = qw
350							(
351							&SqueezeHashSet
352
353							$SQZ_ZAP_REGEXP
354							$SQZ_OPTIMIZE_LEVEL
355
356							%SQZ_WXLATE_HASH
357							%SQZ_WXLATE_EXTRA_HASH
358							%SQZ_WXLATE_MULTI_HASH
359							);
360
361	1					478	%EXPORT_TAGS =
362							(
363							ALL => [ @EXPORT_OK, @EXPORT ]
364							);
365							}
366
367							# ******************************************************* &globals *
368							#
369							# GLOBALS
370							#
371							# **********************************************************************
372
373							$debug = 0;
374							$debugRegexp = '(?i)DummyYummy';
375
376							$SQZ_ZAP_REGEXP =
377							'\b(a\|an\|the\|shall\|hi\|hello\|cheers\|that)\b'
378							. '\|Thanks (in advance)?\|thank you\|well'
379							. '\|N\.B\.\|\beg.\|\btia\b'
380							. '\|\bHi,?\b\|\bHm+,?\b'
381							. '\|!'
382							. '\|wrote:\|writes:'
383
384							# Finnish greetings
385
386							. '\|\b(Terve\|Moi\|Hei\|Huomenta)\b'
387
388							;
389
390							$SQZ_OPTIMIZE_LEVEL = 0;
391
392							# ............................................................ &word ...
393							# A special mnemonic is signified by postfixing it with either
394							# of these characters:
395							#
396							# / prononym, noun
397							# _ verb
398
399							%SQZ_WXLATE_HASH =
400							(
401							above => 'abve'
402							, address => 'addr'
403							, adjust => 'adj'
404							, adjusted => 'ajusd'
405							, adjustable => 'ajutbl'
406							, arbitrary => 'abitry'
407							, argument => 'arg'
408
409							, background => 'bg'
410							, below => 'blow'
411
412							, change => 'chg'
413							, character => 'chr'
414							, control => 'ctl'
415							, command => 'cmd'
416							, compact => 'cpact'
417							, convert => 'cnv_'
418							, converted => 'cnvd_'
419							, conversion => 'cnv'
420							, cooperation => 'c-o'
421							, correct => 'corr'
422							, correlate => 'corrl'
423							, create => 'creat'
424
425							, database => 'db'
426							, day => 'DD'
427							, date => 'DD'
428							, definition => 'defn'
429							, description => 'desc'
430							, different => 'dif'
431							, differently => 'difly'
432							, directory => 'dir'
433							, documentation => 'doc'
434							, document => 'doc/'
435
436							, 'each' => 'ech'
437							, electronic => 'elrnic'
438							, electric => 'elric'
439							, enable => 'enbl'
440							, english => 'eng'
441							, environment => 'env'
442							, everytime => 'when'
443							, example => 'xmple'
444							, expire => 'xpre'
445							, expect => 'exp'
446							, extend => 'extd'
447
448							, field => 'fld'
449							, following => 'fwng'
450							, 'for' => '4'
451							, 'format' => 'fmt'
452							, forward => 'fwd'
453							, function => 'func'
454
455							, gateway => 'gtw'
456							, generated => 'gntd'
457
458							, have => 'hv'
459							, herself => 'hself'
460							, himself => 'hself'
461							, hour => 'HH'
462
463							, identifier => 'id'
464							, information => 'inf'
465							, inform => 'ifrm'
466							, increase => 'inc'
467							, installed => 'ins'
468
469							, level => 'lev'
470							, limit => 'lim'
471							, limiting => 'limg'
472							, located => 'loctd'
473							, lowercase => 'lc'
474
475							, managed => 'mged'
476							, megabyte => 'meg'
477							, maximum => 'max'
478							, member => 'mbr'
479							, message => 'msg'
480							, minute => 'MIN'
481							, minimum => 'min'
482							, module => 'mod'
483							, month => 'MM'
484
485							, 'name' => 'nam'
486							, 'number' => 'nbr'
487
488							, okay => 'ok'
489							, 'other' => 'otr'
490							, 'others' => 'otr'
491
492							, 'package' => 'pkg'
493							, page => 'pg'
494							, parameter => 'param'
495							, password => 'pwd'
496							, pointer => 'ptr'
497							, public => 'pub'
498							, private => 'priv'
499							, problem => 'prb'
500							, process => 'proc'
501							, project => 'prj'
502
503							, recipient => 'rcpt' # this is SMTP acronym
504							, released => 'relsd'
505							, reserve => 'rsv'
506							, register => 'reg'
507							, resource => 'rc'
508							, return => 'ret'
509							, returned => 'ret'
510							, 'require' => 'rq'
511
512							, subject => 'sbj'
513							, soconds => 'SEC'
514							, service => 'srv'
515							, squeeze => 'sqz'
516							, something => 'stng'
517							, sometimes => 'stims'
518							, status => 'stat'
519							, still => 'stil'
520							, straightforward => 'sfwd'
521							, submit => 'sbmit'
522							, submitting => 'sbmtng'
523							, symbol => 'sym'
524							, 'system' => 'sytm'
525
526							, 'time' => 'T'
527							, translate => 'tras'
528
529							, understand => 'untnd'
530							, uppercase => 'uc'
531							, usually => 'usual'
532
533							, year => 'YY'
534							, you => 'u'
535							, your => 'u/'
536							, yourself => 'uself'
537
538							, 'version' => 'ver'
539
540							, warning => 'warng'
541							, with => 'w/'
542							, work => 'wrk'
543
544							);
545
546							%SQZ_WXLATE_EXTRA_HASH =
547							(
548							anything => 'atng'
549							, automatically => 'acly_'
550
551							, can => '_n'
552
553							, does => '_s'
554							, dont => '_nt'
555							, "don't" => '_nt'
556							, 'exists' => 'ex_'
557
558							, everything => 'etng/'
559
560							, however => 'h/ver'
561
562							, increment => 'inc/'
563							, interesting => 'inrsg'
564							, interrupt => 'irup'
565
566							# not spelled like 'less', because plural substitution seens
567							# this first 'less' -> 'les'
568
569							, 'les' => '-/'
570
571							, 'more' => '+/'
572							, most => '++'
573
574							, phone => 'P8'
575							, please => 'pls_'
576							, person => 'per/'
577
578							, should => 's/d'
579							, they => 't/'
580							, their => 't/r'
581							, think => 'thk_'
582							, 'which' => 'W/'
583							, without => 'w/o'
584							, whose => 'WS/'
585							, will => '/w'
586							, would => '/d'
587
588							, "you'd" => 'u/d'
589							, "you'll" => 'u/l'
590
591							);
592
593							# ........................................................... &multi ...
594
595							%SQZ_WXLATE_MULTI_HASH =
596							(
597							'for me' => '4m'
598							, 'for you' => '4u'
599							, 'for him' => '4h'
600							, 'for her' => '4h'
601							, 'for them' => '4t'
602							, 'for those' => '4t'
603
604							, 'for example' => 'f_xmple'
605
606							, 'with or without' => 'w/o'
607
608							, 'it is' => 'i_s'
609							, "it's" => 'i_s'
610
611							, 'that is' => 't_s'
612							, "that's" => 't_s'
613							, "that don't" => 't_nt'
614
615							, 'which is' => 'w_s'
616							, "which's" => 'w_s'
617							, "which don't" => 'w_nt'
618
619							, 'that are' => 't_r'
620							, "that're" => 't_r'
621							, "that are not" => 't_rt'
622
623							, 'which are' => 'w_r'
624							, 'which are not' => 'w_rt'
625							, "which aren't" => 'w_rt'
626
627							, "has not" => 'hs_t'
628							, "have not" => 'hv_t'
629
630							, "that has" => 't_hs'
631							, "that has not" => 't_hst'
632							, "that hasn't" => 't_hst'
633
634							, 'which has' => 'w_hs'
635							, 'which has not' => 'w_hst'
636							, "which hasn't" => 'w_hst'
637
638							, "that have" => 't_hv'
639							, "that have not" => 't_hvt'
640							, "that haven't" => 't_hvt'
641
642							, 'which have' => 'w_hv'
643							, "which have not" => 'w_hvt'
644							, "which haven't" => 'w_hvt'
645
646							, "that had" => 't_hd'
647							, "that had not" => 't_hdt'
648							, "that hadn't" => 't_hdt'
649
650							, 'which had' => 'w_hd'
651							, 'which had not' => 'w_hdt'
652							, "which hadn't" => 'w_hdt'
653
654							, 'used to' => 'usdto'
655							, 'due to' => 'd_to'
656
657							, 'United Kingdom' => 'UK'
658							, 'United States' => 'US'
659							);
660
661							# ******************************************************* &private *
662							#
663							# PRIVATE VARIABLES
664							#
665							# **********************************************************************
666
667							# We must declare package globals sot hat SelfLoader sees them after
668							# __DATA__
669
670							use vars qw
671	1					115	(
672							%SQZ_WXLATE_MULTI_HASH_MEDIUM
673							%SQZ_WXLATE_MULTI_HASH_MAX
674							%SQZ_WXLATE_HASH_MEDIUM
675							%SQZ_WXLATE_HASH_MAX
676	1			1		8	);
	1					2
677
678							%SQZ_WXLATE_MULTI_HASH_MEDIUM = %SQZ_WXLATE_MULTI_HASH;
679							%SQZ_WXLATE_MULTI_HASH_MAX = %SQZ_WXLATE_MULTI_HASH;
680
681							%SQZ_WXLATE_HASH_MEDIUM = %SQZ_WXLATE_HASH;
682							%SQZ_WXLATE_HASH_MAX = ( %SQZ_WXLATE_HASH, %SQZ_WXLATE_EXTRA_HASH);
683
684							# The Active translate tables
685							#
686							# User isn't suppose to touch this, but in case you need to know
687							# exactly what traslations are going and what table is in use, then peeek
688							# these.
689							#
690							# $Lingua::EN::Squeeze::wordXlate{above}
691
692							use vars qw
693	1					1809	(
694							%wordXlate
695							%multiWordXlate
696							$STATE
697	1			1		14	);
	1					2
698
699							%wordXlate = %SQZ_WXLATE_HASH_MAX;
700							%multiWordXlate = %SQZ_WXLATE_MULTI_HASH;
701							$STATE = "max"; # Squeeze level
702
703							# **********************************************************************
704							#
705							# I N T E R F A C E
706							#
707							# *********************************************************************
708
709							=pod
710
711							=head1 INTERFACE FUNCTIONS
712
713							=cut
714
715
716							# **********************************************************************
717							#
718							# PUBLIC FUNCTION
719							#
720							# *********************************************************************
721
722							=pod
723
724							=head2 SqueezeObjectArg($)
725
726							=over
727
728							=item Description
729
730							Return subroutine argument in both function and object cases.
731							This is a wrapper utility to make package work as a function
732							library as well as OO class.
733
734							=item @list
735
736							List of arguments. Usually the first one is object if class
737							interface is used.
738
739							=item Return values
740
741							Return arguments without the first object parameter.
742
743							=back
744
745							=cut
746
747							sub SqueezeObjectArg (@)
748							{
749	12			12	1	4159	my @list = @ARG;
750	12					21	my $ref = ref( $list[0] );
751
752							# This test may not be the bets, but we suppose this is
753							# class if we find text like 'Linguag::EN::Squeeze'.
754							#
755							# FIXME: What about derived classes (although unlikely)
756
757	12	100				34	if ( $ref =~ /::[a-z]+::/i )
758							{
759	1					2	shift @list; # Remove arg
760							}
761
762	12					31	@list;
763							}
764
765							# **********************************************************************
766							#
767							# PUBLIC FUNCTION
768							#
769							# *********************************************************************
770
771							=pod
772
773							=head2 SqueezeText($)
774
775							=over
776
777							=item Description
778
779							Squeeze text by using vowel substitutions and deletions and hash tables
780							that guide text substitutions. The line is parsed multiple times and
781							this will take some time.
782
783							=item arg1: $text
784
785							String. Line of Text.
786
787							=item Return values
788
789							String, squeezed text.
790
791							=back
792
793							=cut
794
795							sub SqueezeText ($)
796							{
797							# If you wonder how these substitutions were selected ...
798							# Just by feeding text after text to this function and
799							# seeing how it could be compressed even more
800							#
801							# => Trial and error. The order of these substitutions
802							# => is highly significant.
803
804							# ....................................................... &start ...
805
806	6			6	1	1298	my $id = "$LIB.SqueezeText";
807
808	6					15	local($ARG) = SqueezeObjectArg(@ARG);
809
810	6	100				20	return $ARG if $STATE eq 'noconv'; # immediate return, no conversion
811
812	5					7	my $vow = '[aeiouy]'; # vowel
813	5					9	my $nvow = '[^aeiouy\s_/\']'; # non-vowel
814
815	5					7	my $orig = $ARG; # for debug
816	5					8	my $tab = ""; # tab
817
818							# ........................................................ &kill ...
819
820	5	50				20	if ( /^\s*[^\s]{30,}/ ) # Any continuous block. UU line ?
821							{
822	0					0	return "";
823							}
824
825	5	50				13	if ( /^[A-Z][^\s]+: / ) # Email headers "From:"
826							{
827	0					0	return "";
828							}
829
830	5					11	s/^\s+//; # delete leading spaces
831	5					39	s/[ \t]+$//; # delete trailing spaces
832	5					88	s/[ \t]+/ /g; # collapse multiple spaces inside text
833
834							# ........................................................ words ...
835
836							# Kill URLs
837
838	5					13	s{\b\S+://\S+\b}{URL}ig;
839
840							# Delete markup +this+ emphasised strong `text'
841
842	5					43	s/\b[_+`'](\S+)[_+`']\b/$1/ig;
843
844							# DON'T REMOVE. This comment fixes Emacs font-lock problem: s///
845							# From above statement.
846
847	5	50	0			33	$debug and warn $tab,"[markup]\t[$ARG]" if $orig =~ /$debugRegexp/;
848
849							# Delete 3rd person voice
850							# expires => expire
851							#
852							# But do not touch 'was'
853
854	5					90	s/\b($vow\S+$vow)s\b/$1/ogi;
855
856							# says => say
857
858	5					126	s/\b($nvow+\S$vow+)s\b/$1/ogi;
859
860							# vowel .. nvowel + 2
861							# interests => interest
862
863	5					91	s/\b($vow\S+$nvow)s\b/$1/ogi;
864	5	50	0			24	$debug and warn $tab,"[3voice]\t[$ARG]" if $orig =~ /$debugRegexp/;
865
866							# Delete plurals: non-vowel .. non-vowel + s
867							# problems => problem
868
869	5					126	s/\b($nvow\S+$nvow)s\b/$1/ogi;
870	5	50	0			22	$debug and warn $tab,"[plural]\t[$ARG]" if $orig =~ /$debugRegexp/;
871
872							# Delete plurals: non-vowel .. vowel + s
873							# messages => message
874
875	5					124	s/\b($nvow\S+$vow)s\b/$1/ogi;
876	5	50	0			23	$debug and warn $tab,"[plural2]\t[$ARG]" if $orig =~ /$debugRegexp/;
877
878							# zap
879
880	5					646	s/$SQZ_ZAP_REGEXP//oig;
881	5	50	0			27	$debug and warn $tab,"[zap]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
882
883							# ................................................... &translate ...
884
885	5					10	my ($from, $to);
886
887	5					40	for $from ( keys %multiWordXlate )
888							{
889	230					423	$to = $multiWordXlate{ $from };
890	230					2468	s/\b$from\b/$to/ig;
891							}
892
893	5	50	0			31	$debug and warn $tab,"[xlate-multi]\t[$ARG]" if $orig =~ /$debugRegexp/;
894
895	5					89	for $from ( keys %wordXlate )
896							{
897	576					1000	$to = $wordXlate{ $from };
898	576					5935	s/\b$from\b/$to/ig;
899							}
900
901	5	50	0			45	$debug and warn $tab,"[xlate-word]\t[$ARG]" if $orig =~ /$debugRegexp/;
902
903							# ...................................................... &suffix ...
904
905							# From Imperfect to active voice
906							# converted => convert
907
908	5					123	s/\b($nvow\S\S+)ed\b/$1/igo;
909
910							# shorten words with -cally suffix => cly
911
912	5					15	s/cally\b/cly/g;
913
914							# shorten comparision: bigger
915							# We can't deduce quicker --> quick, becasue further on
916							# the word would be converted quick --> qck. Not good.
917
918	5					100	s/\b($nvow+\S+e)r\b/$1/ogi;
919	5	50	0			22	$debug and warn $tab,"[comparis]\t[$ARG]" if $orig =~ /$debugRegexp/;
920
921							# leaning --> leang
922
923	5					26	s/ing\b/ng/ig;
924	5					16	s/io\b/o/ig;
925
926							# uniqe --> uniq
927
928	5	50	0			21	$debug and warn $tab,"[-io]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
929
930							# Watch out "due to"
931
932	5					91	s/(\S\S)ue(ness?)?\b/$1/ig;
933
934							# authenticate -> authentic
935							# Watch out 'state' !
936
937	5					111	s/(\S\S\S)ate\b/$1/ig;
938
939	5	50	0			21	$debug and warn $tab,"[-ate]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
940
941							# .................................................. &heuristics ...
942
943	5	50	0			20	$debug and warn $tab,"[0]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
944
945							# Vocal only at the beginning and end ==> drop last
946							# info => inf
947							#
948							# Don't touch away
949
950	5					88	s/\b($vow+$nvow$nvow)$vow+\b/$1/ogi;
951	5	50	0			20	$debug and warn $tab,"[vowel-last]\t[$ARG]" if $orig =~ /$debugRegexp/;
952
953							# only one vowel in string
954							# help ==> hlp
955							# stat BUT can't deduce to stt
956
957	5					137	s/\b($nvow)$vow($nvow$nvow)\b/$1$2/ogi;
958	5	50	0			25	$debug and warn $tab,"[vowel-one]\t[$ARG]" if $orig =~ /$debugRegexp/;
959
960							# asked --> skd
961
962	5					101	s/\b($vow+$nvow$nvow)$vow($nvow)\b/$1$2/ogi;
963	5	50	0			21	$debug and warn $tab,"[vowel-two]\t[$ARG]" if $orig =~ /$debugRegexp/;
964
965							# Delete two vowels; through --> thrgh
966							# Don't touch words ending to -ly: diffrently, difly
967
968	5					111	s/\b($nvow+)$vow$vow($nvow$nvow+)(?!y)\b/$1$2/ogi;
969	5	50	0			20	$debug and warn $tab,"[vowel-many]\t[$ARG]" if $orig =~ /$debugRegexp/;
970
971							# type => typ
972
973	5					153	s/\b($nvow+$vow$nvow+(?!y))$vow\b/$1/ogi;
974	5	50	0			25	$debug and warn $tab,"[vowel-end]\t[$ARG]" if $orig =~ /$debugRegexp/;
975
976							# many vowels, remove first two
977							# detected => dtcted
978							# service => srvce
979
980	5					258	s/\b(\S+)$vow+($nvow+)$vow+(\S$vow\S)\b/$1$2$3/ogi;
981	5	50	0			20	$debug and warn $tab,"[vowel-more]\t[$ARG]" if $orig =~ /$debugRegexp/;
982
983							# Two consequent vowels
984							# obtain => obtan
985
986	5					131	s/\b(\S*$vow$nvow+$vow)$vow(\S+)\b/$1$2/ogi;
987	5	50	0			29	$debug and warn $tab,"[vowel-22more]\t[$ARG]" if $orig =~ /$debugRegexp/;
988
989							# Two non-vowels at the end
990							# contact => contac
991
992							# s/($nvow)$nvow\b/$1/ogi;
993							# $debug and warn $tab,"[non-vowel-2end][$ARG]" if $orig =~ /$debugRegexp/;
994
995							# Two same vowels
996							# took => tok
997							# keep => kep
998
999	5					159	s/\b(\S+)($vow)\2(\S+)\b/$1$2$3/ogi;
1000	5	50	0			23	$debug and warn $tab,"[vowel-2same]\t[$ARG]" if $orig =~ /$debugRegexp/;
1001
1002							# .................................................... &suffixes ...
1003
1004							# frequency => freq
1005							# acceptance => accept
1006							# distance => dist
1007
1008	5					32	s/u?[ae]nc[ye]\b//ig;
1009
1010							# management => manag
1011							# establishement => establish
1012
1013	5					13	s/ement\b/nt/ig;
1014
1015							# allocation => allocan
1016
1017	5					55	s/[a-z]ion\b/n/ig;
1018
1019							# hesitate --> hesit
1020
1021	5					12	s/tate\b/t/ig;
1022
1023							# ................................................. &multi-chars ...
1024
1025	5					11	s/ph\b//g; # paragraph --> paragra
1026	5					7	s/ph/f/g; # photograph --> fotogra
1027
1028	5	50	0			20	$debug and warn $tab,"[multi]\t[$ARG]" if $orig =~ /$debugRegexp/;
1029
1030							# .................................................. simple rules ...
1031
1032	5					11	s/([0-9])(st\|nd\|th)/$1/ig; # get rid of 1st 2nd ...
1033
1034							# Shorted full month names
1035
1036	5					67	s/\b(jan\|feb\|mar\|apr\|may\|jun\|jul\|aug\|sep\|oct\|nov\|dec)[a-z]+\b/$1/ig;
1037
1038							# "This is the end. And new sentence."
1039							# We can leave out the period.
1040
1041	5					18	s/\.\s+([A-Z])/$1/g;
1042
1043							# Any line starting that does not start with aphanumeric can be
1044							# deleted. Like
1045							#
1046							# Well, this is
1047							#
1048							# is previously shortened to ", this is" and the leading is now
1049							# shortened to
1050							#
1051							# this is
1052
1053	5					10	s/^\s[.,;:]\s//;
1054	5					153	s/\s*\W+$/\n/; # ending similarly.
1055
1056	5	50	0			26	$debug and warn $tab,"[shorthand]\t[$ARG]" if $orig =~ /$debugRegexp/;
1057
1058							# we don't need these,
1059
1060	5					13	s/[!#\$'\"*\|\\^]//g; # dummy "' to restore Emacs font-lock
1061
1062							# carefully => carefuly
1063							# Don't touch 'all'
1064
1065	5					37	s/([flkpsy])\1\B/$1/ig; # Any double char to one char
1066
1067	5	50	0			21	$debug and warn $tab,"[double]\t[$ARG]" if $orig =~ /$debugRegexp/;
1068
1069							# Any double chars at the END of work
1070
1071	5					78	s/\b(\S$vow\S)([^0-9])\2\b/$1$2/i;
1072
1073	5	50	0			34	$debug and warn $tab,"[double-end]\t[$ARG]" if $orig =~ /$debugRegexp/;
1074
1075							# short => shor
1076
1077	5					14	s/\rt\b/r/ig; # Any double char to one char
1078
1079	5	50	0			18	$debug and warn $tab,"[rt]\t[$ARG]" if $orig =~ /$debugRegexp/;
1080
1081							# .................................................... &optimize ...
1082
1083	5	50				11	if ( $SQZ_OPTIMIZE_LEVEL )
1084							{
1085	0					0	s/\s+(.)/\U$1/g; # kill empty spaces
1086							}
1087
1088	5					26	$ARG;
1089							}
1090
1091							# This section is automatically updated by Emacs function
1092							# tinyperl.el::tiperl-selfstubber-stubs. Do not touch the BEGIN END tokens.
1093							# See http://tiny-tools.sourceforge.net/
1094
1095							# BEGIN: Devel::SelfStubber
1096
1097							sub Lingua::EN::Squeeze::SqueezeHashSet ($;$);
1098							sub Lingua::EN::Squeeze::SqueezeControl (;$) ;
1099							sub Lingua::EN::Squeeze::SqueezeDebug (;$$);
1100
1101							# END: Devel::SelfStubber
1102
1103							1;
1104							__DATA__