File Coverage

blib/lib/Lingua/EN/Squeeze.pm

Criterion	Covered	Total	%
statement	126	129	97.6
branch	32	60	53.3
condition	0	72	0.0
subroutine	14	14	100.0
pod	2	2	100.0
total	174	277	62.8

line	stmt	bran	cond	sub	pod	time	code
1							# Squeez.pm - Perl package to shorten text to minimum syllables
2							# $Id: Squeeze.pm,v 1.8 2005-12-05 09:02:49 jaalto Exp $
3							#
4							# This file is maintaned by using Emacs (The Editor) and add-on
5							# packages. See http://tiny-tools.sourceforge.net/
6							#
7							# tinytab.el -- indent mode
8							# tinyperl -- Perl helper mode (pod docs, stubs etc)
9							#
10							# To generate HTML
11							#
12							# $ perl -e 'use Pod::Html qw(pod2html); pod2html shift @ARGV' FILE.pm
13
14							package Lingua::EN::Squeeze;
15
16	1			1		64721	use Scalar::Util qw/ reftype /;
	1					2
	1					104
17
18	1			1		26	use 5.006;
	1					3
19	1			1		5	use strict;
	1					6
	1					22
20	1			1		5	use warnings;
	1					1
	1					105
21
22							my $LIB = "Lingua::EN::Squeeze"; # For debug printing
23
24							our $VERSION = '2015.01';
25
26							# ***********************************************************************
27							#
28							# POD HEADER
29							#
30							# ***********************************************************************
31
32							=pod
33
34							=head1 NAME
35
36							Lingua::EN::Squeeze - Shorten text to minimum syllables using hash table lookup and vowel deletion
37
38							=head1 SYNOPSIS
39
40							use Lingua::EN::Squeeze; # import only function
41							use Lingua::EN::Squeeze qw( :ALL ); # import all functions and variables
42							use English; # to use readable variable names
43
44							while (<>) {
45							print "Original: $_\n";
46							print "Squeezed: ", SqueezeText(lc $_), "\n";
47							}
48
49							# Or you can use object oriented interface
50
51							$squeeze = Lingua::EN::Squeeze->new();
52
53							while (<>) {
54							print "Original: $_\n";
55							print "Squeezed: ", $squeeze->SqueezeText(lc $_);
56							}
57
58							=head1 VERSION
59
60							This document describes version 2016.01
61
62							=head1 DESCRIPTION
63
64							This module squeezes English text to the most compact format possible,
65							so that it is barely readable.
66							Be sure to convert all text to lowercase before using the
67							SqueezeText() for maximum compression,
68							because optimizations have been
69							designed mostly for lower case letters.
70
71							B:
72							Each line is processed multiple times, so prepare for slow conversion time
73
74							You can use this module e.g. to preprocess text before it is sent to
75							electronic media that has some maximum text size limit. For example pagers
76							have an arbitrary text size limit, typically around 200 characters, which
77							you want to fill as much as possible. Alternatively you may have GSM
78							cellular phone which is capable of receiving Short Messages (SMS), whose
79							message size limit is 160 characters. For demonstration of this module's
80							SqueezeText() function, this paragraph's conversion result is presented
81							below. See yourself if it's readable (Yes, it takes some time to get used
82							to). The compression ratio is typically 30-40%
83
84							u _n use thi mod e.g. to prprce txt bfre i_s snt to
85							elrnic mda has som max txt siz lim. f_xmple pag
86							hv abitry txt siz lim, tpcly 200 chr, W/ u wnt
87							to fll as mch as psbleAlternatvly u may hv GSM cllar P8
88							w_s cpble of rcivng Short msg (SMS), WS/ msg siz
89							lim is 160 chr. 4 demonstrton of thi mods SquezText
90							fnc , dsc txt of thi prgra has ben cnvd_ blow
91							See uself if i_s redble (Yes, it tak som T to get usdto
92							compr rat is tpcly 30-40
93
94							And if $SQZ_OPTIMIZE_LEVEL is set to non-zero
95
96							u_nUseThiModE.g.ToPrprceTxtBfreI_sSntTo
97							elrnicMdaHasSomMaxTxtSizLim.F_xmplePag
98							hvAbitryTxtSizLim,Tpcly200Chr,W/UWnt
99							toFllAsMchAsPsbleAlternatvlyUMayHvGSMCllarP8
100							w_sCpbleOfRcivngShortMsg(SMS),WS/MsgSiz
101							limIs160Chr.4DemonstrtonOfThiModsSquezText
102							fnc,DscTxtOfThiPrgraHasBenCnvd_Blow
103							SeeUselfIfI_sRedble(Yes,ItTakSomTToGetUsdto
104							comprRatIsTpcly30-40
105
106							The comparision of these two show
107
108							Original text : 627 characters
109							Level 0 : 433 characters reduction 31 %
110							Level 1 : 345 characters reduction 45 % (+14% improvement)
111
112							There are few grammar rules which are used to shorten some English
113							tokens considerably:
114
115							Word that has _ is usually a verb
116
117							Word that has / is usually a substantive, noun,
118							pronomine or other non-verb
119
120							Read following substituting tokens in order to understand the basics of
121							converted text. Hopefully, the text is not pure Geek code (tm) to you after
122							some practice. In Geek code (Like G++L--J) you would need an external
123							parser to understand it. Here some common sense and time is needed to adapt
124							oneself to the compressed format. I
125							would be better off peeking the source code>
126
127							automatically => 'acly_'
128
129							for => 4
130							for him => 4h
131							for her => 4h
132							for them => 4t
133							for those => 4t
134
135							can => _n
136							does => _s
137
138							it is => i_s
139							that is => t_s
140							which is => w_s
141							that are => t_r
142							which are => w_r
143
144							less => -/
145							more => +/
146							most => ++
147
148							however => h/ver
149							think => thk_
150
151							useful => usful
152
153							you => u
154							your => u/
155							you'd => u/d
156							you'll => u/l
157							they => t/
158							their => t/r
159
160							will => /w
161							would => /d
162							with => w/
163							without => w/o
164							which => W/
165							whose => WS/
166
167							Time is expressed with big letters
168
169							time => T
170							minute => MIN
171							second => SEC
172							hour => HH
173							day => DD
174							month => MM
175							year => YY
176
177							Other big letter acronyms, think 8 to represent the speaker and the
178							microphone.
179
180							phone => P8
181
182							=head1 EXAMPLES
183
184							To add new words e.g. to word conversion hash table, you'd define a custom
185							set and merge them to existing ones. Do similarly to
186							C<%SQZ_WXLATE_MULTI_HASH> and C<$SQZ_ZAP_REGEXP> and then start using the
187							conversion function.
188
189							use English;
190							use Squeeze qw( :ALL );
191
192							my %myExtraWordHash =
193							(
194							new-word1 => 'conversion1'
195							, new-word2 => 'conversion2'
196							, new-word3 => 'conversion3'
197							, new-word4 => 'conversion4'
198							);
199
200							# First take the existing tables and merge them with the above
201							# translation table
202
203							my %mySustomWordHash =
204							(
205							%SQZ_WXLATE_HASH
206							, %SQZ_WXLATE_EXTRA_HASH
207							, %myExtraWordHash
208							);
209
210							my $myXlat = 0; # state flag
211
212							while (<>)
213							{
214							if ( $condition )
215							{
216							SqueezeHashSet \%mySustomWordHash; # Use MY conversions
217							$myXlat = 1;
218							}
219
220							if ( $myXlat and $condition )
221							{
222							SqueezeHashSet "reset"; # Back to default table
223							$myXlat = 0;
224							}
225
226							print SqueezeText $ARG;
227							}
228
229							Similarly you can redefine the multi word translation table by supplying
230							another hash reference in call to SqueezeHashSet(). To kill more text
231							immediately in addition to default, just concatenate regexps to variable
232							I<$SQZ_ZAP_REGEXP>
233
234							=head1 KNOWN BUGS
235
236							There may be lot of false conversions and if you think that some word
237							squeezing went too far, please 1) turn on the debug 2) send you example
238							text 3) debug log log to the maintainer. To see how the conversion goes
239							e.g. for word I:
240
241							use English;
242							use Lingua::EN:Squeeze;
243
244							# Activate debug when case-insensitive word "Messages" is found from
245							# the line.
246
247							SqueezeDebug( 1, '(?i)Messages' );
248
249							$ARG = "This line has some Messages in it";
250							print SqueezeText $ARG;
251
252							=head1 EXPORTABLE VARIABLES
253
254							The defaults may not apply to all types of text, so you may wish to extend
255							the hash tables and I<$SQZ_ZAP_REGEXP> to cope with your typical text.
256
257							=head2 $SQZ_ZAP_REGEXP
258
259							Text to kill immediately, like "Hm, Hi, Hello..." You can only set this
260							once, because this regexp is compiled immediately when C is
261							called for the first time.
262
263							=head2 $SQZ_OPTIMIZE_LEVEL
264
265							This controls how optimized the text will be. Currently there is only level
266							0 (default) and level 1. Level 1 removes all spaces. That usually improves
267							compression by average of 10%, but the text is more harder to read. If
268							space is real tight, use this extended compression optimization.
269
270							=head2 %SQZ_WXLATE_MULTI_HASH
271
272							I conversion hash table: "for you" => "4u" ...
273
274							=head2 %SQZ_WXLATE_HASH
275
276							I conversion hash table: word => conversion. This table is applied
277							after C<%SQZ_WXLATE_MULTI_HASH> has been used.
278
279							=head2 %SQZ_WXLATE_EXTRA_HASH
280
281							Aggressive I conversions like: without => w/o are applied last.
282
283							=cut
284
285
286							# **********************************************************************
287							#
288							# MODULE INTERFACE
289							#
290							# ***********************************************************************
291
292							# Somehow doesn't work in Perl 5.004 ?
293							# use autouse 'Carp' => qw( croak carp cluck confess );
294
295	1			1		5	use Carp;
	1					1
	1					65
296	1			1		24011	use SelfLoader;
	1					14725
	1					69
297	1			1		8	use English;
	1					2
	1					8
298
299							BEGIN
300							{
301							# ......................................................... &use ...
302
303							use vars qw
304	1					180	(
305							@ISA
306							@EXPORT
307							@EXPORT_OK
308							%EXPORT_TAGS
309
310							$FILE_ID
311
312							$debug
313							$debugRegexp
314
315							$SQZ_ZAP_REGEXP
316							$SQZ_OPTIMIZE_LEVEL
317
318							%SQZ_WXLATE_HASH
319							%SQZ_WXLATE_EXTRA_HASH
320							%SQZ_WXLATE_MULTI_HASH
321	1			1		547	);
	1					2
322
323	1			1		3	$FILE_ID =
324							q$Id: Squeeze.pm,v 1.8 2005-12-05 09:02:49 jaalto Exp $;
325
326							# Here woudl be the real version number, which you use like this:
327							#
328							# use Squeeze 1.34;
329							#
330							# Derive version number, the index is 1 if matches
331							# Clearcase @@ in file_id string. index is 2 if this was
332							# RCS identifier.
333
334	1	50				8	my $ver = (split ' ', $FILE_ID)[$FILE_ID =~ /@@/ ? 1 : 2];
335
336							# Commented out. Better to use the date based version number,
337							# because it is more informative
338							#
339							# $VERSION = sprintf "%d.%02d", $ver =~ /(\d+)\.(\d+)/;
340
341							# ...................................................... &export ...
342
343	1			1		6	use Exporter ();
	1					2
	1					71
344
345	1					9	@ISA = qw(Exporter);
346
347	1					3	@EXPORT = qw
348							(
349							&SqueezeText
350							&SqueezeControl
351							&SqueezeDebug
352							);
353
354	1					3	@EXPORT_OK = qw
355							(
356							&SqueezeHashSet
357
358							$SQZ_ZAP_REGEXP
359							$SQZ_OPTIMIZE_LEVEL
360
361							%SQZ_WXLATE_HASH
362							%SQZ_WXLATE_EXTRA_HASH
363							%SQZ_WXLATE_MULTI_HASH
364							);
365
366	1					617	%EXPORT_TAGS =
367							(
368							ALL => [ @EXPORT_OK, @EXPORT ]
369							);
370							}
371
372							# ******************************************************* &globals *
373							#
374							# GLOBALS
375							#
376							# **********************************************************************
377
378							$debug = 0;
379							$debugRegexp = '(?i)DummyYummy';
380
381							$SQZ_ZAP_REGEXP =
382							'\b(a\|an\|the\|shall\|hi\|hello\|cheers\|that)\b'
383							. '\|Thanks (in advance)?\|thank you\|well'
384							. '\|N\.B\.\|\beg.\|\btia\b'
385							. '\|\bHi,?\b\|\bHm+,?\b'
386							. '\|!'
387							. '\|wrote:\|writes:'
388
389							# Finnish greetings
390
391							. '\|\b(Terve\|Moi\|Hei\|Huomenta)\b'
392
393							;
394
395							$SQZ_OPTIMIZE_LEVEL = 0;
396
397							# ............................................................ &word ...
398							# A special mnemonic is signified by postfixing it with either
399							# of these characters:
400							#
401							# / prononym, noun
402							# _ verb
403
404							%SQZ_WXLATE_HASH =
405							(
406							above => 'abve'
407							, address => 'addr'
408							, adjust => 'adj'
409							, adjusted => 'ajusd'
410							, adjustable => 'ajutbl'
411							, arbitrary => 'abitry'
412							, argument => 'arg'
413
414							, background => 'bg'
415							, below => 'blow'
416
417							, change => 'chg'
418							, character => 'chr'
419							, control => 'ctl'
420							, command => 'cmd'
421							, compact => 'cpact'
422							, convert => 'cnv_'
423							, converted => 'cnvd_'
424							, conversion => 'cnv'
425							, cooperation => 'c-o'
426							, correct => 'corr'
427							, correlate => 'corrl'
428							, create => 'creat'
429
430							, database => 'db'
431							, day => 'DD'
432							, date => 'DD'
433							, definition => 'defn'
434							, description => 'desc'
435							, different => 'dif'
436							, differently => 'difly'
437							, directory => 'dir'
438							, documentation => 'doc'
439							, document => 'doc/'
440
441							, 'each' => 'ech'
442							, electronic => 'elrnic'
443							, electric => 'elric'
444							, enable => 'enbl'
445							, english => 'eng'
446							, environment => 'env'
447							, everytime => 'when'
448							, example => 'xmple'
449							, expire => 'xpre'
450							, expect => 'exp'
451							, extend => 'extd'
452
453							, field => 'fld'
454							, following => 'fwng'
455							, 'for' => '4'
456							, 'format' => 'fmt'
457							, forward => 'fwd'
458							, function => 'func'
459
460							, gateway => 'gtw'
461							, generated => 'gntd'
462
463							, have => 'hv'
464							, herself => 'hself'
465							, himself => 'hself'
466							, hour => 'HH'
467
468							, identifier => 'id'
469							, information => 'inf'
470							, inform => 'ifrm'
471							, increase => 'inc'
472							, installed => 'ins'
473
474							, level => 'lev'
475							, limit => 'lim'
476							, limiting => 'limg'
477							, located => 'loctd'
478							, lowercase => 'lc'
479
480							, managed => 'mged'
481							, megabyte => 'meg'
482							, maximum => 'max'
483							, member => 'mbr'
484							, message => 'msg'
485							, minute => 'MIN'
486							, minimum => 'min'
487							, module => 'mod'
488							, month => 'MM'
489
490							, 'name' => 'nam'
491							, 'number' => 'nbr'
492
493							, okay => 'ok'
494							, 'other' => 'otr'
495							, 'others' => 'otr'
496
497							, 'package' => 'pkg'
498							, page => 'pg'
499							, parameter => 'param'
500							, password => 'pwd'
501							, pointer => 'ptr'
502							, public => 'pub'
503							, private => 'priv'
504							, problem => 'prb'
505							, process => 'proc'
506							, project => 'prj'
507
508							, recipient => 'rcpt' # this is SMTP acronym
509							, released => 'relsd'
510							, reserve => 'rsv'
511							, register => 'reg'
512							, resource => 'rc'
513							, return => 'ret'
514							, returned => 'ret'
515							, 'require' => 'rq'
516
517							, subject => 'sbj'
518							, soconds => 'SEC'
519							, service => 'srv'
520							, squeeze => 'sqz'
521							, something => 'stng'
522							, sometimes => 'stims'
523							, status => 'stat'
524							, still => 'stil'
525							, straightforward => 'sfwd'
526							, submit => 'sbmit'
527							, submitting => 'sbmtng'
528							, symbol => 'sym'
529							, 'system' => 'sytm'
530
531							, 'time' => 'T'
532							, translate => 'tras'
533
534							, understand => 'untnd'
535							, uppercase => 'uc'
536							, usually => 'usual'
537
538							, year => 'YY'
539							, you => 'u'
540							, your => 'u/'
541							, yourself => 'uself'
542
543							, 'version' => 'ver'
544
545							, warning => 'warng'
546							, with => 'w/'
547							, work => 'wrk'
548
549							);
550
551							%SQZ_WXLATE_EXTRA_HASH =
552							(
553							anything => 'atng'
554							, automatically => 'acly_'
555
556							, can => '_n'
557
558							, does => '_s'
559							, dont => '_nt'
560							, "don't" => '_nt'
561							, 'exists' => 'ex_'
562
563							, everything => 'etng/'
564
565							, however => 'h/ver'
566
567							, increment => 'inc/'
568							, interesting => 'inrsg'
569							, interrupt => 'irup'
570
571							# not spelled like 'less', because plural substitution seens
572							# this first 'less' -> 'les'
573
574							, 'les' => '-/'
575
576							, 'more' => '+/'
577							, most => '++'
578
579							, phone => 'P8'
580							, please => 'pls_'
581							, person => 'per/'
582
583							, should => 's/d'
584							, they => 't/'
585							, their => 't/r'
586							, think => 'thk_'
587							, 'which' => 'W/'
588							, without => 'w/o'
589							, whose => 'WS/'
590							, will => '/w'
591							, would => '/d'
592
593							, "you'd" => 'u/d'
594							, "you'll" => 'u/l'
595
596							);
597
598							# ........................................................... &multi ...
599
600							%SQZ_WXLATE_MULTI_HASH =
601							(
602							'for me' => '4m'
603							, 'for you' => '4u'
604							, 'for him' => '4h'
605							, 'for her' => '4h'
606							, 'for them' => '4t'
607							, 'for those' => '4t'
608
609							, 'for example' => 'f_xmple'
610
611							, 'with or without' => 'w/o'
612
613							, 'it is' => 'i_s'
614							, "it's" => 'i_s'
615
616							, 'that is' => 't_s'
617							, "that's" => 't_s'
618							, "that don't" => 't_nt'
619
620							, 'which is' => 'w_s'
621							, "which's" => 'w_s'
622							, "which don't" => 'w_nt'
623
624							, 'that are' => 't_r'
625							, "that're" => 't_r'
626							, "that are not" => 't_rt'
627
628							, 'which are' => 'w_r'
629							, 'which are not' => 'w_rt'
630							, "which aren't" => 'w_rt'
631
632							, "has not" => 'hs_t'
633							, "have not" => 'hv_t'
634
635							, "that has" => 't_hs'
636							, "that has not" => 't_hst'
637							, "that hasn't" => 't_hst'
638
639							, 'which has' => 'w_hs'
640							, 'which has not' => 'w_hst'
641							, "which hasn't" => 'w_hst'
642
643							, "that have" => 't_hv'
644							, "that have not" => 't_hvt'
645							, "that haven't" => 't_hvt'
646
647							, 'which have' => 'w_hv'
648							, "which have not" => 'w_hvt'
649							, "which haven't" => 'w_hvt'
650
651							, "that had" => 't_hd'
652							, "that had not" => 't_hdt'
653							, "that hadn't" => 't_hdt'
654
655							, 'which had' => 'w_hd'
656							, 'which had not' => 'w_hdt'
657							, "which hadn't" => 'w_hdt'
658
659							, 'used to' => 'usdto'
660							, 'due to' => 'd_to'
661
662							, 'United Kingdom' => 'UK'
663							, 'United States' => 'US'
664							);
665
666							# ******************************************************* &private *
667							#
668							# PRIVATE VARIABLES
669							#
670							# **********************************************************************
671
672							# We must declare package globals sot hat SelfLoader sees them after
673							# __DATA__
674
675							use vars qw
676	1					117	(
677							%SQZ_WXLATE_MULTI_HASH_MEDIUM
678							%SQZ_WXLATE_MULTI_HASH_MAX
679							%SQZ_WXLATE_HASH_MEDIUM
680							%SQZ_WXLATE_HASH_MAX
681	1			1		6	);
	1					2
682
683							%SQZ_WXLATE_MULTI_HASH_MEDIUM = %SQZ_WXLATE_MULTI_HASH;
684							%SQZ_WXLATE_MULTI_HASH_MAX = %SQZ_WXLATE_MULTI_HASH;
685
686							%SQZ_WXLATE_HASH_MEDIUM = %SQZ_WXLATE_HASH;
687							%SQZ_WXLATE_HASH_MAX = ( %SQZ_WXLATE_HASH, %SQZ_WXLATE_EXTRA_HASH);
688
689							# The Active translate tables
690							#
691							# User isn't suppose to touch this, but in case you need to know
692							# exactly what traslations are going and what table is in use, then peeek
693							# these.
694							#
695							# $Lingua::EN::Squeeze::wordXlate{above}
696
697							use vars qw
698	1					2217	(
699							%wordXlate
700							%multiWordXlate
701							$STATE
702	1			1		5	);
	1					1
703
704							%wordXlate = %SQZ_WXLATE_HASH_MAX;
705							%multiWordXlate = %SQZ_WXLATE_MULTI_HASH;
706							$STATE = "max"; # Squeeze level
707
708							# **********************************************************************
709							#
710							# I N T E R F A C E
711							#
712							# *********************************************************************
713
714							=pod
715
716							=head1 INTERFACE FUNCTIONS
717
718							=cut
719
720
721							# **********************************************************************
722							#
723							# PUBLIC FUNCTION
724							#
725							# *********************************************************************
726
727							=pod
728
729							=head2 SqueezeObjectArg($)
730
731							=over
732
733							=item Description
734
735							Return subroutine argument in both function and object cases.
736							This is a wrapper utility to make package work as a function
737							library as well as OO class.
738
739							=item @list
740
741							List of arguments. Usually the first one is object if class
742							interface is used.
743
744							=item Return values
745
746							Return arguments without the first object parameter.
747
748							=back
749
750							=cut
751
752							sub SqueezeObjectArg (@)
753							{
754	12			12	1	3967	my @list = @ARG;
755	12					20	my $ref = ref( $list[0] );
756
757							# This test may not be the bets, but we suppose this is
758							# class if we find text like 'Linguag::EN::Squeeze'.
759							#
760							# FIXME: What about derived classes (although unlikely)
761
762	12	100				31	if ( $ref =~ /::[a-z]+::/i )
763							{
764	1					2	shift @list; # Remove arg
765							}
766
767	12					31	@list;
768							}
769
770							# **********************************************************************
771							#
772							# PUBLIC FUNCTION
773							#
774							# *********************************************************************
775
776							=pod
777
778							=head2 SqueezeText($)
779
780							=over
781
782							=item Description
783
784							Squeeze text by using vowel substitutions and deletions and hash tables
785							that guide text substitutions. The line is parsed multiple times and
786							this will take some time.
787
788							=item arg1: $text
789
790							String. Line of Text.
791
792							=item Return values
793
794							String, squeezed text.
795
796							=back
797
798							=cut
799
800							sub SqueezeText ($)
801							{
802							# If you wonder how these substitutions were selected ...
803							# Just by feeding text after text to this function and
804							# seeing how it could be compressed even more
805							#
806							# => Trial and error. The order of these substitutions
807							# => is highly significant.
808
809							# ....................................................... &start ...
810
811	6			6	1	1207	my $id = "$LIB.SqueezeText";
812
813	6					15	local($ARG) = SqueezeObjectArg(@ARG);
814
815	6	100				18	return $ARG if $STATE eq 'noconv'; # immediate return, no conversion
816
817	5					6	my $vow = '[aeiouy]'; # vowel
818	5					8	my $nvow = '[^aeiouy\s_/\']'; # non-vowel
819
820	5					6	my $orig = $ARG; # for debug
821	5					6	my $tab = ""; # tab
822
823							# ........................................................ &kill ...
824
825	5	50				18	if ( /^\s*[^\s]{30,}/ ) # Any continuous block. UU line ?
826							{
827	0					0	return "";
828							}
829
830	5	50				17	if ( /^[A-Z][^\s]+: / ) # Email headers "From:"
831							{
832	0					0	return "";
833							}
834
835	5					10	s/^\s+//; # delete leading spaces
836	5					59	s/[ \t]+$//; # delete trailing spaces
837	5					101	s/[ \t]+/ /g; # collapse multiple spaces inside text
838
839							# ........................................................ words ...
840
841							# Kill URLs
842
843	5					12	s{\b\S+://\S+\b}{URL}ig;
844
845							# Delete markup +this+ emphasised strong `text'
846
847	5					55	s/\b[_+`'](\S+)[_+`']\b/$1/ig;
848
849							# DON'T REMOVE. This comment fixes Emacs font-lock problem: s///
850							# From above statement.
851
852	5	50	0			34	$debug and warn $tab,"[markup]\t[$ARG]" if $orig =~ /$debugRegexp/;
853
854							# Delete 3rd person voice
855							# expires => expire
856							#
857							# But do not touch 'was'
858
859	5					89	s/\b($vow\S+$vow)s\b/$1/ogi;
860
861							# says => say
862
863	5					144	s/\b($nvow+\S$vow+)s\b/$1/ogi;
864
865							# vowel .. nvowel + 2
866							# interests => interest
867
868	5					93	s/\b($vow\S+$nvow)s\b/$1/ogi;
869	5	50	0			33	$debug and warn $tab,"[3voice]\t[$ARG]" if $orig =~ /$debugRegexp/;
870
871							# Delete plurals: non-vowel .. non-vowel + s
872							# problems => problem
873
874	5					152	s/\b($nvow\S+$nvow)s\b/$1/ogi;
875	5	50	0			25	$debug and warn $tab,"[plural]\t[$ARG]" if $orig =~ /$debugRegexp/;
876
877							# Delete plurals: non-vowel .. vowel + s
878							# messages => message
879
880	5					146	s/\b($nvow\S+$vow)s\b/$1/ogi;
881	5	50	0			29	$debug and warn $tab,"[plural2]\t[$ARG]" if $orig =~ /$debugRegexp/;
882
883							# zap
884
885	5					761	s/$SQZ_ZAP_REGEXP//oig;
886	5	50	0			31	$debug and warn $tab,"[zap]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
887
888							# ................................................... &translate ...
889
890	5					7	my ($from, $to);
891
892	5					38	for $from ( keys %multiWordXlate )
893							{
894	230					335	$to = $multiWordXlate{ $from };
895	230					3316	s/\b$from\b/$to/ig;
896							}
897
898	5	50	0			37	$debug and warn $tab,"[xlate-multi]\t[$ARG]" if $orig =~ /$debugRegexp/;
899
900	5					98	for $from ( keys %wordXlate )
901							{
902	576					775	$to = $wordXlate{ $from };
903	576					7964	s/\b$from\b/$to/ig;
904							}
905
906	5	50	0			52	$debug and warn $tab,"[xlate-word]\t[$ARG]" if $orig =~ /$debugRegexp/;
907
908							# ...................................................... &suffix ...
909
910							# From Imperfect to active voice
911							# converted => convert
912
913	5					121	s/\b($nvow\S\S+)ed\b/$1/igo;
914
915							# shorten words with -cally suffix => cly
916
917	5					11	s/cally\b/cly/g;
918
919							# shorten comparision: bigger
920							# We can't deduce quicker --> quick, becasue further on
921							# the word would be converted quick --> qck. Not good.
922
923	5					116	s/\b($nvow+\S+e)r\b/$1/ogi;
924	5	50	0			27	$debug and warn $tab,"[comparis]\t[$ARG]" if $orig =~ /$debugRegexp/;
925
926							# leaning --> leang
927
928	5					28	s/ing\b/ng/ig;
929	5					16	s/io\b/o/ig;
930
931							# uniqe --> uniq
932
933	5	50	0			23	$debug and warn $tab,"[-io]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
934
935							# Watch out "due to"
936
937	5					112	s/(\S\S)ue(ness?)?\b/$1/ig;
938
939							# authenticate -> authentic
940							# Watch out 'state' !
941
942	5					129	s/(\S\S\S)ate\b/$1/ig;
943
944	5	50	0			26	$debug and warn $tab,"[-ate]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
945
946							# .................................................. &heuristics ...
947
948	5	50	0			22	$debug and warn $tab,"[0]\t\t[$ARG]" if $orig =~ /$debugRegexp/;
949
950							# Vocal only at the beginning and end ==> drop last
951							# info => inf
952							#
953							# Don't touch away
954
955	5					111	s/\b($vow+$nvow$nvow)$vow+\b/$1/ogi;
956	5	50	0			23	$debug and warn $tab,"[vowel-last]\t[$ARG]" if $orig =~ /$debugRegexp/;
957
958							# only one vowel in string
959							# help ==> hlp
960							# stat BUT can't deduce to stt
961
962	5					135	s/\b($nvow)$vow($nvow$nvow)\b/$1$2/ogi;
963	5	50	0			26	$debug and warn $tab,"[vowel-one]\t[$ARG]" if $orig =~ /$debugRegexp/;
964
965							# asked --> skd
966
967	5					118	s/\b($vow+$nvow$nvow)$vow($nvow)\b/$1$2/ogi;
968	5	50	0			22	$debug and warn $tab,"[vowel-two]\t[$ARG]" if $orig =~ /$debugRegexp/;
969
970							# Delete two vowels; through --> thrgh
971							# Don't touch words ending to -ly: diffrently, difly
972
973	5					132	s/\b($nvow+)$vow$vow($nvow$nvow+)(?!y)\b/$1$2/ogi;
974	5	50	0			22	$debug and warn $tab,"[vowel-many]\t[$ARG]" if $orig =~ /$debugRegexp/;
975
976							# type => typ
977
978	5					153	s/\b($nvow+$vow$nvow+(?!y))$vow\b/$1/ogi;
979	5	50	0			27	$debug and warn $tab,"[vowel-end]\t[$ARG]" if $orig =~ /$debugRegexp/;
980
981							# many vowels, remove first two
982							# detected => dtcted
983							# service => srvce
984
985	5					300	s/\b(\S+)$vow+($nvow+)$vow+(\S$vow\S)\b/$1$2$3/ogi;
986	5	50	0			25	$debug and warn $tab,"[vowel-more]\t[$ARG]" if $orig =~ /$debugRegexp/;
987
988							# Two consequent vowels
989							# obtain => obtan
990
991	5					164	s/\b(\S*$vow$nvow+$vow)$vow(\S+)\b/$1$2/ogi;
992	5	50	0			26	$debug and warn $tab,"[vowel-22more]\t[$ARG]" if $orig =~ /$debugRegexp/;
993
994							# Two non-vowels at the end
995							# contact => contac
996
997							# s/($nvow)$nvow\b/$1/ogi;
998							# $debug and warn $tab,"[non-vowel-2end][$ARG]" if $orig =~ /$debugRegexp/;
999
1000							# Two same vowels
1001							# took => tok
1002							# keep => kep
1003
1004	5					139	s/\b(\S+)($vow)\2(\S+)\b/$1$2$3/ogi;
1005	5	50	0			26	$debug and warn $tab,"[vowel-2same]\t[$ARG]" if $orig =~ /$debugRegexp/;
1006
1007							# .................................................... &suffixes ...
1008
1009							# frequency => freq
1010							# acceptance => accept
1011							# distance => dist
1012
1013	5					35	s/u?[ae]nc[ye]\b//ig;
1014
1015							# management => manag
1016							# establishement => establish
1017
1018	5					12	s/ement\b/nt/ig;
1019
1020							# allocation => allocan
1021
1022	5					77	s/[a-z]ion\b/n/ig;
1023
1024							# hesitate --> hesit
1025
1026	5					14	s/tate\b/t/ig;
1027
1028							# ................................................. &multi-chars ...
1029
1030	5					9	s/ph\b//g; # paragraph --> paragra
1031	5					8	s/ph/f/g; # photograph --> fotogra
1032
1033	5	50	0			25	$debug and warn $tab,"[multi]\t[$ARG]" if $orig =~ /$debugRegexp/;
1034
1035							# .................................................. simple rules ...
1036
1037	5					12	s/([0-9])(st\|nd\|th)/$1/ig; # get rid of 1st 2nd ...
1038
1039							# Shorted full month names
1040
1041	5					64	s/\b(jan\|feb\|mar\|apr\|may\|jun\|jul\|aug\|sep\|oct\|nov\|dec)[a-z]+\b/$1/ig;
1042
1043							# "This is the end. And new sentence."
1044							# We can leave out the period.
1045
1046	5					17	s/\.\s+([A-Z])/$1/g;
1047
1048							# Any line starting that does not start with aphanumeric can be
1049							# deleted. Like
1050							#
1051							# Well, this is
1052							#
1053							# is previously shortened to ", this is" and the leading is now
1054							# shortened to
1055							#
1056							# this is
1057
1058	5					11	s/^\s[.,;:]\s//;
1059	5					170	s/\s*\W+$/\n/; # ending similarly.
1060
1061	5	50	0			26	$debug and warn $tab,"[shorthand]\t[$ARG]" if $orig =~ /$debugRegexp/;
1062
1063							# we don't need these,
1064
1065	5					11	s/[!#\$'\"*\|\\^]//g; # dummy "' to restore Emacs font-lock
1066
1067							# carefully => carefuly
1068							# Don't touch 'all'
1069
1070	5					54	s/([flkpsy])\1\B/$1/ig; # Any double char to one char
1071
1072	5	50	0			27	$debug and warn $tab,"[double]\t[$ARG]" if $orig =~ /$debugRegexp/;
1073
1074							# Any double chars at the END of work
1075
1076	5					70	s/\b(\S$vow\S)([^0-9])\2\b/$1$2/i;
1077
1078	5	50	0			26	$debug and warn $tab,"[double-end]\t[$ARG]" if $orig =~ /$debugRegexp/;
1079
1080							# short => shor
1081
1082	5					11	s/\rt\b/r/ig; # Any double char to one char
1083
1084	5	50	0			28	$debug and warn $tab,"[rt]\t[$ARG]" if $orig =~ /$debugRegexp/;
1085
1086							# .................................................... &optimize ...
1087
1088	5	50				10	if ( $SQZ_OPTIMIZE_LEVEL )
1089							{
1090	0					0	s/\s+(.)/\U$1/g; # kill empty spaces
1091							}
1092
1093	5					23	$ARG;
1094							}
1095
1096							# This section is automatically updated by Emacs function
1097							# tinyperl.el::tiperl-selfstubber-stubs. Do not touch the BEGIN END tokens.
1098							# See http://tiny-tools.sourceforge.net/
1099
1100							# BEGIN: Devel::SelfStubber
1101
1102							sub Lingua::EN::Squeeze::SqueezeHashSet ($;$);
1103							sub Lingua::EN::Squeeze::SqueezeControl (;$) ;
1104							sub Lingua::EN::Squeeze::SqueezeDebug (;$$);
1105
1106							# END: Devel::SelfStubber
1107
1108							1;
1109							__DATA__