File Coverage

blib/lib/Lingua/EN/Titlecase.pm

Criterion	Covered	Total	%
statement	115	121	95.0
branch	53	62	85.4
condition	7	7	100.0
subroutine	20	21	95.2
pod	9	9	100.0
total	204	220	92.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Titlecase;
2	5			5		80200	use strict;
	5					12
	5					176
3	5			5		26	use warnings;
	5					13
	5					144
4	5			5		24	no warnings "uninitialized";
	5					13
	5					212
5							require 5.006; # for POSIX classes
6
7	5			5		3434	use parent "Class::Accessor::Fast";
	5					1331
	5					71
8
9							__PACKAGE__->mk_accessors (qw{
10							uc_threshold
11							mixed_threshold
12							mixed_rx
13							wordish_rx
14							allow_mixed
15							word_punctuation
16							});
17
18	46	100		46		244	use overload '""' => sub { $_[0]->original ? $_[0]->title : ref $_[0] },
19	5			5		32215	fallback => 1;
	5					3659
	5					66
20
21							# DEVELOPER NOTES
22							# story card it out including a TT2 plugin
23							# HOW will entities and utf8 be handled?
24							# should be raw; OO and functional both?
25							# lc, default is prepositions, articles, conjunctions, can point to a
26							# file or to a hash ref (like a tied file, should have recipe)
27							# canonical /different word/, like OSS or eBay?
28							# Hyphen-Behavior
29							# confidence
30							# titlecase, tc
31							# rules
32							# allow user to set order of applying look-up rules, lc > uc, e.g.
33
34							# NEED TO ALLOW FOR fixing or leaving things like pH, PERL, tied hash dictionary?
35
36							# new with 1 arg uses it as string
37							# with more than 1 tries constructors
38
39							# There are quite a few apostrophe edge cases right now and no
40							# utf8/entity handling
41
42	5			5		451	use List::Util "first";
	5					10
	5					635
43	5			5		37	use Carp;
	5					11
	5					9600
44							our $VERSION = "0.15";
45
46							our %LC = map { $_ => 1 }
47							qw( the a an and or but aboard about above across after against
48							along amid among around as at before behind below beneath
49							beside besides between beyond but by for from in inside into
50							like minus near of off on onto opposite outside over past per
51							plus regarding since than through to toward towards under
52							underneath unlike until up upon versus via with within without
53							v vs
54							);
55
56							my %Attr = (
57							word_punctuation => 1,
58							original => 1,
59							title => 1,
60							uc_threshold => 1,
61							mixed_threshold => 1,
62							);
63
64							sub new : method {
65	26			26	1	12504	my $self = +shift->SUPER::new();
66
67	26	100				458	if ( @_ == 1 )
68							{
69	23					274	$self->{_original} = $_[0];
70							}
71							else
72							{
73	3					7	my %args = @_; # might be empty
74	3					12	for my $key ( keys %args )
75							{
76	0	0				0	croak "Construction parameter \"$key\" not allowed"
77							unless $Attr{$key};
78	0					0	$self->$key($args{$key});
79							}
80							}
81	26					70	return $self->_init();
82							}
83
84							sub _init : method {
85	74			74		101	my $self = shift;
86	74					377	$self->{_titlecase} = "";
87	74					113	$self->{_real_length} = 0;
88	74					149	$self->{_mixedcase} = [];
89	74					129	$self->{_wc} = [];
90	74					169	$self->{_token_queue} = [];
91	74					216	$self->{_uppercase} = [];
92	74	100				249	$self->word_punctuation(qr/[[:punct:]]/) unless $self->word_punctuation;
93	74					1041	my $wp = $self->word_punctuation;
94	74	100				379	$self->wordish_rx(qr/
95							[[:alpha:]]
96							(?: (?<=[[:alpha:]]) $wp (?=[[:alpha:]]) \| [[:alpha:]] )*
97							[[:alpha:]]*
98							/x) unless $self->wordish_rx;
99	74	100				909	$self->mixed_rx(
100							qr/(?<=[[:lower:]])[[:upper:]]
101							\|
102							(?<=\A)[[:upper:]](?=[[:upper:]]+[[:lower:]])
103							\|
104							(?<=\A)[[:upper:]](?=[[:lower:]]+[[:upper:]])
105							\|
106							(?<=[[:lower:]]$wp)[[:upper:]]
107							\|
108							\G(?
109							/x) unless $self->mixed_rx;
110
111	74					1545	$self->allow_mixed(undef);
112	74	100				453	$self->mixed_threshold(0.25) unless $self->mixed_threshold;
113	74	100				820	$self->uc_threshold(0.90) unless $self->uc_threshold;
114	74					549	return $self;
115							}
116
117							sub mixedcase : method {
118	88			88	1	114	my $self = shift;
119	88	50				485	$self->_parse unless $self->{_mixedcase};
120	88					91	return @{$self->{_mixedcase}};
	88					498
121							}
122
123							sub uppercase : method {
124	48			48	1	69	my $self = shift;
125	48	50				115	$self->_parse unless $self->{_uppercase};
126	48					46	return @{$self->{_uppercase}};
	48					270
127							}
128
129							sub whitespace : method {
130	0			0	1	0	my $self = shift;
131	0	0				0	$self->_parse unless $self->{_whitespace};
132	0					0	return @{$self->{_whitespace}};
	0					0
133							}
134
135							sub wc : method {
136	40			40	1	73	my $self = shift;
137	40	50				124	$self->_parse unless $self->{_wc};
138	40					48	return @{$self->{_wc}};
	40					254
139							}
140
141							sub title : method {
142	115			115	1	9619	my $self = shift;
143	115	100				321	$self->original(+shift) if @_;
144	115					225	$self->_parse();
145	115					206	return $self->titlecase();
146							}
147
148							sub original : method {
149	159			159	1	239	my $self = shift;
150	159	100				330	if ( my $new = shift )
151							{
152	25	50				88	$self->{_parsed} = 0 if $self->{_original} ne $new;
153	25					41	$self->{_original} = $new;
154							}
155	159					513	return $self->{_original};
156							}
157
158							sub _parse : method {
159	115			115		129	my $self = shift;
160	115	100				316	return if $self->{_parsed};
161	48					124	$self->_init();
162	48					103	my $string = $self->original();
163	48					470	$self->{_uppercase} = [ $string =~ /[[:upper:]]/g ];
164							# TOKEN ARRAYS
165							# 0 - type: word\|null
166							# 1 - content
167							# 2 - mixed array
168							# 3 - uc array
169							# 4 - first word token in queue -- "boolean" -- set in titlecase()
170
171	48					172	my $wp = $self->word_punctuation;
172	48					247	my $mixed_rx = $self->mixed_rx;
173
174	48					243	while ( my $token = $self->lexer->($string) )
175							{
176	587					4435	my @mixed = $token->[1] =~ /$mixed_rx/g;
177	587	100				1467	$token->[2] = @mixed ? \@mixed : undef;
178	587	100				1276	push @{$self->{_mixedcase}}, @mixed if @mixed;
	18					95
179	587					854	push @{$self->{_token_queue}}, $token;
	587					1127
180	587	100				1209	push @{$self->{_wc}}, $token->[1] if $token->[0];
	250					4907
181	587	100				3327	$self->{_real_length} += length($token->[1]) if $token->[0];
182							}
183	48		100			81	my $uc_ratio = eval { $self->uppercase / $self->{_real_length} } \|\| 0;
184	48		100			61	my $mixed_ratio = eval { $self->mixedcase / $self->{_real_length} } \|\| 0;
185	48	100				150	if ( $uc_ratio > $self->uc_threshold ) # too much uppercase to be real
		100
186							{
187	4					28	$_->[1] = lc($_->[1]) for @{ $self->{_token_queue} };
	4					40
188							# carp "Original exceeds uppercase threshold (" .
189							# $self->uc_threshold .
190							# ") lower casing for pre-processing";
191							}
192							elsif ( $mixed_ratio > $self->mixed_threshold ) # too mixed to be real
193							{
194	2					23	$_->[1] = lc($_->[1]) for @{ $self->{_token_queue} };
	2					28
195							# carp "Original exceeds mixedcase threshold, lower casing for pre-processing";
196							}
197							else
198							{
199	42					495	$self->allow_mixed(1);
200							}
201	48					274	$self->{_parsed} = 1;
202							}
203
204							sub lexer : method {
205	355			355	1	386	my $self = shift;
206	355	50				545	$self->{_lexer} = shift if $@;
207	355	100				1014	return $self->{_lexer} if $self->{_lexer};
208
209	17					32	my $wp = $self->word_punctuation;
210	17					75	my $wordish = $self->wordish_rx;
211
212							$self->{_lexer} = sub {
213	355	100		355		2538	$_[0] =~ s/\A($wordish)// and return [ "word", "$1" ];
214	193	100				999	$_[0] =~ s/\A(.)//s and return [ undef, "$1" ];
215	34					109	return ();
216	17					127	};
217							}
218
219							sub titlecase : method {
220	155			155	1	187	my $self = shift;
221							# it's up to _parse to clear it
222	155	100				1242	return $self->{_titlecase} if $self->{_titlecase};
223
224							# first word token
225	48			67		179	my $fwt = first { $_->[0] } @{$self->{_token_queue} };
	67					118
	48					278
226	48					250	$fwt->[4] = 1;
227
228	48					61	for my $t ( @{ $self->{_token_queue} } )
	48					110
229							{
230	587	100				921	if ( $t->[0] )
231							{
232	250	100	100			1187	if ( $t->[2] and $self->allow_mixed )
		100
		100
233							{
234	2					19	$self->{_titlecase} .= $t->[1];
235							}
236							elsif ( $t->[4] ) # the initial word token
237							{
238	47					150	$self->{_titlecase} .= ucfirst $t->[1];
239							}
240							elsif ( $LC{lc($t->[1])} ) # lc/uc checks here
241							{
242	70					195	$self->{_titlecase} .= lc $t->[1];
243							}
244							else
245							{
246	131					361	$self->{_titlecase} .= ucfirst $t->[1];
247							}
248							}
249							else # not a word token
250							{
251	337					835	$self->{_titlecase} .= $t->[1];
252							}
253							}
254	48					694	return $self->{_titlecase};
255							}
256
257							1;
258
259							__END__