File Coverage

blib/lib/Lingua/EN/Titlecase.pm
Criterion Covered Total %
statement 115 121 95.0
branch 53 62 85.4
condition 7 7 100.0
subroutine 20 21 95.2
pod 9 9 100.0
total 204 220 92.7


line stmt bran cond sub pod time code
1             package Lingua::EN::Titlecase;
2 5     5   80200 use strict;
  5         12  
  5         176  
3 5     5   26 use warnings;
  5         13  
  5         144  
4 5     5   24 no warnings "uninitialized";
  5         13  
  5         212  
5             require 5.006; # for POSIX classes
6              
7 5     5   3434 use parent "Class::Accessor::Fast";
  5         1331  
  5         71  
8              
9             __PACKAGE__->mk_accessors (qw{
10             uc_threshold
11             mixed_threshold
12             mixed_rx
13             wordish_rx
14             allow_mixed
15             word_punctuation
16             });
17              
18 46 100   46   244 use overload '""' => sub { $_[0]->original ? $_[0]->title : ref $_[0] },
19 5     5   32215 fallback => 1;
  5         3659  
  5         66  
20              
21             # DEVELOPER NOTES
22             # story card it out including a TT2 plugin
23             # HOW will entities and utf8 be handled?
24             # should be raw; OO and functional both?
25             # lc, default is prepositions, articles, conjunctions, can point to a
26             # file or to a hash ref (like a tied file, should have recipe)
27             # canonical /different word/, like OSS or eBay?
28             # Hyphen-Behavior
29             # confidence
30             # titlecase, tc
31             # rules
32             # allow user to set order of applying look-up rules, lc > uc, e.g.
33              
34             # NEED TO ALLOW FOR fixing or leaving things like pH, PERL, tied hash dictionary?
35              
36             # new with 1 arg uses it as string
37             # with more than 1 tries constructors
38              
39             # There are quite a few apostrophe edge cases right now and no
40             # utf8/entity handling
41              
42 5     5   451 use List::Util "first";
  5         10  
  5         635  
43 5     5   37 use Carp;
  5         11  
  5         9600  
44             our $VERSION = "0.15";
45              
46             our %LC = map { $_ => 1 }
47             qw( the a an and or but aboard about above across after against
48             along amid among around as at before behind below beneath
49             beside besides between beyond but by for from in inside into
50             like minus near of off on onto opposite outside over past per
51             plus regarding since than through to toward towards under
52             underneath unlike until up upon versus via with within without
53             v vs
54             );
55              
56             my %Attr = (
57             word_punctuation => 1,
58             original => 1,
59             title => 1,
60             uc_threshold => 1,
61             mixed_threshold => 1,
62             );
63              
64             sub new : method {
65 26     26 1 12504 my $self = +shift->SUPER::new();
66              
67 26 100       458 if ( @_ == 1 )
68             {
69 23         274 $self->{_original} = $_[0];
70             }
71             else
72             {
73 3         7 my %args = @_; # might be empty
74 3         12 for my $key ( keys %args )
75             {
76 0 0       0 croak "Construction parameter \"$key\" not allowed"
77             unless $Attr{$key};
78 0         0 $self->$key($args{$key});
79             }
80             }
81 26         70 return $self->_init();
82             }
83              
84             sub _init : method {
85 74     74   101 my $self = shift;
86 74         377 $self->{_titlecase} = "";
87 74         113 $self->{_real_length} = 0;
88 74         149 $self->{_mixedcase} = [];
89 74         129 $self->{_wc} = [];
90 74         169 $self->{_token_queue} = [];
91 74         216 $self->{_uppercase} = [];
92 74 100       249 $self->word_punctuation(qr/[[:punct:]]/) unless $self->word_punctuation;
93 74         1041 my $wp = $self->word_punctuation;
94 74 100       379 $self->wordish_rx(qr/
95             [[:alpha:]]
96             (?: (?<=[[:alpha:]]) $wp (?=[[:alpha:]]) | [[:alpha:]] )*
97             [[:alpha:]]*
98             /x) unless $self->wordish_rx;
99 74 100       909 $self->mixed_rx(
100             qr/(?<=[[:lower:]])[[:upper:]]
101             |
102             (?<=\A)[[:upper:]](?=[[:upper:]]+[[:lower:]])
103             |
104             (?<=\A)[[:upper:]](?=[[:lower:]]+[[:upper:]])
105             |
106             (?<=[[:lower:]]$wp)[[:upper:]]
107             |
108             \G(?
109             /x) unless $self->mixed_rx;
110              
111 74         1545 $self->allow_mixed(undef);
112 74 100       453 $self->mixed_threshold(0.25) unless $self->mixed_threshold;
113 74 100       820 $self->uc_threshold(0.90) unless $self->uc_threshold;
114 74         549 return $self;
115             }
116              
117             sub mixedcase : method {
118 88     88 1 114 my $self = shift;
119 88 50       485 $self->_parse unless $self->{_mixedcase};
120 88         91 return @{$self->{_mixedcase}};
  88         498  
121             }
122              
123             sub uppercase : method {
124 48     48 1 69 my $self = shift;
125 48 50       115 $self->_parse unless $self->{_uppercase};
126 48         46 return @{$self->{_uppercase}};
  48         270  
127             }
128              
129             sub whitespace : method {
130 0     0 1 0 my $self = shift;
131 0 0       0 $self->_parse unless $self->{_whitespace};
132 0         0 return @{$self->{_whitespace}};
  0         0  
133             }
134              
135             sub wc : method {
136 40     40 1 73 my $self = shift;
137 40 50       124 $self->_parse unless $self->{_wc};
138 40         48 return @{$self->{_wc}};
  40         254  
139             }
140              
141             sub title : method {
142 115     115 1 9619 my $self = shift;
143 115 100       321 $self->original(+shift) if @_;
144 115         225 $self->_parse();
145 115         206 return $self->titlecase();
146             }
147              
148             sub original : method {
149 159     159 1 239 my $self = shift;
150 159 100       330 if ( my $new = shift )
151             {
152 25 50       88 $self->{_parsed} = 0 if $self->{_original} ne $new;
153 25         41 $self->{_original} = $new;
154             }
155 159         513 return $self->{_original};
156             }
157              
158             sub _parse : method {
159 115     115   129 my $self = shift;
160 115 100       316 return if $self->{_parsed};
161 48         124 $self->_init();
162 48         103 my $string = $self->original();
163 48         470 $self->{_uppercase} = [ $string =~ /[[:upper:]]/g ];
164             # TOKEN ARRAYS
165             # 0 - type: word|null
166             # 1 - content
167             # 2 - mixed array
168             # 3 - uc array
169             # 4 - first word token in queue -- "boolean" -- set in titlecase()
170              
171 48         172 my $wp = $self->word_punctuation;
172 48         247 my $mixed_rx = $self->mixed_rx;
173              
174 48         243 while ( my $token = $self->lexer->($string) )
175             {
176 587         4435 my @mixed = $token->[1] =~ /$mixed_rx/g;
177 587 100       1467 $token->[2] = @mixed ? \@mixed : undef;
178 587 100       1276 push @{$self->{_mixedcase}}, @mixed if @mixed;
  18         95  
179 587         854 push @{$self->{_token_queue}}, $token;
  587         1127  
180 587 100       1209 push @{$self->{_wc}}, $token->[1] if $token->[0];
  250         4907  
181 587 100       3327 $self->{_real_length} += length($token->[1]) if $token->[0];
182             }
183 48   100     81 my $uc_ratio = eval { $self->uppercase / $self->{_real_length} } || 0;
184 48   100     61 my $mixed_ratio = eval { $self->mixedcase / $self->{_real_length} } || 0;
185 48 100       150 if ( $uc_ratio > $self->uc_threshold ) # too much uppercase to be real
    100          
186             {
187 4         28 $_->[1] = lc($_->[1]) for @{ $self->{_token_queue} };
  4         40  
188             # carp "Original exceeds uppercase threshold (" .
189             # $self->uc_threshold .
190             # ") lower casing for pre-processing";
191             }
192             elsif ( $mixed_ratio > $self->mixed_threshold ) # too mixed to be real
193             {
194 2         23 $_->[1] = lc($_->[1]) for @{ $self->{_token_queue} };
  2         28  
195             # carp "Original exceeds mixedcase threshold, lower casing for pre-processing";
196             }
197             else
198             {
199 42         495 $self->allow_mixed(1);
200             }
201 48         274 $self->{_parsed} = 1;
202             }
203              
204             sub lexer : method {
205 355     355 1 386 my $self = shift;
206 355 50       545 $self->{_lexer} = shift if $@;
207 355 100       1014 return $self->{_lexer} if $self->{_lexer};
208              
209 17         32 my $wp = $self->word_punctuation;
210 17         75 my $wordish = $self->wordish_rx;
211              
212             $self->{_lexer} = sub {
213 355 100   355   2538 $_[0] =~ s/\A($wordish)// and return [ "word", "$1" ];
214 193 100       999 $_[0] =~ s/\A(.)//s and return [ undef, "$1" ];
215 34         109 return ();
216 17         127 };
217             }
218              
219             sub titlecase : method {
220 155     155 1 187 my $self = shift;
221             # it's up to _parse to clear it
222 155 100       1242 return $self->{_titlecase} if $self->{_titlecase};
223              
224             # first word token
225 48     67   179 my $fwt = first { $_->[0] } @{$self->{_token_queue} };
  67         118  
  48         278  
226 48         250 $fwt->[4] = 1;
227              
228 48         61 for my $t ( @{ $self->{_token_queue} } )
  48         110  
229             {
230 587 100       921 if ( $t->[0] )
231             {
232 250 100 100     1187 if ( $t->[2] and $self->allow_mixed )
    100          
    100          
233             {
234 2         19 $self->{_titlecase} .= $t->[1];
235             }
236             elsif ( $t->[4] ) # the initial word token
237             {
238 47         150 $self->{_titlecase} .= ucfirst $t->[1];
239             }
240             elsif ( $LC{lc($t->[1])} ) # lc/uc checks here
241             {
242 70         195 $self->{_titlecase} .= lc $t->[1];
243             }
244             else
245             {
246 131         361 $self->{_titlecase} .= ucfirst $t->[1];
247             }
248             }
249             else # not a word token
250             {
251 337         835 $self->{_titlecase} .= $t->[1];
252             }
253             }
254 48         694 return $self->{_titlecase};
255             }
256              
257             1;
258              
259             __END__