File Coverage

blib/lib/Lingua/EN/TitleParse.pm
Criterion Covered Total %
statement 60 60 100.0
branch 17 18 94.4
condition 8 9 88.8
subroutine 9 9 100.0
pod 2 4 50.0
total 96 100 96.0


line stmt bran cond sub pod time code
1             package Lingua::EN::TitleParse;
2              
3 1     1   999 use 5.006000;
  1         4  
  1         39  
4 1     1   6 use strict;
  1         2  
  1         35  
5 1     1   16 use warnings;
  1         2  
  1         2859  
6              
7             our $VERSION = '0.01';
8              
9             sub new {
10 4     4 0 5941 my ($class, %params) = @_;
11 4         6 my $self = {};
12 4         15 bless $self, $class;
13 4 100       17 $self->{titles} = $params{titles} ? $self->_load( $params{titles} ) : $self->_default_titles;
14 4 100       21 $self->{clean} = $params{clean} ? 1 : 0;
15 4         12 return $self;
16             }
17              
18             # parse uses a hash-table of "normalised" titles to very efficiently identify
19             # titles, regardless of the number of titles required to look up.
20             # Normalised text for our purposes consists of lower-case \w characters,
21             # all other characters and spaces being \W.
22             # Using this technique we find titles regardless of case or other
23             # punctuation used. e.g. MR, Mr., mr, and Mr can all be found.
24             #
25             # Once we have identified the normalised title we then capture the real
26             # title, with the punctuation and case as in the original string, by
27             # counting forward the correct number of normalised characters and
28             # capturing non-normalised characters along the way.
29              
30             sub parse {
31              
32 16     16 1 9184 my ($self, $name) = @_;
33              
34 16 50       48 return () unless defined $name;
35              
36 16         27 my ($title, $remaining_name) = ('', $name);
37 16 100       44 my $titles = ref $self ? $self->{titles} : $self->_default_titles;
38              
39             # Try to find a normalised title using a hash lookup.
40             # Split the name by spaces/non-word characters, then match in
41             # reverse order against a list of normalised titles.
42             # Take the largest matching title.
43 16         20 my $normalised_title;
44 16         86 my @name_chunks = split(/\W+/, lc $name);
45 16         619 while (pop @name_chunks) {
46 20         42 my $possible_title = join(" ", @name_chunks);
47 20 100       53 if (exists $titles->{$possible_title}) {
48 15         16 $normalised_title = $possible_title;
49 15         21 last;
50             }
51             }
52              
53 16 100       26 if ($normalised_title) {
54             # Find the normalised title in the real string
55             # by counting the number of normalised characters
56             # (ignore any spaces in the count)
57 15         20 my $unspaced_title = $normalised_title;
58 15         34 $unspaced_title =~ s/\s//g;
59 15         17 my $character_count = length $unspaced_title;
60 15         91 my @characters = split (//, $name);
61 15         24 my @title_chars;
62 15   66     74 while ($character_count > 0 && scalar @characters > 0) {
63 89         160 my $character = shift @characters;
64 89         139 push (@title_chars, $character);
65             # only count down when we have a normalised character
66 89 100       468 $character_count-- if $character =~ /^\w$/;
67             }
68             # Now add any trailing un-normalised characters to the title too
69             # e.g. for "Mr." we want the "." in "Mr." too,
70 15         48 while ($characters[0] =~ /^\W$/) {
71 20         62 push (@title_chars, shift @characters);
72             }
73              
74 15         35 $title = join("", @title_chars);
75 15         30 $remaining_name = join("", @characters);
76              
77             # clean up any spaces at the point of separation
78 15         54 $title =~ s/\s+$//;
79 15         54 $remaining_name =~ s/^\s+//;
80             }
81              
82             # Return a cleaned title if that option was set
83 16 100 100     103 $title = $titles->{$normalised_title} if $normalised_title && ref $self && $self->{clean};
      100        
84              
85 16         73 return ($title, $remaining_name);
86             }
87              
88             # This method must match how parse()
89             # handles its input string.
90              
91             sub normalise {
92 3     3 0 4 my ($self, $title) = @_;
93              
94             # remove leading/trailing whitespace
95 3         7 $title =~ s/^\s+//;
96 3         8 $title =~ s/\s+$//;
97             # remove punctuation & consolidate spaces
98 3         9 $title =~ s/\W+/ /;
99             # lower-case
100 3         6 $title = lc($title);
101              
102 3         7 return $title;
103             }
104              
105             sub titles {
106 2     2 1 2087 my $self = shift;
107 2 100       11 my $titles = ref $self ? $self->{titles} : $self->_default_titles;
108 2         141 return sort values %$titles;
109             }
110              
111             sub _load {
112 1     1   1 my ($self, $titles) = @_;
113 1         3 my $normalised_titles = {};
114 1         3 foreach my $title (@$titles) {
115 3         11 my $normalised_title = $self->normalise($title);
116             # Store the title in our hashref pointing at the original title
117 3         8 $normalised_titles->{$normalised_title} = $title;
118             }
119 1         3 return $normalised_titles;
120             }
121              
122             sub _default_titles {
123             return {
124             # Basic titles
125 5     5   373 'mr' => 'Mr',
126             'ms' => 'Ms',
127             'mrs' => 'Mrs',
128             'miss' => 'Miss',
129             'mx' => 'Mx',
130             'dr' => 'Dr',
131             # Combined titles
132             'mr and mrs' => 'Mr and Mrs',
133             'mr mrs' => 'Mr & Mrs',
134             # Extended titles
135             'sir' => 'Sir',
136             'dame' => 'Dame',
137             'messrs' => 'Messrs',
138             'madame' => 'Madame',
139             'madam' => 'Madam',
140             'mme' => 'Mme',
141             'mister' => 'Mister',
142             'master' => 'Master',
143             'mast' => 'Mast',
144             'msgr' => 'Msgr',
145             'mgr' => 'Mgr',
146             'count' => 'Count',
147             'countess' => 'Countess',
148             'duke' => 'Duke',
149             'duchess' => 'Duchess',
150             'lord' => 'Lord',
151             'lady' => 'Lady',
152             'marquis' => 'Marquis',
153             'marquess' => 'Marquess',
154             # Medical
155             'doctor' => 'Doctor',
156             'sister' => 'Sister',
157             'matron' => 'Matron',
158             'nurse' => 'Nurse',
159             # Legal
160             'judge' => 'Judge',
161             'justice' => 'Justice',
162             'attorney' => 'Attorney',
163             'solicitor' => 'Solicitor',
164             'barrister' => 'Barrister',
165             'qc' => 'QC',
166             'kc' => 'KC',
167             # Police
168             'det' => 'Det',
169             'detective' => 'Detective',
170             'insp' => 'Insp',
171             'inspector' => 'Inspector',
172             # Military
173             'brig' => 'Brig',
174             'brigadier' => 'Brigadier',
175             'captain' => 'Captain',
176             'capt' => 'Capt',
177             'colonel' => 'Colonel',
178             'col' => 'Col',
179             'commander in chief' => 'Commander in Chief',
180             'commander' => 'Commander',
181             'commodore' => 'Commodore',
182             'cdr' => 'Cdr',
183             'field marshall' => 'Field Marshall',
184             'fl off' => 'Fl Off',
185             'flight officer' => 'Flight Officer',
186             'flt lt' => 'Flt Lt',
187             'flight lieutenant' => 'Flight Lieutenant',
188             'general of the army' => 'General of the Army',
189             'general' => 'General',
190             'gen' => 'Gen',
191             'pte' => 'Pte',
192             'private' => 'Private',
193             'sgt' => 'Sgt',
194             'sargent' => 'Sargent',
195             'air commander' => 'Air Commander',
196             'air commodore' => 'Air Commodore',
197             'air marshall' => 'Air Marshall',
198             'lieutenant colonel' => 'Lieutenant Colonel',
199             'lt col' => 'Lt Col',
200             'lt gen' => 'Lt Gen',
201             'lt cdr' => 'Lt Cdr',
202             'lieutenant' => 'Lieutenant',
203             'lt' => 'Lt',
204             'leut' => 'Leut',
205             'lieut' => 'Lieut',
206             'major general' => 'Major General',
207             'maj gen' => 'Maj Gen',
208             'major' => 'Major',
209             'maj' => 'Maj',
210             'pilot officer' => 'Pilot Officer',
211             # Religious
212             'rabbi' => 'Rabbi',
213             'bishop' => 'Bishop',
214             'brother' => 'Brother',
215             'chaplain' => 'Chaplain',
216             'father' => 'Father',
217             'pastor' => 'Pastor',
218             'mother superior' => 'Mother Superior',
219             'mother' => 'Mother',
220             'most reverend' => 'Most Reverend',
221             'most reverand' => 'Most Reverand',
222             'very reverend' => 'Very Reverend',
223             'very reverand' => 'Very Reverand',
224             'reverend' => 'Reverend',
225             'reverand' => 'Reverand',
226             'mt revd' => 'Mt Revd',
227             'v revd' => 'V Revd',
228             'revd' => 'Revd',
229             # Academic
230             'professor' => 'Professor',
231             'prof' => 'Prof',
232             'associate professor' => 'Associate Professor',
233             'assoc prof' => 'Assoc Prof',
234             # Other
235             'alderman' => 'Alderman',
236             'ald' => 'Ald',
237             # These might be followed by another title
238             # in which case we will fail to pick that up.
239             'his excellency' => 'His Excellency',
240             'his honour' => 'His Honour',
241             'his honor' => 'His Honor',
242             'her excellency' => 'Her Excellency',
243             'her honour' => 'Her Honour',
244             'her honor' => 'Her Honor',
245             'the right honourable' => 'The Right Honourable',
246             'the right honorable' => 'The Right Honorable',
247             'the honourable' => 'The Honourable',
248             'the honorable' => 'The Honorable',
249             'right honourable' => 'Right Honourable',
250             'right honorable' => 'Right Honorable',
251             'rt hon' => 'Rt Hon',
252             'rt hon' => 'Rt Hon',
253             'the hon' => 'The Hon',
254             'the hon' => 'The Hon',
255             };
256             }
257              
258             1;
259             __END__