File Coverage

blib/lib/Lingua/EN/NameParse.pm

Criterion	Covered	Total	%
statement	177	262	67.5
branch	58	106	54.7
condition	6	24	25.0
subroutine	21	23	91.3
pod	10	10	100.0
total	272	425	64.0

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::NameParse - extract the components of a person or couples full name, from free form text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::NameParse qw(clean case_surname);
8
9							# optional configuration arguments
10							my %args =
11							(
12							auto_clean => 1,
13							lc_prefix => 1,
14							initials => 3,
15							allow_reversed => 1,
16							joint_names => 0,
17							extended_titles => 0
18							);
19
20							my $name = Lingua::EN::NameParse->new(%args);
21
22							$error = $name->parse("Estate Of Lt Col AB Van Der Heiden (Hold Mail)");
23							unless ( $error )
24							{
25							print($name->report);
26
27							Case all : Estate Of Lt Col AB Van Der Heiden (Hold Mail)
28							Case all reversed : Van Der Heiden, Lt Col AB
29							Salutation : Dear Friend
30							Type : Mr_A_Smith
31							Parsing Error : 0
32							Error description : :
33							Parsing Warning : 1
34							Warning description : ;non_matching text found : (Hold Mail)
35
36							COMPONENTS
37							initials_1 : AB
38							non_matching : (Hold Mail)
39							precursor : Estate Of
40							surname_1 : Van Der Heiden
41							title_1 : Lt Col
42
43							%name_comps = $name->components;
44							$surname = $name_comps{surname_1};
45
46							$correct_casing = $name->case_all;
47
48							$correct_casing = $name->case_all_reversed ;
49
50							$salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend'));
51
52							$good_name = clean("Bad Na9me "); # "Bad Name"
53
54							%my_properties = $name->properties;
55							$number_surnames = $my_properties{number}; # 1
56							}
57
58
59							$lc_prefix = 0;
60							$correct_case = case_surname("DE SILVA-MACNAY",$lc_prefix); # A stand alone function, returns: De Silva-MacNay
61
62							$error = $name->parse("MR AS & D.E. DE LA MARE");
63							%my_properties = $name->properties;
64							$number_surnames = $my_properties{number}; # 2
65
66
67							=head1 DESCRIPTION
68
69
70							This module takes as input one person's name or a couples name in
71							free format text such as,
72
73							Mr AB & M/s CD MacNay-Smith
74							MR J.L. D'ANGELO
75							Estate Of The Late Lieutenant Colonel AB Van Der Heiden
76
77							and attempts to parse it. If successful, the name is broken
78							down into components and useful functions can be performed such as :
79
80							converting upper or lower case values to name case (Mr AB MacNay )
81							creating a personalised greeting or salutation (Dear Mr MacNay )
82							extracting the names individual components (Mr,AB,MacNay )
83							determining the type of format the name is in (Mr_A_Smith )
84
85
86							If the name(s) cannot be parsed you have the option of cleaning the name(s)
87							of bad characters, or extracting any portion that was parsed and the
88							portion that failed.
89
90							This module can be used for analysing and improving the quality of
91							lists of names.
92
93
94							=head1 DEFINITIONS
95
96							The following terms are used by NameParse to define the components
97							that can make up a name.
98
99							Precursor - Estate of (The Late), Right Honourable ...
100							Title - Mr, Mrs, Ms., Sir, Dr, Major, Reverend ...
101							Conjunction - word to separate two names, such as "And" or &
102							Initials - 1-3 letters, each with an optional space and/or dot
103							Surname - De Silva, Van Der Heiden, MacNay-Smith, O'Reilly ...
104							Suffix - Snr., Jnr, III, V ...
105
106							Refer to the component grammar defined within the code for a complete
107							list of combinations.
108
109							'Name casing' refers to the correct use of upper and lower case letters
110							in peoples names, such as Mr AB McNay.
111
112							To describe the formats supported by NameParse, a short hand representation
113							of the name is used. The following formats are currently supported :
114
115							Mr_John_Smith_&_Ms_Mary_Jones
116							Mr_A_Smith_&_Ms_B_Jones
117							Mr_&Ms_A_&_B_Smith
118							Mr_A_&_Ms_B_Smith
119							Mr_&_Ms_A_Smith
120							Mr_A_&_B_Smith
121							John_Smith_&_Mary_Jones
122							John_&_Mary_Smith
123							A_Smith_&_B_Jones
124
125							Mr_John_Adam_Smith
126							Mr_John_A_Smith
127							Mr_John_Smith
128							Mr_A_Smith
129							John_Adam_Smith
130							John_A_Smith
131							J_Adam_Smith
132							John_Smith
133							A_Smith
134							John
135
136							Precursors and suffixes may be applied to single names that have a surname
137
138
139							=head1 METHODS
140
141							=head2 new
142
143							The C method creates an instance of a name object and sets up
144							the grammar used to parse names. This must be called before any of the
145							following methods are invoked. Note that the object only needs to be
146							created ONCE, and should be reused with new input data. Calling C
147							repeatedly will significantly slow your program down.
148
149							Various setup options may be defined in a hash that is passed as an optional
150							argument to the C method. Note that all the arguments are optional. You
151							need to define the combination of arguments that are appropriate for your
152							usage.
153
154							my %args =
155							(
156							auto_clean => 1,
157							lc_prefix => 1,
158							initials => 3,
159							allow_reversed => 1
160							);
161
162
163							my $name = Lingua::EN::NameParse->new(%args);
164
165
166							=over 4
167
168							=item auto_clean
169
170							When this option is set to a positive value, any call to the C method
171							that fails will attempt to 'clean' the name and then reparse it. See the
172							C method for details. This is useful for dirty data with embedded
173							unprintable or non alphabetic characters.
174
175							=item lc_prefix
176
177							When this option is set to a positive value, it will force the C
178							and C methods to lower case the first letter of each word that
179							occurs in the prefix portion of a surname. For example, Mr AB de Silva,
180							or Ms AS von der Heiden.
181
182							=item initials
183
184							Allows the user to control the number of letters that can occur in the initials.
185							Valid settings are 1,2 or 3. If no value is supplied a default of 2 is used.
186
187							=item allow_reversed
188
189							When this option is set to a positive value, names in reverse order will be
190							processed. The only valid format is the surname followed by a comma and the
191							rest of the name, which can be in any of the combinations allowed by non
192							reversed names. Some examples are:
193
194							Smith, Mr AB
195							Jones, Jim
196							De Silva, Professor A.B.
197
198							The program changes the order of the name back to the non reversed format, and
199							then performs the normal parsing. Note that if the name can be parsed, the fact
200							that it's order was originally reversed, is not recorded as a property of the
201							name object.
202
203							=item joint_names
204
205							When this option is set to a positive value, joint names are accounted for:
206
207							Mr_A_Smith_&Ms_B_Jones
208							Mr_&Ms_A_&B_Smith
209							Mr_A_&Ms_B_Smith
210							Mr_&Ms_A_Smith
211							Mr_A_&B_Smith
212
213							Note that if this option is not specified, than by default joint names are
214							ignored. Disabling joint names speeds up the processing a lot.
215
216							=item extended_titles
217
218							When this option is set to a positive value, all combinations of titles,
219							such as Colonel, Mother Superior are used. If this value is not set, only
220							the following titles are accounted for:
221
222							Mr
223							Ms
224							M/s
225							Mrs
226							Miss
227							Dr
228							Sir
229							Dame
230
231
232							Note that if this option is not specified, than by default extended titles
233							are ignored. Disabling extended titles speeds up the parsing.
234
235							=back
236
237							=head2 parse
238
239							$error = $name->parse("MR AC DE SILVA");
240
241							The C method takes a single parameter of a text string containing a
242							name. It attempts to parse the name and break it down into the components
243
244							Returns an error flag. If the name was parsed successfully, it's value is 0,
245							otherwise a 1. This step is a prerequisite for the following methods.
246
247
248							=head2 case_all
249
250							$correct_casing = $name->case_all;
251
252							The C method converts the first letter of each component to
253							capitals and the remainder to lower case, with the following exceptions-
254
255							initials remain capitalised
256							surname spelling such as MacNay-Smith, O'Brien and Van Der Heiden are preserved
257							- see C for user defined exceptions
258
259							A complete definition of the capitalising rules can be found by studying
260							the case_surname function.
261
262							The method returns the entire cased name as text.
263
264							=head2 case_all_reversed
265
266							$correct_casing = $name->case_all_reversed;
267
268							The C method applies the same type of casing as
269							C. However, the name is returned as surname followed by a comma
270							and the rest of the name, which can be any of the combinations allowed
271							for a name, except the title. Some examples are: "Smith, John", "De Silva, A.B."
272							This is useful for sorting names alphabetically by surname.
273
274							The method returns the entire reverse order cased name as text.
275
276
277							=head2 components
278
279							%my_name = $name->components;
280							$cased_surname = $my_name{surname_1};
281
282
283							The C method does the same thing as the C method,
284							but returns the name cased components in a hash. The following keys are used
285							for each component:
286
287							precursor
288							title_1
289							title_2
290							given_name_1
291							given_name_2
292							initials_1
293							initials_2
294							middle_name
295							conjunction_1
296							conjunction_2
297							surname_1
298							surname_2
299							suffix
300
301							If a component has no matching data for a given name, it will not appear in the hash
302
303							If the name could not be parsed, this method returns null. If you assign the return
304							value to a hash, you should check the error status returned by the C method first.
305							Ohterwise, you will get an odd number of values assigned to the hash.
306
307
308							=head2 case_surname
309
310							$correct_casing = case_surname("DE SILVA-MACNAY" [,$lc_prefix]);
311
312							C is a stand alone function that does not require a name
313							object. The input is a text string. An optional input argument controls the
314							casing rules for prefix portions of a surname, as described above in the
315							C section.
316
317							The output is a string converted to the correct casing for surnames.
318							See C for user defined exceptions
319
320							This function is useful when you know you are only dealing with names that
321							do not have initials like "Mr John Jones". It is much faster than the case_all
322							method, but does not understand context, and cannot detect errors on strings
323							that are not personal names.
324
325
326							=head2 surname_prefs.txt
327
328							Some surnames can have more than one form of valid capitalisation, such as
329							MacQuarie or Macquarie. Where the user wants to specify one form as the default,
330							a text file called surname_prefs.txt should be created and placed in the same
331							location as the NameParse module. The text file should contain one surname per
332							line, in the capitalised form you want, such as
333
334							Macquarie
335							MacHado
336
337							NameParse will still operate if the file does not exist
338
339							=head2 salutation
340
341							$salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend',sal_type => 'given_name'));
342
343							The C method converts a name into a personal greeting,
344							such as "Dear Mr & Mrs O'Brien" or "Dear Sue and John"
345
346							Optional parameters may be specided in a hash as follows:
347
348
349							salutation:
350
351							The greeting word such as 'Dear' or 'Greetings'. If not spefied than 'Dear' is used
352
353							sal_default:
354
355							The default word used when a personalised salution cannot be generated. If not
356							specified, than 'Friend' is used.
357
358							sal_type:
359
360							Can be either 'given_name' such as 'Dear Sue' or 'title_plus_name' such as 'Dear Ms Smith'
361							If not specified, than 'given_name' is used.
362
363							If an error is detected during parsing, such as with the name "AB Smith & Associates",
364							then the value of sal_default is used instead of a given name, or a title and surname.
365							If the input string contains a conjunction, an 's' is added to the value of sal_default.
366
367							If the name contains a precursor, a default salutation is produced.
368
369							=head2 clean
370
371							$good_name = clean("Bad Na9me");
372
373							C is a stand alone function that does not require a name object.
374							The input is a text string and the output is the string with:
375
376							all repeating spaces removed
377							all characters not in the set (A-Z a-z - ' , . &) removed
378
379
380							=head2 properties
381
382							The C method returns all the properties of the name,
383							non_matching, number and type, as a hash.
384
385							=over 4
386
387							=item type
388
389							The type of format a name is in, as one of the following strings:
390
391							Mr_A_Smith_&Ms_B_Jones
392							Mr_&Ms_A_&B_Smith
393							Mr_A_&Ms_B_Smith
394							Mr_&Ms_A_Smith
395							Mr_A_&B_Smith
396							Mr_John_Adam_Smith
397							Mr_John_A_Smith
398							Mr_John_Smith
399							Mr_A_Smith
400							John_Adam_Smith
401							John_A_Smith
402							J_Adam_Smith
403							John_Smith
404							A_Smith
405							John
406							unknown
407
408
409							=item non_matching
410
411							Returns any unmatched section that was found.
412
413							=back
414
415							=head2 report
416
417							Create a formatted text report to standard output listing
418							- the input string,
419							- the name and value of each defined component
420							- any non matching component
421
422
423							=head1 LIMITATIONS
424
425							The huge number of character combinations that can form a valid names makes
426							it is impossible to correctly identify them all. Firstly, there are many
427							ambiguities, which have no right answer.
428
429							Macbeth or MacBeth, are both valid spellings
430							Is ED WOOD E.D. Wood or Edward Wood
431							Is 'Mr Rapid Print' a name or a company
432							Does John Bradfield Smith have a middle name of Bradfield, or a surname of Bradfield-Smith?
433
434							One approach is to have large lookup files of names and words, statistical rules
435							and fuzzy logic to attempt to derive context. This approach gives high levels of
436							accuracy but uses a lot of your computers time and resources.
437
438							NameParse takes the approach of using a limited set of rules, based on the
439							formats that are commonly used by business to represent peoples names. This
440							gives us fairly high accuracy, with acceptable speed and program size.
441
442							NameParse will accept names from many countries, like Van Der Heiden,
443							De La Mare and Le Fontain. Having said that, it is still biased toward English,
444							because the precursors, titles and conjunctions are based on English usage.
445
446							Names with two or more words, but no separating hyphen are not recognized.
447							This is a real quandary as Indian, Chinese and other names can have several
448							components. If these are allowed for, any component after the surname
449							will also be picked up. For example in "Mr AB Jones Trading As Jones Pty Ltd"
450							will return a surname of "Jones Trading".
451
452							Because of the large combination of possible names defined in the grammar, the
453							program is not very fast, except for the more limited C subroutine.
454							See the "Future Directions" section for possible speed ups.
455
456							As the parser has a very limited understanding of context, the "John_Adam_Smith"
457							name type is most likely to cause problems, as it contains no known tokens
458							like a title. A string such as "National Australia Bank" would be accepted
459							as a valid name, first name National etc. Supplying a list of common pronouns
460							as exceptions could solve this problem.
461
462
463							=head1 REFERENCES
464
465							"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
466
467							Australian Standard AS4212-1994 "Geographic Information Systems -
468							Data Dictionary for transfer of street addressing information"
469
470
471							=head1 FUTURE DIRECTIONS
472
473							Define grammar for other languages. Hopefully, all that would be needed is
474							to specify a new module with its own grammar, and inherit all the existing
475							methods. I don't have the knowledge of the naming conventions for non-english
476							languages.
477
478							=head1 REPOSITORY
479
480							L
481
482
483							=head1 SEE ALSO
484
485							L, L, L,
486							L, L
487
488
489							=head1 BUGS
490
491							Names with accented characters (acute, circumfelx etc) will not be parsed
492							correctly. A work around is to replace the character class [a-z] with \w
493							in the appropriate rules in the grammar tree, but this could lower the accuracy
494							of names based purely on ASCII text.
495
496							=head1 CREDITS
497
498							Thanks to all the people who provided ideas and suggestions, including -
499
500							Damian Conway, author of Parse::RecDescent
501							Mark Summerfield author of Lingua::EN::NameCase,
502							Ron Savage, Alastair Adam Huffman, Douglas Wilson
503							Peter Schendzielorz
504
505							=head1 AUTHOR
506
507							NameParse was written by Kim Ryan
508
509							=head1 COPYRIGHT AND LICENSE
510
511							Copyright (c) 2018 Kim Ryan. All rights reserved.
512
513							This library is free software; you can redistribute it and/or modify
514							it under the same terms as Perl itself.
515
516							=cut
517							#-------------------------------------------------------------------------------
518
519							package Lingua::EN::NameParse;
520
521	2			2		123911	use strict;
	2					13
	2					56
522	2			2		10	use warnings;
	2					5
	2					51
523
524	2			2		932	use Lingua::EN::NameParse::Grammar;
	2					11
	2					59
525	2			2		2296	use Parse::RecDescent;
	2					85127
	2					17
526
527	2			2		101	use Exporter;
	2					5
	2					83
528	2			2		13	use vars qw (@ISA @EXPORT_OK);
	2					4
	2					2718
529
530							our $VERSION = '1.38';
531							@ISA = qw(Exporter);
532							@EXPORT_OK = qw(clean case_surname);
533
534							#-------------------------------------------------------------------------------
535							# Create a new instance of a name parsing object. This step is time consuming
536							# and should normally only be called once in your program.
537
538							sub new
539							{
540	2			2	1	94	my $class = shift;
541	2					8	my %args = @_;
542
543	2					5	my $name = {};
544	2					6	bless($name,$class);
545
546							# Default to 2 initials per name. Can be overwritten if user defines
547							# 'initials' as a key in the hash supplied to new method.
548	2					14	$name->{initials} = 2;
549
550	2					4	my $current_key;
551	2					9	foreach my $current_key (keys %args)
552							{
553	8					17	$name->{$current_key} = $args{$current_key};
554							}
555
556	2					12	my $grammar = Lingua::EN::NameParse::Grammar::_create($name);
557	2					19	$name->{parse} = new Parse::RecDescent($grammar);
558
559	2					580356	return ($name);
560							}
561							#-------------------------------------------------------------------------------
562							# Attempt to parse a string and retrieve it's components and properties
563							# Requires a name object to have been created with the 'new' method'
564							# Returns: an error code, 0 for success, 1 for failure
565
566							sub parse
567							{
568	28			28	1	7270	my $name = shift;
569	28					61	my ($input_string) = @_;
570
571	28					62	chomp($input_string);
572
573							# If reverse ordered names are allowed, swap the surname component, before
574							# the comma, with the rest of the name. Rejoin the name, replacing comma
575							# with a space.
576
577	28	100	100			158	if ( $name->{allow_reversed} and $input_string =~ /,/ )
578							{
579	1					5	my ($first,$second) = split(/,/,$input_string);
580	1					5	$input_string = join(' ',$second,$first);
581							}
582
583	28					157	$name->{comps} = ();
584	28					68	$name->{properties} = ();
585	28					73	$name->{properties}{type} = 'unknown';
586	28					60	$name->{error} = 0;
587	28					56	$name->{error_desc} = '';
588	28					41	$name->{warning} = 0;
589	28					59	$name->{warning_desc} = '';
590
591	28					51	$name->{original_input} = $input_string;
592	28					52	$name->{input_string} = $input_string;
593
594	28					74	$name = _pre_parse($name);
595	28	50				75	unless ( $name->{error} )
596							{
597	28	100				71	if ( $name->{auto_clean} )
598							{
599	9					21	$name->{input_string} = clean($name->{input_string});
600							}
601	28					67	$name = _assemble($name);
602	28					74	_validate($name);
603							}
604
605	28					103	return($name->{error});
606							}
607							#-------------------------------------------------------------------------------
608							# Clean the input string. Can be called as a stand alone function.
609
610							sub clean
611							{
612	10			10	1	336	my ($input_string) = @_;
613
614							# remove illegal characters
615	10					32	$input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go;
616
617							# remove repeating spaces
618	10					26	$input_string =~ s/ +/ /go ;
619
620							# remove any remaining leading or trailing space
621	10					17	$input_string =~ s/^ //;
622
623	10					24	return($input_string);
624							}
625
626							#-------------------------------------------------------------------------------
627							# Given a name object, apply correct capitalisation to each component of a person's name.
628							# Return all cased components in a hash.
629							# Else return no value.
630
631
632							sub components
633							{
634	35			35	1	88	my $name = shift;
635
636	35	50				109	if ( $name->{properties}{type} eq 'unknown' )
637							{
638	0					0	return;
639							}
640							else
641							{
642	35					51	my %orig_components = %{ $name->{comps} };
	35					285
643
644	35					92	my ($current_key,%cased_components);
645	35					163	foreach $current_key ( keys %orig_components )
646							{
647	490					636	my $cased_value;
648	490	100				1372	if ( $current_key =~ /initials/ ) # initials_1, possibly initials_2
		100
		50
649							{
650	70					133	$cased_value = uc($orig_components{$current_key});
651							}
652							elsif ( $current_key =~ /surname\|suffix/ )
653							{
654	105					309	$cased_value = case_surname($orig_components{$current_key},$name->{lc_prefix});
655							}
656							elsif ( $current_key eq 'type')
657							{
658	0					0	$cased_value = $orig_components{$current_key};
659							}
660							else
661							{
662	315					537	$cased_value = _case_word($orig_components{$current_key});
663							}
664
665	490					1003	$cased_components{$current_key} = $cased_value;
666							}
667	35					439	return(%cased_components);
668							}
669							}
670
671							#-------------------------------------------------------------------------------
672							# Hash of of lists, indicating the order that name components are assembled in.
673							# Each list element is itself the name of the key value in a name object.
674							# Used by the case_all and case_all_reversed methods.
675							# These hashes are created here globally, as quite a large overhead is
676							# imposed if the are created locally, each time the method is invoked
677
678							my %component_order=
679							(
680							'Mr_John_Smith_&_Ms_Mary_Jones' => ['title_1','given_name_1','surname_1','conjunction_1','title_2','given_name_2','surname_2'],
681							'Mr_A_Smith_&_Ms_B_Jones' => ['title_1','initials_1','surname_1','conjunction_1','title_2','initials_2','surname_2'],
682							'Mr_&_Ms_A_&_B_Smith' => ['title_1','conjunction_1','title_2','initials_1','conjunction_2','initials_2','surname_1'],
683							'Mr_A_&_Ms_B_Smith' => ['title_1','initials_1','conjunction_1','title_2','initials_2','surname_1'],
684							'Mr_&_Ms_A_Smith' => ['title_1','conjunction_1','title_2','initials_1','surname_1'],
685							'Mr_A_&_B_Smith' => ['title_1','initials_1','conjunction_1','initials_2','surname_1'],
686							'John_Smith_&_Mary_Jones' => ['given_name_1','surname_1','conjunction_1','given_name_2','surname_2'],
687							'John_&_Mary_Smith' => ['given_name_1','conjunction_1','given_name_2','surname_1'],
688							'A_Smith_&_B_Jones' => ['initials_1','surname_1','conjunction_1','initials_2','surname_2'],
689
690							'Mr_John_Adam_Smith' => ['precursor','title_1','given_name_1','middle_name','surname_1','suffix'],
691							'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'],
692							'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'],
693							'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'],
694							'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'],
695							'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'],
696							'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'],
697							'John_Smith' => ['precursor','given_name_1','surname_1','suffix'],
698							'A_Smith' => ['precursor','initials_1','surname_1','suffix'],
699							'John' => ['given_name_1']
700							);
701
702
703							# only include names with a single surname
704							my %reverse_component_order=
705							(
706							'Mr_&_Ms_A_&_B_Smith' => ['surname_1','title_1','conjunction_1','title_2','initials_1','conjunction_1','initials_2'],
707							'Mr_A_&_Ms_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','title_2','initials_2'],
708							'Mr_&_Ms_A_Smith' => ['surname_1','title_1','title_1','conjunction_1','title_2','initials_1'],
709							'Mr_A_&_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','initials_2'],
710							'John_&_Mary_Smith' => ['surname_1','given_name_1','conjunction_1','given_name_2'],
711
712							'Mr_John_Adam_Smith' => ['surname_1','title_1','given_name_1','middle_name','suffix'],
713							'Mr_John_A_Smith' => ['surname_1','title_1','given_name_1','initials_1','suffix'],
714							'Mr_John_Smith' => ['surname_1','title_1','given_name_1','suffix'],
715							'Mr_A_Smith' => ['surname_1','title_1','initials_1','suffix'],
716							'John_Adam_Smith' => ['surname_1','given_name_1','middle_name','suffix'],
717							'John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'],
718							'J_Adam_Smith' => ['surname_1','initials_1','middle_name','suffix'],
719							'John_Smith' => ['surname_1','given_name_1','suffix'],
720							'A_Smith' => ['surname_1','initials_1','suffix'],
721							'John' => ['given_name_1']
722							);
723
724							#-------------------------------------------------------------------------------
725							# Apply correct capitalisation to a person's entire name
726							# If the name type is unknown, return undef
727							# Else, return a string of all cased components in correct order
728
729							sub case_all
730							{
731	2			2	1	21	my $name = shift;
732
733	2					3	my @cased_name;
734
735	2	50				7	if ( $name->{properties}{type} eq 'unknown' )
736							{
737	0					0	return undef;
738							}
739
740	2	50				8	unless ( $component_order{$name->{properties}{type}} )
741							{
742							# component order missing in array defined above
743	0					0	warn "Component order not defined for: $name->{properties}{type}";
744	0					0	return;
745							}
746
747	2					6	my %component_vals = $name->components;
748	2					7	my @order = @{ $component_order{$name->{properties}{type}} };
	2					8
749
750	2					5	foreach my $component_key ( @order )
751							{
752							# As some components such as precursors are optional, they will appear
753							# in the order array but may or may not have have a value, so only
754							# process defined values
755	10	100				24	if ( $component_vals{$component_key} )
756							{
757	6					12	push(@cased_name,$component_vals{$component_key});
758							}
759							}
760	2	100				11	if ( $name->{comps}{non_matching} )
761							{
762							# Despite errors, try to name case non-matching section. As the format
763							# of this section is unknown, surname case will provide the best
764							# approximation, but still fail on initials of more than 1 letter
765	1					5	push(@cased_name,case_surname($name->{comps}{non_matching},$name->{lc_prefix}));
766							}
767
768	2					23	return(join(' ',@cased_name));
769							}
770
771							#-------------------------------------------------------------------------------
772							=head1 case_all_reversed
773
774							Apply correct capitalisation to a person's entire name and reverse the order
775							so that surname is first, followed by the other components, such as: Smith, Mr John A
776							Useful for creating a list of names that can be sorted by surname.
777
778							If name type is unknown , returns null
779
780							If the name type has a joint name, such as 'Mr_A_Smith_Ms_B_Jones', return null,
781							as it is ambiguous which surname to place at the start of the string
782
783							Else, returns a string of all cased components in correct reversed order
784
785							=cut
786
787							sub case_all_reversed
788							{
789	0			0	1	0	my $name = shift;
790
791	0					0	my @cased_name_reversed;
792
793	0	0				0	unless ( $name->{properties}{type} eq 'unknown' )
794							{
795	0	0				0	unless ( $reverse_component_order{$name->{properties}{type} } )
796							{
797							# this type of name should not be reversed, such as two surnames
798	0					0	return;
799							}
800	0					0	my %component_vals = $name->components;
801	0					0	my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } };
	0					0
802
803	0					0	foreach my $component_key ( @reverse_order )
804							{
805							# As some components such as precursors are optional, they will appear
806							# in the order array but may or may not have have a value, so only
807							# process defined values
808
809	0					0	my $component_value = $component_vals{$component_key};
810	0	0				0	if ( $component_value )
811							{
812	0	0				0	if ($component_key eq 'surname_1')
813							{
814	0					0	$component_value .= ',';
815							}
816	0					0	push(@cased_name_reversed,$component_value);
817							}
818							}
819							}
820	0					0	return(join(' ',@cased_name_reversed));
821							}
822							#-------------------------------------------------------------------------------
823							# The user may specify their own preferred spelling for surnames.
824							# These should be placed in a text file called surname_prefs.txt
825							# in the same location as the module itself.
826
827							BEGIN
828							{
829							# Obtain the full path to NameParse module, defined in the %INC hash.
830	2			2		17	my $prefs_file_location = $INC{"Lingua/EN/NameParse.pm"};
831							# Now substitute the name of the preferences file
832	2					20	$prefs_file_location =~ s/NameParse\.pm$/surname_prefs.txt/;
833
834	2	50				4257	if ( open(PREFERENCES_FH,"<$prefs_file_location") )
835							{
836	0					0	my @surnames = ;
837	0					0	foreach my $name ( @surnames )
838							{
839	0					0	chomp($name);
840							# Build hash, lower case name is key for case insensitive
841							# comparison, while value holds the actual capitalisation
842	0					0	$Lingua::EN::surname_preferences{lc($name)} = $name;
843							}
844	0					0	close(PREFERENCES_FH);
845							}
846							}
847							#-------------------------------------------------------------------------------
848							# Apply correct capitalisation to a person's surname. Can be called as a
849							# stand alone function.
850
851							sub case_surname
852							{
853	108			108	1	701	my ($surname,$lc_prefix) = @_;
854
855	108	100				236	unless ($surname)
856							{
857	65					143	return('');
858							}
859
860							# If the user has specified a preferred capitalisation for this
861							# surname in the surname_prefs.txt, it should be returned now.
862	43	50				128	if ($Lingua::EN::surname_preferences{lc($surname)} )
863							{
864	0					0	return($Lingua::EN::surname_preferences{lc($surname)});
865							}
866
867							# Lowercase everything
868	43					82	$surname = lc($surname);
869
870							# Now uppercase first letter of every word. By checking on word boundaries,
871							# we will account for apostrophes (D'Angelo) and hyphenated names
872	43					272	$surname =~ s/\b(\w)/\u$1/g;
873
874							# Name case Macs and Mcs
875							# Exclude names with 1-2 letters after prefix like Mack, Macky, Mace
876							# Exclude names ending in a,c,i,o,z or j, typically Polish or Italian
877
878	43	100				194	if ( $surname =~ /\bMac[a-z]{2,}[^a\|c\|i\|o\|z\|j]\b/i )
		50
879							{
880	4					23	$surname =~ s/\b(Mac)([a-z]+)/$1\u$2/ig;
881
882							# Now correct for "Mac" exceptions
883	4					10	$surname =~ s/MacHin/Machin/;
884	4					12	$surname =~ s/MacHlin/Machlin/;
885	4					7	$surname =~ s/MacHar/Machar/;
886	4					8	$surname =~ s/MacKle/Mackle/;
887	4					5	$surname =~ s/MacKlin/Macklin/;
888	4					8	$surname =~ s/MacKie/Mackie/;
889
890							# Portuguese
891	4					6	$surname =~ s/MacHado/Machado/;
892
893							# Lithuanian
894	4					5	$surname =~ s/MacEvicius/Macevicius/;
895	4					7	$surname =~ s/MacIulis/Maciulis/;
896	4					7	$surname =~ s/MacIas/Macias/;
897							}
898							elsif ( $surname =~ /\bMc/i )
899							{
900	0					0	$surname =~ s/\b(Mc)([a-z]+)/$1\u$2/ig;
901							}
902							# Exceptions (only 'Mac' name ending in 'o' ?)
903	43					79	$surname =~ s/Macmurdo/MacMurdo/;
904
905
906	43	100				89	if ( $lc_prefix )
907							{
908							# Lowercase first letter of every word in prefix. The trailing space
909							# prevents the surname from being altered. Note that spellings like
910							# d'Angelo are not accounted for.
911	1					8	$surname =~ s/\b(\w+ )/\l$1/g;
912							}
913
914							# Correct for possessives such as "John's" or "Australia's". Although this
915							# should not occur in a person's name, they are valid for proper names.
916							# As this subroutine may be used to capitalise words other than names,
917							# we may need to account for this case. Note that the 's' must be at the
918							# end of the string
919	43					84	$surname =~ s/(\w+)'S(\s+)/$1's$2/;
920	43					68	$surname =~ s/(\w+)'S$/$1's/;
921
922							# Correct for roman numerals, excluding single letter cases I,V and X,
923							# which will work with the above code
924	43					115	$surname =~ s/\b(I{2,3})\b/\U$1/i; # 2nd, 3rd
925	43					88	$surname =~ s/\b(IV)\b/\U$1/i; # 4th
926	43					77	$surname =~ s/\b(VI{1,3})\b/\U$1/i; # 6th, 7th, 8th
927	43					78	$surname =~ s/\b(IX)\b/\U$1/i; # 9th
928	43					100	$surname =~ s/\b(XI{1,3})\b/\U$1/i; # 11th, 12th, 13th
929
930	43					113	return($surname);
931							}
932							#-------------------------------------------------------------------------------
933							# Create a personalised greeting from one or two person's names
934							# Returns the salutation as a string, such as "Dear Mr Smith", or "Dear Sue"
935
936							sub salutation
937							{
938	3			3	1	7	my $name = shift;
939	3					8	my %args = @_;
940
941	3					6	my $salutation = 'Dear';
942	3					5	my $sal_default = 'Friend';
943	3					5	my $sal_type = 'title_plus_surname';
944
945							# Check to see if we should override defualts with any user specified preferences
946	3	100				9	if ( %args )
947							{
948	2					8	foreach my $current_key (keys %args)
949							{
950	2	50				6	$current_key eq 'salutation' and $salutation = $args{$current_key};
951	2	50				6	$current_key eq 'sal_default' and $sal_default = $args{$current_key};
952	2	50				8	$current_key eq 'sal_type' and $sal_type = $args{$current_key};
953							}
954							}
955
956
957	3					6	my @greeting;
958	3					7	push(@greeting,$salutation);
959
960							# Personalised salutations cannot be created for Estates or people
961							# without some title
962	3	50	33			17	if
			33
963							(
964							$name->{error} or
965							( $name->{comps}{precursor} and $name->{comps}{precursor} =~ /ESTATE/)
966							)
967							{
968							# Despite an error, the presence of a conjunction probably
969							# means we are dealing with 2 or more people.
970							# For example Mr AB Smith & John Jones
971	0	0				0	if ( $name->{input_string} =~ / (AND\|&) / )
972							{
973	0					0	$sal_default .= 's';
974							}
975	0					0	push(@greeting,$sal_default);
976							}
977							else
978							{
979	3					8	my %component_vals = $name->components;
980
981	3	100				17	if ( $sal_type eq 'given_name')
		50
982							{
983	1	50				4	if ( $component_vals{'given_name_1'} )
984							{
985	1					3	push(@greeting,$component_vals{'given_name_1'});
986	1	50				5	if ( $component_vals{'given_name_2'} )
987							{
988	0					0	push(@greeting,$component_vals{'conjunction_1'});
989	0					0	push(@greeting,$component_vals{'given_name_2'});
990							}
991							}
992							else
993							{
994							# No given name such as 'A_Smith','J_Adam_Smith','Mr_A_Smith'
995							# Must use default
996	0					0	push(@greeting,$sal_default);
997							}
998							}
999							elsif ( $sal_type eq 'title_plus_surname' )
1000							{
1001	2	50				6	if ( $name->{properties}{number} == 1 )
		0
1002							{
1003	2	100				7	if ( $component_vals{'title_1'} )
1004							{
1005	1					3	push(@greeting,$component_vals{'title_1'});
1006	1					4	push(@greeting,$component_vals{'surname_1'});
1007							}
1008							else
1009							{
1010							# No title such as 'A_Smith','J_Adam_Smith', so must use default
1011	1					4	push(@greeting,$sal_default);
1012							}
1013							}
1014							elsif ( $name->{properties}{number} == 2 )
1015							{
1016							# a joint name
1017
1018	0					0	my $type = $name->{properties}{type};
1019	0	0	0			0	if ( $type eq 'Mr_&Ms_A_Smith' or $type eq 'Mr_A_&Ms_B_Smith' or $type eq 'Mr_&Ms_A_&B_Smith' )
		0	0
			0
1020							{
1021							# common surname
1022	0					0	push(@greeting,$component_vals{'title_1'});
1023	0					0	push(@greeting,$component_vals{'conjunction_1'});
1024	0					0	push(@greeting,$component_vals{'title_2'});
1025	0					0	push(@greeting,$component_vals{'surname_1'});
1026
1027							}
1028							elsif ( $type eq 'Mr_A_Smith_&Ms_B_Jones' or $type eq 'Mr_John_Smith_&Ms_Mary_Jones' )
1029							{
1030	0					0	push(@greeting,$component_vals{'title_1'});
1031	0					0	push(@greeting,$component_vals{'surname_1'});
1032	0					0	push(@greeting,$component_vals{'conjunction_1'});
1033	0					0	push(@greeting,$component_vals{'title_2'});
1034	0					0	push(@greeting,$component_vals{'surname_2'});
1035							}
1036							else
1037							{
1038							# No title such as A_Smith_&B_Jones', 'John_Smith_&Mary_Jones'
1039							# Must use default
1040	0					0	push(@greeting,$sal_default);
1041							}
1042							}
1043							}
1044							else
1045							{
1046	0					0	warn "Invalid sal_type : ", $sal_type;
1047	0					0	push(@greeting,$sal_default);
1048							}
1049							}
1050	3					24	return(join(' ',@greeting));
1051							}
1052							#-------------------------------------------------------------------------------
1053							# Return all name properties as a hash
1054
1055							sub properties
1056							{
1057	21			21	1	102	my $name = shift;
1058	21					35	return(%{ $name->{properties} });
	21					107
1059							}
1060
1061							#-------------------------------------------------------------------------------
1062							# Create a text report to standard output listing
1063							# - the input string,
1064							# - the name of each defined component, if it exists
1065							# - any non matching component
1066
1067							sub report
1068							{
1069	0			0	1	0	my $name = shift;
1070
1071	0					0	my %props = $name->properties;
1072
1073	0					0	my $fmt = "%-20.20s : %s\n";
1074
1075	0					0	printf($fmt,"Original Input",$name->{original_input});
1076	0					0	printf($fmt,"Cleaned Input",$name->{input_string});
1077	0					0	printf($fmt,"Case all",$name->case_all);
1078	0	0				0	if ($name->case_all_reversed)
1079							{
1080	0					0	printf($fmt,"Case all reversed",$name->case_all_reversed);
1081							}
1082							else
1083							{
1084	0					0	printf($fmt,"Case all reversed",'not applicable');
1085							}
1086
1087
1088	0					0	printf($fmt,"Salutation",$name->salutation(salutation => 'Dear',sal_default => 'Friend', sal_type => 'title_plus_surname'));
1089	0					0	printf($fmt,"Type", $props{type});
1090	0					0	printf($fmt,"Number", $props{number});
1091	0					0	printf($fmt,"Parsing Error", $name->{error});
1092	0					0	printf($fmt,"Error description : ", $name->{error_desc});
1093	0					0	printf($fmt,"Parsing Warning", $name->{warning});
1094	0					0	printf($fmt,"Warning description", $name->{warning_desc});
1095
1096
1097	0	0				0	unless ($props{type} eq 'unknown')
1098							{
1099	0					0	my %comps = $name->components;
1100	0	0				0	if ( %comps )
1101							{
1102	0					0	print("\nCOMPONENTS\n");
1103	0					0	foreach my $value ( sort keys %comps)
1104							{
1105	0	0	0			0	if ($value and $comps{$value})
1106							{
1107	0					0	printf($fmt,$value,$comps{$value});
1108							}
1109							}
1110							}
1111							}
1112							}
1113							#-------------------------------------------------------------------------------
1114
1115							# PRIVATE METHODS
1116
1117							#-------------------------------------------------------------------------------
1118
1119							sub _pre_parse
1120							{
1121	28			28		46	my $name = shift;
1122
1123							# strip all full stops
1124	28					94	$name->{input_string} =~ s/\.//g;
1125
1126							# Fold all text to upper case, as these are used in all regular expressions withun thr grammar tree
1127	28					75	$name->{input_string} = uc($name->{input_string});
1128
1129							# Check that common reserved word (as found in company names) do not appear
1130	28	50				284	if ( $name->{input_string} =~
1131							/\BPTY LTD$\|\BLTD$\|\BPLC$\|ASSOCIATION\|DEPARTMENT\|NATIONAL\|SOCIETY/ )
1132							{
1133	0					0	$name->{error} = 1;
1134	0					0	$name->{comps}{non_matching} = $name->{input_string};
1135	0					0	$name->{error_desc} = 'Reserved words found in name';
1136							}
1137
1138							# For the case of a single name such as 'Voltaire' we need to add a trailing space
1139							# to the input string. This is because the grammar tree expects a terminator (the space)
1140							# optionally followed by other productions or non matching text
1141	28					71	$name->{input_string} .= ' ';
1142	28	50				143	if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/ )
1143							{
1144	0					0	$name->{input_string} .= ' ';
1145							}
1146	28					99	return($name);
1147
1148							}
1149							#-------------------------------------------------------------------------------
1150							# Initialise all components to empty string. Assemble hashes of components
1151							# and properties as part of the name object
1152							#
1153							sub _assemble
1154							{
1155	28			28		45	my $name = shift;
1156
1157							# Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree
1158							# defined in Lingua::EN::NameParse::Grammar
1159	28					228	my $parsed_name = $name->{parse}->full_name($name->{input_string});
1160
1161							# Place components into a separate hash, so they can be easily returned
1162							# for the user to inspect and modify.
1163
1164	28					300778	my @all_comps = qw(precursor title_1 given_name_1 initials_1 middle_name surname_1 conjunction_1
1165							title_2 given_name_2 initials_2 surname_2 conjunction_2 suffix non_matching);
1166
1167	28					77	foreach my $comp (@all_comps)
1168							{
1169							# set all components to empty string, as any of them could be accessed, even if they don't exist
1170	392					730	$name->{comps}{$comp} = '';
1171	392	100				772	if (defined($parsed_name->{$comp}))
1172							{
1173							# Copy over existing components.
1174	135					266	$name->{comps}{$comp} = _trim_space($parsed_name->{$comp});
1175							}
1176							}
1177
1178	28					71	$name->{properties}{number} = 0;
1179	28					52	$name->{properties}{number} = $parsed_name->{number};
1180	28					55	$name->{properties}{type} = $parsed_name->{type};
1181
1182	28					105	return($name);
1183							}
1184							#-------------------------------------------------------------------------------
1185							# For correct matching, the grammar of each component must include the trailing space that separates it
1186							# from any following word. This should now be removed from the components, and will be restored by the
1187							# case_all and salutation methods, if called.
1188
1189							sub _trim_space
1190							{
1191	135			135		239	my ($string) = @_;
1192	135	100				258	if ($string)
1193							{
1194	109					343	$string =~ s/ $//;
1195							}
1196	135					343	return($string);
1197							}
1198							#-------------------------------------------------------------------------------
1199							# Check if any name components have illegal characters, or do not have the
1200							# correct syntax for a valid name.
1201
1202
1203							sub _validate
1204							{
1205	28			28		57	my $name = shift;
1206	28					85	my %comps = $name->components;
1207
1208	28	100				154	if ( $comps{non_matching} )
		50
1209							{
1210	2					6	$name->{warning} = 1;
1211	2					7	$name->{warning_desc} .= ";non_matching text found : $comps{non_matching}";
1212							}
1213							elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ )
1214							{
1215							# illegal characters found
1216	0					0	$name->{error} = 1;
1217	0					0	$name->{error_desc} = 'illegal characters found';
1218							}
1219
1220
1221	28	50				71	if ( not _valid_name($comps{given_name_1}) )
		50
		50
		50
1222							{
1223	0					0	$name->{warning} = 1;
1224	0					0	$name->{warning_desc} .= ";no vowel sound in given_name_1 : $comps{given_name_1}";
1225							}
1226							elsif ( not _valid_name($comps{middle_name}) )
1227							{
1228	0					0	$name->{warning} = 1;
1229	0					0	$name->{warning_desc} .= ";no vowel sound in middle_name : $comps{middle_name}";
1230							}
1231
1232							elsif ( not _valid_name($comps{surname_1}) )
1233							{
1234	0					0	$name->{warning} = 1;
1235	0					0	$name->{warning_desc} .= ";no vowel sound in surname_1 : $comps{surname_1}";
1236
1237							}
1238							elsif ( not _valid_name($comps{surname_2}) )
1239							{
1240	0					0	$name->{warning} = 1;
1241	0					0	$name->{warning_desc} .= ";no vowel sound in surname_2 : $comps{surname_2}";
1242							}
1243							}
1244							#-------------------------------------------------------------------------------
1245							# If the name has an assigned value, check that it contains a vowel sound,
1246							# or matches the exceptions to this rule.
1247							# Returns 1 if name is valid, otherwise 0
1248
1249							sub _valid_name
1250							{
1251	112			112		200	my ($name) = @_;
1252	112	100	33			406	if ( not $name )
		50
1253							{
1254	65					224	return(1);
1255							}
1256							# Names should have a vowel sound,
1257							# valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc
1258							elsif ( $name and $name =~ /[AEIOUYJ]\|^(NG\|TSZ\|MD\|(C?H\|[PTS])NG)$/i )
1259							{
1260	47					175	return(1);
1261							}
1262							else
1263							{
1264	0					0	return(0);
1265							}
1266							}
1267							#-------------------------------------------------------------------------------
1268							# Upper case first letter, lower case the rest, for all words in string
1269							sub _case_word
1270							{
1271	315			315		554	my ($word) = @_;
1272
1273	315	100				560	if ($word)
1274							{
1275	67					364	$word =~ s/(\w+)/\u\L$1/g;
1276							}
1277
1278	315					630	return($word);
1279							}
1280							#-------------------------------------------------------------------------------
1281							return(1);