File Coverage

blib/lib/Lingua/EN/NameParse.pm
Criterion Covered Total %
statement 177 260 68.0
branch 58 104 55.7
condition 6 24 25.0
subroutine 21 23 91.3
pod 10 10 100.0
total 272 421 64.6


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::NameParse - extract the components of a person or couples full name, presented as a text string
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::NameParse qw(clean case_surname);
8              
9             # optional configuration arguments
10             my %args =
11             (
12             auto_clean => 1,
13             lc_prefix => 1,
14             initials => 3,
15             allow_reversed => 1,
16             joint_names => 0,
17             extended_titles => 0
18             );
19              
20             my $name = Lingua::EN::NameParse->new(%args);
21              
22             $error = $name->parse("Estate Of Lt Col AB Van Der Heiden (Hold Mail)");
23             unless ( $error )
24             {
25             print($name->report);
26            
27             Case all : Estate Of Lt Col AB Van Der Heiden (Hold Mail)
28             Case all reversed : Van Der Heiden, Lt Col AB
29             Salutation : Dear Friend
30             Type : Mr_A_Smith
31             Parsing Error : 0
32             Error description : :
33             Parsing Warning : 1
34             Warning description : ;non_matching text found : (Hold Mail)
35            
36             COMPONENTS
37             initials_1 : AB
38             non_matching : (Hold Mail)
39             precursor : Estate Of
40             surname_1 : Van Der Heiden
41             title_1 : Lt Col
42            
43             %name_comps = $name->components;
44             $surname = $name_comps{surname_1};
45              
46             $correct_casing = $name->case_all;
47              
48             $correct_casing = $name->case_all_reversed ;
49              
50             $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend'));
51            
52             $good_name = clean("Bad Na9me "); # "Bad Name"
53            
54             %my_properties = $name->properties;
55             $number_surnames = $my_properties{number}; # 1
56             }
57            
58              
59             $lc_prefix = 0;
60             $correct_case = case_surname("DE SILVA-MACNAY",$lc_prefix); # A stand alone function, returns: De Silva-MacNay
61            
62             $error = $name->parse("MR AS & D.E. DE LA MARE");
63             %my_properties = $name->properties;
64             $number_surnames = $my_properties{number}; # 2
65            
66              
67             =head1 DESCRIPTION
68              
69              
70             This module takes as input one person's name or a couples names in
71             free format text such as,
72              
73             Mr AB & M/s CD MacNay-Smith
74             MR J.L. D'ANGELO
75             Estate Of The Late Lieutenant Colonel AB Van Der Heiden
76              
77             and attempts to parse it. If successful, the name is broken
78             down into components and useful functions can be performed such as :
79              
80             converting upper or lower case values to name case (Mr AB MacNay )
81             creating a personalised greeting or salutation (Dear Mr MacNay )
82             extracting the names individual components (Mr,AB,MacNay )
83             determining the type of format the name is in (Mr_A_Smith )
84              
85              
86             If the name(s) cannot be parsed you have the option of cleaning the name(s)
87             of bad characters, or extracting any portion that was parsed and the
88             portion that failed.
89              
90             This module can be used for analysing and improving the quality of
91             lists of names.
92              
93              
94             =head1 DEFINITIONS
95              
96             The following terms are used by NameParse to define the components
97             that can make up a name.
98              
99             Precursor - Estate of (The Late), Right Honourable ...
100             Title - Mr, Mrs, Ms., Sir, Dr, Major, Reverend ...
101             Conjunction - word to separate two names, such as "And" or &
102             Initials - 1-3 letters, each with an optional space and/or dot
103             Surname - De Silva, Van Der Heiden, MacNay-Smith, O'Reilly ...
104             Suffix - Snr., Jnr, III, V ...
105              
106             Refer to the component grammar defined within the code for a complete
107             list of combinations.
108              
109             'Name casing' refers to the correct use of upper and lower case letters
110             in peoples names, such as Mr AB McNay.
111              
112             To describe the formats supported by NameParse, a short hand representation
113             of the name is used. The following formats are currently supported :
114              
115             Mr_John_Smith_&_Ms_Mary_Jones
116             Mr_A_Smith_&_Ms_B_Jones
117             Mr_&Ms_A_&_B_Smith
118             Mr_A_&_Ms_B_Smith
119             Mr_&_Ms_A_Smith
120             Mr_A_&_B_Smith
121             John_Smith_&_Mary_Jones
122             John_&_Mary_Smith
123             A_Smith_&_B_Jones
124              
125             Mr_John_Adam_Smith
126             Mr_John_A_Smith
127             Mr_John_Smith
128             Mr_A_Smith
129             John_Adam_Smith
130             John_A_Smith
131             J_Adam_Smith
132             John_Smith
133             A_Smith
134             John
135              
136             Precursors and suffixes may be applied to single names that have a surname
137              
138              
139             =head1 METHODS
140              
141             =head2 new
142              
143             The C method creates an instance of a name object and sets up
144             the grammar used to parse names. This must be called before any of the
145             following methods are invoked. Note that the object only needs to be
146             created ONCE, and should be reused with new input data. Calling C
147             repeatedly will significantly slow your program down.
148              
149             Various setup options may be defined in a hash that is passed as an optional
150             argument to the C method. Note that all the arguments are optional. You
151             need to define the combination of arguments that are appropriate for your
152             usage.
153              
154             my %args =
155             (
156             auto_clean => 1,
157             lc_prefix => 1,
158             initials => 3,
159             allow_reversed => 1
160             );
161              
162              
163             my $name = Lingua::EN::NameParse->new(%args);
164              
165              
166             =over 4
167              
168             =item auto_clean
169              
170             When this option is set to a positive value, any call to the C method
171             that fails will attempt to 'clean' the name and then reparse it. See the
172             C method for details. This is useful for dirty data with embedded
173             unprintable or non alphabetic characters.
174              
175             =item lc_prefix
176              
177             When this option is set to a positive value, it will force the C
178             and C methods to lower case the first letter of each word that
179             occurs in the prefix portion of a surname. For example, Mr AB de Silva,
180             or Ms AS von der Heiden.
181              
182             =item initials
183              
184             Allows the user to control the number of letters that can occur in the initials.
185             Valid settings are 1,2 or 3. If no value is supplied a default of 2 is used.
186              
187             =item allow_reversed
188              
189             When this option is set to a positive value, names in reverse order will be
190             processed. The only valid format is the surname followed by a comma and the
191             rest of the name, which can be in any of the combinations allowed by non
192             reversed names. Some examples are:
193              
194             Smith, Mr AB
195             Jones, Jim
196             De Silva, Professor A.B.
197              
198             The program changes the order of the name back to the non reversed format, and
199             then performs the normal parsing. Note that if the name can be parsed, the fact
200             that it's order was originally reversed, is not recorded as a property of the
201             name object.
202              
203             =item joint_names
204              
205             When this option is set to a positive value, joint names are accounted for:
206              
207             Mr_A_Smith_&Ms_B_Jones
208             Mr_&Ms_A_&B_Smith
209             Mr_A_&Ms_B_Smith
210             Mr_&Ms_A_Smith
211             Mr_A_&B_Smith
212              
213             Note that if this option is not specified, than by default joint names are
214             ignored. Disabling joint names speeds up the processing a lot.
215              
216             =item extended_titles
217              
218             When this option is set to a positive value, all combinations of titles,
219             such as Colonel, Mother Superior are used. If this value is not set, only
220             the following titles are accounted for:
221              
222             Mr
223             Ms
224             M/s
225             Mrs
226             Miss
227             Dr
228             Sir
229             Dame
230              
231              
232             Note that if this option is not specified, than by default extended titles
233             are ignored. Disabling extended titles speeds up the parsing.
234              
235             =back
236              
237             =head2 parse
238              
239             $error = $name->parse("MR AC DE SILVA");
240              
241             The C method takes a single parameter of a text string containing a
242             name. It attempts to parse the name and break it down into the components
243              
244             Returns an error flag. If the name was parsed successfully, it's value is 0,
245             otherwise a 1. This step is a prerequisite for the following methods.
246              
247              
248             =head2 case_all
249              
250             $correct_casing = $name->case_all;
251              
252             The C method converts the first letter of each component to
253             capitals and the remainder to lower case, with the following exceptions-
254              
255             initials remain capitalised
256             surname spelling such as MacNay-Smith, O'Brien and Van Der Heiden are preserved
257             - see C for user defined exceptions
258              
259             A complete definition of the capitalising rules can be found by studying
260             the case_surname function.
261              
262             The method returns the entire cased name as text.
263              
264             =head2 case_all_reversed
265              
266             $correct_casing = $name->case_all_reversed;
267              
268             The C method applies the same type of casing as
269             C. However, the name is returned as surname followed by a comma
270             and the rest of the name, which can be any of the combinations allowed
271             for a name, except the title. Some examples are: "Smith, John", "De Silva, A.B."
272             This is useful for sorting names alphabetically by surname.
273              
274             The method returns the entire reverse order cased name as text.
275              
276              
277             =head2 components
278              
279             %my_name = $name->components;
280             $cased_surname = $my_name{surname_1};
281              
282              
283             The C method does the same thing as the C method,
284             but returns the name cased components in a hash. The following keys are used
285             for each component:
286              
287             precursor
288             title_1
289             title_2
290             given_name_1
291             given_name_2
292             initials_1
293             initials_2
294             middle_name
295             conjunction_1
296             conjunction_2
297             surname_1
298             surname_2
299             suffix
300              
301             If a component has no matching data for a given name, it will not appear in the hash
302              
303             If the name could not be parsed, this method returns null. If you assign the return
304             value to a hash, you should check the error status returned by the C method first.
305             Ohterwise, you will get an odd number of values assigned to the hash.
306              
307              
308             =head2 case_surname
309              
310             $correct_casing = case_surname("DE SILVA-MACNAY" [,$lc_prefix]);
311              
312             C is a stand alone function that does not require a name
313             object. The input is a text string. An optional input argument controls the
314             casing rules for prefix portions of a surname, as described above in the
315             C section.
316              
317             The output is a string converted to the correct casing for surnames.
318             See C for user defined exceptions
319              
320             This function is useful when you know you are only dealing with names that
321             do not have initials like "Mr John Jones". It is much faster than the case_all
322             method, but does not understand context, and cannot detect errors on strings
323             that are not personal names.
324              
325              
326             =head2 surname_prefs.txt
327              
328             Some surnames can have more than one form of valid capitalisation, such as
329             MacQuarie or Macquarie. Where the user wants to specify one form as the default,
330             a text file called surname_prefs.txt should be created and placed in the same
331             location as the NameParse module. The text file should contain one surname per
332             line, in the capitalised form you want, such as
333              
334             Macquarie
335             MacHado
336              
337             NameParse will still operate if the file does not exist
338              
339             =head2 salutation
340              
341             $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend',sal_type => 'given_name'));
342              
343             The C method converts a name into a personal greeting,
344             such as "Dear Mr & Mrs O'Brien" or "Dear Sue and John"
345              
346             Optional parameters may be specided in a hash as follows:
347              
348              
349             salutation:
350              
351             The greeting word such as 'Dear' or 'Greetings'. If not spefied than 'Dear' is used
352              
353             sal_default:
354              
355             The default word used when a personalised salution cannot be generated. If not
356             specified, than 'Friend' is used.
357              
358             sal_type:
359              
360             Can be either 'given_name' such as 'Dear Sue' or 'title_plus_name' such as 'Dear Ms Smith'
361             If not specified, than 'given_name' is used.
362              
363             If an error is detected during parsing, such as with the name "AB Smith & Associates",
364             then the value of sal_default is used instead of a given name, or a title and surname.
365             If the input string contains a conjunction, an 's' is added to the value of sal_default.
366              
367             If the name contains a precursor, a default salutation is produced.
368              
369             =head2 clean
370              
371             $good_name = clean("Bad Na9me");
372              
373             C is a stand alone function that does not require a name object.
374             The input is a text string and the output is the string with:
375              
376             all repeating spaces removed
377             all characters not in the set (A-Z a-z - ' , . &) removed
378              
379              
380             =head2 properties
381              
382             The C method returns all the properties of the name,
383             non_matching, number and type, as a hash.
384              
385             =over 4
386              
387             =item type
388              
389             The type of format a name is in, as one of the following strings:
390              
391             Mr_A_Smith_&Ms_B_Jones
392             Mr_&Ms_A_&B_Smith
393             Mr_A_&Ms_B_Smith
394             Mr_&Ms_A_Smith
395             Mr_A_&B_Smith
396             Mr_John_Adam_Smith
397             Mr_John_A_Smith
398             Mr_John_Smith
399             Mr_A_Smith
400             John_Adam_Smith
401             John_A_Smith
402             J_Adam_Smith
403             John_Smith
404             A_Smith
405             John
406             unknown
407              
408              
409             =item non_matching
410              
411             Returns any unmatched section that was found.
412              
413             =back
414              
415             =head2 report
416              
417             Create a formatted text report to standard output listing
418             - the input string,
419             - the name and value of each defined component
420             - any non matching component
421              
422              
423             =head1 LIMITATIONS
424              
425             The huge number of character combinations that can form a valid names makes
426             it is impossible to correctly identify them all. Firstly, there are many
427             ambiguities, which have no right answer.
428              
429             Macbeth or MacBeth, are both valid spellings
430             Is ED WOOD E.D. Wood or Edward Wood
431             Is 'Mr Rapid Print' a name or a company
432             Does John Bradfield Smith have a middle name of Bradfield, or a surname of Bradfield-Smith?
433              
434             One approach is to have large lookup files of names and words, statistical rules
435             and fuzzy logic to attempt to derive context. This approach gives high levels of
436             accuracy but uses a lot of your computers time and resources.
437              
438             NameParse takes the approach of using a limited set of rules, based on the
439             formats that are commonly used by business to represent peoples names. This
440             gives us fairly high accuracy, with acceptable speed and program size.
441              
442             NameParse will accept names from many countries, like Van Der Heiden,
443             De La Mare and Le Fontain. Having said that, it is still biased toward English,
444             because the precursors, titles and conjunctions are based on English usage.
445              
446             Names with two or more words, but no separating hyphen are not recognized.
447             This is a real quandary as Indian, Chinese and other names can have several
448             components. If these are allowed for, any component after the surname
449             will also be picked up. For example in "Mr AB Jones Trading As Jones Pty Ltd"
450             will return a surname of "Jones Trading".
451              
452             Because of the large combination of possible names defined in the grammar, the
453             program is not very fast, except for the more limited C subroutine.
454             See the "Future Directions" section for possible speed ups.
455              
456             As the parser has a very limited understanding of context, the "John_Adam_Smith"
457             name type is most likely to cause problems, as it contains no known tokens
458             like a title. A string such as "National Australia Bank" would be accepted
459             as a valid name, first name National etc. Supplying a list of common pronouns
460             as exceptions could solve this problem.
461              
462              
463             =head1 REFERENCES
464              
465             "The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
466              
467             Australian Standard AS4212-1994 "Geographic Information Systems -
468             Data Dictionary for transfer of street addressing information"
469              
470              
471             =head1 FUTURE DIRECTIONS
472              
473             Define grammar for other languages. Hopefully, all that would be needed is
474             to specify a new module with its own grammar, and inherit all the existing
475             methods. I don't have the knowledge of the naming conventions for non-english
476             languages.
477              
478             =head1 REPOSITORY
479              
480             L
481              
482              
483             =head1 SEE ALSO
484              
485             L, L, L,
486             L, L
487              
488              
489             =head1 BUGS
490              
491             Names with accented characters (acute, circumfelx etc) will not be parsed
492             correctly. A work around is to replace the character class [a-z] with \w
493             in the appropriate rules in the grammar tree, but this could lower the accuracy
494             of names based purely on ASCII text.
495              
496             =head1 CREDITS
497              
498             Thanks to all the people who provided ideas and suggestions, including -
499              
500             Damian Conway, author of Parse::RecDescent
501             Mark Summerfield author of Lingua::EN::NameCase,
502             Ron Savage, Alastair Adam Huffman, Douglas Wilson
503             Peter Schendzielorz
504              
505             =head1 AUTHOR
506              
507             NameParse was written by Kim Ryan
508              
509             =head1 COPYRIGHT AND LICENSE
510              
511             Copyright (c) 2016 Kim Ryan. All rights reserved.
512              
513             This library is free software; you can redistribute it and/or modify
514             it under the same terms as Perl itself.
515              
516             =cut
517             #-------------------------------------------------------------------------------
518              
519             package Lingua::EN::NameParse;
520              
521 2     2   19427 use strict;
  2         3  
  2         47  
522 2     2   6 use warnings;
  2         2  
  2         39  
523              
524 2     2   711 use Lingua::EN::NameParse::Grammar;
  2         9  
  2         42  
525 2     2   2012 use Parse::RecDescent;
  2         59747  
  2         14  
526              
527 2     2   76 use Exporter;
  2         2  
  2         65  
528 2     2   7 use vars qw (@ISA @EXPORT_OK);
  2         2  
  2         2459  
529              
530             our $VERSION = '1.36';
531             @ISA = qw(Exporter);
532             @EXPORT_OK = qw(clean case_surname);
533              
534             #-------------------------------------------------------------------------------
535             # Create a new instance of a name parsing object. This step is time consuming
536             # and should normally only be called once in your program.
537              
538             sub new
539             {
540 2     2 1 15 my $class = shift;
541 2         5 my %args = @_;
542              
543 2         3 my $name = {};
544 2         3 bless($name,$class);
545              
546             # Default to 2 initials per name. Can be overwritten if user defines
547             # 'initials' as a key in the hash supplied to new method.
548 2         12 $name->{initials} = 2;
549              
550 2         14 my $current_key;
551 2         7 foreach my $current_key (keys %args)
552             {
553 8         11 $name->{$current_key} = $args{$current_key};
554             }
555              
556 2         7 my $grammar = Lingua::EN::NameParse::Grammar::_create($name);
557 2         15 $name->{parse} = new Parse::RecDescent($grammar);
558              
559 2         348263 return ($name);
560             }
561             #-------------------------------------------------------------------------------
562             # Attempt to parse a string and retrieve it's components and properties
563             # Requires a name object to have been created with the 'new' method'
564             # Returns: an error code, 0 for success, 1 for failure
565              
566             sub parse
567             {
568 28     28 1 4271 my $name = shift;
569 28         34 my ($input_string) = @_;
570              
571 28         36 chomp($input_string);
572              
573             # If reverse ordered names are allowed, swap the surname component, before
574             # the comma, with the rest of the name. Rejoin the name, replacing comma
575             # with a space.
576              
577 28 100 100     107 if ( $name->{allow_reversed} and $input_string =~ /,/ )
578             {
579 1         4 my ($first,$second) = split(/,/,$input_string);
580 1         4 $input_string = join(' ',$second,$first);
581             }
582              
583 28         38 $name->{comps} = ();
584 28         92 $name->{properties} = ();
585 28         63 $name->{properties}{type} = 'unknown';
586 28         32 $name->{error} = 0;
587 28         33 $name->{error_desc} = '';
588 28         78 $name->{warning} = 0;
589 28         32 $name->{warning_desc} = '';
590              
591 28         28 $name->{original_input} = $input_string;
592 28         30 $name->{input_string} = $input_string;
593              
594 28         49 $name = _pre_parse($name);
595 28 50       56 unless ( $name->{error} )
596             {
597 28 100       43 if ( $name->{auto_clean} )
598             {
599 9         17 $name->{input_string} = clean($name->{input_string});
600             }
601 28         41 $name = _assemble($name);
602 28         44 _validate($name);
603             }
604              
605 28         60 return($name->{error});
606             }
607             #-------------------------------------------------------------------------------
608             # Clean the input string. Can be called as a stand alone function.
609              
610             sub clean
611             {
612 10     10 1 528 my ($input_string) = @_;
613              
614             # remove illegal characters
615 10         19 $input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go;
616              
617             # remove repeating spaces
618 10         16 $input_string =~ s/ +/ /go ;
619              
620             # remove any remaining leading or trailing space
621 10         14 $input_string =~ s/^ //;
622              
623 10         18 return($input_string);
624             }
625              
626             #-------------------------------------------------------------------------------
627             # Given a name object, apply correct capitalisation to each component of a person's name.
628             # Return all cased components in a hash.
629             # Else return no value.
630              
631              
632             sub components
633             {
634 35     35 1 32 my $name = shift;
635              
636 35 50       76 if ( $name->{properties}{type} eq 'unknown' )
637             {
638 0         0 return;
639             }
640             else
641             {
642 35         29 my %orig_components = %{ $name->{comps} };
  35         190  
643              
644 35         48 my ($current_key,%cased_components);
645 35         78 foreach $current_key ( keys %orig_components )
646             {
647 490         329 my $cased_value;
648 490 100       930 if ( $current_key =~ /initials/ ) # initials_1, possibly initials_2
    100          
    50          
649             {
650 70         80 $cased_value = uc($orig_components{$current_key});
651             }
652             elsif ( $current_key =~ /surname|suffix/ )
653             {
654 105         191 $cased_value = case_surname($orig_components{$current_key},$name->{lc_prefix});
655             }
656             elsif ( $current_key eq 'type')
657             {
658 0         0 $cased_value = $orig_components{$current_key};
659             }
660             else
661             {
662 315         316 $cased_value = _case_word($orig_components{$current_key});
663             }
664              
665 490         559 $cased_components{$current_key} = $cased_value;
666             }
667 35         333 return(%cased_components);
668             }
669             }
670              
671             #-------------------------------------------------------------------------------
672             # Hash of of lists, indicating the order that name components are assembled in.
673             # Each list element is itself the name of the key value in a name object.
674             # Used by the case_all and case_all_reversed methods.
675             # These hashes are created here globally, as quite a large overhead is
676             # imposed if the are created locally, each time the method is invoked
677              
678             my %component_order=
679             (
680             'Mr_John_Smith_&_Ms_Mary_Jones' => ['title_1','given_name_1','surname_1','conjunction_1','title_2','given_name_2','surname_2'],
681             'Mr_A_Smith_&_Ms_B_Jones' => ['title_1','initials_1','surname_1','conjunction_1','title_2','initials_2','surname_2'],
682             'Mr_&_Ms_A_&_B_Smith' => ['title_1','conjunction_1','title_2','initials_1','conjunction_2','initials_2','surname_1'],
683             'Mr_A_&_Ms_B_Smith' => ['title_1','initials_1','conjunction_1','title_2','initials_2','surname_1'],
684             'Mr_&_Ms_A_Smith' => ['title_1','conjunction_1','title_2','initials_1','surname_1'],
685             'Mr_A_&_B_Smith' => ['title_1','initials_1','conjunction_1','initials_2','surname_1'],
686             'John_Smith_&Mary_Jones' => ['given_name_1','surname_1','conjunction_1','given_name_2','surname_2'],
687             'John_&_Mary_Smith' => ['given_name_1','conjunction_1','given_name_2','surname_1'],
688             'A_Smith_&_B_Jones' => ['initials_1','surname_1','conjunction_1','initials_2','surname_2'],
689              
690             'Mr_John_Adam_Smith' => ['precursor','title_1','given_name_1','middle_name','surname_1','suffix'],
691             'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'],
692             'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'],
693             'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'],
694             'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'],
695             'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'],
696             'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'],
697             'John_Smith' => ['precursor','given_name_1','surname_1','suffix'],
698             'A_Smith' => ['precursor','initials_1','surname_1','suffix'],
699             'John' => ['given_name_1']
700             );
701              
702              
703             # only include names with a single surname
704             my %reverse_component_order=
705             (
706             'Mr_&_Ms_A_&_B_Smith' => ['surname_1','title_1','conjunction_1','title_2','initials_1','conjunction_1','initials_2'],
707             'Mr_A_&_Ms_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','title_2','initials_2'],
708             'Mr_&_Ms_A_Smith' => ['surname_1','title_1','title_1','conjunction_1','title_2','initials_1'],
709             'Mr_A_&_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','initials_2'],
710             'John_&_Mary_Smith' => ['surname_1','given_name_1','conjunction_1','given_name_2'],
711              
712             'Mr_John_Adam_Smith' => ['surname_1','title_1','given_name_1','middle_name','suffix'],
713             'Mr_John_A_Smith' => ['surname_1','title_1','given_name_1','initials_1','suffix'],
714             'Mr_John_Smith' => ['surname_1','title_1','given_name_1','suffix'],
715             'Mr_A_Smith' => ['surname_1','title_1','initials_1','suffix'],
716             'John_Adam_Smith' => ['surname_1','given_name_1','middle_name','suffix'],
717             'John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'],
718             'J_Adam_Smith' => ['surname_1','initials_1','middle_name','suffix'],
719             'John_Smith' => ['surname_1','given_name_1','suffix'],
720             'A_Smith' => ['surname_1','initials_1','suffix']
721             );
722              
723             #-------------------------------------------------------------------------------
724             # Apply correct capitalisation to a person's entire name
725             # If the name type is unknown, return undef
726             # Else, return a string of all cased components in correct order
727              
728             sub case_all
729             {
730 2     2 1 6 my $name = shift;
731              
732 2         4 my @cased_name;
733              
734 2 50       5 if ( $name->{properties}{type} eq 'unknown' )
735             {
736 0         0 return undef;
737             }
738              
739 2 50       9 unless ( $component_order{$name->{properties}{type}} )
740             {
741             # component order missing in array defined above
742 0         0 warn "Component order not defined for: $name->{properties}{type}";
743 0         0 return;
744             }
745              
746 2         4 my %component_vals = $name->components;
747 2         3 my @order = @{ $component_order{$name->{properties}{type}} };
  2         8  
748              
749 2         5 foreach my $component_key ( @order )
750             {
751             # As some components such as precursors are optional, they will appear
752             # in the order array but may or may not have have a value, so only
753             # process defined values
754 10 100       18 if ( $component_vals{$component_key} )
755             {
756 6         8 push(@cased_name,$component_vals{$component_key});
757             }
758             }
759 2 100       5 if ( $name->{comps}{non_matching} )
760             {
761             # Despite errors, try to name case non-matching section. As the format
762             # of this section is unknown, surname case will provide the best
763             # approximation, but still fail on initials of more than 1 letter
764 1         4 push(@cased_name,case_surname($name->{comps}{non_matching},$name->{lc_prefix}));
765             }
766              
767 2         17 return(join(' ',@cased_name));
768             }
769              
770             #-------------------------------------------------------------------------------
771             =head1 case_all_reversed
772              
773             Apply correct capitalisation to a person's entire name and reverse the order
774             so that surname is first, followed by the other components, such as: Smith, Mr John A
775             Useful for creating a list of names that can be sorted by surname.
776              
777             If name type is unknown , returns null
778              
779             If the name type has a joint name, such as 'Mr_A_Smith_Ms_B_Jones', return null,
780             as it is ambiguous which surname to place at the start of the string
781              
782             Else, returns a string of all cased components in correct reversed order
783              
784             =cut
785              
786             sub case_all_reversed
787             {
788 0     0 1 0 my $name = shift;
789              
790 0         0 my @cased_name_reversed;
791              
792 0 0       0 unless ( $name->{properties}{type} eq 'unknown' )
793             {
794 0 0       0 unless ( $reverse_component_order{$name->{properties}{type} } )
795             {
796             # this type of name should not be reversed, such as two surnames
797 0         0 return;
798             }
799 0         0 my %component_vals = $name->components;
800 0         0 my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } };
  0         0  
801              
802 0         0 foreach my $component_key ( @reverse_order )
803             {
804             # As some components such as precursors are optional, they will appear
805             # in the order array but may or may not have have a value, so only
806             # process defined values
807              
808 0         0 my $component_value = $component_vals{$component_key};
809 0 0       0 if ( $component_value )
810             {
811 0 0       0 if ($component_key eq 'surname_1')
812             {
813 0         0 $component_value .= ',';
814             }
815 0         0 push(@cased_name_reversed,$component_value);
816             }
817             }
818             }
819 0         0 return(join(' ',@cased_name_reversed));
820             }
821             #-------------------------------------------------------------------------------
822             # The user may specify their own preferred spelling for surnames.
823             # These should be placed in a text file called surname_prefs.txt
824             # in the same location as the module itself.
825              
826             BEGIN
827             {
828             # Obtain the full path to NameParse module, defined in the %INC hash.
829 2     2   5 my $prefs_file_location = $INC{"Lingua/EN/NameParse.pm"};
830             # Now substitute the name of the preferences file
831 2         17 $prefs_file_location =~ s/NameParse\.pm$/surname_prefs.txt/;
832              
833 2 50       2989 if ( open(PREFERENCES_FH,"<$prefs_file_location") )
834             {
835 0         0 my @surnames = ;
836 0         0 foreach my $name ( @surnames )
837             {
838 0         0 chomp($name);
839             # Build hash, lower case name is key for case insensitive
840             # comparison, while value holds the actual capitalisation
841 0         0 $Lingua::EN::surname_preferences{lc($name)} = $name;
842             }
843 0         0 close(PREFERENCES_FH);
844             }
845             }
846             #-------------------------------------------------------------------------------
847             # Apply correct capitalisation to a person's surname. Can be called as a
848             # stand alone function.
849              
850             sub case_surname
851             {
852 108     108 1 366 my ($surname,$lc_prefix) = @_;
853              
854 108 100       159 unless ($surname)
855             {
856 65         94 return('');
857             }
858              
859             # If the user has specified a preferred capitalisation for this
860             # surname in the surname_prefs.txt, it should be returned now.
861 43 50       91 if ($Lingua::EN::surname_preferences{lc($surname)} )
862             {
863 0         0 return($Lingua::EN::surname_preferences{lc($surname)});
864             }
865              
866             # Lowercase everything
867 43         40 $surname = lc($surname);
868              
869             # Now uppercase first letter of every word. By checking on word boundaries,
870             # we will account for apostrophes (D'Angelo) and hyphenated names
871 43         209 $surname =~ s/\b(\w)/\u$1/g;
872              
873             # Name case Macs and Mcs
874             # Exclude names with 1-2 letters after prefix like Mack, Macky, Mace
875             # Exclude names ending in a,c,i,o,z or j, typically Polish or Italian
876              
877 43 100       134 if ( $surname =~ /\bMac[a-z]{2,}[^a|c|i|o|z|j]\b/i )
    50          
878             {
879 4         15 $surname =~ s/\b(Mac)([a-z]+)/$1\u$2/ig;
880              
881             # Now correct for "Mac" exceptions
882 4         7 $surname =~ s/MacHin/Machin/;
883 4         9 $surname =~ s/MacHlin/Machlin/;
884 4         3 $surname =~ s/MacHar/Machar/;
885 4         7 $surname =~ s/MacKle/Mackle/;
886 4         4 $surname =~ s/MacKlin/Macklin/;
887 4         3 $surname =~ s/MacKie/Mackie/;
888              
889             # Portuguese
890 4         4 $surname =~ s/MacHado/Machado/;
891              
892             # Lithuanian
893 4         4 $surname =~ s/MacEvicius/Macevicius/;
894 4         3 $surname =~ s/MacIulis/Maciulis/;
895 4         4 $surname =~ s/MacIas/Macias/;
896             }
897             elsif ( $surname =~ /\bMc/i )
898             {
899 0         0 $surname =~ s/\b(Mc)([a-z]+)/$1\u$2/ig;
900             }
901             # Exceptions (only 'Mac' name ending in 'o' ?)
902 43         38 $surname =~ s/Macmurdo/MacMurdo/;
903              
904              
905 43 100       61 if ( $lc_prefix )
906             {
907             # Lowercase first letter of every word in prefix. The trailing space
908             # prevents the surname from being altered. Note that spellings like
909             # d'Angelo are not accounted for.
910 1         10 $surname =~ s/\b(\w+ )/\l$1/g;
911             }
912              
913             # Correct for possessives such as "John's" or "Australia's". Although this
914             # should not occur in a person's name, they are valid for proper names.
915             # As this subroutine may be used to capitalise words other than names,
916             # we may need to account for this case. Note that the 's' must be at the
917             # end of the string
918 43         63 $surname =~ s/(\w+)'S(\s+)/$1's$2/;
919 43         39 $surname =~ s/(\w+)'S$/$1's/;
920              
921             # Correct for roman numerals, excluding single letter cases I,V and X,
922             # which will work with the above code
923 43         75 $surname =~ s/\b(I{2,3})\b/\U$1/i; # 2nd, 3rd
924 43         46 $surname =~ s/\b(IV)\b/\U$1/i; # 4th
925 43         42 $surname =~ s/\b(VI{1,3})\b/\U$1/i; # 6th, 7th, 8th
926 43         46 $surname =~ s/\b(IX)\b/\U$1/i; # 9th
927 43         44 $surname =~ s/\b(XI{1,3})\b/\U$1/i; # 11th, 12th, 13th
928              
929 43         70 return($surname);
930             }
931             #-------------------------------------------------------------------------------
932             # Create a personalised greeting from one or two person's names
933             # Returns the salutation as a string, such as "Dear Mr Smith", or "Dear Sue"
934              
935             sub salutation
936             {
937 3     3 1 4 my $name = shift;
938 3         4 my %args = @_;
939              
940 3         5 my $salutation = 'Dear';
941 3         26 my $sal_default = 'Friend';
942 3         4 my $sal_type = 'title_plus_surname';
943              
944             # Check to see if we should override defualts with any user specified preferences
945 3 100       6 if ( %args )
946             {
947 2         4 foreach my $current_key (keys %args)
948             {
949 2 50       5 $current_key eq 'salutation' and $salutation = $args{$current_key};
950 2 50       5 $current_key eq 'sal_default' and $sal_default = $args{$current_key};
951 2 50       7 $current_key eq 'sal_type' and $sal_type = $args{$current_key};
952             }
953             }
954              
955              
956 3         4 my @greeting;
957 3         4 push(@greeting,$salutation);
958              
959             # Personalised salutations cannot be created for Estates or people
960             # without some title
961 3 50 33     13 if
      33        
962             (
963             $name->{error} or
964             ( $name->{comps}{precursor} and $name->{comps}{precursor} =~ /ESTATE/)
965             )
966             {
967             # Despite an error, the presence of a conjunction probably
968             # means we are dealing with 2 or more people.
969             # For example Mr AB Smith & John Jones
970 0 0       0 if ( $name->{input_string} =~ / (AND|&) / )
971             {
972 0         0 $sal_default .= 's';
973             }
974 0         0 push(@greeting,$sal_default);
975             }
976             else
977             {
978 3         7 my %component_vals = $name->components;
979              
980 3 100       14 if ( $sal_type eq 'given_name')
    50          
981             {
982 1 50       4 if ( $component_vals{'given_name_1'} )
983             {
984 1         2 push(@greeting,$component_vals{'given_name_1'});
985 1 50       5 if ( $component_vals{'given_name_2'} )
986             {
987 0         0 push(@greeting,$component_vals{'conjunction_1'});
988 0         0 push(@greeting,$component_vals{'given_name_2'});
989             }
990             }
991             else
992             {
993             # No given name such as 'A_Smith','J_Adam_Smith','Mr_A_Smith'
994             # Must use default
995 0         0 push(@greeting,$sal_default);
996             }
997             }
998             elsif ( $sal_type eq 'title_plus_surname' )
999             {
1000 2 50       6 if ( $name->{properties}{number} == 1 )
    0          
1001             {
1002 2 100       5 if ( $component_vals{'title_1'} )
1003             {
1004 1         2 push(@greeting,$component_vals{'title_1'});
1005 1         3 push(@greeting,$component_vals{'surname_1'});
1006             }
1007             else
1008             {
1009             # No title such as 'A_Smith','J_Adam_Smith', so must use default
1010 1         3 push(@greeting,$sal_default);
1011             }
1012             }
1013             elsif ( $name->{properties}{number} == 2 )
1014             {
1015             # a joint name
1016              
1017 0         0 my $type = $name->{properties}{type};
1018 0 0 0     0 if ( $type eq 'Mr_&Ms_A_Smith' or $type eq 'Mr_A_&Ms_B_Smith' or $type eq 'Mr_&Ms_A_&B_Smith' )
    0 0        
      0        
1019             {
1020             # common surname
1021 0         0 push(@greeting,$component_vals{'title_1'});
1022 0         0 push(@greeting,$component_vals{'conjunction_1'});
1023 0         0 push(@greeting,$component_vals{'title_2'});
1024 0         0 push(@greeting,$component_vals{'surname_1'});
1025              
1026             }
1027             elsif ( $type eq 'Mr_A_Smith_&Ms_B_Jones' or $type eq 'Mr_John_Smith_&Ms_Mary_Jones' )
1028             {
1029 0         0 push(@greeting,$component_vals{'title_1'});
1030 0         0 push(@greeting,$component_vals{'surname_1'});
1031 0         0 push(@greeting,$component_vals{'conjunction_1'});
1032 0         0 push(@greeting,$component_vals{'title_2'});
1033 0         0 push(@greeting,$component_vals{'surname_2'});
1034             }
1035             else
1036             {
1037             # No title such as A_Smith_&B_Jones', 'John_Smith_&Mary_Jones'
1038             # Must use default
1039 0         0 push(@greeting,$sal_default);
1040             }
1041             }
1042             }
1043             else
1044             {
1045 0         0 warn "Invalid sal_type : ", $sal_type;
1046 0         0 push(@greeting,$sal_default);
1047             }
1048             }
1049 3         19 return(join(' ',@greeting));
1050             }
1051             #-------------------------------------------------------------------------------
1052             # Return all name properties as a hash
1053              
1054             sub properties
1055             {
1056 21     21 1 69 my $name = shift;
1057 21         15 return(%{ $name->{properties} });
  21         89  
1058             }
1059              
1060             #-------------------------------------------------------------------------------
1061             # Create a text report to standard output listing
1062             # - the input string,
1063             # - the name of each defined component, if it exists
1064             # - any non matching component
1065              
1066             sub report
1067             {
1068 0     0 1 0 my $name = shift;
1069            
1070 0         0 my %props = $name->properties;
1071            
1072 0         0 my $fmt = "%-20.20s : %s\n";
1073              
1074 0         0 printf($fmt,"Original Input",$name->{original_input});
1075 0         0 printf($fmt,"Cleaned Input",$name->{input_string});
1076 0         0 printf($fmt,"Case all",$name->case_all);
1077 0         0 printf($fmt,"Case all reversed",$name->case_all_reversed);
1078 0         0 printf($fmt,"Salutation",$name->salutation(salutation => 'Dear',sal_default => 'Friend', sal_type => 'title_plus_surname'));
1079 0         0 printf($fmt,"Type", $props{type});
1080 0         0 printf($fmt,"Number", $props{number});
1081 0         0 printf($fmt,"Parsing Error", $name->{error});
1082 0         0 printf($fmt,"Error description : ", $name->{error_desc});
1083 0         0 printf($fmt,"Parsing Warning", $name->{warning});
1084 0         0 printf($fmt,"Warning description", $name->{warning_desc});
1085            
1086            
1087 0 0       0 unless ($props{type} eq 'unknown')
1088             {
1089 0         0 my %comps = $name->components;
1090 0 0       0 if ( %comps )
1091             {
1092 0         0 print("\nCOMPONENTS\n");
1093 0         0 foreach my $value ( sort keys %comps)
1094             {
1095 0 0 0     0 if ($value and $comps{$value})
1096             {
1097 0         0 printf($fmt,$value,$comps{$value});
1098             }
1099             }
1100             }
1101             }
1102             }
1103             #-------------------------------------------------------------------------------
1104              
1105             # PRIVATE METHODS
1106              
1107             #-------------------------------------------------------------------------------
1108              
1109             sub _pre_parse
1110             {
1111 28     28   28 my $name = shift;
1112            
1113             # strip all full stops
1114 28         50 $name->{input_string} =~ s/\.//g;
1115            
1116             # Fold all text to upper case, as these are used in all regular expressions withun thr grammar tree
1117 28         39 $name->{input_string} = uc($name->{input_string});
1118            
1119             # Check that common reserved word (as found in company names) do not appear
1120 28 50       197 if ( $name->{input_string} =~
1121             /\BPTY LTD$|\BLTD$|\BPLC$|ASSOCIATION|DEPARTMENT|NATIONAL|SOCIETY/ )
1122             {
1123 0         0 $name->{error} = 1;
1124 0         0 $name->{comps}{non_matching} = $name->{input_string};
1125 0         0 $name->{error_desc} = 'Reserved words found in name';
1126             }
1127              
1128             # For the case of a single name such as 'Voltaire' we need to add a trailing space
1129             # to the input string. This is because the grammar tree expects a terminator (the space)
1130             # optionally followed by other productions or non matching text
1131 28         33 $name->{input_string} .= ' ';
1132 28 50       148 if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/ )
1133             {
1134 0         0 $name->{input_string} .= ' ';
1135             }
1136 28         40 return($name);
1137              
1138             }
1139             #-------------------------------------------------------------------------------
1140             # Initialise all components to empty string. Assemble hashes of components
1141             # and properties as part of the name object
1142             #
1143             sub _assemble
1144             {
1145 28     28   22 my $name = shift;
1146              
1147             # Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree
1148             # defined in Lingua::EN::NameParse::Grammar
1149 28         163 my $parsed_name = $name->{parse}->full_name($name->{input_string});
1150            
1151             # Place components into a separate hash, so they can be easily returned
1152             # for the user to inspect and modify.
1153            
1154 28         171857 my @all_comps = qw(precursor title_1 given_name_1 initials_1 middle_name surname_1 conjunction_1
1155             title_2 given_name_2 initials_2 surname_2 conjunction_2 suffix non_matching);
1156            
1157 28         48 foreach my $comp (@all_comps)
1158             {
1159             # set all components to empty string, as any of them could be accessed, even if they don't exist
1160 392         380 $name->{comps}{$comp} = '';
1161 392 100       516 if (defined($parsed_name->{$comp}))
1162             {
1163             # Copy over existing components.
1164 135         156 $name->{comps}{$comp} = _trim_space($parsed_name->{$comp});
1165             }
1166             }
1167              
1168 28         41 $name->{properties}{number} = 0;
1169 28         31 $name->{properties}{number} = $parsed_name->{number};
1170 28         35 $name->{properties}{type} = $parsed_name->{type};
1171              
1172 28         73 return($name);
1173             }
1174             #-------------------------------------------------------------------------------
1175             # For correct matching, the grammar of each component must include the trailing space that separates it
1176             # from any following word. This should now be removed from the components, and will be restored by the
1177             # case_all and salutation methods, if called.
1178              
1179             sub _trim_space
1180             {
1181 135     135   112 my ($string) = @_;
1182 135 100       157 if ($string)
1183             {
1184 109         213 $string =~ s/ $//;
1185             }
1186 135         191 return($string);
1187             }
1188             #-------------------------------------------------------------------------------
1189             # Check if any name components have illegal characters, or do not have the
1190             # correct syntax for a valid name.
1191              
1192              
1193             sub _validate
1194             {
1195 28     28   26 my $name = shift;
1196 28         67 my %comps = $name->components;
1197              
1198 28 100       112 if ( $comps{non_matching} )
    50          
1199             {
1200 2         3 $name->{warning} = 1;
1201 2         7 $name->{warning_desc} .= ";non_matching text found : $comps{non_matching}";
1202             }
1203             elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ )
1204             {
1205             # illegal characters found
1206 0         0 $name->{error} = 1;
1207 0         0 $name->{error_desc} = 'illegal characters found';
1208             }
1209            
1210              
1211 28 50       44 if ( not _valid_name($comps{given_name_1}) )
    50          
    50          
    50          
1212             {
1213 0         0 $name->{warning} = 1;
1214 0         0 $name->{warning_desc} .= ";no vowel sound in given_name_1 : $comps{given_name_1}";
1215             }
1216             elsif ( not _valid_name($comps{middle_name}) )
1217             {
1218 0         0 $name->{warning} = 1;
1219 0         0 $name->{warning_desc} .= ";no vowel sound in middle_name : $comps{middle_name}";
1220             }
1221              
1222             elsif ( not _valid_name($comps{surname_1}) )
1223             {
1224 0         0 $name->{warning} = 1;
1225 0         0 $name->{warning_desc} .= ";no vowel sound in surname_1 : $comps{surname_1}";
1226              
1227             }
1228             elsif ( not _valid_name($comps{surname_2}) )
1229             {
1230 0         0 $name->{warning} = 1;
1231 0         0 $name->{warning_desc} .= ";no vowel sound in surname_2 : $comps{surname_2}";
1232             }
1233             }
1234             #-------------------------------------------------------------------------------
1235             # If the name has an assigned value, check that it contains a vowel sound,
1236             # or matches the exceptions to this rule.
1237             # Returns 1 if name is valid, otherwise 0
1238              
1239             sub _valid_name
1240             {
1241 112     112   86 my ($name) = @_;
1242 112 100 33     303 if ( not $name )
    50          
1243             {
1244 65         150 return(1);
1245             }
1246             # Names should have a vowel sound,
1247             # valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc
1248             elsif ( $name and $name =~ /[AEIOUYJ]|^(NG|TSZ|MD|(C?H|[PTS])NG)$/i )
1249             {
1250 47         106 return(1);
1251             }
1252             else
1253             {
1254 0         0 return(0);
1255             }
1256             }
1257             #-------------------------------------------------------------------------------
1258             # Upper case first letter, lower case the rest, for all words in string
1259             sub _case_word
1260             {
1261 315     315   229 my ($word) = @_;
1262              
1263 315 100       349 if ($word)
1264             {
1265 67         262 $word =~ s/(\w+)/\u\L$1/g;
1266             }
1267            
1268 315         309 return($word);
1269             }
1270             #-------------------------------------------------------------------------------
1271             return(1);