File Coverage

blib/lib/Lingua/EN/NameParse.pm
Criterion Covered Total %
statement 177 262 67.5
branch 58 106 54.7
condition 6 24 25.0
subroutine 21 23 91.3
pod 10 10 100.0
total 272 425 64.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::NameParse - extract the components of a person or couples full name, from free form text
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::NameParse qw(clean case_surname);
8              
9             # optional configuration arguments
10             my %args =
11             (
12             auto_clean => 1,
13             lc_prefix => 1,
14             initials => 3,
15             allow_reversed => 1,
16             joint_names => 0,
17             extended_titles => 0
18             );
19              
20             my $name = Lingua::EN::NameParse->new(%args);
21              
22             $error = $name->parse("Estate Of Lt Col AB Van Der Heiden (Hold Mail)");
23             unless ( $error )
24             {
25             print($name->report);
26            
27             Case all : Estate Of Lt Col AB Van Der Heiden (Hold Mail)
28             Case all reversed : Van Der Heiden, Lt Col AB
29             Salutation : Dear Friend
30             Type : Mr_A_Smith
31             Parsing Error : 0
32             Error description : :
33             Parsing Warning : 1
34             Warning description : ;non_matching text found : (Hold Mail)
35            
36             COMPONENTS
37             initials_1 : AB
38             non_matching : (Hold Mail)
39             precursor : Estate Of
40             surname_1 : Van Der Heiden
41             title_1 : Lt Col
42            
43             %name_comps = $name->components;
44             $surname = $name_comps{surname_1};
45              
46             $correct_casing = $name->case_all;
47              
48             $correct_casing = $name->case_all_reversed ;
49              
50             $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend'));
51            
52             $good_name = clean("Bad Na9me "); # "Bad Name"
53            
54             %my_properties = $name->properties;
55             $number_surnames = $my_properties{number}; # 1
56             }
57            
58              
59             $lc_prefix = 0;
60             $correct_case = case_surname("DE SILVA-MACNAY",$lc_prefix); # A stand alone function, returns: De Silva-MacNay
61            
62             $error = $name->parse("MR AS & D.E. DE LA MARE");
63             %my_properties = $name->properties;
64             $number_surnames = $my_properties{number}; # 2
65            
66              
67             =head1 DESCRIPTION
68              
69              
70             This module takes as input one person's name or a couples name in
71             free format text such as,
72              
73             Mr AB & M/s CD MacNay-Smith
74             MR J.L. D'ANGELO
75             Estate Of The Late Lieutenant Colonel AB Van Der Heiden
76              
77             and attempts to parse it. If successful, the name is broken
78             down into components and useful functions can be performed such as :
79              
80             converting upper or lower case values to name case (Mr AB MacNay )
81             creating a personalised greeting or salutation (Dear Mr MacNay )
82             extracting the names individual components (Mr,AB,MacNay )
83             determining the type of format the name is in (Mr_A_Smith )
84              
85              
86             If the name(s) cannot be parsed you have the option of cleaning the name(s)
87             of bad characters, or extracting any portion that was parsed and the
88             portion that failed.
89              
90             This module can be used for analysing and improving the quality of
91             lists of names.
92              
93              
94             =head1 DEFINITIONS
95              
96             The following terms are used by NameParse to define the components
97             that can make up a name.
98              
99             Precursor - Estate of (The Late), Right Honourable ...
100             Title - Mr, Mrs, Ms., Sir, Dr, Major, Reverend ...
101             Conjunction - word to separate two names, such as "And" or &
102             Initials - 1-3 letters, each with an optional space and/or dot
103             Surname - De Silva, Van Der Heiden, MacNay-Smith, O'Reilly ...
104             Suffix - Snr., Jnr, III, V ...
105              
106             Refer to the component grammar defined within the code for a complete
107             list of combinations.
108              
109             'Name casing' refers to the correct use of upper and lower case letters
110             in peoples names, such as Mr AB McNay.
111              
112             To describe the formats supported by NameParse, a short hand representation
113             of the name is used. The following formats are currently supported :
114              
115             Mr_John_Smith_&_Ms_Mary_Jones
116             Mr_A_Smith_&_Ms_B_Jones
117             Mr_&Ms_A_&_B_Smith
118             Mr_A_&_Ms_B_Smith
119             Mr_&_Ms_A_Smith
120             Mr_A_&_B_Smith
121             John_Smith_&_Mary_Jones
122             John_&_Mary_Smith
123             A_Smith_&_B_Jones
124              
125             Mr_John_Adam_Smith
126             Mr_John_A_Smith
127             Mr_John_Smith
128             Mr_A_Smith
129             John_Adam_Smith
130             John_A_Smith
131             J_Adam_Smith
132             John_Smith
133             A_Smith
134             John
135              
136             Precursors and suffixes may be applied to single names that have a surname
137              
138              
139             =head1 METHODS
140              
141             =head2 new
142              
143             The C method creates an instance of a name object and sets up
144             the grammar used to parse names. This must be called before any of the
145             following methods are invoked. Note that the object only needs to be
146             created ONCE, and should be reused with new input data. Calling C
147             repeatedly will significantly slow your program down.
148              
149             Various setup options may be defined in a hash that is passed as an optional
150             argument to the C method. Note that all the arguments are optional. You
151             need to define the combination of arguments that are appropriate for your
152             usage.
153              
154             my %args =
155             (
156             auto_clean => 1,
157             lc_prefix => 1,
158             initials => 3,
159             allow_reversed => 1
160             );
161              
162              
163             my $name = Lingua::EN::NameParse->new(%args);
164              
165              
166             =over 4
167              
168             =item auto_clean
169              
170             When this option is set to a positive value, any call to the C method
171             that fails will attempt to 'clean' the name and then reparse it. See the
172             C method for details. This is useful for dirty data with embedded
173             unprintable or non alphabetic characters.
174              
175             =item lc_prefix
176              
177             When this option is set to a positive value, it will force the C
178             and C methods to lower case the first letter of each word that
179             occurs in the prefix portion of a surname. For example, Mr AB de Silva,
180             or Ms AS von der Heiden.
181              
182             =item initials
183              
184             Allows the user to control the number of letters that can occur in the initials.
185             Valid settings are 1,2 or 3. If no value is supplied a default of 2 is used.
186              
187             =item allow_reversed
188              
189             When this option is set to a positive value, names in reverse order will be
190             processed. The only valid format is the surname followed by a comma and the
191             rest of the name, which can be in any of the combinations allowed by non
192             reversed names. Some examples are:
193              
194             Smith, Mr AB
195             Jones, Jim
196             De Silva, Professor A.B.
197              
198             The program changes the order of the name back to the non reversed format, and
199             then performs the normal parsing. Note that if the name can be parsed, the fact
200             that it's order was originally reversed, is not recorded as a property of the
201             name object.
202              
203             =item joint_names
204              
205             When this option is set to a positive value, joint names are accounted for:
206              
207             Mr_A_Smith_&Ms_B_Jones
208             Mr_&Ms_A_&B_Smith
209             Mr_A_&Ms_B_Smith
210             Mr_&Ms_A_Smith
211             Mr_A_&B_Smith
212              
213             Note that if this option is not specified, than by default joint names are
214             ignored. Disabling joint names speeds up the processing a lot.
215              
216             =item extended_titles
217              
218             When this option is set to a positive value, all combinations of titles,
219             such as Colonel, Mother Superior are used. If this value is not set, only
220             the following titles are accounted for:
221              
222             Mr
223             Ms
224             M/s
225             Mrs
226             Miss
227             Dr
228             Sir
229             Dame
230              
231              
232             Note that if this option is not specified, than by default extended titles
233             are ignored. Disabling extended titles speeds up the parsing.
234              
235             =back
236              
237             =head2 parse
238              
239             $error = $name->parse("MR AC DE SILVA");
240              
241             The C method takes a single parameter of a text string containing a
242             name. It attempts to parse the name and break it down into the components
243              
244             Returns an error flag. If the name was parsed successfully, it's value is 0,
245             otherwise a 1. This step is a prerequisite for the following methods.
246              
247              
248             =head2 case_all
249              
250             $correct_casing = $name->case_all;
251              
252             The C method converts the first letter of each component to
253             capitals and the remainder to lower case, with the following exceptions-
254              
255             initials remain capitalised
256             surname spelling such as MacNay-Smith, O'Brien and Van Der Heiden are preserved
257             - see C for user defined exceptions
258              
259             A complete definition of the capitalising rules can be found by studying
260             the case_surname function.
261              
262             The method returns the entire cased name as text.
263              
264             =head2 case_all_reversed
265              
266             $correct_casing = $name->case_all_reversed;
267              
268             The C method applies the same type of casing as
269             C. However, the name is returned as surname followed by a comma
270             and the rest of the name, which can be any of the combinations allowed
271             for a name, except the title. Some examples are: "Smith, John", "De Silva, A.B."
272             This is useful for sorting names alphabetically by surname.
273              
274             The method returns the entire reverse order cased name as text.
275              
276              
277             =head2 components
278              
279             %my_name = $name->components;
280             $cased_surname = $my_name{surname_1};
281              
282              
283             The C method does the same thing as the C method,
284             but returns the name cased components in a hash. The following keys are used
285             for each component:
286              
287             precursor
288             title_1
289             title_2
290             given_name_1
291             given_name_2
292             initials_1
293             initials_2
294             middle_name
295             conjunction_1
296             conjunction_2
297             surname_1
298             surname_2
299             suffix
300              
301             If a component has no matching data for a given name, it will not appear in the hash
302              
303             If the name could not be parsed, this method returns null. If you assign the return
304             value to a hash, you should check the error status returned by the C method first.
305             Ohterwise, you will get an odd number of values assigned to the hash.
306              
307              
308             =head2 case_surname
309              
310             $correct_casing = case_surname("DE SILVA-MACNAY" [,$lc_prefix]);
311              
312             C is a stand alone function that does not require a name
313             object. The input is a text string. An optional input argument controls the
314             casing rules for prefix portions of a surname, as described above in the
315             C section.
316              
317             The output is a string converted to the correct casing for surnames.
318             See C for user defined exceptions
319              
320             This function is useful when you know you are only dealing with names that
321             do not have initials like "Mr John Jones". It is much faster than the case_all
322             method, but does not understand context, and cannot detect errors on strings
323             that are not personal names.
324              
325              
326             =head2 surname_prefs.txt
327              
328             Some surnames can have more than one form of valid capitalisation, such as
329             MacQuarie or Macquarie. Where the user wants to specify one form as the default,
330             a text file called surname_prefs.txt should be created and placed in the same
331             location as the NameParse module. The text file should contain one surname per
332             line, in the capitalised form you want, such as
333              
334             Macquarie
335             MacHado
336              
337             NameParse will still operate if the file does not exist
338              
339             =head2 salutation
340              
341             $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend',sal_type => 'given_name'));
342              
343             The C method converts a name into a personal greeting,
344             such as "Dear Mr & Mrs O'Brien" or "Dear Sue and John"
345              
346             Optional parameters may be specided in a hash as follows:
347              
348              
349             salutation:
350              
351             The greeting word such as 'Dear' or 'Greetings'. If not spefied than 'Dear' is used
352              
353             sal_default:
354              
355             The default word used when a personalised salution cannot be generated. If not
356             specified, than 'Friend' is used.
357              
358             sal_type:
359              
360             Can be either 'given_name' such as 'Dear Sue' or 'title_plus_name' such as 'Dear Ms Smith'
361             If not specified, than 'given_name' is used.
362              
363             If an error is detected during parsing, such as with the name "AB Smith & Associates",
364             then the value of sal_default is used instead of a given name, or a title and surname.
365             If the input string contains a conjunction, an 's' is added to the value of sal_default.
366              
367             If the name contains a precursor, a default salutation is produced.
368              
369             =head2 clean
370              
371             $good_name = clean("Bad Na9me");
372              
373             C is a stand alone function that does not require a name object.
374             The input is a text string and the output is the string with:
375              
376             all repeating spaces removed
377             all characters not in the set (A-Z a-z - ' , . &) removed
378              
379              
380             =head2 properties
381              
382             The C method returns all the properties of the name,
383             non_matching, number and type, as a hash.
384              
385             =over 4
386              
387             =item type
388              
389             The type of format a name is in, as one of the following strings:
390              
391             Mr_A_Smith_&Ms_B_Jones
392             Mr_&Ms_A_&B_Smith
393             Mr_A_&Ms_B_Smith
394             Mr_&Ms_A_Smith
395             Mr_A_&B_Smith
396             Mr_John_Adam_Smith
397             Mr_John_A_Smith
398             Mr_John_Smith
399             Mr_A_Smith
400             John_Adam_Smith
401             John_A_Smith
402             J_Adam_Smith
403             John_Smith
404             A_Smith
405             John
406             unknown
407              
408              
409             =item non_matching
410              
411             Returns any unmatched section that was found.
412              
413             =back
414              
415             =head2 report
416              
417             Create a formatted text report to standard output listing
418             - the input string,
419             - the name and value of each defined component
420             - any non matching component
421              
422              
423             =head1 LIMITATIONS
424              
425             The huge number of character combinations that can form a valid names makes
426             it is impossible to correctly identify them all. Firstly, there are many
427             ambiguities, which have no right answer.
428              
429             Macbeth or MacBeth, are both valid spellings
430             Is ED WOOD E.D. Wood or Edward Wood
431             Is 'Mr Rapid Print' a name or a company
432             Does John Bradfield Smith have a middle name of Bradfield, or a surname of Bradfield-Smith?
433              
434             One approach is to have large lookup files of names and words, statistical rules
435             and fuzzy logic to attempt to derive context. This approach gives high levels of
436             accuracy but uses a lot of your computers time and resources.
437              
438             NameParse takes the approach of using a limited set of rules, based on the
439             formats that are commonly used by business to represent peoples names. This
440             gives us fairly high accuracy, with acceptable speed and program size.
441              
442             NameParse will accept names from many countries, like Van Der Heiden,
443             De La Mare and Le Fontain. Having said that, it is still biased toward English,
444             because the precursors, titles and conjunctions are based on English usage.
445              
446             Names with two or more words, but no separating hyphen are not recognized.
447             This is a real quandary as Indian, Chinese and other names can have several
448             components. If these are allowed for, any component after the surname
449             will also be picked up. For example in "Mr AB Jones Trading As Jones Pty Ltd"
450             will return a surname of "Jones Trading".
451              
452             Because of the large combination of possible names defined in the grammar, the
453             program is not very fast, except for the more limited C subroutine.
454             See the "Future Directions" section for possible speed ups.
455              
456             As the parser has a very limited understanding of context, the "John_Adam_Smith"
457             name type is most likely to cause problems, as it contains no known tokens
458             like a title. A string such as "National Australia Bank" would be accepted
459             as a valid name, first name National etc. Supplying a list of common pronouns
460             as exceptions could solve this problem.
461              
462              
463             =head1 REFERENCES
464              
465             "The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
466              
467             Australian Standard AS4212-1994 "Geographic Information Systems -
468             Data Dictionary for transfer of street addressing information"
469              
470              
471             =head1 FUTURE DIRECTIONS
472              
473             Define grammar for other languages. Hopefully, all that would be needed is
474             to specify a new module with its own grammar, and inherit all the existing
475             methods. I don't have the knowledge of the naming conventions for non-english
476             languages.
477              
478             =head1 REPOSITORY
479              
480             L
481              
482              
483             =head1 SEE ALSO
484              
485             L, L, L,
486             L, L
487              
488              
489             =head1 BUGS
490              
491             Names with accented characters (acute, circumfelx etc) will not be parsed
492             correctly. A work around is to replace the character class [a-z] with \w
493             in the appropriate rules in the grammar tree, but this could lower the accuracy
494             of names based purely on ASCII text.
495              
496             =head1 CREDITS
497              
498             Thanks to all the people who provided ideas and suggestions, including -
499              
500             Damian Conway, author of Parse::RecDescent
501             Mark Summerfield author of Lingua::EN::NameCase,
502             Ron Savage, Alastair Adam Huffman, Douglas Wilson
503             Peter Schendzielorz
504              
505             =head1 AUTHOR
506              
507             NameParse was written by Kim Ryan
508              
509             =head1 COPYRIGHT AND LICENSE
510              
511             Copyright (c) 2018 Kim Ryan. All rights reserved.
512              
513             This library is free software; you can redistribute it and/or modify
514             it under the same terms as Perl itself.
515              
516             =cut
517             #-------------------------------------------------------------------------------
518              
519             package Lingua::EN::NameParse;
520              
521 2     2   123911 use strict;
  2         13  
  2         56  
522 2     2   10 use warnings;
  2         5  
  2         51  
523              
524 2     2   932 use Lingua::EN::NameParse::Grammar;
  2         11  
  2         59  
525 2     2   2296 use Parse::RecDescent;
  2         85127  
  2         17  
526              
527 2     2   101 use Exporter;
  2         5  
  2         83  
528 2     2   13 use vars qw (@ISA @EXPORT_OK);
  2         4  
  2         2718  
529              
530             our $VERSION = '1.38';
531             @ISA = qw(Exporter);
532             @EXPORT_OK = qw(clean case_surname);
533              
534             #-------------------------------------------------------------------------------
535             # Create a new instance of a name parsing object. This step is time consuming
536             # and should normally only be called once in your program.
537              
538             sub new
539             {
540 2     2 1 94 my $class = shift;
541 2         8 my %args = @_;
542              
543 2         5 my $name = {};
544 2         6 bless($name,$class);
545              
546             # Default to 2 initials per name. Can be overwritten if user defines
547             # 'initials' as a key in the hash supplied to new method.
548 2         14 $name->{initials} = 2;
549              
550 2         4 my $current_key;
551 2         9 foreach my $current_key (keys %args)
552             {
553 8         17 $name->{$current_key} = $args{$current_key};
554             }
555              
556 2         12 my $grammar = Lingua::EN::NameParse::Grammar::_create($name);
557 2         19 $name->{parse} = new Parse::RecDescent($grammar);
558              
559 2         580356 return ($name);
560             }
561             #-------------------------------------------------------------------------------
562             # Attempt to parse a string and retrieve it's components and properties
563             # Requires a name object to have been created with the 'new' method'
564             # Returns: an error code, 0 for success, 1 for failure
565              
566             sub parse
567             {
568 28     28 1 7270 my $name = shift;
569 28         61 my ($input_string) = @_;
570              
571 28         62 chomp($input_string);
572              
573             # If reverse ordered names are allowed, swap the surname component, before
574             # the comma, with the rest of the name. Rejoin the name, replacing comma
575             # with a space.
576              
577 28 100 100     158 if ( $name->{allow_reversed} and $input_string =~ /,/ )
578             {
579 1         5 my ($first,$second) = split(/,/,$input_string);
580 1         5 $input_string = join(' ',$second,$first);
581             }
582              
583 28         157 $name->{comps} = ();
584 28         68 $name->{properties} = ();
585 28         73 $name->{properties}{type} = 'unknown';
586 28         60 $name->{error} = 0;
587 28         56 $name->{error_desc} = '';
588 28         41 $name->{warning} = 0;
589 28         59 $name->{warning_desc} = '';
590              
591 28         51 $name->{original_input} = $input_string;
592 28         52 $name->{input_string} = $input_string;
593              
594 28         74 $name = _pre_parse($name);
595 28 50       75 unless ( $name->{error} )
596             {
597 28 100       71 if ( $name->{auto_clean} )
598             {
599 9         21 $name->{input_string} = clean($name->{input_string});
600             }
601 28         67 $name = _assemble($name);
602 28         74 _validate($name);
603             }
604              
605 28         103 return($name->{error});
606             }
607             #-------------------------------------------------------------------------------
608             # Clean the input string. Can be called as a stand alone function.
609              
610             sub clean
611             {
612 10     10 1 336 my ($input_string) = @_;
613              
614             # remove illegal characters
615 10         32 $input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go;
616              
617             # remove repeating spaces
618 10         26 $input_string =~ s/ +/ /go ;
619              
620             # remove any remaining leading or trailing space
621 10         17 $input_string =~ s/^ //;
622              
623 10         24 return($input_string);
624             }
625              
626             #-------------------------------------------------------------------------------
627             # Given a name object, apply correct capitalisation to each component of a person's name.
628             # Return all cased components in a hash.
629             # Else return no value.
630              
631              
632             sub components
633             {
634 35     35 1 88 my $name = shift;
635              
636 35 50       109 if ( $name->{properties}{type} eq 'unknown' )
637             {
638 0         0 return;
639             }
640             else
641             {
642 35         51 my %orig_components = %{ $name->{comps} };
  35         285  
643              
644 35         92 my ($current_key,%cased_components);
645 35         163 foreach $current_key ( keys %orig_components )
646             {
647 490         636 my $cased_value;
648 490 100       1372 if ( $current_key =~ /initials/ ) # initials_1, possibly initials_2
    100          
    50          
649             {
650 70         133 $cased_value = uc($orig_components{$current_key});
651             }
652             elsif ( $current_key =~ /surname|suffix/ )
653             {
654 105         309 $cased_value = case_surname($orig_components{$current_key},$name->{lc_prefix});
655             }
656             elsif ( $current_key eq 'type')
657             {
658 0         0 $cased_value = $orig_components{$current_key};
659             }
660             else
661             {
662 315         537 $cased_value = _case_word($orig_components{$current_key});
663             }
664              
665 490         1003 $cased_components{$current_key} = $cased_value;
666             }
667 35         439 return(%cased_components);
668             }
669             }
670              
671             #-------------------------------------------------------------------------------
672             # Hash of of lists, indicating the order that name components are assembled in.
673             # Each list element is itself the name of the key value in a name object.
674             # Used by the case_all and case_all_reversed methods.
675             # These hashes are created here globally, as quite a large overhead is
676             # imposed if the are created locally, each time the method is invoked
677              
678             my %component_order=
679             (
680             'Mr_John_Smith_&_Ms_Mary_Jones' => ['title_1','given_name_1','surname_1','conjunction_1','title_2','given_name_2','surname_2'],
681             'Mr_A_Smith_&_Ms_B_Jones' => ['title_1','initials_1','surname_1','conjunction_1','title_2','initials_2','surname_2'],
682             'Mr_&_Ms_A_&_B_Smith' => ['title_1','conjunction_1','title_2','initials_1','conjunction_2','initials_2','surname_1'],
683             'Mr_A_&_Ms_B_Smith' => ['title_1','initials_1','conjunction_1','title_2','initials_2','surname_1'],
684             'Mr_&_Ms_A_Smith' => ['title_1','conjunction_1','title_2','initials_1','surname_1'],
685             'Mr_A_&_B_Smith' => ['title_1','initials_1','conjunction_1','initials_2','surname_1'],
686             'John_Smith_&_Mary_Jones' => ['given_name_1','surname_1','conjunction_1','given_name_2','surname_2'],
687             'John_&_Mary_Smith' => ['given_name_1','conjunction_1','given_name_2','surname_1'],
688             'A_Smith_&_B_Jones' => ['initials_1','surname_1','conjunction_1','initials_2','surname_2'],
689              
690             'Mr_John_Adam_Smith' => ['precursor','title_1','given_name_1','middle_name','surname_1','suffix'],
691             'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'],
692             'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'],
693             'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'],
694             'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'],
695             'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'],
696             'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'],
697             'John_Smith' => ['precursor','given_name_1','surname_1','suffix'],
698             'A_Smith' => ['precursor','initials_1','surname_1','suffix'],
699             'John' => ['given_name_1']
700             );
701              
702              
703             # only include names with a single surname
704             my %reverse_component_order=
705             (
706             'Mr_&_Ms_A_&_B_Smith' => ['surname_1','title_1','conjunction_1','title_2','initials_1','conjunction_1','initials_2'],
707             'Mr_A_&_Ms_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','title_2','initials_2'],
708             'Mr_&_Ms_A_Smith' => ['surname_1','title_1','title_1','conjunction_1','title_2','initials_1'],
709             'Mr_A_&_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','initials_2'],
710             'John_&_Mary_Smith' => ['surname_1','given_name_1','conjunction_1','given_name_2'],
711              
712             'Mr_John_Adam_Smith' => ['surname_1','title_1','given_name_1','middle_name','suffix'],
713             'Mr_John_A_Smith' => ['surname_1','title_1','given_name_1','initials_1','suffix'],
714             'Mr_John_Smith' => ['surname_1','title_1','given_name_1','suffix'],
715             'Mr_A_Smith' => ['surname_1','title_1','initials_1','suffix'],
716             'John_Adam_Smith' => ['surname_1','given_name_1','middle_name','suffix'],
717             'John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'],
718             'J_Adam_Smith' => ['surname_1','initials_1','middle_name','suffix'],
719             'John_Smith' => ['surname_1','given_name_1','suffix'],
720             'A_Smith' => ['surname_1','initials_1','suffix'],
721             'John' => ['given_name_1']
722             );
723              
724             #-------------------------------------------------------------------------------
725             # Apply correct capitalisation to a person's entire name
726             # If the name type is unknown, return undef
727             # Else, return a string of all cased components in correct order
728              
729             sub case_all
730             {
731 2     2 1 21 my $name = shift;
732              
733 2         3 my @cased_name;
734              
735 2 50       7 if ( $name->{properties}{type} eq 'unknown' )
736             {
737 0         0 return undef;
738             }
739              
740 2 50       8 unless ( $component_order{$name->{properties}{type}} )
741             {
742             # component order missing in array defined above
743 0         0 warn "Component order not defined for: $name->{properties}{type}";
744 0         0 return;
745             }
746              
747 2         6 my %component_vals = $name->components;
748 2         7 my @order = @{ $component_order{$name->{properties}{type}} };
  2         8  
749              
750 2         5 foreach my $component_key ( @order )
751             {
752             # As some components such as precursors are optional, they will appear
753             # in the order array but may or may not have have a value, so only
754             # process defined values
755 10 100       24 if ( $component_vals{$component_key} )
756             {
757 6         12 push(@cased_name,$component_vals{$component_key});
758             }
759             }
760 2 100       11 if ( $name->{comps}{non_matching} )
761             {
762             # Despite errors, try to name case non-matching section. As the format
763             # of this section is unknown, surname case will provide the best
764             # approximation, but still fail on initials of more than 1 letter
765 1         5 push(@cased_name,case_surname($name->{comps}{non_matching},$name->{lc_prefix}));
766             }
767              
768 2         23 return(join(' ',@cased_name));
769             }
770              
771             #-------------------------------------------------------------------------------
772             =head1 case_all_reversed
773              
774             Apply correct capitalisation to a person's entire name and reverse the order
775             so that surname is first, followed by the other components, such as: Smith, Mr John A
776             Useful for creating a list of names that can be sorted by surname.
777              
778             If name type is unknown , returns null
779              
780             If the name type has a joint name, such as 'Mr_A_Smith_Ms_B_Jones', return null,
781             as it is ambiguous which surname to place at the start of the string
782              
783             Else, returns a string of all cased components in correct reversed order
784              
785             =cut
786              
787             sub case_all_reversed
788             {
789 0     0 1 0 my $name = shift;
790              
791 0         0 my @cased_name_reversed;
792              
793 0 0       0 unless ( $name->{properties}{type} eq 'unknown' )
794             {
795 0 0       0 unless ( $reverse_component_order{$name->{properties}{type} } )
796             {
797             # this type of name should not be reversed, such as two surnames
798 0         0 return;
799             }
800 0         0 my %component_vals = $name->components;
801 0         0 my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } };
  0         0  
802              
803 0         0 foreach my $component_key ( @reverse_order )
804             {
805             # As some components such as precursors are optional, they will appear
806             # in the order array but may or may not have have a value, so only
807             # process defined values
808              
809 0         0 my $component_value = $component_vals{$component_key};
810 0 0       0 if ( $component_value )
811             {
812 0 0       0 if ($component_key eq 'surname_1')
813             {
814 0         0 $component_value .= ',';
815             }
816 0         0 push(@cased_name_reversed,$component_value);
817             }
818             }
819             }
820 0         0 return(join(' ',@cased_name_reversed));
821             }
822             #-------------------------------------------------------------------------------
823             # The user may specify their own preferred spelling for surnames.
824             # These should be placed in a text file called surname_prefs.txt
825             # in the same location as the module itself.
826              
827             BEGIN
828             {
829             # Obtain the full path to NameParse module, defined in the %INC hash.
830 2     2   17 my $prefs_file_location = $INC{"Lingua/EN/NameParse.pm"};
831             # Now substitute the name of the preferences file
832 2         20 $prefs_file_location =~ s/NameParse\.pm$/surname_prefs.txt/;
833              
834 2 50       4257 if ( open(PREFERENCES_FH,"<$prefs_file_location") )
835             {
836 0         0 my @surnames = ;
837 0         0 foreach my $name ( @surnames )
838             {
839 0         0 chomp($name);
840             # Build hash, lower case name is key for case insensitive
841             # comparison, while value holds the actual capitalisation
842 0         0 $Lingua::EN::surname_preferences{lc($name)} = $name;
843             }
844 0         0 close(PREFERENCES_FH);
845             }
846             }
847             #-------------------------------------------------------------------------------
848             # Apply correct capitalisation to a person's surname. Can be called as a
849             # stand alone function.
850              
851             sub case_surname
852             {
853 108     108 1 701 my ($surname,$lc_prefix) = @_;
854              
855 108 100       236 unless ($surname)
856             {
857 65         143 return('');
858             }
859              
860             # If the user has specified a preferred capitalisation for this
861             # surname in the surname_prefs.txt, it should be returned now.
862 43 50       128 if ($Lingua::EN::surname_preferences{lc($surname)} )
863             {
864 0         0 return($Lingua::EN::surname_preferences{lc($surname)});
865             }
866              
867             # Lowercase everything
868 43         82 $surname = lc($surname);
869              
870             # Now uppercase first letter of every word. By checking on word boundaries,
871             # we will account for apostrophes (D'Angelo) and hyphenated names
872 43         272 $surname =~ s/\b(\w)/\u$1/g;
873              
874             # Name case Macs and Mcs
875             # Exclude names with 1-2 letters after prefix like Mack, Macky, Mace
876             # Exclude names ending in a,c,i,o,z or j, typically Polish or Italian
877              
878 43 100       194 if ( $surname =~ /\bMac[a-z]{2,}[^a|c|i|o|z|j]\b/i )
    50          
879             {
880 4         23 $surname =~ s/\b(Mac)([a-z]+)/$1\u$2/ig;
881              
882             # Now correct for "Mac" exceptions
883 4         10 $surname =~ s/MacHin/Machin/;
884 4         12 $surname =~ s/MacHlin/Machlin/;
885 4         7 $surname =~ s/MacHar/Machar/;
886 4         8 $surname =~ s/MacKle/Mackle/;
887 4         5 $surname =~ s/MacKlin/Macklin/;
888 4         8 $surname =~ s/MacKie/Mackie/;
889              
890             # Portuguese
891 4         6 $surname =~ s/MacHado/Machado/;
892              
893             # Lithuanian
894 4         5 $surname =~ s/MacEvicius/Macevicius/;
895 4         7 $surname =~ s/MacIulis/Maciulis/;
896 4         7 $surname =~ s/MacIas/Macias/;
897             }
898             elsif ( $surname =~ /\bMc/i )
899             {
900 0         0 $surname =~ s/\b(Mc)([a-z]+)/$1\u$2/ig;
901             }
902             # Exceptions (only 'Mac' name ending in 'o' ?)
903 43         79 $surname =~ s/Macmurdo/MacMurdo/;
904              
905              
906 43 100       89 if ( $lc_prefix )
907             {
908             # Lowercase first letter of every word in prefix. The trailing space
909             # prevents the surname from being altered. Note that spellings like
910             # d'Angelo are not accounted for.
911 1         8 $surname =~ s/\b(\w+ )/\l$1/g;
912             }
913              
914             # Correct for possessives such as "John's" or "Australia's". Although this
915             # should not occur in a person's name, they are valid for proper names.
916             # As this subroutine may be used to capitalise words other than names,
917             # we may need to account for this case. Note that the 's' must be at the
918             # end of the string
919 43         84 $surname =~ s/(\w+)'S(\s+)/$1's$2/;
920 43         68 $surname =~ s/(\w+)'S$/$1's/;
921              
922             # Correct for roman numerals, excluding single letter cases I,V and X,
923             # which will work with the above code
924 43         115 $surname =~ s/\b(I{2,3})\b/\U$1/i; # 2nd, 3rd
925 43         88 $surname =~ s/\b(IV)\b/\U$1/i; # 4th
926 43         77 $surname =~ s/\b(VI{1,3})\b/\U$1/i; # 6th, 7th, 8th
927 43         78 $surname =~ s/\b(IX)\b/\U$1/i; # 9th
928 43         100 $surname =~ s/\b(XI{1,3})\b/\U$1/i; # 11th, 12th, 13th
929              
930 43         113 return($surname);
931             }
932             #-------------------------------------------------------------------------------
933             # Create a personalised greeting from one or two person's names
934             # Returns the salutation as a string, such as "Dear Mr Smith", or "Dear Sue"
935              
936             sub salutation
937             {
938 3     3 1 7 my $name = shift;
939 3         8 my %args = @_;
940              
941 3         6 my $salutation = 'Dear';
942 3         5 my $sal_default = 'Friend';
943 3         5 my $sal_type = 'title_plus_surname';
944              
945             # Check to see if we should override defualts with any user specified preferences
946 3 100       9 if ( %args )
947             {
948 2         8 foreach my $current_key (keys %args)
949             {
950 2 50       6 $current_key eq 'salutation' and $salutation = $args{$current_key};
951 2 50       6 $current_key eq 'sal_default' and $sal_default = $args{$current_key};
952 2 50       8 $current_key eq 'sal_type' and $sal_type = $args{$current_key};
953             }
954             }
955              
956              
957 3         6 my @greeting;
958 3         7 push(@greeting,$salutation);
959              
960             # Personalised salutations cannot be created for Estates or people
961             # without some title
962 3 50 33     17 if
      33        
963             (
964             $name->{error} or
965             ( $name->{comps}{precursor} and $name->{comps}{precursor} =~ /ESTATE/)
966             )
967             {
968             # Despite an error, the presence of a conjunction probably
969             # means we are dealing with 2 or more people.
970             # For example Mr AB Smith & John Jones
971 0 0       0 if ( $name->{input_string} =~ / (AND|&) / )
972             {
973 0         0 $sal_default .= 's';
974             }
975 0         0 push(@greeting,$sal_default);
976             }
977             else
978             {
979 3         8 my %component_vals = $name->components;
980              
981 3 100       17 if ( $sal_type eq 'given_name')
    50          
982             {
983 1 50       4 if ( $component_vals{'given_name_1'} )
984             {
985 1         3 push(@greeting,$component_vals{'given_name_1'});
986 1 50       5 if ( $component_vals{'given_name_2'} )
987             {
988 0         0 push(@greeting,$component_vals{'conjunction_1'});
989 0         0 push(@greeting,$component_vals{'given_name_2'});
990             }
991             }
992             else
993             {
994             # No given name such as 'A_Smith','J_Adam_Smith','Mr_A_Smith'
995             # Must use default
996 0         0 push(@greeting,$sal_default);
997             }
998             }
999             elsif ( $sal_type eq 'title_plus_surname' )
1000             {
1001 2 50       6 if ( $name->{properties}{number} == 1 )
    0          
1002             {
1003 2 100       7 if ( $component_vals{'title_1'} )
1004             {
1005 1         3 push(@greeting,$component_vals{'title_1'});
1006 1         4 push(@greeting,$component_vals{'surname_1'});
1007             }
1008             else
1009             {
1010             # No title such as 'A_Smith','J_Adam_Smith', so must use default
1011 1         4 push(@greeting,$sal_default);
1012             }
1013             }
1014             elsif ( $name->{properties}{number} == 2 )
1015             {
1016             # a joint name
1017              
1018 0         0 my $type = $name->{properties}{type};
1019 0 0 0     0 if ( $type eq 'Mr_&Ms_A_Smith' or $type eq 'Mr_A_&Ms_B_Smith' or $type eq 'Mr_&Ms_A_&B_Smith' )
    0 0        
      0        
1020             {
1021             # common surname
1022 0         0 push(@greeting,$component_vals{'title_1'});
1023 0         0 push(@greeting,$component_vals{'conjunction_1'});
1024 0         0 push(@greeting,$component_vals{'title_2'});
1025 0         0 push(@greeting,$component_vals{'surname_1'});
1026              
1027             }
1028             elsif ( $type eq 'Mr_A_Smith_&Ms_B_Jones' or $type eq 'Mr_John_Smith_&Ms_Mary_Jones' )
1029             {
1030 0         0 push(@greeting,$component_vals{'title_1'});
1031 0         0 push(@greeting,$component_vals{'surname_1'});
1032 0         0 push(@greeting,$component_vals{'conjunction_1'});
1033 0         0 push(@greeting,$component_vals{'title_2'});
1034 0         0 push(@greeting,$component_vals{'surname_2'});
1035             }
1036             else
1037             {
1038             # No title such as A_Smith_&B_Jones', 'John_Smith_&Mary_Jones'
1039             # Must use default
1040 0         0 push(@greeting,$sal_default);
1041             }
1042             }
1043             }
1044             else
1045             {
1046 0         0 warn "Invalid sal_type : ", $sal_type;
1047 0         0 push(@greeting,$sal_default);
1048             }
1049             }
1050 3         24 return(join(' ',@greeting));
1051             }
1052             #-------------------------------------------------------------------------------
1053             # Return all name properties as a hash
1054              
1055             sub properties
1056             {
1057 21     21 1 102 my $name = shift;
1058 21         35 return(%{ $name->{properties} });
  21         107  
1059             }
1060              
1061             #-------------------------------------------------------------------------------
1062             # Create a text report to standard output listing
1063             # - the input string,
1064             # - the name of each defined component, if it exists
1065             # - any non matching component
1066              
1067             sub report
1068             {
1069 0     0 1 0 my $name = shift;
1070            
1071 0         0 my %props = $name->properties;
1072            
1073 0         0 my $fmt = "%-20.20s : %s\n";
1074              
1075 0         0 printf($fmt,"Original Input",$name->{original_input});
1076 0         0 printf($fmt,"Cleaned Input",$name->{input_string});
1077 0         0 printf($fmt,"Case all",$name->case_all);
1078 0 0       0 if ($name->case_all_reversed)
1079             {
1080 0         0 printf($fmt,"Case all reversed",$name->case_all_reversed);
1081             }
1082             else
1083             {
1084 0         0 printf($fmt,"Case all reversed",'not applicable');
1085             }
1086            
1087            
1088 0         0 printf($fmt,"Salutation",$name->salutation(salutation => 'Dear',sal_default => 'Friend', sal_type => 'title_plus_surname'));
1089 0         0 printf($fmt,"Type", $props{type});
1090 0         0 printf($fmt,"Number", $props{number});
1091 0         0 printf($fmt,"Parsing Error", $name->{error});
1092 0         0 printf($fmt,"Error description : ", $name->{error_desc});
1093 0         0 printf($fmt,"Parsing Warning", $name->{warning});
1094 0         0 printf($fmt,"Warning description", $name->{warning_desc});
1095            
1096            
1097 0 0       0 unless ($props{type} eq 'unknown')
1098             {
1099 0         0 my %comps = $name->components;
1100 0 0       0 if ( %comps )
1101             {
1102 0         0 print("\nCOMPONENTS\n");
1103 0         0 foreach my $value ( sort keys %comps)
1104             {
1105 0 0 0     0 if ($value and $comps{$value})
1106             {
1107 0         0 printf($fmt,$value,$comps{$value});
1108             }
1109             }
1110             }
1111             }
1112             }
1113             #-------------------------------------------------------------------------------
1114              
1115             # PRIVATE METHODS
1116              
1117             #-------------------------------------------------------------------------------
1118              
1119             sub _pre_parse
1120             {
1121 28     28   46 my $name = shift;
1122            
1123             # strip all full stops
1124 28         94 $name->{input_string} =~ s/\.//g;
1125            
1126             # Fold all text to upper case, as these are used in all regular expressions withun thr grammar tree
1127 28         75 $name->{input_string} = uc($name->{input_string});
1128            
1129             # Check that common reserved word (as found in company names) do not appear
1130 28 50       284 if ( $name->{input_string} =~
1131             /\BPTY LTD$|\BLTD$|\BPLC$|ASSOCIATION|DEPARTMENT|NATIONAL|SOCIETY/ )
1132             {
1133 0         0 $name->{error} = 1;
1134 0         0 $name->{comps}{non_matching} = $name->{input_string};
1135 0         0 $name->{error_desc} = 'Reserved words found in name';
1136             }
1137              
1138             # For the case of a single name such as 'Voltaire' we need to add a trailing space
1139             # to the input string. This is because the grammar tree expects a terminator (the space)
1140             # optionally followed by other productions or non matching text
1141 28         71 $name->{input_string} .= ' ';
1142 28 50       143 if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/ )
1143             {
1144 0         0 $name->{input_string} .= ' ';
1145             }
1146 28         99 return($name);
1147              
1148             }
1149             #-------------------------------------------------------------------------------
1150             # Initialise all components to empty string. Assemble hashes of components
1151             # and properties as part of the name object
1152             #
1153             sub _assemble
1154             {
1155 28     28   45 my $name = shift;
1156              
1157             # Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree
1158             # defined in Lingua::EN::NameParse::Grammar
1159 28         228 my $parsed_name = $name->{parse}->full_name($name->{input_string});
1160            
1161             # Place components into a separate hash, so they can be easily returned
1162             # for the user to inspect and modify.
1163            
1164 28         300778 my @all_comps = qw(precursor title_1 given_name_1 initials_1 middle_name surname_1 conjunction_1
1165             title_2 given_name_2 initials_2 surname_2 conjunction_2 suffix non_matching);
1166            
1167 28         77 foreach my $comp (@all_comps)
1168             {
1169             # set all components to empty string, as any of them could be accessed, even if they don't exist
1170 392         730 $name->{comps}{$comp} = '';
1171 392 100       772 if (defined($parsed_name->{$comp}))
1172             {
1173             # Copy over existing components.
1174 135         266 $name->{comps}{$comp} = _trim_space($parsed_name->{$comp});
1175             }
1176             }
1177              
1178 28         71 $name->{properties}{number} = 0;
1179 28         52 $name->{properties}{number} = $parsed_name->{number};
1180 28         55 $name->{properties}{type} = $parsed_name->{type};
1181              
1182 28         105 return($name);
1183             }
1184             #-------------------------------------------------------------------------------
1185             # For correct matching, the grammar of each component must include the trailing space that separates it
1186             # from any following word. This should now be removed from the components, and will be restored by the
1187             # case_all and salutation methods, if called.
1188              
1189             sub _trim_space
1190             {
1191 135     135   239 my ($string) = @_;
1192 135 100       258 if ($string)
1193             {
1194 109         343 $string =~ s/ $//;
1195             }
1196 135         343 return($string);
1197             }
1198             #-------------------------------------------------------------------------------
1199             # Check if any name components have illegal characters, or do not have the
1200             # correct syntax for a valid name.
1201              
1202              
1203             sub _validate
1204             {
1205 28     28   57 my $name = shift;
1206 28         85 my %comps = $name->components;
1207              
1208 28 100       154 if ( $comps{non_matching} )
    50          
1209             {
1210 2         6 $name->{warning} = 1;
1211 2         7 $name->{warning_desc} .= ";non_matching text found : $comps{non_matching}";
1212             }
1213             elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ )
1214             {
1215             # illegal characters found
1216 0         0 $name->{error} = 1;
1217 0         0 $name->{error_desc} = 'illegal characters found';
1218             }
1219            
1220              
1221 28 50       71 if ( not _valid_name($comps{given_name_1}) )
    50          
    50          
    50          
1222             {
1223 0         0 $name->{warning} = 1;
1224 0         0 $name->{warning_desc} .= ";no vowel sound in given_name_1 : $comps{given_name_1}";
1225             }
1226             elsif ( not _valid_name($comps{middle_name}) )
1227             {
1228 0         0 $name->{warning} = 1;
1229 0         0 $name->{warning_desc} .= ";no vowel sound in middle_name : $comps{middle_name}";
1230             }
1231              
1232             elsif ( not _valid_name($comps{surname_1}) )
1233             {
1234 0         0 $name->{warning} = 1;
1235 0         0 $name->{warning_desc} .= ";no vowel sound in surname_1 : $comps{surname_1}";
1236              
1237             }
1238             elsif ( not _valid_name($comps{surname_2}) )
1239             {
1240 0         0 $name->{warning} = 1;
1241 0         0 $name->{warning_desc} .= ";no vowel sound in surname_2 : $comps{surname_2}";
1242             }
1243             }
1244             #-------------------------------------------------------------------------------
1245             # If the name has an assigned value, check that it contains a vowel sound,
1246             # or matches the exceptions to this rule.
1247             # Returns 1 if name is valid, otherwise 0
1248              
1249             sub _valid_name
1250             {
1251 112     112   200 my ($name) = @_;
1252 112 100 33     406 if ( not $name )
    50          
1253             {
1254 65         224 return(1);
1255             }
1256             # Names should have a vowel sound,
1257             # valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc
1258             elsif ( $name and $name =~ /[AEIOUYJ]|^(NG|TSZ|MD|(C?H|[PTS])NG)$/i )
1259             {
1260 47         175 return(1);
1261             }
1262             else
1263             {
1264 0         0 return(0);
1265             }
1266             }
1267             #-------------------------------------------------------------------------------
1268             # Upper case first letter, lower case the rest, for all words in string
1269             sub _case_word
1270             {
1271 315     315   554 my ($word) = @_;
1272              
1273 315 100       560 if ($word)
1274             {
1275 67         364 $word =~ s/(\w+)/\u\L$1/g;
1276             }
1277            
1278 315         630 return($word);
1279             }
1280             #-------------------------------------------------------------------------------
1281             return(1);