File Coverage

blib/lib/Lingua/EN/AddressParse.pm
Criterion Covered Total %
statement 255 322 79.1
branch 88 122 72.1
condition 22 36 61.1
subroutine 20 23 86.9
pod 6 6 100.0
total 391 509 76.8


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::AddressParse - extract components of a street address, presented as a text string
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::AddressParse;
8              
9             my %args =
10             (
11             country => 'US',
12             auto_clean => 1,
13             force_case => 1,
14             abbreviate_subcountry => 0,
15             abbreviated_subcountry_only => 1,
16             force_post_code => 0
17             );
18              
19             my $address = Lingua::EN::AddressParse->new(%args);
20             $error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21            
22             print $address->report;
23            
24             Country address format 'US'
25             Address type 'suburban'
26             Non matching part 'HOLD MAIL '
27             Error '1'
28             Error descriptions 'non matching section : HOLD MAIL '
29             Warning '1'
30             Warning description ''
31             Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32             COMPONENTS ''
33             base_street_name 'Old Massachusetts'
34             post_code '98100'
35             property_identifier '40 1/2'
36             street_direction_prefix 'N'
37             street_name 'N Old Massachusetts'
38             street_type 'Ave'
39             sub_property_identifier '3B'
40             sub_property_type 'Apt'
41             subcountry 'WA'
42             suburb 'Washington Valley'
43              
44             %address_components = $address->components;
45             print $address_components{sub_property_type}; # APT
46             print $address_components{sub_property_identifier}; # 3B
47             print $address_components{property_identifier}; # 40 1/2
48            
49             %address_properties = $address->properties;
50             print $address_properties{type}; # suburban
51             print $address_properties{non_matching}; # : HOLD MAIL
52              
53             $correct_casing = $address->case_all;
54              
55              
56             =head1 DESCRIPTION
57              
58             This module takes as input a suburban, rural or postal address in free format
59             text such as,
60              
61             3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62             12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63             C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64              
65             9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66             27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67              
68             2A LOW ST KEW NSW 2123
69             12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70             "OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71             GPO Box K318, HAYMARKET, NSW 2000
72              
73            
74             and attempts to parse it. If successful, the address is broken
75             down into it's components and useful functions can be performed such as :
76              
77             converting upper or lower case values to title case (2A Low St Kew NSW 2123)
78             extracting the addresses individual components (2A,Low,St,KEW,NSW,2123)
79             determining the type of format the address is in ('suburban')
80              
81              
82             If the address cannot be parsed you have the option of cleaning the address
83             of bad characters, or extracting any portion that was parsed and the portion
84             that failed.
85              
86             This module can be used for analysing and improving the quality of
87             lists of residential and postal addresses.
88              
89             =head1 DEFINITIONS
90              
91             The following terms are used by AddressParse to define the components that
92             can make up an address.
93              
94             Pre cursor : C/O MR A Smith...
95             Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
96             Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44
97             Property name : "Old Regret"
98             Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
99             Road Box : RMB 24A, RMS 234 ...
100             Street Direction: North, SE, Sth. etc
101             Street name : O'Hare, New South Head, The Causeway
102             Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
103             Suburb : Dee Why, St. John's Wood ...
104             Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
105             Post (zip) code : 2062, 34532-1234, SG12A 9ET
106             Country : Australia, UK, US or Canada
107              
108              
109              
110             The main address formats currently supported are as follows. (a ? means the component is optional):
111              
112             'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
113              
114             OR for the USA
115             'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
116              
117             'rural' : property_name suburb subcountry post_code(?) country(?)
118             'post_box' : post_box suburb subcountry post_code(?) country(?)
119             'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
120             'road_box' : road_box suburb subcountry post_code(?) country(?)
121              
122             All formats may contain a precursor
123              
124             Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
125             module for a complete list of combinations.
126              
127              
128             =head1 METHODS
129              
130             =head2 new
131              
132             The C method creates an instance of an address object and sets up
133             the grammar used to parse addresses. This must be called before any of the
134             following methods are invoked. Note that the object only needs to be
135             created once, and can be reused with new input data.
136              
137             Various setup options may be defined in a hash that is passed as an
138             optional argument to the C method.
139              
140             my %args =
141             (
142             country => 'Australia',
143             auto_clean => 1,
144             force_case => 1,
145             abbreviate_subcountry => 1,
146             abbreviated_subcountry_only => 1,
147             force_post_code => 1
148             );
149              
150             my $address = Lingua::EN::AddressParse->new(%args);
151              
152             =over 4
153              
154             =item country
155              
156             The country argument must be specified. It determines the possible list of
157             valid sub countries (states, counties etc, defined in the Locale::SubCountry
158             module) and post code formats. Either the full name or abbreviation may be
159             specified. The currently supported country names and codes are:
160              
161             AU or Australia
162             CA or Canada
163             GB or United Kingdom
164             US or United States
165              
166             All forms of upper/lower case are acceptable in the country's spelling. If a
167             country name is supplied that the module doesn't recognise, it will die.
168              
169             =item force_case (optional)
170              
171             This option only applies to the C method, see below.
172              
173             =item auto_clean (optional)
174              
175             When this option is set to a positive value, the input string is
176             'cleaned' to try and normalise bad patterns. The type of cleaning
177             includes
178              
179             remove non alphanumeric characters
180             remove full stops
181             remove redundant white space
182             add missing space separators
183             expand abbreviations to more common forms
184             remove bracketed annotations
185             fix badly formed sub property identifiers
186              
187             =item abbreviate_subcountry (optional)
188              
189             When this option is set to a positive value, the sub country is forced to it's
190             abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
191             already abbreviated then it's value is not altered.
192              
193             =item abbreviated_subcountry_only (optional)
194              
195             When this option is set to a positive value, only the abbreviated form
196             of sub country is allowed, such as "NSW" and not "New South Wales". This
197             will make parsing quicker and ensure that addresses comply with postal
198             standards that normally permit only abbreviated sub countries.
199              
200             It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
201             Normally, 'Washington would be consumed as the sub country, but by first converting
202             the address to 'Port Washington NJ' we avoid this problem
203              
204              
205             =item force_post_code (optional)
206              
207             When this option is set to a positive value, the address must contain
208             a post code. If it does not then an error flag is raised. If this option
209             is set to 0 than a post code is optional.
210              
211             By default for this option is true.
212              
213             =back
214              
215             =head2 parse
216              
217             $error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
218              
219             The C method takes a single parameter of a text string containing a
220             address. It attempts to parse the address and break it down into the components
221             described below. If the address is parsed successfully, a 0 is returned,
222             otherwise a 1.
223              
224             Note that you can successfully parse all the components of an address and still
225             have an error returned. This occurs when you have non matching data following
226             a valid address. To check if the data is unusable, you also need to use the
227             C method to check the address type is 'unknown'
228              
229             This method is a prerequisite for all the following methods.
230              
231             =head2 components
232              
233             %address = $address->components($upper_case_all);
234             $suburb = $address{suburb};
235            
236             If the optional argument $upper_case_all is set to a postive value, all components
237             are converted to upper case.
238            
239              
240             The C method returns all the address components in a hash. The
241             following keys are used for each component:
242              
243              
244             pre_cursor - such as 'C/O Mr A Smith'
245             po_box_type - such as 'Private Boxes'
246             post_box
247             road_box
248             sub_property_type
249             sub_property_identifier
250             property_identifier
251             property_name
252             level - such as 12th Floor
253             building - such as Tower A
254             street_direction_prefix (such as East, NW, North etc)
255             base_street_name (the name with direction removed, such as "Main" in "East Main St")
256             street_name (the full street name such as "East Main")
257             street_type
258             street_direction_suffix (US only, abbreviated only such as N, SE etc)
259             suburb
260             subcountry
261             post_code
262             country
263              
264             If a component has no matching data for a given address, it's values will be
265             set to the empty string.
266              
267             Each component is converted to title case, meaning the first letter of each
268             component is set to capitals and the remainder to lower case.
269              
270             Proper name capitalisations such as MacNay and O'Brien are observed
271              
272             The following components are not converted to title case:
273              
274             post_box
275             road_box
276             subcountry
277             post_code
278             country
279             street_direction_suffix
280            
281             If your input data is all upper case and you want to retian that format for parsed
282             data, you will need to apply the 'uc' function to each component.
283              
284             =head2 case_all
285              
286             $correct_casing = $address->case_all;
287              
288             The C method does the same thing as the C method except
289             the entire address is returned as a title cased text string.
290              
291             If the force_case option was set in the C method above, address case the
292             entire input string, including any unmatched sections after a recognisable address
293             that failed parsing. This option is useful when you know you have invalid data,
294             but you still want to title case what you have.
295              
296             =head2 properties
297              
298             The C method returns several properties of the address as a hash.
299             The following keys are used for each property -
300              
301             type - either suburban ,rural,post_box,road_box,unknown
302             non_matching - any trailing string not part the address
303              
304              
305             Additional properties can be accessed with the following
306              
307             $address->{original_input}
308             $address->{input_string} - string after auto_clean option has been applied
309             $address->{country_code} - abbreviated Country address format (as defined in the C method)
310             $address->{error} - error flag, 0 = good, 1 = error
311             $address->{error_desc} - text to describe the type of parsing error
312             $address->{warning} - warning flag, 0 = good, 1 = warning
313             $address->{warning_desc} - text to to describe the type of parsing warning(s)
314            
315             Warnings mean that the address has parsed but there may still be errors within it's components
316              
317              
318             =head2 report
319              
320             Create a formatted text report
321              
322             the input string
323             the cleaned input string
324             the country type
325             the address type
326             any non matching part of input string
327             if any parsing errors occurred
328             error description
329             if any parsing warning occurred
330             warning description
331              
332             the name and value of each defined component
333              
334              
335             Returns a string containing a multi line formatted text report
336              
337             =head1 DEPENDANCIES
338              
339             L, L, L
340              
341             =head1 BUGS
342              
343             =head1 LIMITATIONS
344              
345             Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
346             street type of null string.
347              
348             For US addresses, an ambiguity arises between a street directional suffix and
349             a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
350             Main St, or South Springfield? The parser assumes that 'S' belongs to the street
351             description.
352              
353             The huge number of character combinations that can form a valid address makes
354             it is impossible to correctly identify them all.
355              
356             Valid addresses must contain:
357              
358             property address, suburb, subcountry (aka state) in that order.
359              
360             This format is widely accepted in Australia and the US.
361              
362             UK addresses will often include suburb, town, city and county, formats that
363             are very difficult to parse.
364              
365             Property names must be enclosed in single or double quotes like "Old Regret"
366              
367             Because of the large combination of possible addresses defined in the grammar,
368             the program is not very fast.
369              
370              
371             =head1 REFERENCES
372              
373             "The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
374              
375             Australian Standard AS4212-1994 "Geographic Information Systems -
376             Data Dictionary for transfer of street addressing information"
377              
378             ISO 3166-2:1998, Codes for the representation of names of countries
379             and their subdivisions. Also released as AS/NZS 2632.2:1999
380              
381              
382             =head1 SEE ALSO
383              
384             AddressParse is designed to identify properties, which have a unique physical
385             location. L will also parse addresses for the USA, and can handle
386             locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
387             "Mission Street at Valencia Street, San Francisco, CA"
388              
389              
390             L
391             L
392             L
393             L
394              
395             See L
396             for a list of different addressing formats from around the world. And also
397             L
398              
399             =head1 REPOSITORY
400              
401             L
402              
403             =head1 TO DO
404              
405             Define grammar for other languages. Hopefully, all that would be needed is
406             to specify a new module with its own grammar, and inherit all the existing
407             methods. I don't have the knowledge of the naming conventions for non-english
408             languages.
409              
410             =head1 AUTHOR
411              
412             AddressParse was written by Kim Ryan
413              
414             =head1 COPYRIGHT AND LICENSE
415              
416             Copyright (c) 2015 Kim Ryan. All rights reserved.
417              
418             This library is free software; you can redistribute it and/or modify
419             it under the same terms as Perl itself.
420              
421              
422             =cut
423              
424             #------------------------------------------------------------------------------
425              
426             package Lingua::EN::AddressParse;
427              
428 2     2   89600 use strict;
  2         2  
  2         45  
429 2     2   6 use Carp;
  2         2  
  2         115  
430 2     2   7 use warnings;
  2         5  
  2         40  
431 2     2   829 use Lingua::EN::AddressParse::Grammar;
  2         3  
  2         74  
432 2     2   1413 use Lingua::EN::NameParse;
  2         67956  
  2         99  
433 2     2   12 use Parse::RecDescent;
  2         3  
  2         9  
434              
435             our $VERSION = '1.25';
436              
437             #------------------------------------------------------------------------------
438             # Create a new instance of an address parsing object. This step is time
439             # consuming and should normally only be called once in your program.
440              
441             sub new
442             {
443 4     4 1 1364 my $class = shift;
444 4         9 my %args = @_;
445              
446              
447 4 50 33     53 unless (defined $args{country} and $args{country} =~
448             /^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ )
449             {
450 0         0 croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n";
451             }
452              
453              
454 4         6 my $address = {};
455 4         9 bless($address,$class);
456              
457             # option defaults
458 4         11 $address->{'force_post_code'} = 1;
459              
460             # Add error checking for invalid keys?
461 4         13 foreach my $curr_key (keys %args)
462             {
463 8         13 $address->{$curr_key} = $args{$curr_key};
464             }
465              
466             # create the grammar tree (this is country dependent)
467 4         20 my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
468              
469 4         26 $address->{parse} = Parse::RecDescent->new($grammar);
470              
471 4         2614337 return ($address);
472             }
473             #------------------------------------------------------------------------------
474             sub parse
475             {
476 15     15 1 7686 my $address = shift;
477 15         25 my ($input_string) = @_;
478              
479             # Save original data so we can check effect of auto cleaning
480 15         63 $address->{original_input} = $input_string;
481              
482             # Convert to all upper case. This will allow for faster regexp matching in
483             # the grammar tree
484 15         40 $address->{input_string} = uc($input_string);
485              
486 15         34 chomp($address->{input_string});
487              
488 15         18 my $pre_cursor;
489 15         32 ($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
490            
491             # Replace commas (which can be used to chunk sections of addresses) with spaces
492 15         34 $address->{input_string} =~ s/,/ /g;
493            
494 15 100       45 if ( $address->{auto_clean} )
495             {
496 12         24 $address->{input_string} = _clean($address);
497             }
498              
499 15         15 my $po_box_type;
500 15         34 ($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
501            
502 15         16 my $level;
503 15         33 ($level,$address->{input_string}) = _extract_level($address->{input_string});
504            
505 15         15 my $building;
506 15         26 ($building,$address->{input_string}) = _extract_building($address->{input_string});
507            
508              
509            
510             # Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
511 15 100 100     94 if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ )
512             {
513 2         10 $address->{input_string} = "UNIT $1 $2";
514             }
515              
516             # We need to add a trailing space to the input string. This is because the grammar
517             # tree expects a terminator (the space) fro every production, optionally followed
518             # by other productions or any final non matching text.
519             # This space will be removed in the _assemble function
520 15         21 $address->{input_string} .= ' ';
521              
522 15         33 $address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
523 15         32 _validate($address);
524              
525              
526 15         48 return($address,$address->{error});
527             }
528              
529             #------------------------------------------------------------------------------
530             # Apply correct capitalisation to each component of an address
531              
532             sub components
533             {
534 17     17 1 80 my $address = shift;
535 17         16 my ($uc_all) = @_;
536              
537 17         18 my %orig_components = %{ $address->{components} };
  17         132  
538              
539 17         26 my (%cased_components);
540 17         49 foreach my $curr_key ( keys %orig_components )
541             {
542 323         191 my $cased_value;
543 323         232 my $curr_value = $orig_components{$curr_key};
544            
545 323 50       359 if ($uc_all)
546             {
547 0         0 $cased_components{$curr_key} = uc($curr_value);
548 0         0 next;
549             }
550            
551              
552 323 100       607 if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ )
    100          
553             {
554              
555 187 100 100     405 if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
    100          
556             {
557 1         5 $cased_value = "US Highway $1";
558             }
559             elsif ( $curr_key eq 'sub_property_identifier' )
560             {
561             # UNIT, APT ... 12D etc
562              
563 17         28 my @words = split(/ /,$curr_value);
564 17         16 my @cased_words;
565             my $cased_string;
566 17         23 foreach my $word (@words)
567             {
568 5         3 my $cased_word;
569 5 50 66     31 if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/)
    50          
570             {
571             # ordinal component, as in 3rd Floor
572 0         0 $cased_word = lc($word);
573             }
574             elsif ( length($word) > 1 and $word !~ /\d/ )
575             {
576             # only need to title case words such as UNIT
577 0         0 $cased_word = Lingua::EN::NameParse::case_surname($word);
578             }
579             else
580             {
581 5         6 $cased_word = $word;
582             }
583 5         7 push(@cased_words,$cased_word);
584              
585             }
586 17         34 $cased_value = join(' ',@cased_words);
587             }
588             else
589             {
590 169 100       164 if ($curr_value)
591             {
592             # Surnames can be used for street's or suburbs so this method
593             # will give correct capitalisation for most cases
594 71         126 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
595             }
596             else
597             {
598 98         82 $cased_value = '';
599             }
600             }
601             }
602             # retain street_direction,sub country and countries capitalisation, usually uppercase
603             elsif ($curr_key =~ /street_direction/)
604             {
605 34 100 66     135 if (length($curr_value) == 1 or length($curr_value) == 2)
    50          
606             {
607             # N, SE etc is capitalised
608 1         2 $cased_value =$curr_value;
609             }
610             elsif (length($curr_value) > 2)
611             {
612 0         0 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
613             }
614             else
615             {
616 33         34 $cased_value = '';
617             }
618             }
619             # retain sub country and countries capitalisation, as usually uppercase
620             else
621             {
622 102         103 $cased_value = uc($curr_value);
623             }
624 323         1518 $cased_components{$curr_key} = $cased_value;
625             }
626 17         324 return(%cased_components);
627             }
628             #------------------------------------------------------------------------------
629             # Apply correct capitalisation to an entire address
630              
631             sub case_all
632             {
633 0     0 1 0 my $address = shift;
634              
635 0         0 my @cased_address;
636              
637 0 0       0 unless ( $address->{properties}{type} eq 'unknown' )
638             {
639              
640             # Hash of of lists, indicating the order that address components are assembled in.
641             # Each list element is itself the name of the key value in an address object.
642              
643 0         0 my %component_order=
644             (
645             'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
646             'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
647             'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
648              
649             );
650 0 0       0 if ( $address->{country} eq 'US' )
651             {
652 0         0 $component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
653             }
654             else
655             {
656 0         0 $component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
657             }
658              
659 0         0 my %component_vals = $address->components;
660 0         0 my @order = @{ $component_order{$address->{properties}{type} } };
  0         0  
661              
662 0         0 foreach my $component ( @order )
663             {
664             # As some components such as property name are optional, they will appear
665             # in the order array but may or may not have have a value, so check
666             # for undefined values
667 0 0       0 if ( $component_vals{$component} )
668             {
669 0         0 push(@cased_address,$component_vals{$component});
670             }
671             }
672             }
673              
674 0 0 0     0 if ( $address->{error} and $address->{force_case} )
675             {
676             # Despite errors, try to name case non-matching section. As the format
677             # of this section is unknown, surname case will provide the best
678             # approximation
679 0         0 push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
680             }
681              
682 0         0 return(join(' ',@cased_address));
683             }
684             #------------------------------------------------------------------------------
685             sub properties
686             {
687 2     2 1 8 my $address = shift;
688 2         2 return(%{ $address->{properties} });
  2         10  
689             }
690              
691             #------------------------------------------------------------------------------
692             # Create a text report to standard output listing
693             # - the input string,
694             # - the name of each defined component
695             # - any non matching component
696              
697             sub report
698             {
699 0     0 1 0 my $address = shift;
700              
701 0         0 my $report = '';
702              
703 0         0 _fmt_report_line(\$report,"Original Input",$address->{original_input});
704 0         0 _fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
705 0         0 _fmt_report_line(\$report,"Country address format",$address->{country_code});
706              
707 0         0 my %props = $address->properties;
708 0 0       0 if ( $props{type} )
709             {
710 0         0 _fmt_report_line(\$report,"Address type",$props{type});
711             }
712              
713              
714 0         0 _fmt_report_line(\$report,"Non matching part",$props{non_matching});
715 0         0 _fmt_report_line(\$report,"Error",$address->{error});
716 0         0 _fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
717 0         0 _fmt_report_line(\$report,"Warning",$address->{error});
718 0         0 _fmt_report_line(\$report,"Warning description",$address->{warning_desc});
719 0         0 _fmt_report_line(\$report,"Case all",$address->case_all);
720              
721              
722 0         0 _fmt_report_line(\$report,"COMPONENTS",'');
723 0         0 my %comps = $address->components;
724 0         0 foreach my $comp ( sort keys %comps)
725             {
726 0 0       0 if (defined($comps{$comp}) )
727             {
728 0         0 _fmt_report_line(\$report,$comp,$comps{$comp});
729             }
730             }
731              
732 0         0 return($report);
733             }
734              
735              
736             #------------------------------------------------------------------------------
737              
738             # PRIVATE METHODS
739              
740             #------------------------------------------------------------------------------
741              
742             sub _assemble
743             {
744              
745 15     15   17 my $address = shift;
746 15         19 my ($pre_cursor,$po_box_type,$level,$building) = @_;
747              
748             # Parse the address according to the rules defined in the AddressParse::Grammar module,
749             # $::RD_TRACE = 1; # for debugging RecDescent output
750             # Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
751 15         127 my $parsed_address = $address->{parse}->full_address($address->{input_string});
752              
753             # Place components into a separate hash, so they can be easily returned to the user to inspect and modify
754 15         169049 $address->{components} = ();
755              
756 15 100       90 if ($pre_cursor)
757             {
758 1         4 $address->{components}{'pre_cursor'} = $pre_cursor;
759             }
760             else
761             {
762 14         35 $address->{components}{'pre_cursor'} = '';
763             }
764            
765 15 100       31 if ($level)
766             {
767 1         2 $address->{components}{'level'} = $level;
768             }
769             else
770             {
771 14         24 $address->{components}{'level'} = '';
772             }
773            
774 15 100       41 if ($building)
775             {
776 1         3 $address->{components}{'building'} = $building;
777             }
778             else
779             {
780 14         29 $address->{components}{'building'} = '';
781             }
782            
783              
784 15 100       28 if ($po_box_type)
785             {
786 1         2 $address->{components}{'po_box_type'} = $po_box_type;
787             }
788             else
789             {
790 14         22 $address->{components}{'po_box_type'} = '';
791             }
792            
793              
794 15         28 $address->{components}{post_box} = '';
795 15 100       39 if ( $parsed_address->{post_box} )
796             {
797 1         2 $address->{components}{post_box} = $parsed_address->{post_box};
798             }
799              
800 15         23 $address->{components}{road_box} = '';
801 15 50       33 if ( $parsed_address->{road_box} )
802             {
803 0         0 $address->{components}{road_box} = $parsed_address->{road_box};
804             }
805              
806 15         25 $address->{components}{property_name} = '';
807 15 100       34 if ( $parsed_address->{property_name} )
808             {
809 1         2 $address->{components}{property_name} = $parsed_address->{property_name};
810             }
811              
812 15         38 $address->{components}{sub_property_identifier} = '';
813 15         26 $address->{components}{sub_property_type} = '';
814            
815 15 100       35 if ( $parsed_address->{sub_property} )
816             {
817 4 50       49 if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ )
    0          
818             {
819             # Such as Unit 24, # 4A etc
820 4         15 $address->{components}{sub_property_type} = $1;
821 4         10 $address->{components}{sub_property_identifier} = $2;
822             }
823             elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
824             {
825             # Such as 1st Floor
826 0         0 $address->{components}{sub_property_type} = $2;
827 0         0 $address->{components}{sub_property_identifier} = $1;
828             }
829             }
830              
831 15         23 $address->{components}{property_identifier} = '';
832 15 100       37 if ( $parsed_address->{property_identifier} )
833             {
834 13         27 $address->{components}{property_identifier} = $parsed_address->{property_identifier};
835             }
836              
837 15         21 $address->{components}{base_street_name} = '';
838 15         25 $address->{components}{street_direction_prefix} = '';
839 15         38 my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name});
840 15 50       26 if ($street_direction )
841             {
842 0         0 $address->{components}{street_direction_prefix} = $street_direction;
843 0         0 $address->{components}{base_street_name} = $base_street_name;
844             }
845             else
846             {
847 15         24 $address->{components}{base_street_name} = $parsed_address->{street_name};
848             }
849              
850 15         22 $address->{components}{street_name} = '';
851 15         22 $address->{components}{street_type} = '';
852 15 100       28 if ( $parsed_address->{street_name} )
853             {
854             # Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
855 13 50       29 if ( $parsed_address->{street_name} eq 'THE ' )
856             {
857 0         0 $address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
858             }
859             else
860             {
861 13         21 $address->{components}{street_name} = $parsed_address->{street_name};
862 13         18 $address->{components}{street_type} = $parsed_address->{street_type};
863             }
864             }
865              
866              
867 15         25 $address->{components}{street_direction_suffix} = '';
868 15 100       34 if ( $parsed_address->{street_direction_suffix} )
869             {
870 1         5 $address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
871             }
872              
873              
874 15         40 $address->{components}{suburb} = '';
875 15 50       37 if ( $parsed_address->{suburb} )
876             {
877 15         24 $address->{components}{suburb} = $parsed_address->{suburb};
878             }
879              
880 15         23 $address->{components}{subcountry} = '';
881 15 50       33 if ( $parsed_address->{subcountry} )
882             {
883 15         17 my $sub_country = $parsed_address->{subcountry};
884              
885             # Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
886 15 100       33 if ($address->{abbreviate_subcountry})
887             {
888 7         45 my $country = Locale::SubCountry->new($address->{country});
889 7         106 my $code = $country->code($sub_country);
890 7 100       497 if ( $code ne 'unknown' )
891             {
892 1         3 $address->{components}{subcountry} = $code;
893             }
894             # sub country already abbreviated
895             else
896             {
897 6         20 $address->{components}{subcountry} = $sub_country;
898             }
899             }
900             else
901             {
902 8         15 $address->{components}{subcountry} = $sub_country;
903             }
904             }
905              
906 15         20 $address->{components}{post_code} = '';
907 15 50       34 if ( $parsed_address->{post_code} )
908             {
909 15         24 $address->{components}{post_code} = $parsed_address->{post_code};
910             }
911              
912 15         19 $address->{components}{country} = '';
913 15 100       31 if ( $parsed_address->{country} )
914             {
915 1         2 $address->{components}{country} = $parsed_address->{country};
916             }
917              
918 15         30 $address->{properties} = ();
919              
920 15         31 $address->{properties}{non_matching} = '';
921 15 100       28 if ( $parsed_address->{non_matching} )
922             {
923 1         2 $address->{properties}{non_matching} = $parsed_address->{non_matching};
924             }
925 15         24 $address->{properties}{type} = $parsed_address->{type};
926            
927 15         32 _trim_trailing_space($address);
928              
929 15         59 return($address);
930             }
931              
932             #-------------------------------------------------------------------------------
933             #
934             sub _get_street_direction
935             {
936 15     15   23 my ($street_name) = @_;
937              
938 15         19 my $street_direction;
939             my $base_street_name;
940              
941 15 100       32 unless ($street_name)
942             {
943 2         4 return;
944             }
945              
946 13         65 my @words = split(/\s/,$street_name);
947 13 100       34 if (@words > 1)
948             {
949 4 50       15 if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ )
950             {
951 0         0 $street_direction = $1;
952 0         0 shift(@words);
953 0         0 $base_street_name = join(' ',@words);
954             }
955             }
956 13         29 return($street_direction,$base_street_name);
957              
958             }
959              
960             #------------------------------------------------------------------------------
961             # Check for several different types of syntax errors
962              
963             sub _validate
964             {
965 15     15   19 my $address = shift;
966 15         24 $address->{error} = 0;
967 15         21 $address->{error_desc} = '';
968 15         21 $address->{warning} = 0;
969 15         20 $address->{warning_desc} = '';
970              
971 15 100       35 if ( $address->{properties}{non_matching} )
972             {
973 1         2 $address->{error} = 1;
974 1         4 $address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
975             }
976             else
977             {
978 14 50       27 if ( $address->{properties}{type} eq 'unknown' )
979             {
980 0         0 $address->{error} = 1;
981 0         0 $address->{error_desc} .= 'unknown address format';
982             }
983             else
984             {
985 14 50 33     82 if ($address->{force_post_code} and not $address->{components}{post_code})
986             {
987 0         0 $address->{error} = 1;
988 0         0 $address->{error_desc} .= ':no post code';
989             }
990              
991             # illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
992 14 50       50 if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
993             {
994             # Note, if auto_clean is on, illegal characters will have been removed
995             # for second parsing and no error flag or message reported
996 0         0 $address->{error} = 1;
997 0         0 $address->{error_desc} .= ':illegal chars';
998             }
999 14 100       36 if ( $address->{properties}{type} eq 'suburban' )
1000             {
1001 12         12 my $street = $address->{components}{street_name};
1002 12 100       34 if ($street !~ /\d/ )
1003             {
1004             # Not an ordinal or single letter street type
1005 11 100       24 if ( _check_vowel($address->{components}{base_street_name}) )
1006             {
1007             # street name must have a vowel sound,
1008 1         2 $address->{warning} = 1;
1009 1         3 $address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1010             }
1011             }
1012             }
1013              
1014 14 50       25 if ( _check_vowel($address->{components}{suburb}) )
1015             {
1016 0         0 $address->{warning} = 1;
1017 0         0 $address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1018             }
1019             }
1020             }
1021             }
1022             #-------------------------------------------------------------------------------
1023             # Purge the input string of illegal or redundant characters.
1024             # Correct malformed patterns
1025              
1026             sub _clean
1027             {
1028 12     12   16 my $address = shift;
1029              
1030 12         21 my ($input) = $address->{input_string};
1031              
1032             # Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1033 12         15 $input =~ s|\(.*\)||;
1034            
1035             # Normalise half house numbers, sucvh as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1036 12         17 $input =~ s|^(\d{1,4})\.5 |$1 1/2 |;
1037              
1038             # strip full stops, remove illegal characters
1039             # & can be part of property name
1040             # hash (#) may denote number for USA address
1041             # quotes can occur as property name delimiters
1042              
1043 12         24 $input =~ s|[^A-Za-z0-9&#/'" -]||go;
1044              
1045             # remove repeating, leading and trailing spaces
1046 12         18 $input =~ s| +| |go ;
1047 12         13 $input =~ s|^ ||;
1048 12         18 $input =~ s| $||;
1049              
1050              
1051             # Expand abbreviations that are too short
1052              
1053 12         18 $input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1054 12         17 $input =~ s| CSEWY | CAUSEWAY |;
1055              
1056             # street types
1057              
1058              
1059 12         13 $input =~ s|^FCTR?Y |FACTORY |;
1060 12         50 $input =~ s|^FACT?R?Y? |FACTORY |;
1061              
1062 12         15 $input =~ s|LVL |LEVEL |; # sub property identifiers
1063 12         14 $input =~ s|^UN? |UNIT |;
1064 12         11 $input =~ s|^U(\d+)|UNIT $1|;
1065              
1066             # Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1067 12         11 $input =~ s|/ |/|;
1068 12         12 $input =~ s| /|/|;
1069 12         12 $input =~ s|- |-|;
1070 12         14 $input =~ s| -|-|;
1071              
1072             # Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1073              
1074 12 50       40 if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ )
1075             {
1076             # Don't remove space before single letter streets such as 21 B Street
1077 12 50       31 if ( $address->{country_code} eq 'US' )
1078             {
1079             # Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1080             # Assume that the direction prefix is the more likely case
1081 0         0 $input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|;
1082             }
1083             else
1084             {
1085 12         31 $input =~ s|^(\d+) ([A-Z] )|$1$2|;
1086             }
1087             }
1088              
1089             # Add or remove spaces in sub property identifiers
1090 12 50       24 if ( $address->{country_code} eq 'US' )
1091             {
1092             # Fix US sub property identifiers that appear after street name and type
1093             # add space between # and the number so #2 becomes '# 2'
1094 0         0 $input =~ s| #(\d)| # $1|;
1095 0         0 $input =~ s| #([A-Z])| # $1|;
1096 0         0 $input =~ s| (APT)(\d)| $1 $2|i;
1097              
1098             # remove redundnant space so # 34 B becomes # 34B
1099 0         0 $input =~ s| # (\d+) (\w) | # $1$2 |;
1100              
1101             # remove redundnant '#'
1102 0         0 $input =~ s| APT #| APT |;
1103             }
1104             else
1105             {
1106             # Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1107 12         20 $input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/;
1108             }
1109              
1110             # Remove redundant slash or dash
1111             # Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1112             # TO DO, add |# at start
1113 12         21 $input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1114             # Unit J1/ 39 becomes Unit J1 39
1115 12         18 $input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1116              
1117              
1118             # remove dash that is not from a sequence, such as D-5 or 22-A
1119 12         15 $input =~ s|([A-Z])-(\d)|$1$2|;
1120 12         16 $input =~ s|(\d)-([A-Z])|$1$2|;
1121            
1122              
1123 12         18 return($input);
1124             }
1125             #-------------------------------------------------------------------------------
1126             # Remove any "care of" type of precursor from the main address
1127             # such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1128             # It will later be saved as an attribute in the address object
1129              
1130             sub _extract_precursor
1131             {
1132 15     15   19 my ($input) = @_;
1133 15         19 my ($pre_cursor,$address_start,$address_end);
1134              
1135 15 100       73 if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)})
1136             {
1137 1         5 $pre_cursor = $1;
1138 1         2 $address_start = $2;
1139 1         2 $address_end = $3;
1140 1         5 return($pre_cursor, $address_start . $address_end);
1141             }
1142             else
1143             {
1144 14         37 return('',$input)
1145             }
1146             }
1147             #-------------------------------------------------------------------------------
1148             # Remove any level or floor info such as:
1149             # 12 Smith St Floor 2
1150             # Level 22 Suite 3 12 Main St
1151             # It will later be saved as an attribute in the address object
1152              
1153             sub _extract_level
1154             {
1155 15     15   14 my ($input) = @_;
1156 15         13 my ($level);
1157            
1158 15 100 66     131 if
      66        
1159             (
1160             # Level info could be at start of string so first space is optional
1161             $input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or
1162             $input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or
1163             $input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/
1164             )
1165             {
1166 1         2 $level = $1;
1167 1         2 $level =~ s|/||;
1168 1         2 $level =~ s|-||;
1169 1         17 $input =~ s/$level//;
1170             }
1171            
1172 15         40 return($level,$input);
1173             }
1174             #-------------------------------------------------------------------------------
1175             # Remove any building info such as:
1176             # Building 2 Level 12 123 Smith St
1177             # 12 Main St Tower A Level 2
1178             # It will later be saved as an attribute in the address object
1179              
1180             sub _extract_building
1181             {
1182 15     15   17 my ($input) = @_;
1183 15         17 my ($building);
1184            
1185 15         46 my $bld = qr{BLOCK|BLDG?|BUILDING|TOWER};
1186            
1187 15 100 66     486 if
      66        
1188             (
1189             $input =~ / ?(($bld) ([A-Z]{1,2}|\d+) )/ or # BLD 12 or AA
1190             $input =~ / ?(($bld) \d{1,3}[A-Z] )/ or # BLD 32C
1191             $input =~ / ?(($bld) [A-Z]\d{1,3} )/ # BLD C12
1192             )
1193             {
1194 1         2 $building = $1;
1195 1         1 $building =~ s|/||;
1196 1         2 $building =~ s|-||;
1197 1         7 $input =~ s/$building//;
1198             }
1199            
1200 15         49 return($building,$input);
1201             }
1202              
1203             #-------------------------------------------------------------------------------
1204             # Remove any description that follows the suburb after the main address
1205             # such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1206             # It will be saved as an address attribute
1207              
1208             sub _extract_po_box_type
1209             {
1210 15     15   21 my ($input) = @_;
1211 15         19 my ($po_box_type,$address_start,$address_end);
1212              
1213 15 100       42 if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ )
1214             {
1215 1         3 $address_start = $1;
1216 1         2 $po_box_type = $2;
1217 1         2 $address_end = $3;
1218 1         4 return($po_box_type, $address_start . $address_end);
1219             }
1220             else
1221             {
1222 14         31 return('',$input)
1223             }
1224             }
1225             #------------------------------------------------------------------------------
1226             # For correct matching, the grammar of each component must include the
1227             # trailing space that separates it from any following word. This should
1228             # now be removed from each component
1229              
1230             sub _trim_trailing_space
1231             {
1232 15     15   17 my ($address) = @_;
1233            
1234 15         17 foreach my $key (keys %{ $address->{components} } )
  15         79  
1235             {
1236 285 100       386 if ($address->{components}{$key} )
1237             {
1238 110         211 $address->{components}{$key} =~ s/ $//g;
1239             }
1240             }
1241             }
1242             #------------------------------------------------------------------------------
1243              
1244             sub _fmt_report_line
1245             {
1246 0     0   0 my ($report_ref,$label,$value) = @_;
1247             # To DO $$ ??
1248 0         0 $$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1249             }
1250             #------------------------------------------------------------------------------
1251              
1252             sub _check_vowel
1253             {
1254 25     25   34 my ($str) = @_;
1255              
1256 25         49 my @words = split(/ /,$str);
1257 25         38 foreach my $word (@words)
1258             {
1259             # Saint, Mount, Junior, Senior (as in Martin Luther KIng Snr)
1260 34 100 66     160 if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|JN?R|SN?R/ )
1261             {
1262 1         4 return(1);
1263             }
1264             }
1265 24         65 return(0);
1266             }
1267             #------------------------------------------------------------------------------
1268              
1269             return(1);