File Coverage

blib/lib/Lingua/EN/AddressParse.pm
Criterion Covered Total %
statement 258 326 79.1
branch 88 122 72.1
condition 21 36 58.3
subroutine 20 23 86.9
pod 6 6 100.0
total 393 513 76.6


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::AddressParse - extract components of a street address from free format text
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::AddressParse;
8              
9             my %args =
10             (
11             country => 'US',
12             auto_clean => 1,
13             force_case => 1,
14             abbreviate_subcountry => 0,
15             abbreviated_subcountry_only => 0,
16             force_post_code => 0
17             );
18              
19             my $address = Lingua::EN::AddressParse->new(%args);
20             $error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21            
22             print $address->report;
23            
24             Country address format 'US'
25             Address type 'suburban'
26             Non matching part 'HOLD MAIL '
27             Error '1'
28             Error descriptions 'non matching section : HOLD MAIL '
29             Warning '1'
30             Warning description ''
31             Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32             COMPONENTS ''
33             base_street_name 'Old Massachusetts'
34             post_code '98100'
35             property_identifier '40 1/2'
36             street_direction_prefix 'N'
37             street_name 'N Old Massachusetts'
38             street_type 'Ave'
39             sub_property_identifier '3B'
40             sub_property_type 'Apt'
41             subcountry 'WASHINGTON'
42             suburb 'Washington Valley'
43              
44             %address_components = $address->components;
45             print $address_components{sub_property_type}; # APT
46             print $address_components{sub_property_identifier}; # 3B
47             print $address_components{property_identifier}; # 40 1/2
48            
49             %address_properties = $address->properties;
50             print $address_properties{type}; # suburban
51             print $address_properties{non_matching}; # : HOLD MAIL
52              
53             $correct_casing = $address->case_all;
54              
55              
56             =head1 DESCRIPTION
57              
58             This module takes as input a suburban, rural or postal address in free format
59             text such as,
60              
61             3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62             12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63             C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64              
65             9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66             27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67              
68             2A O'CONNELL ST KEW NSW 2123
69             12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70             "OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71             GPO Box K318, HAYMARKET, NSW 2000
72              
73            
74             and attempts to parse it. If successful, the address is broken
75             down into it's components and useful functions can be performed such as :
76              
77             converting upper or lower case values to title case (2A O'Connell St Kew NSW 2123)
78             extracting the addresses individual components (2A,O'Connell,St,KEW,NSW,2123)
79             determining the type of format the address is in ('suburban')
80              
81              
82             If the address cannot be parsed you have the option of cleaning the address
83             of bad characters, or extracting any portion that was parsed and the portion
84             that failed.
85              
86             This module can be used for analysing and improving the quality of
87             lists of residential and postal addresses.
88              
89             By using a large combination of regular expressiosn with look ahead analysis, patterns
90             can be parsed that confuse many other parsers. Examples are
91              
92             Street names with several street types: Lane Cove Road
93             Suburbs which include street types: Smith Road St Marys
94             Suburbs that include state names: Fort Washington Washington
95              
96             =head1 DEFINITIONS
97              
98             The following terms are used by AddressParse to define the components that
99             can make up an address.
100              
101             Pre cursor : C/O MR A Smith...
102             Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
103             Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44, 2.5
104             Property name : "Old Regret"
105             Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
106             Road Box : RMB 24A, RMS 234 ...
107             Street Direction: North, SE, Sth. etc
108             Street name : O'Hare, New South Head, The Causeway, Broadway
109             Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
110             Suburb : Dee Why, St. John's Wood ...
111             Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
112             Post (zip) code : 2062, 34532-1234, SG12A 9ET
113             Country : Australia, UK, US or Canada
114              
115              
116              
117             The main address formats currently supported are as follows. (a ? means the component is optional):
118              
119             'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
120              
121             OR for the USA
122             'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
123              
124             'rural' : property_name suburb subcountry post_code(?) country(?)
125             'post_box' : post_box suburb subcountry post_code(?) country(?)
126             'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
127             'road_box' : road_box suburb subcountry post_code(?) country(?)
128            
129             Note that suburb and subcountry are not optional. The accuracy of the parser is
130             improved by providing as much context as possible. Proding a suburb can ehlp to
131             identify street names that would itherwise be ambigious.
132              
133             For the case where you only have a street address, dummy (but still valid) values can be used
134             for suburb (such as 'Somewhere') and sub country (such as 'NY'). These dummy values will
135             be parsed but can be ignored.
136              
137             All formats may contain a precursor
138              
139             Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
140             module for a complete list of combinations.
141              
142              
143             =head1 METHODS
144              
145             =head2 new
146              
147             The C method creates an instance of an address object and sets up
148             the grammar used to parse addresses. This must be called before any of the
149             following methods are invoked. Note that the object only needs to be
150             created once, and can be reused with new input data.
151              
152             Various setup options may be defined in a hash that is passed as an
153             optional argument to the C method.
154              
155             my %args =
156             (
157             country => 'US',
158             auto_clean => 1,
159             force_case => 1,
160             abbreviate_subcountry => 1,
161             abbreviated_subcountry_only => 1,
162             force_post_code => 1
163             );
164              
165             my $address = Lingua::EN::AddressParse->new(%args);
166              
167             =over 4
168              
169             =item country
170              
171             The country argument must be specified. It determines the possible list of
172             valid sub countries (states, counties etc, defined in the Locale::SubCountry
173             module) and post code formats. Either the full name or abbreviation may be
174             specified. The currently supported country names and codes are:
175              
176             AU or Australia
177             CA or Canada
178             GB or United Kingdom
179             US or United States
180              
181             All forms of upper/lower case are acceptable in the country's spelling. If a
182             country name is supplied that the module doesn't recognise, it will die.
183              
184             =item force_case (optional)
185              
186             This option only applies to the C method, see below.
187              
188             =item auto_clean (optional)
189              
190             When this option is set to a positive value, the input string is
191             'cleaned' to try and normalise bad patterns. The type of cleaning
192             includes
193              
194             remove non alphanumeric characters
195             remove full stops
196             remove redundant white space
197             add missing space separators
198             expand abbreviations to more common forms
199             remove bracketed annotations
200             fix badly formed sub property identifiers
201              
202             =item abbreviate_subcountry (optional)
203              
204             When this option is set to a positive value, the sub country is forced to it's
205             abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
206             already abbreviated then it's value is not altered.
207              
208             =item abbreviated_subcountry_only (optional)
209              
210             When this option is set to a positive value, only the abbreviated form
211             of sub country is allowed, such as "NSW" and not "New South Wales". This
212             will make parsing quicker and ensure that addresses comply with postal
213             standards that normally permit only abbreviated sub countries.
214              
215             It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
216             Normally, 'Washington would be consumed as the sub country, but by first converting
217             the address to 'Port Washington NJ' we avoid this problem
218              
219              
220             =item force_post_code (optional)
221              
222             When this option is set to a positive value, the address must contain
223             a post code. If it does not then an error flag is raised. If this option
224             is set to 0 than a post code is optional.
225              
226             By default for this option is true.
227              
228             =back
229              
230             =head2 parse
231              
232             $error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
233              
234             The C method takes a single parameter of a text string containing a
235             address. It attempts to parse the address and break it down into the components
236             described below. If the address is parsed successfully, a 0 is returned,
237             otherwise a 1.
238              
239             Note that you can successfully parse all the components of an address and still
240             have an error returned. This occurs when you have non matching data following
241             a valid address. To check if the data is unusable, you also need to use the
242             C method to check the address type is 'unknown'
243              
244             This method is a prerequisite for all the following methods.
245              
246             =head2 components
247              
248             %address = $address->components($upper_case_all);
249             $suburb = $address{suburb};
250            
251             If the optional argument $upper_case_all is set to a postive value, all components
252             are converted to upper case.
253            
254              
255             The C method returns all the address components in a hash. The
256             following keys are used for each component:
257              
258              
259             pre_cursor - such as 'C/O Mr A Smith'
260             po_box_type - such as 'Private Boxes'
261             post_box
262             road_box
263             sub_property_type
264             sub_property_identifier
265             property_identifier
266             property_name
267             level - such as 12th Floor
268             building - such as Tower A
269             street_direction_prefix (such as East, NW, North etc)
270             base_street_name (the name with direction removed, such as "Main" in "East Main St")
271             street_name (the full street name such as "East Main")
272             street_type
273             street_direction_suffix (US only, abbreviated only such as N, SE etc)
274             suburb
275             subcountry
276             post_code
277             country
278              
279             If a component has no matching data for a given address, it's values will be
280             set to the empty string.
281              
282             Each component is converted to title case, meaning the first letter of each
283             component is set to capitals and the remainder to lower case.
284              
285             Proper name capitalisations such as MacNay and O'Brien are observed
286              
287             The following components are not converted to title case:
288              
289             post_box
290             road_box
291             subcountry
292             post_code
293             country
294             street_direction_suffix
295            
296             If your input data is all upper case and you want to retian that format for parsed
297             data, you will need to apply the 'uc' function to each component.
298              
299             =head2 case_all
300              
301             $correct_casing = $address->case_all;
302              
303             The C method does the same thing as the C method except
304             the entire address is returned as a title cased text string.
305              
306             If the force_case option was set in the C method above, address case the
307             entire input string, including any unmatched sections after a recognisable address
308             that failed parsing. This option is useful when you know you have invalid data,
309             but you still want to title case what you have.
310              
311             =head2 properties
312              
313             The C method returns several properties of the address as a hash.
314             The following keys are used for each property -
315              
316             type - either suburban ,rural,post_box,road_box,unknown
317             non_matching - any trailing string not part the address
318              
319              
320             Additional properties can be accessed with the following
321              
322             $address->{original_input}
323             $address->{input_string} - string after auto_clean option has been applied
324             $address->{country_code} - abbreviated Country address format (as defined in the C method)
325             $address->{error} - error flag, 0 = good, 1 = error
326             $address->{error_desc} - text to describe the type of parsing error
327             $address->{warning} - warning flag, 0 = good, 1 = warning
328             $address->{warning_desc} - text to to describe the type of parsing warning(s)
329            
330             Warnings mean that the address has parsed but there may still be errors within it's components
331              
332              
333             =head2 report
334              
335             Create a formatted text report
336              
337             the input string
338             the cleaned input string
339             the country type
340             the address type
341             any non matching part of input string
342             if any parsing errors occurred
343             error description
344             if any parsing warning occurred
345             warning description
346              
347             the name and value of each defined component
348              
349              
350             Returns a string containing a multi line formatted text report
351              
352             =head1 DEPENDENCIES
353              
354             L, L, L
355              
356             =head1 BUGS
357              
358             =head1 LIMITATIONS
359              
360             Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
361             street type of null string.
362              
363             The abbreviation 'St' can be interpreted as either street or Saint. This leads to
364             ambiguities such as '12 East St Thomas Lane'. This could be 'East Street', suburb of
365             'Thomas Lane' or 'East St Thomas Lane'. And the first pattern is the more common,
366             that is what will match.
367              
368             For US addresses, an ambiguity arises between a street directional suffix and
369             a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
370             Main St, or South Springfield? The parser assumes that 'S' belongs to the street
371             description.
372              
373             The huge number of character combinations that can form a valid address makes
374             it is impossible to correctly identify them all.
375              
376             Valid addresses must contain:
377              
378             property address, suburb, subcountry (aka state) in that order.
379              
380             This format is widely accepted in Australia and the US.
381              
382             UK addresses will often include suburb, town, city and county, formats that
383             are very difficult to parse.
384              
385             Property names must be enclosed in single or double quotes like "Old Regret"
386              
387             Because of the large combination of possible addresses defined in the grammar,
388             the program is not very fast.
389              
390              
391             =head1 REFERENCES
392              
393             "The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
394              
395             Australian Standard AS4212-1994 "Geographic Information Systems -
396             Data Dictionary for transfer of street addressing information"
397              
398             ISO 3166-2:1998, Codes for the representation of names of countries
399             and their subdivisions. Also released as AS/NZS 2632.2:1999
400              
401              
402             =head1 SEE ALSO
403              
404             AddressParse is designed to identify properties, which have a unique physical
405             location. L will also parse addresses for the USA, and can handle
406             locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
407             "Mission Street at Valencia Street, San Francisco, CA"
408              
409              
410             L
411             L
412             L
413             L
414              
415             See L
416             for a list of different addressing formats from around the world. And also
417             L
418              
419             =head1 REPOSITORY
420              
421             L
422              
423             =head1 TO DO
424              
425             Define grammar for other languages. Hopefully, all that would be needed is
426             to specify a new module with its own grammar, and inherit all the existing
427             methods. I don't have the knowledge of the naming conventions for non-english
428             languages.
429              
430             =head1 AUTHOR
431              
432             AddressParse was written by Kim Ryan
433              
434             =head1 COPYRIGHT AND LICENSE
435              
436             Copyright (c) 2015 Kim Ryan. All rights reserved.
437              
438             This library is free software; you can redistribute it and/or modify
439             it under the same terms as Perl itself.
440              
441              
442             =cut
443              
444             #------------------------------------------------------------------------------
445              
446             package Lingua::EN::AddressParse;
447              
448 1     1   573 use strict;
  1         1  
  1         23  
449 1     1   3 use Carp;
  1         1  
  1         57  
450 1     1   3 use warnings;
  1         3  
  1         18  
451 1     1   404 use Lingua::EN::AddressParse::Grammar;
  1         2  
  1         37  
452 1     1   748 use Lingua::EN::NameParse;
  1         34374  
  1         52  
453 1     1   7 use Parse::RecDescent;
  1         1  
  1         4  
454              
455             our $VERSION = '1.26';
456              
457             #------------------------------------------------------------------------------
458             # Create a new instance of an address parsing object. This step is time
459             # consuming and should normally only be called once in your program.
460              
461             sub new
462             {
463 4     4 1 940 my $class = shift;
464 4         11 my %args = @_;
465              
466              
467 4 50 33     51 unless (defined $args{country} and $args{country} =~
468             /^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ )
469             {
470 0         0 croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n";
471             }
472              
473              
474 4         5 my $address = {};
475 4         10 bless($address,$class);
476              
477             # option defaults
478 4         13 $address->{'force_post_code'} = 1;
479              
480             # Add error checking for invalid keys?
481 4         13 foreach my $curr_key (keys %args)
482             {
483 8         14 $address->{$curr_key} = $args{$curr_key};
484             }
485              
486             # create the grammar tree (this is country dependent)
487 4         19 my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
488              
489 4         24 $address->{parse} = Parse::RecDescent->new($grammar);
490              
491 4         2508788 return ($address);
492             }
493             #------------------------------------------------------------------------------
494             sub parse
495             {
496 15     15 1 7725 my $address = shift;
497 15         41 my ($input_string) = @_;
498              
499             # Save original data so we can check effect of auto cleaning
500 15         44 $address->{original_input} = $input_string;
501              
502             # Convert to all upper case. This will allow for faster regexp matching in
503             # the grammar tree
504 15         57 $address->{input_string} = uc($input_string);
505              
506 15         44 chomp($address->{input_string});
507              
508 15         25 my $pre_cursor;
509 15         61 ($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
510            
511             # Replace commas (which can be used to chunk sections of addresses) with spaces
512 15         63 $address->{input_string} =~ s/,/ /g;
513            
514 15 100       47 if ( $address->{auto_clean} )
515             {
516 12         34 $address->{input_string} = _clean($address);
517             }
518              
519 15         25 my $po_box_type;
520 15         53 ($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
521            
522 15         27 my $level;
523 15         49 ($level,$address->{input_string}) = _extract_level($address->{input_string});
524            
525 15         20 my $building;
526 15         54 ($building,$address->{input_string}) = _extract_building($address->{input_string});
527            
528              
529            
530             # Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
531 15 100 100     142 if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ )
532             {
533 2         12 $address->{input_string} = "UNIT $1 $2";
534             }
535              
536             # We need to add a trailing space to the input string. This is because the grammar
537             # tree expects a terminator (the space) fro every production, optionally followed
538             # by other productions or any final non matching text.
539             # This space will be removed in the _assemble function
540 15         34 $address->{input_string} .= ' ';
541              
542 15         49 $address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
543 15         62 _validate($address);
544              
545              
546 15         64 return($address,$address->{error});
547             }
548              
549             #------------------------------------------------------------------------------
550             # Apply correct capitalisation to each component of an address
551              
552             sub components
553             {
554 17     17 1 105 my $address = shift;
555 17         20 my ($uc_all) = @_;
556              
557 17         23 my %orig_components = %{ $address->{components} };
  17         163  
558              
559 17         36 my (%cased_components);
560 17         55 foreach my $curr_key ( keys %orig_components )
561             {
562 323         186 my $cased_value;
563 323         267 my $curr_value = $orig_components{$curr_key};
564            
565 323 50       386 if ($uc_all)
566             {
567 0         0 $cased_components{$curr_key} = uc($curr_value);
568 0         0 next;
569             }
570            
571              
572 323 100       658 if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ )
    100          
573             {
574              
575 187 100 100     444 if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
    100          
576             {
577 1         4 $cased_value = "US Highway $1";
578             }
579             elsif ( $curr_key eq 'sub_property_identifier' )
580             {
581             # UNIT, APT ... 12D etc
582              
583 17         39 my @words = split(/ /,$curr_value);
584 17         18 my @cased_words;
585             my $cased_string;
586 17         32 foreach my $word (@words)
587             {
588 5         6 my $cased_word;
589 5 50 66     36 if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/)
    50          
590             {
591             # ordinal component, as in 3rd Floor
592 0         0 $cased_word = lc($word);
593             }
594             elsif ( length($word) > 1 and $word !~ /\d/ )
595             {
596             # only need to title case words such as UNIT
597 0         0 $cased_word = Lingua::EN::NameParse::case_surname($word);
598             }
599             else
600             {
601 5         9 $cased_word = $word;
602             }
603 5         8 push(@cased_words,$cased_word);
604              
605             }
606 17         41 $cased_value = join(' ',@cased_words);
607             }
608             else
609             {
610 169 100       162 if ($curr_value)
611             {
612             # Surnames can be used for street's or suburbs so this method
613             # will give correct capitalisation for most cases
614 71         133 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
615             }
616             else
617             {
618 98         93 $cased_value = '';
619             }
620             }
621             }
622             # retain street_direction,sub country and countries capitalisation, usually uppercase
623             elsif ($curr_key =~ /street_direction/)
624             {
625 34 100 66     147 if (length($curr_value) == 1 or length($curr_value) == 2)
    50          
626             {
627             # N, SE etc is capitalised
628 1         3 $cased_value =$curr_value;
629             }
630             elsif (length($curr_value) > 2)
631             {
632 0         0 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
633             }
634             else
635             {
636 33         35 $cased_value = '';
637             }
638             }
639             # retain sub country and countries capitalisation, as usually uppercase
640             else
641             {
642 102         107 $cased_value = uc($curr_value);
643             }
644 323         1720 $cased_components{$curr_key} = $cased_value;
645             }
646 17         402 return(%cased_components);
647             }
648             #------------------------------------------------------------------------------
649             # Apply correct capitalisation to an entire address
650              
651             sub case_all
652             {
653 0     0 1 0 my $address = shift;
654              
655 0         0 my @cased_address;
656              
657 0 0       0 unless ( $address->{properties}{type} eq 'unknown' )
658             {
659              
660             # Hash of of lists, indicating the order that address components are assembled in.
661             # Each list element is itself the name of the key value in an address object.
662              
663 0         0 my %component_order=
664             (
665             'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
666             'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
667             'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
668              
669             );
670 0 0       0 if ( $address->{country} eq 'US' )
671             {
672 0         0 $component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
673             }
674             else
675             {
676 0         0 $component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
677             }
678              
679 0         0 my %component_vals = $address->components;
680 0         0 my @order = @{ $component_order{$address->{properties}{type} } };
  0         0  
681              
682 0         0 foreach my $component ( @order )
683             {
684             # As some components such as property name are optional, they will appear
685             # in the order array but may or may not have have a value, so check
686             # for undefined values
687 0 0       0 if ( $component_vals{$component} )
688             {
689 0         0 push(@cased_address,$component_vals{$component});
690             }
691             }
692             }
693              
694 0 0 0     0 if ( $address->{error} and $address->{force_case} )
695             {
696             # Despite errors, try to name case non-matching section. As the format
697             # of this section is unknown, surname case will provide the best
698             # approximation
699 0         0 push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
700             }
701              
702 0         0 return(join(' ',@cased_address));
703             }
704             #------------------------------------------------------------------------------
705             sub properties
706             {
707 2     2 1 11 my $address = shift;
708 2         2 return(%{ $address->{properties} });
  2         11  
709             }
710              
711             #------------------------------------------------------------------------------
712             # Create a text report to standard output listing
713             # - the input string,
714             # - the name of each defined component
715             # - any non matching component
716              
717             sub report
718             {
719 0     0 1 0 my $address = shift;
720              
721 0         0 my $report = '';
722              
723 0         0 _fmt_report_line(\$report,"Original Input",$address->{original_input});
724 0         0 _fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
725 0         0 _fmt_report_line(\$report,"Country address format",$address->{country_code});
726              
727 0         0 my %props = $address->properties;
728 0 0       0 if ( $props{type} )
729             {
730 0         0 _fmt_report_line(\$report,"Address type",$props{type});
731             }
732              
733              
734 0         0 _fmt_report_line(\$report,"Non matching part",$props{non_matching});
735 0         0 _fmt_report_line(\$report,"Error",$address->{error});
736 0         0 _fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
737 0         0 _fmt_report_line(\$report,"Warning",$address->{error});
738 0         0 _fmt_report_line(\$report,"Warning description",$address->{warning_desc});
739 0         0 _fmt_report_line(\$report,"Case all",$address->case_all);
740              
741              
742 0         0 _fmt_report_line(\$report,"COMPONENTS",'');
743 0         0 my %comps = $address->components;
744 0         0 foreach my $comp ( sort keys %comps)
745             {
746 0 0       0 if (defined($comps{$comp}) )
747             {
748 0         0 _fmt_report_line(\$report,$comp,$comps{$comp});
749             }
750             }
751              
752 0         0 return($report);
753             }
754              
755              
756             #------------------------------------------------------------------------------
757              
758             # PRIVATE METHODS
759              
760             #------------------------------------------------------------------------------
761              
762             sub _assemble
763             {
764              
765 15     15   25 my $address = shift;
766 15         29 my ($pre_cursor,$po_box_type,$level,$building) = @_;
767              
768             # Parse the address according to the rules defined in the AddressParse::Grammar module,
769             # $::RD_TRACE = 1; # for debugging RecDescent output
770             # Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
771 15         147 my $parsed_address = $address->{parse}->full_address($address->{input_string});
772              
773             # Place components into a separate hash, so they can be easily returned to the user to inspect and modify
774 15         191025 $address->{components} = ();
775              
776 15 100       146 if ($pre_cursor)
777             {
778 1         4 $address->{components}{'pre_cursor'} = $pre_cursor;
779             }
780             else
781             {
782 14         46 $address->{components}{'pre_cursor'} = '';
783             }
784            
785 15 100       51 if ($level)
786             {
787 1         4 $address->{components}{'level'} = $level;
788             }
789             else
790             {
791 14         33 $address->{components}{'level'} = '';
792             }
793            
794 15 100       40 if ($building)
795             {
796 1         3 $address->{components}{'building'} = $building;
797             }
798             else
799             {
800 14         28 $address->{components}{'building'} = '';
801             }
802            
803              
804 15 100       42 if ($po_box_type)
805             {
806 1         4 $address->{components}{'po_box_type'} = $po_box_type;
807             }
808             else
809             {
810 14         36 $address->{components}{'po_box_type'} = '';
811             }
812            
813              
814 15         42 $address->{components}{post_box} = '';
815 15 100       65 if ( $parsed_address->{post_box} )
816             {
817 1         3 $address->{components}{post_box} = $parsed_address->{post_box};
818             }
819              
820 15         30 $address->{components}{road_box} = '';
821 15 50       57 if ( $parsed_address->{road_box} )
822             {
823 0         0 $address->{components}{road_box} = $parsed_address->{road_box};
824             }
825              
826 15         42 $address->{components}{property_name} = '';
827 15 100       44 if ( $parsed_address->{property_name} )
828             {
829 1         4 $address->{components}{property_name} = $parsed_address->{property_name};
830             }
831              
832 15         48 $address->{components}{sub_property_identifier} = '';
833 15         63 $address->{components}{sub_property_type} = '';
834            
835 15 100       46 if ( $parsed_address->{sub_property} )
836             {
837 4 50       31 if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ )
    0          
838             {
839             # Such as Unit 24, # 4A etc
840 4         12 $address->{components}{sub_property_type} = $1;
841 4         13 $address->{components}{sub_property_identifier} = $2;
842             }
843             elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
844             {
845             # Such as 1st Floor
846 0         0 $address->{components}{sub_property_type} = $2;
847 0         0 $address->{components}{sub_property_identifier} = $1;
848             }
849             }
850              
851 15         35 $address->{components}{property_identifier} = '';
852 15 100       50 if ( $parsed_address->{property_identifier} )
853             {
854 13         34 $address->{components}{property_identifier} = $parsed_address->{property_identifier};
855             }
856              
857 15         46 $address->{components}{base_street_name} = '';
858 15         26 $address->{components}{street_direction_prefix} = '';
859 15         61 my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name});
860 15 50       42 if ($street_direction )
861             {
862 0         0 $address->{components}{street_direction_prefix} = $street_direction;
863 0         0 $address->{components}{base_street_name} = $base_street_name;
864             }
865             else
866             {
867 15         36 $address->{components}{base_street_name} = $parsed_address->{street_name};
868             }
869              
870 15         34 $address->{components}{street_name} = '';
871 15         27 $address->{components}{street_type} = '';
872 15 100       45 if ( $parsed_address->{street_name} )
873             {
874             # Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
875 13 50       40 if ( $parsed_address->{street_name} eq 'THE ' )
876             {
877 0         0 $address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
878             }
879             else
880             {
881 13         32 $address->{components}{street_name} = $parsed_address->{street_name};
882 13         25 $address->{components}{street_type} = $parsed_address->{street_type};
883             }
884             }
885              
886              
887 15         41 $address->{components}{street_direction_suffix} = '';
888 15 100       43 if ( $parsed_address->{street_direction_suffix} )
889             {
890 1         2 $address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
891             }
892              
893              
894 15         45 $address->{components}{suburb} = '';
895 15 50       36 if ( $parsed_address->{suburb} )
896             {
897 15         45 $address->{components}{suburb} = $parsed_address->{suburb};
898             }
899              
900 15         28 $address->{components}{subcountry} = '';
901 15 50       58 if ( $parsed_address->{subcountry} )
902             {
903 15         24 my $sub_country = $parsed_address->{subcountry};
904              
905             # Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
906 15 100       39 if ($address->{abbreviate_subcountry})
907             {
908 7         51 my $country = Locale::SubCountry->new($address->{country});
909 7         172 my $code = $country->code($sub_country);
910 7 100       619 if ( $code ne 'unknown' )
911             {
912 1         4 $address->{components}{subcountry} = $code;
913             }
914             # sub country already abbreviated
915             else
916             {
917 6         20 $address->{components}{subcountry} = $sub_country;
918             }
919             }
920             else
921             {
922 8         24 $address->{components}{subcountry} = $sub_country;
923             }
924             }
925              
926 15         27 $address->{components}{post_code} = '';
927 15 50       48 if ( $parsed_address->{post_code} )
928             {
929 15         29 $address->{components}{post_code} = $parsed_address->{post_code};
930             }
931              
932 15         32 $address->{components}{country} = '';
933 15 100       44 if ( $parsed_address->{country} )
934             {
935 1         2 $address->{components}{country} = $parsed_address->{country};
936             }
937              
938 15         40 $address->{properties} = ();
939              
940 15         43 $address->{properties}{non_matching} = '';
941 15 100       43 if ( $parsed_address->{non_matching} )
942             {
943 1         2 $address->{properties}{non_matching} = $parsed_address->{non_matching};
944             }
945 15         36 $address->{properties}{type} = $parsed_address->{type};
946            
947 15         48 _trim_trailing_space($address);
948              
949 15         64 return($address);
950             }
951              
952             #-------------------------------------------------------------------------------
953             #
954             sub _get_street_direction
955             {
956 15     15   30 my ($street_name) = @_;
957              
958 15         19 my $street_direction;
959             my $base_street_name;
960              
961 15 100       50 unless ($street_name)
962             {
963 2         4 return;
964             }
965              
966 13         85 my @words = split(/\s/,$street_name);
967 13 100       39 if (@words > 1)
968             {
969 4 50       18 if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ )
970             {
971 0         0 $street_direction = $1;
972 0         0 shift(@words);
973 0         0 $base_street_name = join(' ',@words);
974             }
975             }
976 13         39 return($street_direction,$base_street_name);
977              
978             }
979              
980             #------------------------------------------------------------------------------
981             # Check for several different types of syntax errors
982              
983             sub _validate
984             {
985 15     15   28 my $address = shift;
986 15         35 $address->{error} = 0;
987 15         30 $address->{error_desc} = '';
988 15         25 $address->{warning} = 0;
989 15         37 $address->{warning_desc} = '';
990              
991 15 100       40 if ( $address->{properties}{non_matching} )
992             {
993 1         2 $address->{error} = 1;
994 1         4 $address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
995             }
996             else
997             {
998 14 50       40 if ( $address->{properties}{type} eq 'unknown' )
999             {
1000 0         0 $address->{error} = 1;
1001 0         0 $address->{error_desc} .= 'unknown address format';
1002             }
1003             else
1004             {
1005 14 50 33     107 if ($address->{force_post_code} and not $address->{components}{post_code})
1006             {
1007 0         0 $address->{error} = 1;
1008 0         0 $address->{error_desc} .= ':no post code';
1009             }
1010              
1011             # illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
1012 14 50       66 if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
1013             {
1014             # Note, if auto_clean is on, illegal characters will have been removed
1015             # for second parsing and no error flag or message reported
1016 0         0 $address->{error} = 1;
1017 0         0 $address->{error_desc} .= ':illegal chars';
1018             }
1019 14 100       37 if ( $address->{properties}{type} eq 'suburban' )
1020             {
1021 12         26 my $street = $address->{components}{street_name};
1022 12 100       42 if ($street !~ /\d/ )
1023             {
1024             # Not an ordinal or single letter street type
1025 11 100       31 if ( _check_vowel($address->{components}{base_street_name}) )
1026             {
1027             # street name must have a vowel sound,
1028 1         2 $address->{warning} = 1;
1029 1         6 $address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1030             }
1031             }
1032             }
1033              
1034 14 50       40 if ( _check_vowel($address->{components}{suburb}) )
1035             {
1036 0         0 $address->{warning} = 1;
1037 0         0 $address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1038             }
1039             }
1040             }
1041             }
1042             #-------------------------------------------------------------------------------
1043             # Purge the input string of illegal or redundant characters.
1044             # Correct malformed patterns
1045              
1046             sub _clean
1047             {
1048 12     12   13 my $address = shift;
1049              
1050 12         29 my ($input) = $address->{input_string};
1051              
1052             # Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1053 12         25 $input =~ s|\(.*\)||;
1054            
1055             # Normalise half house numbers, such as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1056 12         26 $input =~ s|^(\d{1,4})\.5 |$1 1/2 |;
1057              
1058             # strip full stops, remove illegal characters
1059             # & can be part of property name
1060             # hash (#) may denote number for USA address
1061             # quotes can occur as property name delimiters
1062              
1063 12         32 $input =~ s|[^A-Za-z0-9&#/'" -]||go;
1064              
1065             # remove repeating, leading and trailing spaces
1066 12         25 $input =~ s| +| |go ;
1067 12         28 $input =~ s|^ ||;
1068 12         20 $input =~ s| $||;
1069              
1070              
1071             # Expand abbreviations that are too short
1072              
1073 12         26 $input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1074 12         28 $input =~ s| CSEWY | CAUSEWAY |;
1075              
1076             # Standardise abbreviations
1077              
1078 12         23 $input =~ s|STR |ST |;
1079 12         24 $input =~ s|TERR |TERRACE |;
1080            
1081 12         14 $input =~ s|^FCTR?Y |FACTORY |;
1082 12         19 $input =~ s|^FACT?R?Y? |FACTORY |;
1083              
1084 12         27 $input =~ s|LVL |LEVEL |; # sub property identifiers
1085 12         34 $input =~ s|^UN? |UNIT |;
1086 12         25 $input =~ s|^U(\d+)|UNIT $1|;
1087              
1088             # Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1089 12         17 $input =~ s|/ |/|;
1090 12         23 $input =~ s| /|/|;
1091 12         22 $input =~ s|- |-|;
1092 12         22 $input =~ s| -|-|;
1093              
1094             # Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1095              
1096 12 50       61 if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ )
1097             {
1098             # Don't remove space before single letter streets such as 21 B Street
1099 12 50       46 if ( $address->{country_code} eq 'US' )
1100             {
1101             # Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1102             # Assume that the direction prefix is the more likely case
1103 0         0 $input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|;
1104             }
1105             else
1106             {
1107 12         42 $input =~ s|^(\d+) ([A-Z] )|$1$2|;
1108             }
1109             }
1110              
1111             # Normalise sub property identifiers
1112 12 50       30 if ( $address->{country_code} eq 'US' )
1113             {
1114             # Fix US sub property identifiers that appear after street name and type
1115             # add space between # or 'Apt' and the number so #2 becomes '# 2'
1116 0         0 $input =~ s| #(\d)| # $1|;
1117 0         0 $input =~ s| #([A-Z])| # $1|;
1118 0         0 $input =~ s| (APT)(\d)| $1 $2|i;
1119            
1120 0         0 $input =~ s| (APT)-(\w)| $1 $2|i;
1121              
1122             # remove redundant space so # 34 B becomes # 34B
1123 0         0 $input =~ s| # (\d+) (\w) | # $1$2 |;
1124              
1125             # remove redundant '#'
1126 0         0 $input =~ s/ (APT|SUITE|UNIT) #/ $1 /;
1127             # still to test
1128            
1129             }
1130             else
1131             {
1132             # Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1133 12         33 $input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/;
1134             }
1135              
1136             # Remove redundant slash or dash
1137             # Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1138 12         29 $input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1139             # Unit J1/ 39 becomes Unit J1 39
1140 12         25 $input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1141              
1142              
1143             # remove dash that does not from a sequence, such as D-5 or 22-A
1144 12         21 $input =~ s|([A-Z])-(\d)|$1$2|;
1145 12         21 $input =~ s|(\d)-([A-Z])|$1$2|;
1146            
1147              
1148 12         38 return($input);
1149             }
1150             #-------------------------------------------------------------------------------
1151             # Remove any "care of" type of precursor from the main address
1152             # such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1153             # It will later be saved as an attribute in the address object
1154              
1155             sub _extract_precursor
1156             {
1157 15     15   25 my ($input) = @_;
1158 15         25 my ($pre_cursor,$address_start,$address_end);
1159              
1160 15 100       92 if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)})
1161             {
1162 1         5 $pre_cursor = $1;
1163 1         3 $address_start = $2;
1164 1         3 $address_end = $3;
1165 1         6 return($pre_cursor, $address_start . $address_end);
1166             }
1167             else
1168             {
1169 14         61 return('',$input)
1170             }
1171             }
1172             #-------------------------------------------------------------------------------
1173             # Remove any level or floor info such as:
1174             # 12 Smith St Floor 2
1175             # Level 22 Suite 3 12 Main St
1176             # It will later be saved as an attribute in the address object
1177              
1178             sub _extract_level
1179             {
1180 15     15   21 my ($input) = @_;
1181 15         19 my ($level);
1182            
1183 15 100 33     238 if
      66        
      66        
1184             (
1185             # Level info could be at start of string so first space is optional
1186             $input =~ / ?((FIRST|SECONND|THIRD|FOURTH|FIFTH|SIXTH) (FLOOR|FLR|FL) )/ or
1187             $input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or
1188             $input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or
1189             $input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/
1190             )
1191             {
1192 1         5 $level = $1;
1193 1         4 $level =~ s|/||;
1194 1         3 $level =~ s|-||;
1195 1         17 $input =~ s/$level//;
1196             }
1197            
1198 15         47 return($level,$input);
1199             }
1200             #-------------------------------------------------------------------------------
1201             # Remove any building info such as:
1202             # Building 2 Level 12 123 Smith St
1203             # 12 Main St Tower A Level 2
1204             # It will later be saved as an attribute in the address object
1205              
1206             sub _extract_building
1207             {
1208 15     15   18 my ($input) = @_;
1209 15         18 my ($building);
1210            
1211 15         64 my $bld = qr{BLOCK|BLDG?|BUILDING|TOWER};
1212 15         39 my $id = qr{[A-Z]|A[A-Z]|\d+|\d{1,3}[A-Z]|[A-Z]\d{1,3}}; # AA or 12 or 32C or C12
1213            
1214 15 100 66     315 if
1215             (
1216             $input =~ / ($bld $id) / or $input =~ /^($bld $id) /
1217             )
1218             {
1219 1         3 $building = $1;
1220 1         3 $building =~ s|/||;
1221 1         2 $building =~ s|-||;
1222 1         12 $input =~ s/$building//;
1223             }
1224             # TO DO, North, East etc Building?
1225            
1226 15         62 return($building,$input);
1227             }
1228              
1229             #-------------------------------------------------------------------------------
1230             # Remove any description that follows the suburb after the main address
1231             # such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1232             # It will be saved as an address attribute
1233              
1234             sub _extract_po_box_type
1235             {
1236 15     15   23 my ($input) = @_;
1237 15         34 my ($po_box_type,$address_start,$address_end);
1238              
1239 15 100       59 if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ )
1240             {
1241 1         4 $address_start = $1;
1242 1         3 $po_box_type = $2;
1243 1         2 $address_end = $3;
1244 1         5 return($po_box_type, $address_start . $address_end);
1245             }
1246             else
1247             {
1248 14         45 return('',$input)
1249             }
1250             }
1251             #------------------------------------------------------------------------------
1252             # For correct matching, the grammar of each component must include the
1253             # trailing space that separates it from any following word. This should
1254             # now be removed from each component
1255              
1256             sub _trim_trailing_space
1257             {
1258 15     15   28 my ($address) = @_;
1259            
1260 15         25 foreach my $key (keys %{ $address->{components} } )
  15         93  
1261             {
1262 285 100       379 if ($address->{components}{$key} )
1263             {
1264 110         222 $address->{components}{$key} =~ s/ $//g;
1265             }
1266             }
1267             }
1268             #------------------------------------------------------------------------------
1269              
1270             sub _fmt_report_line
1271             {
1272 0     0   0 my ($report_ref,$label,$value) = @_;
1273             # To DO $$ ??
1274 0         0 $$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1275             }
1276             #------------------------------------------------------------------------------
1277              
1278             sub _check_vowel
1279             {
1280 25     25   27 my ($str) = @_;
1281              
1282 25         58 my @words = split(/ /,$str);
1283 25         34 foreach my $word (@words)
1284             {
1285             # Saint, Mount, Junior, Senior (as in Dr Martin Luther King Snr)
1286 34 100 66     192 if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|DR|JN?R|SN?R/ )
1287             {
1288 1         4 return(1);
1289             }
1290             }
1291 24         69 return(0);
1292             }
1293             #------------------------------------------------------------------------------
1294              
1295             return(1);