File Coverage

blib/lib/Lingua/EN/AddressParse.pm
Criterion Covered Total %
statement 262 331 79.1
branch 90 124 72.5
condition 21 36 58.3
subroutine 20 23 86.9
pod 6 6 100.0
total 399 520 76.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Lingua::EN::AddressParse - extract components of a street address from free format text
4              
5             =head1 SYNOPSIS
6              
7             use Lingua::EN::AddressParse;
8              
9             my %args =
10             (
11             country => 'US',
12             auto_clean => 1,
13             force_case => 1,
14             abbreviate_subcountry => 0,
15             abbreviated_subcountry_only => 0,
16             force_post_code => 0
17             );
18              
19             my $address = Lingua::EN::AddressParse->new(%args);
20             $error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21            
22             print $address->report;
23            
24             Country address format 'US'
25             Address type 'suburban'
26             Non matching part 'HOLD MAIL '
27             Error '1'
28             Error descriptions 'non matching section : HOLD MAIL '
29             Warning '1'
30             Warning description ''
31             Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32             COMPONENTS ''
33             base_street_name 'Old Massachusetts'
34             post_code '98100'
35             property_identifier '40 1/2'
36             street_direction_prefix 'N'
37             street_name 'N Old Massachusetts'
38             street_type 'Ave'
39             sub_property_identifier '3B'
40             sub_property_type 'Apt'
41             subcountry 'WASHINGTON'
42             suburb 'Washington Valley'
43              
44             %address_components = $address->components;
45             print $address_components{sub_property_type}; # APT
46             print $address_components{sub_property_identifier}; # 3B
47             print $address_components{property_identifier}; # 40 1/2
48            
49             %address_properties = $address->properties;
50             print $address_properties{type}; # suburban
51             print $address_properties{non_matching}; # : HOLD MAIL
52              
53             $correct_casing = $address->case_all;
54              
55              
56             =head1 DESCRIPTION
57              
58             This module takes as input a suburban, rural or postal address in free format
59             text such as,
60              
61             3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62             12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63             C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64              
65             9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66             27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67              
68             2A O'CONNELL ST KEW NSW 2123
69             12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70             "OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71             GPO Box K318, HAYMARKET, NSW 2000
72              
73            
74             and attempts to parse it. If successful, the address is broken
75             down into it's components and useful functions can be performed such as :
76              
77             converting upper or lower case values to title case (2A O'Connell St Kew NSW 2123)
78             extracting the addresses individual components (2A,O'Connell,St,KEW,NSW,2123)
79             determining the type of format the address is in ('suburban')
80              
81              
82             If the address cannot be parsed you have the option of cleaning the address
83             of bad characters, or extracting any portion that was parsed and the portion
84             that failed.
85              
86             This module can be used for analysing and improving the quality of
87             lists of residential and postal addresses.
88              
89             By using a large combination of regular expressiosn with look ahead analysis, patterns
90             can be parsed that confuse many other parsers. Examples are
91              
92             Street names with several street types: Lane Cove Road
93             Suburbs which include street types: Smith Road St Marys
94             Suburbs that include state names: Fort Washington Washington
95              
96             =head1 DEFINITIONS
97              
98             The following terms are used by AddressParse to define the components that
99             can make up an address.
100              
101             Pre cursor : C/O MR A Smith...
102             Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
103             Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44, 2.5
104             Property name : "Old Regret"
105             Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
106             Road Box : RMB 24A, RMS 234 ...
107             Street Direction: North, SE, Sth. etc
108             Street name : O'Hare, New South Head, The Causeway, Broadway
109             Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
110             Suburb : Dee Why, St. John's Wood ...
111             Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
112             Post (zip) code : 2062, 34532-1234, SG12A 9ET
113             Country : Australia, UK, US or Canada
114              
115              
116              
117             The main address formats currently supported are as follows. (a ? means the component is optional):
118              
119             'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
120              
121             OR for the USA
122             'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
123              
124             'rural' : property_name suburb subcountry post_code(?) country(?)
125             'post_box' : post_box suburb subcountry post_code(?) country(?)
126             'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
127             'road_box' : road_box suburb subcountry post_code(?) country(?)
128            
129             Note that suburb and subcountry are not optional. The accuracy of the parser is
130             improved by providing as much context as possible. Proding a suburb can ehlp to
131             identify street names that would itherwise be ambigious.
132              
133             For the case where you only have a street address, dummy (but still valid) values can be used
134             for suburb (such as 'Somewhere') and sub country (such as 'NY'). These dummy values will
135             be parsed but can be ignored.
136              
137             All formats may contain a precursor
138              
139             Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
140             module for a complete list of combinations.
141              
142              
143             =head1 METHODS
144              
145             =head2 new
146              
147             The C method creates an instance of an address object and sets up
148             the grammar used to parse addresses. This must be called before any of the
149             following methods are invoked. Note that the object only needs to be
150             created once, and can be reused with new input data.
151              
152             Various setup options may be defined in a hash that is passed as an
153             optional argument to the C method.
154              
155             my %args =
156             (
157             country => 'US',
158             auto_clean => 1,
159             force_case => 1,
160             abbreviate_subcountry => 1,
161             abbreviated_subcountry_only => 1,
162             force_post_code => 1
163             );
164              
165             my $address = Lingua::EN::AddressParse->new(%args);
166              
167             =over 4
168              
169             =item country
170              
171             The country argument must be specified. It determines the possible list of
172             valid sub countries (states, counties etc, defined in the Locale::SubCountry
173             module) and post code formats. Either the full name or abbreviation may be
174             specified. The currently supported country names and codes are:
175              
176             AU or Australia
177             CA or Canada
178             GB or United Kingdom
179             US or United States
180              
181             All forms of upper/lower case are acceptable in the country's spelling. If a
182             country name is supplied that the module doesn't recognise, it will die.
183              
184             =item force_case (optional)
185              
186             This option only applies to the C method, see below.
187              
188             =item auto_clean (optional)
189              
190             When this option is set to a positive value, the input string is
191             'cleaned' to try and normalise bad patterns. The type of cleaning
192             includes
193              
194             remove non alphanumeric characters
195             remove full stops
196             remove redundant white space
197             add missing space separators
198             expand abbreviations to more common forms
199             remove bracketed annotations
200             fix badly formed sub property identifiers
201              
202             =item abbreviate_subcountry (optional)
203              
204             When this option is set to a positive value, the sub country is forced to it's
205             abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
206             already abbreviated then it's value is not altered.
207              
208             =item abbreviated_subcountry_only (optional)
209              
210             When this option is set to a positive value, only the abbreviated form
211             of sub country is allowed, such as "NSW" and not "New South Wales". This
212             will make parsing quicker and ensure that addresses comply with postal
213             standards that normally permit only abbreviated sub countries.
214              
215             It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
216             Normally, 'Washington would be consumed as the sub country, but by first converting
217             the address to 'Port Washington NJ' we avoid this problem
218              
219              
220             =item force_post_code (optional)
221              
222             When this option is set to a positive value, the address must contain
223             a post code. If it does not then an error flag is raised. If this option
224             is set to 0 than a post code is optional.
225              
226             By default for this option is true.
227              
228             =back
229              
230             =head2 parse
231              
232             $error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
233              
234             The C method takes a single parameter of a text string containing a
235             address. It attempts to parse the address and break it down into the components
236             described below. If the address is parsed successfully, a 0 is returned,
237             otherwise a 1.
238              
239             Note that you can successfully parse all the components of an address and still
240             have an error returned. This occurs when you have non matching data following
241             a valid address. To check if the data is unusable, you also need to use the
242             C method to check the address type is 'unknown'
243              
244             This method is a prerequisite for all the following methods.
245              
246             =head2 components
247              
248             %address = $address->components($upper_case_all);
249             $suburb = $address{suburb};
250            
251             If the optional argument $upper_case_all is set to a postive value, all components
252             are converted to upper case.
253            
254              
255             The C method returns all the address components in a hash. The
256             following keys are used for each component:
257              
258              
259             pre_cursor - such as 'C/O Mr A Smith'
260             po_box_type - such as 'Private Boxes'
261             post_box
262             road_box
263             sub_property_type
264             sub_property_identifier
265             property_identifier
266             property_name
267             level - such as 12th Floor
268             building - such as Tower A
269             street_direction_prefix (such as East, NW, North etc)
270             base_street_name (the name with direction removed, such as "Main" in "East Main St")
271             street_name (the full street name such as "East Main")
272             street_type
273             street_direction_suffix (US only, abbreviated only such as N, SE etc)
274             suburb
275             subcountry
276             post_code
277             country
278              
279             If a component has no matching data for a given address, it's values will be
280             set to the empty string.
281              
282             Each component is converted to title case, meaning the first letter of each
283             component is set to capitals and the remainder to lower case.
284              
285             Proper name capitalisations such as MacNay and O'Brien are observed
286              
287             The following components are not converted to title case:
288              
289             post_box
290             road_box
291             subcountry
292             post_code
293             country
294             street_direction_suffix
295            
296             If your input data is all upper case and you want to retian that format for parsed
297             data, you will need to apply the 'uc' function to each component.
298              
299             =head2 case_all
300              
301             $correct_casing = $address->case_all;
302              
303             The C method does the same thing as the C method except
304             the entire address is returned as a title cased text string.
305              
306             If the force_case option was set in the C method above, address case the
307             entire input string, including any unmatched sections after a recognisable address
308             that failed parsing. This option is useful when you know you have invalid data,
309             but you still want to title case what you have.
310              
311             =head2 properties
312              
313             The C method returns several properties of the address as a hash.
314             The following keys are used for each property -
315              
316             type - either suburban ,rural,post_box,road_box,unknown
317             non_matching - any trailing string not part the address
318              
319              
320             Additional properties can be accessed with the following
321              
322             $address->{original_input}
323             $address->{input_string} - string after auto_clean option has been applied
324             $address->{country_code} - abbreviated Country address format (as defined in the C method)
325             $address->{error} - error flag, 0 = good, 1 = error
326             $address->{error_desc} - text to describe the type of parsing error
327             $address->{warning} - warning flag, 0 = good, 1 = warning
328             $address->{warning_desc} - text to to describe the type of parsing warning(s)
329            
330             Warnings mean that the address has parsed but there may still be errors within it's components
331              
332              
333             =head2 report
334              
335             Create a formatted text report
336              
337             the input string
338             the cleaned input string
339             the country type
340             the address type
341             any non matching part of input string
342             if any parsing errors occurred
343             error description
344             if any parsing warning occurred
345             warning description
346              
347             the name and value of each defined component
348              
349              
350             Returns a string containing a multi line formatted text report
351              
352             =head1 DEPENDENCIES
353              
354             L, L, L
355              
356             =head1 BUGS
357              
358             =head1 LIMITATIONS
359              
360             Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
361             street type of null string.
362              
363             The abbreviation 'St' can be interpreted as either street or Saint. This leads to
364             ambiguities such as '12 East St Thomas Lane'. This could be 'East Street', suburb of
365             'Thomas Lane' or 'East St Thomas Lane'. And the first pattern is the more common,
366             that is what will match.
367              
368             For US addresses, an ambiguity arises between a street directional suffix and
369             a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
370             Main St, or South Springfield? The parser assumes that 'S' belongs to the street
371             description.
372              
373             The huge number of character combinations that can form a valid address makes
374             it is impossible to correctly identify them all.
375              
376             Valid addresses must contain:
377              
378             property address, suburb, subcountry (aka state) in that order.
379              
380             This format is widely accepted in Australia and the US.
381              
382             UK addresses will often include suburb, town, city and county, formats that
383             are very difficult to parse.
384              
385             Property names must be enclosed in single or double quotes like "Old Regret"
386              
387             Because of the large combination of possible addresses defined in the grammar,
388             the program is not very fast.
389              
390              
391             =head1 REFERENCES
392              
393             "The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
394              
395             Australian Standard AS4212-1994 "Geographic Information Systems -
396             Data Dictionary for transfer of street addressing information"
397              
398             ISO 3166-2:1998, Codes for the representation of names of countries
399             and their subdivisions. Also released as AS/NZS 2632.2:1999
400              
401              
402             =head1 SEE ALSO
403              
404             AddressParse is designed to identify properties, which have a unique physical
405             location. L will also parse addresses for the USA, and can handle
406             locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
407             "Mission Street at Valencia Street, San Francisco, CA"
408              
409              
410             L
411             L
412             L
413             L
414              
415             See L
416             for a list of different addressing formats from around the world. And also
417             L
418              
419             =head1 REPOSITORY
420              
421             L
422              
423             =head1 TO DO
424              
425             Define grammar for other languages. Hopefully, all that would be needed is
426             to specify a new module with its own grammar, and inherit all the existing
427             methods. I don't have the knowledge of the naming conventions for non-english
428             languages.
429              
430             =head1 AUTHOR
431              
432             AddressParse was written by Kim Ryan
433              
434             =head1 COPYRIGHT AND LICENSE
435              
436             Copyright (c) 2018 Kim Ryan. All rights reserved.
437              
438             This library is free software; you can redistribute it and/or modify
439             it under the same terms as Perl itself.
440              
441              
442             =cut
443              
444             #------------------------------------------------------------------------------
445              
446             package Lingua::EN::AddressParse;
447              
448 1     1   491 use strict;
  1         2  
  1         21  
449 1     1   4 use Carp;
  1         1  
  1         70  
450 1     1   5 use warnings;
  1         2  
  1         18  
451 1     1   417 use Lingua::EN::AddressParse::Grammar;
  1         2  
  1         28  
452 1     1   570 use Lingua::EN::NameParse;
  1         37066  
  1         64  
453 1     1   12 use Parse::RecDescent;
  1         3  
  1         6  
454              
455             our $VERSION = '1.27';
456              
457             #------------------------------------------------------------------------------
458             # Create a new instance of an address parsing object. This step is time
459             # consuming and should normally only be called once in your program.
460              
461             sub new
462             {
463 4     4 1 1045 my $class = shift;
464 4         12 my %args = @_;
465              
466              
467 4 50 33     40 unless (defined $args{country} and $args{country} =~ /^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ )
468             {
469 0         0 croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AU,GB,US or CA.\n";
470             }
471            
472 4         11 my $address = {};
473 4         8 bless($address,$class);
474              
475             # option defaults
476 4         14 $address->{'force_post_code'} = 1;
477              
478             # Add error checking for invalid keys?
479 4         11 foreach my $curr_key (keys %args)
480             {
481 8         18 $address->{$curr_key} = $args{$curr_key};
482             }
483              
484             # create the grammar tree (this is country dependent)
485 4         18 my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
486              
487 4         26 $address->{parse} = Parse::RecDescent->new($grammar);
488              
489 4         3011169 return ($address);
490             }
491             #------------------------------------------------------------------------------
492             sub parse
493             {
494 15     15 1 6906 my $address = shift;
495 15         39 my ($input_string) = @_;
496              
497             # Save original data so we can check effect of auto cleaning
498 15         38 $address->{original_input} = $input_string;
499              
500             # Convert to all upper case. This will allow for faster regexp matching in
501             # the grammar tree
502 15         41 $address->{input_string} = uc($input_string);
503              
504 15         40 chomp($address->{input_string});
505              
506 15         32 my $pre_cursor;
507 15         68 ($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
508            
509             # Replace commas (which can be used to chunk sections of addresses) with spaces
510 15         47 $address->{input_string} =~ s/,/ /g;
511            
512 15 100       69 if ( $address->{auto_clean} )
513             {
514 12         33 $address->{input_string} = _clean($address);
515             }
516              
517 15         22 my $po_box_type;
518 15         55 ($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
519            
520 15         30 my $level;
521 15         47 ($level,$address->{input_string}) = _extract_level($address->{input_string});
522            
523 15         29 my $building;
524 15         47 ($building,$address->{input_string}) = _extract_building($address->{input_string});
525            
526              
527            
528             # Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
529 15 100 100     97 if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ )
530             {
531 2         11 $address->{input_string} = "UNIT $1 $2";
532             }
533              
534             # We need to add a trailing space to the input string. This is because the grammar
535             # tree expects a terminator (the space) fro every production, optionally followed
536             # by other productions or any final non matching text.
537             # This space will be removed in the _assemble function
538 15         35 $address->{input_string} .= ' ';
539              
540 15         45 $address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
541 15         46 _validate($address);
542              
543              
544 15         50 return($address,$address->{error});
545             }
546              
547             #------------------------------------------------------------------------------
548             # Apply correct capitalisation to each component of an address
549              
550             sub components
551             {
552 17     17 1 84 my $address = shift;
553 17         29 my ($uc_all) = @_;
554              
555 17         25 my %orig_components = %{ $address->{components} };
  17         158  
556              
557 17         44 my (%cased_components);
558 17         54 foreach my $curr_key ( keys %orig_components )
559             {
560 323         386 my ($cased_value, $curr_value);
561            
562 323 100       452 if ( $orig_components{$curr_key})
563             {
564 129         170 $curr_value = $orig_components{$curr_key};
565             }
566             else
567             {
568 194         259 $curr_value = '';
569             }
570            
571            
572 323 50       474 if ($uc_all)
573             {
574 0         0 $cased_components{$curr_key} = uc($curr_value);
575 0         0 next;
576             }
577            
578              
579 323 100       800 if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ )
    100          
580             {
581              
582 187 100 100     445 if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
    100          
583             {
584 1         3 $cased_value = "US Highway $1";
585             }
586             elsif ( $curr_key eq 'sub_property_identifier' )
587             {
588             # UNIT, APT ... 12D etc
589              
590 17         40 my @words = split(/ /,$curr_value);
591 17         23 my @cased_words;
592             my $cased_string;
593 17         33 foreach my $word (@words)
594             {
595 5         7 my $cased_word;
596 5 50 66     39 if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/)
    50          
597             {
598             # ordinal component, as in 3rd Floor
599 0         0 $cased_word = lc($word);
600             }
601             elsif ( length($word) > 1 and $word !~ /\d/ )
602             {
603             # only need to title case words such as UNIT
604 0         0 $cased_word = Lingua::EN::NameParse::case_surname($word);
605             }
606             else
607             {
608 5         11 $cased_word = $word;
609             }
610 5         12 push(@cased_words,$cased_word);
611              
612             }
613 17         35 $cased_value = join(' ',@cased_words);
614             }
615             else
616             {
617 169 100       241 if ($curr_value)
618             {
619             # Surnames can be used for street's or suburbs so this method
620             # will give correct capitalisation for most cases
621 71         147 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
622             }
623             else
624             {
625 98         127 $cased_value = '';
626             }
627             }
628             }
629             # retain street_direction,sub country and countries capitalisation, usually uppercase
630             elsif ($curr_key =~ /street_direction/)
631             {
632 34 100 66     154 if (length($curr_value) == 1 or length($curr_value) == 2)
    50          
633             {
634             # N, SE etc is capitalised
635 1         4 $cased_value =$curr_value;
636             }
637             elsif (length($curr_value) > 2)
638             {
639 0         0 $cased_value = Lingua::EN::NameParse::case_surname($curr_value);
640             }
641             else
642             {
643 33         47 $cased_value = '';
644             }
645             }
646             # retain sub country and countries capitalisation, as usually uppercase
647             else
648             {
649 102         143 $cased_value = uc($curr_value);
650             }
651 323         2348 $cased_components{$curr_key} = $cased_value;
652             }
653 17         299 return(%cased_components);
654             }
655             #------------------------------------------------------------------------------
656             # Apply correct capitalisation to an entire address
657              
658             sub case_all
659             {
660 0     0 1 0 my $address = shift;
661              
662 0         0 my @cased_address;
663              
664 0 0       0 unless ( $address->{properties}{type} eq 'unknown' )
665             {
666              
667             # Hash of of lists, indicating the order that address components are assembled in.
668             # Each list element is itself the name of the key value in an address object.
669              
670 0         0 my %component_order=
671             (
672             'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
673             'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
674             'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
675              
676             );
677 0 0       0 if ( $address->{country} eq 'US' )
678             {
679 0         0 $component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
680             }
681             else
682             {
683 0         0 $component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
684             }
685              
686 0         0 my %component_vals = $address->components;
687 0         0 my @order = @{ $component_order{$address->{properties}{type} } };
  0         0  
688              
689 0         0 foreach my $component ( @order )
690             {
691             # As some components such as property name are optional, they will appear
692             # in the order array but may or may not have have a value, so check
693             # for undefined values
694 0 0       0 if ( $component_vals{$component} )
695             {
696 0         0 push(@cased_address,$component_vals{$component});
697             }
698             }
699             }
700              
701 0 0 0     0 if ( $address->{error} and $address->{force_case} )
702             {
703             # Despite errors, try to name case non-matching section. As the format
704             # of this section is unknown, surname case will provide the best
705             # approximation
706 0         0 push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
707             }
708              
709 0         0 return(join(' ',@cased_address));
710             }
711             #------------------------------------------------------------------------------
712             sub properties
713             {
714 2     2 1 11 my $address = shift;
715 2         2 return(%{ $address->{properties} });
  2         9  
716             }
717              
718             #------------------------------------------------------------------------------
719             # Create a text report to standard output listing
720             # - the input string,
721             # - the name of each defined component
722             # - any non matching component
723              
724             sub report
725             {
726 0     0 1 0 my $address = shift;
727              
728 0         0 my $report = '';
729              
730 0         0 _fmt_report_line(\$report,"Original Input",$address->{original_input});
731 0         0 _fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
732 0         0 _fmt_report_line(\$report,"Country address format",$address->{country_code});
733              
734 0         0 my %props = $address->properties;
735 0 0       0 if ( $props{type} )
736             {
737 0         0 _fmt_report_line(\$report,"Address type",$props{type});
738             }
739              
740              
741 0         0 _fmt_report_line(\$report,"Non matching part",$props{non_matching});
742 0         0 _fmt_report_line(\$report,"Error",$address->{error});
743 0         0 _fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
744 0         0 _fmt_report_line(\$report,"Warning",$address->{error});
745 0         0 _fmt_report_line(\$report,"Warning description",$address->{warning_desc});
746 0         0 _fmt_report_line(\$report,"Case all",$address->case_all);
747              
748              
749 0         0 _fmt_report_line(\$report,"COMPONENTS",'');
750 0         0 my %comps = $address->components;
751 0         0 foreach my $comp ( sort keys %comps)
752             {
753 0 0       0 if (defined($comps{$comp}) )
754             {
755 0         0 _fmt_report_line(\$report,$comp,$comps{$comp});
756             }
757             }
758              
759 0         0 return($report);
760             }
761              
762              
763             #------------------------------------------------------------------------------
764              
765             # PRIVATE METHODS
766              
767             #------------------------------------------------------------------------------
768              
769             sub _assemble
770             {
771              
772 15     15   30 my $address = shift;
773 15         34 my ($pre_cursor,$po_box_type,$level,$building) = @_;
774              
775             # Parse the address according to the rules defined in the AddressParse::Grammar module,
776             # $::RD_TRACE = 1; # for debugging RecDescent output
777             # Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
778 15         118 my $parsed_address = $address->{parse}->full_address($address->{input_string});
779              
780             # Place components into a separate hash, so they can be easily returned to the user to inspect and modify
781 15         228790 $address->{components} = ();
782              
783 15 100       40 if ($pre_cursor)
784             {
785 1         3 $address->{components}{'pre_cursor'} = $pre_cursor;
786             }
787             else
788             {
789 14         49 $address->{components}{'pre_cursor'} = '';
790             }
791            
792 15 100       39 if ($level)
793             {
794 1         3 $address->{components}{'level'} = $level;
795             }
796             else
797             {
798 14         30 $address->{components}{'level'} = '';
799             }
800            
801 15 100       29 if ($building)
802             {
803 1         2 $address->{components}{'building'} = $building;
804             }
805             else
806             {
807 14         28 $address->{components}{'building'} = '';
808             }
809            
810              
811 15 100       28 if ($po_box_type)
812             {
813 1         3 $address->{components}{'po_box_type'} = $po_box_type;
814             }
815             else
816             {
817 14         33 $address->{components}{'po_box_type'} = '';
818             }
819            
820              
821 15         27 $address->{components}{post_box} = '';
822 15 100       41 if ( $parsed_address->{post_box} )
823             {
824 1         3 $address->{components}{post_box} = $parsed_address->{post_box};
825             }
826              
827 15         31 $address->{components}{road_box} = '';
828 15 50       43 if ( $parsed_address->{road_box} )
829             {
830 0         0 $address->{components}{road_box} = $parsed_address->{road_box};
831             }
832              
833 15         33 $address->{components}{property_name} = '';
834 15 100       34 if ( $parsed_address->{property_name} )
835             {
836 1         4 $address->{components}{property_name} = $parsed_address->{property_name};
837             }
838              
839 15         44 $address->{components}{sub_property_identifier} = '';
840 15         28 $address->{components}{sub_property_type} = '';
841            
842 15 100       38 if ( $parsed_address->{sub_property} )
843             {
844 4 50       26 if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ )
    0          
845             {
846             # Such as Unit 24, # 4A etc
847 4         14 $address->{components}{sub_property_type} = $1;
848 4         11 $address->{components}{sub_property_identifier} = $2;
849             }
850             elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
851             {
852             # Such as 1st Floor
853 0         0 $address->{components}{sub_property_type} = $2;
854 0         0 $address->{components}{sub_property_identifier} = $1;
855             }
856             }
857              
858 15         32 $address->{components}{property_identifier} = '';
859 15 100       39 if ( $parsed_address->{property_identifier} )
860             {
861 13         28 $address->{components}{property_identifier} = $parsed_address->{property_identifier};
862             }
863            
864 15         56 ($address->{components}{street_direction_prefix},$address->{components}{base_street_name}) = _get_street_direction($parsed_address->{street_name});
865              
866 15         35 $address->{components}{street_name} = '';
867 15         28 $address->{components}{street_type} = '';
868 15 100       48 if ( $parsed_address->{street_name} )
869             {
870             # Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
871 13 50       38 if ( $parsed_address->{street_name} eq 'THE ' )
872             {
873 0         0 $address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
874             }
875             else
876             {
877 13         24 $address->{components}{street_name} = $parsed_address->{street_name};
878 13         29 $address->{components}{street_type} = $parsed_address->{street_type};
879             }
880             }
881              
882 15         32 $address->{components}{street_direction_suffix} = '';
883 15 100       35 if ( $parsed_address->{street_direction_suffix} )
884             {
885 1         6 $address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
886             }
887              
888              
889 15         41 $address->{components}{suburb} = '';
890 15 50       32 if ( $parsed_address->{suburb} )
891             {
892 15         33 $address->{components}{suburb} = $parsed_address->{suburb};
893             }
894              
895 15         29 $address->{components}{subcountry} = '';
896 15 50       35 if ( $parsed_address->{subcountry} )
897             {
898 15         31 my $sub_country = $parsed_address->{subcountry};
899              
900             # Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
901 15 100       29 if ($address->{abbreviate_subcountry})
902             {
903 7         32 my $country = Locale::SubCountry->new($address->{country});
904 7         148 my $code = $country->code($sub_country);
905 7 100       578 if ( $code ne 'unknown' )
906             {
907 1         3 $address->{components}{subcountry} = $code;
908             }
909             # sub country already abbreviated
910             else
911             {
912 6         20 $address->{components}{subcountry} = $sub_country;
913             }
914             }
915             else
916             {
917 8         18 $address->{components}{subcountry} = $sub_country;
918             }
919             }
920              
921 15         32 $address->{components}{post_code} = '';
922 15 50       36 if ( $parsed_address->{post_code} )
923             {
924 15         33 $address->{components}{post_code} = $parsed_address->{post_code};
925             }
926              
927 15         28 $address->{components}{country} = '';
928 15 100       62 if ( $parsed_address->{country} )
929             {
930 1         2 $address->{components}{country} = $parsed_address->{country};
931             }
932              
933 15         62 $address->{properties} = ();
934              
935 15         33 $address->{properties}{non_matching} = '';
936 15 100       33 if ( $parsed_address->{non_matching} )
937             {
938 1         3 $address->{properties}{non_matching} = $parsed_address->{non_matching};
939             }
940 15         32 $address->{properties}{type} = $parsed_address->{type};
941            
942 15         41 _trim_trailing_space($address);
943              
944 15         60 return($address);
945             }
946              
947             #-------------------------------------------------------------------------------
948             #
949             sub _get_street_direction
950             {
951 15     15   29 my ($street_name) = @_;
952              
953 15         28 my $street_direction;
954             my $base_street_name;
955              
956 15 100       33 unless ($street_name)
957             {
958 2         5 return;
959             }
960              
961 13         66 my @words = split(/\s/,$street_name);
962 13 100       37 if (@words > 1)
963             {
964 4 50       15 if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ )
965             {
966 0         0 $street_direction = $1;
967 0         0 shift(@words);
968 0         0 $base_street_name = join(' ',@words);
969             }
970             else
971             {
972 4         7 $street_direction = '';
973 4         7 $base_street_name = $street_name;
974            
975             }
976             }
977             else
978             {
979 9         17 $base_street_name = $street_name;
980             }
981 13         52 return($street_direction,$base_street_name);
982              
983             }
984              
985             #------------------------------------------------------------------------------
986             # Check for several different types of syntax errors
987              
988             sub _validate
989             {
990 15     15   25 my $address = shift;
991 15         32 $address->{error} = 0;
992 15         28 $address->{error_desc} = '';
993 15         29 $address->{warning} = 0;
994 15         25 $address->{warning_desc} = '';
995              
996 15 100       39 if ( $address->{properties}{non_matching} )
997             {
998 1         2 $address->{error} = 1;
999 1         4 $address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
1000             }
1001             else
1002             {
1003 14 50       35 if ( $address->{properties}{type} eq 'unknown' )
1004             {
1005 0         0 $address->{error} = 1;
1006 0         0 $address->{error_desc} .= 'unknown address format';
1007             }
1008             else
1009             {
1010 14 50 33     117 if ($address->{force_post_code} and not $address->{components}{post_code})
1011             {
1012 0         0 $address->{error} = 1;
1013 0         0 $address->{error_desc} .= ':no post code';
1014             }
1015              
1016             # illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
1017 14 50       51 if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
1018             {
1019             # Note, if auto_clean is on, illegal characters will have been removed
1020             # for second parsing and no error flag or message reported
1021 0         0 $address->{error} = 1;
1022 0         0 $address->{error_desc} .= ':illegal chars';
1023             }
1024 14 100       40 if ( $address->{properties}{type} eq 'suburban' )
1025             {
1026 12         28 my $street = $address->{components}{street_name};
1027 12 100       37 if ($street !~ /\d/ )
1028             {
1029             # Not an ordinal or single letter street type
1030 11 100       32 if ( _check_vowel($address->{components}{base_street_name}) )
1031             {
1032             # street name must have a vowel sound,
1033 1         3 $address->{warning} = 1;
1034 1         5 $address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1035             }
1036             }
1037             }
1038              
1039 14 50       32 if ( _check_vowel($address->{components}{suburb}) )
1040             {
1041 0         0 $address->{warning} = 1;
1042 0         0 $address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1043             }
1044             }
1045             }
1046             }
1047             #-------------------------------------------------------------------------------
1048             # Purge the input string of illegal or redundant characters.
1049             # Correct malformed patterns
1050              
1051             sub _clean
1052             {
1053 12     12   20 my $address = shift;
1054              
1055 12         26 my ($input) = $address->{input_string};
1056              
1057             # Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1058 12         21 $input =~ s|\(.*\)||;
1059            
1060             # Normalise half house numbers, such as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1061 12         24 $input =~ s|^(\d{1,4})\.5 |$1 1/2 |;
1062              
1063             # strip full stops, remove illegal characters
1064             # & can be part of property name
1065             # hash (#) may denote number for USA address
1066             # quotes can occur as property name delimiters
1067              
1068 12         35 $input =~ s|[^A-Za-z0-9&#/'" -]||go;
1069              
1070             # remove repeating, leading and trailing spaces
1071 12         20 $input =~ s| +| |go ;
1072 12         21 $input =~ s|^ ||;
1073 12         25 $input =~ s| $||;
1074              
1075              
1076             # Expand abbreviations that are too short
1077              
1078 12         20 $input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1079 12         20 $input =~ s| CSEWY | CAUSEWAY |;
1080              
1081             # Standardise abbreviations
1082              
1083 12         31 $input =~ s|STR |ST |;
1084 12         21 $input =~ s|TERR |TERRACE |;
1085            
1086 12         20 $input =~ s|^FCTR?Y |FACTORY |;
1087 12         19 $input =~ s|^FACT?R?Y? |FACTORY |;
1088              
1089 12         25 $input =~ s|LVL |LEVEL |; # sub property identifiers
1090 12         25 $input =~ s|^UN? |UNIT |;
1091 12         17 $input =~ s|^U(\d+)|UNIT $1|;
1092              
1093             # Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1094 12         21 $input =~ s|/ |/|;
1095 12         17 $input =~ s| /|/|;
1096 12         21 $input =~ s|- |-|;
1097 12         16 $input =~ s| -|-|;
1098              
1099             # Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1100              
1101 12 50       44 if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ )
1102             {
1103             # Don't remove space before single letter streets such as 21 B Street
1104 12 50       37 if ( $address->{country_code} eq 'US' )
1105             {
1106             # Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1107             # Assume that the direction prefix is the more likely case
1108 0         0 $input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|;
1109             }
1110             else
1111             {
1112 12         30 $input =~ s|^(\d+) ([A-Z] )|$1$2|;
1113             }
1114             }
1115              
1116             # Normalise sub property identifiers
1117 12 50       30 if ( $address->{country_code} eq 'US' )
1118             {
1119             # Fix US sub property identifiers that appear after street name and type
1120             # add space between # or 'Apt' and the number so #2 becomes '# 2'
1121 0         0 $input =~ s| #(\d)| # $1|;
1122 0         0 $input =~ s| #([A-Z])| # $1|;
1123 0         0 $input =~ s| (APT)(\d)| $1 $2|i;
1124            
1125 0         0 $input =~ s| (APT)-(\w)| $1 $2|i;
1126              
1127             # remove redundant space so # 34 B becomes # 34B
1128 0         0 $input =~ s| # (\d+) (\w) | # $1$2 |;
1129              
1130             # remove redundant '#'
1131 0         0 $input =~ s/ (APT|SUITE|UNIT) #/ $1 /;
1132             # still to test
1133             }
1134             else
1135             {
1136             # Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1137 12         27 $input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/;
1138             }
1139              
1140             # Remove redundant slash or dash
1141             # Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1142 12         25 $input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1143             # Unit J1/ 39 becomes Unit J1 39
1144 12         25 $input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1145              
1146              
1147             # remove dash that does not from a sequence, such as D-5 or 22-A
1148 12         24 $input =~ s|([A-Z])-(\d)|$1$2|;
1149 12         22 $input =~ s|(\d)-([A-Z])|$1$2|;
1150            
1151              
1152 12         30 return($input);
1153             }
1154             #-------------------------------------------------------------------------------
1155             # Remove any "care of" type of precursor from the main address
1156             # such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1157             # It will later be saved as an attribute in the address object
1158              
1159             sub _extract_precursor
1160             {
1161 15     15   35 my ($input) = @_;
1162 15         28 my ($pre_cursor,$address_start,$address_end);
1163              
1164 15 100       80 if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)})
1165             {
1166 1         4 $pre_cursor = $1;
1167 1         3 $address_start = $2;
1168 1         4 $address_end = $3;
1169 1         14 return($pre_cursor, $address_start . $address_end);
1170             }
1171             else
1172             {
1173 14         51 return('',$input)
1174             }
1175             }
1176             #-------------------------------------------------------------------------------
1177             # Remove any level or floor info such as:
1178             # 12 Smith St Floor 2
1179             # Level 22 Suite 3 12 Main St
1180             # It will later be saved as an attribute in the address object
1181              
1182             sub _extract_level
1183             {
1184 15     15   31 my ($input) = @_;
1185 15         24 my ($level);
1186            
1187 15 100 33     167 if
      66        
      66        
1188             (
1189             # Level info could be at start of string so first space is optional
1190             # TO DO, add lower?, mezz, BASEMENT etc?
1191             $input =~ / ?((GROUND|FIRST|SECOND|THIRD|FOURTH|FIFTH|SIXTH) (FLOOR|FLR|FL) )/ or
1192             $input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or
1193             $input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or
1194             $input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/
1195             )
1196             {
1197 1         4 $level = $1;
1198 1         3 $level =~ s|/||;
1199 1         2 $level =~ s|-||;
1200 1         12 $input =~ s/$level//;
1201             }
1202            
1203 15         49 return($level,$input);
1204             }
1205             #-------------------------------------------------------------------------------
1206             # Remove any building info such as:
1207             # Building 2 Level 12 123 Smith St
1208             # 12 Main St Tower A Level 2
1209             # It will later be saved as an attribute in the address object
1210              
1211             sub _extract_building
1212             {
1213 15     15   35 my ($input) = @_;
1214 15         21 my ($building);
1215            
1216 15         58 my $building_label = qr{BLOCK|BLDG?|BUILDING|TOWER};
1217 15         60 my $id = qr{[A-Z]|A[A-Z]|\d+|\d{1,3}[A-Z]|[A-Z]\d{1,3}}; # AA or 12 or 32C or C12
1218            
1219 15 100 66     288 if
1220             (
1221             $input =~ /^($building_label $id) / or $input =~ / ($building_label $id) /
1222             )
1223             {
1224 1         3 $building = $1;
1225 1         4 $building =~ s|/||;
1226 1         2 $building =~ s|-||;
1227 1         7 $input =~ s/$building//;
1228             }
1229             # TO DO, North, East etc Building?
1230            
1231            
1232 15         79 my $name = qr{[A-Z]+}; #
1233 15         25 my $house;
1234            
1235             # Allow for x house at start of text only. Not for example: 12 Gate House Road, suburb
1236 15 50       98 if ( $input =~ /^(($name )?$name HOUSE) / )
1237             {
1238 0         0 $house = $1;
1239 0         0 $input =~ s/$house//;
1240 0         0 $building .= $house;
1241             }
1242            
1243 15         81 return($building,$input);
1244             }
1245              
1246             #-------------------------------------------------------------------------------
1247             # Remove any description that follows the suburb after the main address
1248             # such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1249             # It will be saved as an address attribute
1250              
1251             sub _extract_po_box_type
1252             {
1253 15     15   36 my ($input) = @_;
1254 15         29 my ($po_box_type,$address_start,$address_end);
1255              
1256 15 100       52 if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ )
1257             {
1258 1         3 $address_start = $1;
1259 1         4 $po_box_type = $2;
1260 1         2 $address_end = $3;
1261 1         5 return($po_box_type, $address_start . $address_end);
1262             }
1263             else
1264             {
1265 14         52 return('',$input)
1266             }
1267             }
1268             #------------------------------------------------------------------------------
1269             # For correct matching, the grammar of each component must include the
1270             # trailing space that separates it from any following word. This should
1271             # now be removed from each component
1272              
1273             sub _trim_trailing_space
1274             {
1275 15     15   28 my ($address) = @_;
1276            
1277 15         24 foreach my $key (keys %{ $address->{components} } )
  15         77  
1278             {
1279 285 100       480 if ($address->{components}{$key} )
1280             {
1281 110         279 $address->{components}{$key} =~ s/ $//g;
1282             }
1283             }
1284             }
1285             #------------------------------------------------------------------------------
1286              
1287             sub _fmt_report_line
1288             {
1289 0     0   0 my ($report_ref,$label,$value) = @_;
1290 0         0 $$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1291             }
1292             #------------------------------------------------------------------------------
1293              
1294             sub _check_vowel
1295             {
1296 25     25   51 my ($str) = @_;
1297              
1298 25         65 my @words = split(/ /,$str);
1299 25         44 foreach my $word (@words)
1300             {
1301             # Saint, Mount, Junior, Senior (as in Dr Martin Luther King Snr)
1302 34 100 66     186 if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|DR|JN?R|SN?R/ )
1303             {
1304 1         4 return(1);
1305             }
1306             }
1307 24         73 return(0);
1308             }
1309             #------------------------------------------------------------------------------
1310              
1311             return(1);