File Coverage

blib/lib/Lingua/EN/AddressParse.pm

Criterion	Covered	Total	%
statement	258	326	79.1
branch	88	122	72.1
condition	21	36	58.3
subroutine	20	23	86.9
pod	6	6	100.0
total	393	513	76.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::AddressParse - extract components of a street address from free format text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::AddressParse;
8
9							my %args =
10							(
11							country => 'US',
12							auto_clean => 1,
13							force_case => 1,
14							abbreviate_subcountry => 0,
15							abbreviated_subcountry_only => 0,
16							force_post_code => 0
17							);
18
19							my $address = Lingua::EN::AddressParse->new(%args);
20							$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21
22							print $address->report;
23
24							Country address format 'US'
25							Address type 'suburban'
26							Non matching part 'HOLD MAIL '
27							Error '1'
28							Error descriptions 'non matching section : HOLD MAIL '
29							Warning '1'
30							Warning description ''
31							Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32							COMPONENTS ''
33							base_street_name 'Old Massachusetts'
34							post_code '98100'
35							property_identifier '40 1/2'
36							street_direction_prefix 'N'
37							street_name 'N Old Massachusetts'
38							street_type 'Ave'
39							sub_property_identifier '3B'
40							sub_property_type 'Apt'
41							subcountry 'WASHINGTON'
42							suburb 'Washington Valley'
43
44							%address_components = $address->components;
45							print $address_components{sub_property_type}; # APT
46							print $address_components{sub_property_identifier}; # 3B
47							print $address_components{property_identifier}; # 40 1/2
48
49							%address_properties = $address->properties;
50							print $address_properties{type}; # suburban
51							print $address_properties{non_matching}; # : HOLD MAIL
52
53							$correct_casing = $address->case_all;
54
55
56							=head1 DESCRIPTION
57
58							This module takes as input a suburban, rural or postal address in free format
59							text such as,
60
61							3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62							12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63							C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64
65							9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66							27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67
68							2A O'CONNELL ST KEW NSW 2123
69							12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70							"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71							GPO Box K318, HAYMARKET, NSW 2000
72
73
74							and attempts to parse it. If successful, the address is broken
75							down into it's components and useful functions can be performed such as :
76
77							converting upper or lower case values to title case (2A O'Connell St Kew NSW 2123)
78							extracting the addresses individual components (2A,O'Connell,St,KEW,NSW,2123)
79							determining the type of format the address is in ('suburban')
80
81
82							If the address cannot be parsed you have the option of cleaning the address
83							of bad characters, or extracting any portion that was parsed and the portion
84							that failed.
85
86							This module can be used for analysing and improving the quality of
87							lists of residential and postal addresses.
88
89							By using a large combination of regular expressiosn with look ahead analysis, patterns
90							can be parsed that confuse many other parsers. Examples are
91
92							Street names with several street types: Lane Cove Road
93							Suburbs which include street types: Smith Road St Marys
94							Suburbs that include state names: Fort Washington Washington
95
96							=head1 DEFINITIONS
97
98							The following terms are used by AddressParse to define the components that
99							can make up an address.
100
101							Pre cursor : C/O MR A Smith...
102							Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
103							Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44, 2.5
104							Property name : "Old Regret"
105							Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
106							Road Box : RMB 24A, RMS 234 ...
107							Street Direction: North, SE, Sth. etc
108							Street name : O'Hare, New South Head, The Causeway, Broadway
109							Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
110							Suburb : Dee Why, St. John's Wood ...
111							Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
112							Post (zip) code : 2062, 34532-1234, SG12A 9ET
113							Country : Australia, UK, US or Canada
114
115
116
117							The main address formats currently supported are as follows. (a ? means the component is optional):
118
119							'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
120
121							OR for the USA
122							'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
123
124							'rural' : property_name suburb subcountry post_code(?) country(?)
125							'post_box' : post_box suburb subcountry post_code(?) country(?)
126							'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
127							'road_box' : road_box suburb subcountry post_code(?) country(?)
128
129							Note that suburb and subcountry are not optional. The accuracy of the parser is
130							improved by providing as much context as possible. Proding a suburb can ehlp to
131							identify street names that would itherwise be ambigious.
132
133							For the case where you only have a street address, dummy (but still valid) values can be used
134							for suburb (such as 'Somewhere') and sub country (such as 'NY'). These dummy values will
135							be parsed but can be ignored.
136
137							All formats may contain a precursor
138
139							Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
140							module for a complete list of combinations.
141
142
143							=head1 METHODS
144
145							=head2 new
146
147							The C method creates an instance of an address object and sets up
148							the grammar used to parse addresses. This must be called before any of the
149							following methods are invoked. Note that the object only needs to be
150							created once, and can be reused with new input data.
151
152							Various setup options may be defined in a hash that is passed as an
153							optional argument to the C method.
154
155							my %args =
156							(
157							country => 'US',
158							auto_clean => 1,
159							force_case => 1,
160							abbreviate_subcountry => 1,
161							abbreviated_subcountry_only => 1,
162							force_post_code => 1
163							);
164
165							my $address = Lingua::EN::AddressParse->new(%args);
166
167							=over 4
168
169							=item country
170
171							The country argument must be specified. It determines the possible list of
172							valid sub countries (states, counties etc, defined in the Locale::SubCountry
173							module) and post code formats. Either the full name or abbreviation may be
174							specified. The currently supported country names and codes are:
175
176							AU or Australia
177							CA or Canada
178							GB or United Kingdom
179							US or United States
180
181							All forms of upper/lower case are acceptable in the country's spelling. If a
182							country name is supplied that the module doesn't recognise, it will die.
183
184							=item force_case (optional)
185
186							This option only applies to the C method, see below.
187
188							=item auto_clean (optional)
189
190							When this option is set to a positive value, the input string is
191							'cleaned' to try and normalise bad patterns. The type of cleaning
192							includes
193
194							remove non alphanumeric characters
195							remove full stops
196							remove redundant white space
197							add missing space separators
198							expand abbreviations to more common forms
199							remove bracketed annotations
200							fix badly formed sub property identifiers
201
202							=item abbreviate_subcountry (optional)
203
204							When this option is set to a positive value, the sub country is forced to it's
205							abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
206							already abbreviated then it's value is not altered.
207
208							=item abbreviated_subcountry_only (optional)
209
210							When this option is set to a positive value, only the abbreviated form
211							of sub country is allowed, such as "NSW" and not "New South Wales". This
212							will make parsing quicker and ensure that addresses comply with postal
213							standards that normally permit only abbreviated sub countries.
214
215							It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
216							Normally, 'Washington would be consumed as the sub country, but by first converting
217							the address to 'Port Washington NJ' we avoid this problem
218
219
220							=item force_post_code (optional)
221
222							When this option is set to a positive value, the address must contain
223							a post code. If it does not then an error flag is raised. If this option
224							is set to 0 than a post code is optional.
225
226							By default for this option is true.
227
228							=back
229
230							=head2 parse
231
232							$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
233
234							The C method takes a single parameter of a text string containing a
235							address. It attempts to parse the address and break it down into the components
236							described below. If the address is parsed successfully, a 0 is returned,
237							otherwise a 1.
238
239							Note that you can successfully parse all the components of an address and still
240							have an error returned. This occurs when you have non matching data following
241							a valid address. To check if the data is unusable, you also need to use the
242							C method to check the address type is 'unknown'
243
244							This method is a prerequisite for all the following methods.
245
246							=head2 components
247
248							%address = $address->components($upper_case_all);
249							$suburb = $address{suburb};
250
251							If the optional argument $upper_case_all is set to a postive value, all components
252							are converted to upper case.
253
254
255							The C method returns all the address components in a hash. The
256							following keys are used for each component:
257
258
259							pre_cursor - such as 'C/O Mr A Smith'
260							po_box_type - such as 'Private Boxes'
261							post_box
262							road_box
263							sub_property_type
264							sub_property_identifier
265							property_identifier
266							property_name
267							level - such as 12th Floor
268							building - such as Tower A
269							street_direction_prefix (such as East, NW, North etc)
270							base_street_name (the name with direction removed, such as "Main" in "East Main St")
271							street_name (the full street name such as "East Main")
272							street_type
273							street_direction_suffix (US only, abbreviated only such as N, SE etc)
274							suburb
275							subcountry
276							post_code
277							country
278
279							If a component has no matching data for a given address, it's values will be
280							set to the empty string.
281
282							Each component is converted to title case, meaning the first letter of each
283							component is set to capitals and the remainder to lower case.
284
285							Proper name capitalisations such as MacNay and O'Brien are observed
286
287							The following components are not converted to title case:
288
289							post_box
290							road_box
291							subcountry
292							post_code
293							country
294							street_direction_suffix
295
296							If your input data is all upper case and you want to retian that format for parsed
297							data, you will need to apply the 'uc' function to each component.
298
299							=head2 case_all
300
301							$correct_casing = $address->case_all;
302
303							The C method does the same thing as the C method except
304							the entire address is returned as a title cased text string.
305
306							If the force_case option was set in the C method above, address case the
307							entire input string, including any unmatched sections after a recognisable address
308							that failed parsing. This option is useful when you know you have invalid data,
309							but you still want to title case what you have.
310
311							=head2 properties
312
313							The C method returns several properties of the address as a hash.
314							The following keys are used for each property -
315
316							type - either suburban ,rural,post_box,road_box,unknown
317							non_matching - any trailing string not part the address
318
319
320							Additional properties can be accessed with the following
321
322							$address->{original_input}
323							$address->{input_string} - string after auto_clean option has been applied
324							$address->{country_code} - abbreviated Country address format (as defined in the C method)
325							$address->{error} - error flag, 0 = good, 1 = error
326							$address->{error_desc} - text to describe the type of parsing error
327							$address->{warning} - warning flag, 0 = good, 1 = warning
328							$address->{warning_desc} - text to to describe the type of parsing warning(s)
329
330							Warnings mean that the address has parsed but there may still be errors within it's components
331
332
333							=head2 report
334
335							Create a formatted text report
336
337							the input string
338							the cleaned input string
339							the country type
340							the address type
341							any non matching part of input string
342							if any parsing errors occurred
343							error description
344							if any parsing warning occurred
345							warning description
346
347							the name and value of each defined component
348
349
350							Returns a string containing a multi line formatted text report
351
352							=head1 DEPENDENCIES
353
354							L, L, L
355
356							=head1 BUGS
357
358							=head1 LIMITATIONS
359
360							Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
361							street type of null string.
362
363							The abbreviation 'St' can be interpreted as either street or Saint. This leads to
364							ambiguities such as '12 East St Thomas Lane'. This could be 'East Street', suburb of
365							'Thomas Lane' or 'East St Thomas Lane'. And the first pattern is the more common,
366							that is what will match.
367
368							For US addresses, an ambiguity arises between a street directional suffix and
369							a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
370							Main St, or South Springfield? The parser assumes that 'S' belongs to the street
371							description.
372
373							The huge number of character combinations that can form a valid address makes
374							it is impossible to correctly identify them all.
375
376							Valid addresses must contain:
377
378							property address, suburb, subcountry (aka state) in that order.
379
380							This format is widely accepted in Australia and the US.
381
382							UK addresses will often include suburb, town, city and county, formats that
383							are very difficult to parse.
384
385							Property names must be enclosed in single or double quotes like "Old Regret"
386
387							Because of the large combination of possible addresses defined in the grammar,
388							the program is not very fast.
389
390
391							=head1 REFERENCES
392
393							"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
394
395							Australian Standard AS4212-1994 "Geographic Information Systems -
396							Data Dictionary for transfer of street addressing information"
397
398							ISO 3166-2:1998, Codes for the representation of names of countries
399							and their subdivisions. Also released as AS/NZS 2632.2:1999
400
401
402							=head1 SEE ALSO
403
404							AddressParse is designed to identify properties, which have a unique physical
405							location. L will also parse addresses for the USA, and can handle
406							locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
407							"Mission Street at Valencia Street, San Francisco, CA"
408
409
410							L
411							L
412							L
413							L
414
415							See L
416							for a list of different addressing formats from around the world. And also
417							L
418
419							=head1 REPOSITORY
420
421							L
422
423							=head1 TO DO
424
425							Define grammar for other languages. Hopefully, all that would be needed is
426							to specify a new module with its own grammar, and inherit all the existing
427							methods. I don't have the knowledge of the naming conventions for non-english
428							languages.
429
430							=head1 AUTHOR
431
432							AddressParse was written by Kim Ryan
433
434							=head1 COPYRIGHT AND LICENSE
435
436							Copyright (c) 2015 Kim Ryan. All rights reserved.
437
438							This library is free software; you can redistribute it and/or modify
439							it under the same terms as Perl itself.
440
441
442							=cut
443
444							#------------------------------------------------------------------------------
445
446							package Lingua::EN::AddressParse;
447
448	1			1		573	use strict;
	1					1
	1					23
449	1			1		3	use Carp;
	1					1
	1					57
450	1			1		3	use warnings;
	1					3
	1					18
451	1			1		404	use Lingua::EN::AddressParse::Grammar;
	1					2
	1					37
452	1			1		748	use Lingua::EN::NameParse;
	1					34374
	1					52
453	1			1		7	use Parse::RecDescent;
	1					1
	1					4
454
455							our $VERSION = '1.26';
456
457							#------------------------------------------------------------------------------
458							# Create a new instance of an address parsing object. This step is time
459							# consuming and should normally only be called once in your program.
460
461							sub new
462							{
463	4			4	1	940	my $class = shift;
464	4					11	my %args = @_;
465
466
467	4	50	33			51	unless (defined $args{country} and $args{country} =~
468							/^(AU\|Australia\|GB\|United Kingdom\|US\|United States\|CA\|Canada)$/ )
469							{
470	0					0	croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n";
471							}
472
473
474	4					5	my $address = {};
475	4					10	bless($address,$class);
476
477							# option defaults
478	4					13	$address->{'force_post_code'} = 1;
479
480							# Add error checking for invalid keys?
481	4					13	foreach my $curr_key (keys %args)
482							{
483	8					14	$address->{$curr_key} = $args{$curr_key};
484							}
485
486							# create the grammar tree (this is country dependent)
487	4					19	my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
488
489	4					24	$address->{parse} = Parse::RecDescent->new($grammar);
490
491	4					2508788	return ($address);
492							}
493							#------------------------------------------------------------------------------
494							sub parse
495							{
496	15			15	1	7725	my $address = shift;
497	15					41	my ($input_string) = @_;
498
499							# Save original data so we can check effect of auto cleaning
500	15					44	$address->{original_input} = $input_string;
501
502							# Convert to all upper case. This will allow for faster regexp matching in
503							# the grammar tree
504	15					57	$address->{input_string} = uc($input_string);
505
506	15					44	chomp($address->{input_string});
507
508	15					25	my $pre_cursor;
509	15					61	($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
510
511							# Replace commas (which can be used to chunk sections of addresses) with spaces
512	15					63	$address->{input_string} =~ s/,/ /g;
513
514	15	100				47	if ( $address->{auto_clean} )
515							{
516	12					34	$address->{input_string} = _clean($address);
517							}
518
519	15					25	my $po_box_type;
520	15					53	($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
521
522	15					27	my $level;
523	15					49	($level,$address->{input_string}) = _extract_level($address->{input_string});
524
525	15					20	my $building;
526	15					54	($building,$address->{input_string}) = _extract_building($address->{input_string});
527
528
529
530							# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
531	15	100	100			142	if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/\| ](\d+[ \w-].*)$/ )
532							{
533	2					12	$address->{input_string} = "UNIT $1 $2";
534							}
535
536							# We need to add a trailing space to the input string. This is because the grammar
537							# tree expects a terminator (the space) fro every production, optionally followed
538							# by other productions or any final non matching text.
539							# This space will be removed in the _assemble function
540	15					34	$address->{input_string} .= ' ';
541
542	15					49	$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
543	15					62	_validate($address);
544
545
546	15					64	return($address,$address->{error});
547							}
548
549							#------------------------------------------------------------------------------
550							# Apply correct capitalisation to each component of an address
551
552							sub components
553							{
554	17			17	1	105	my $address = shift;
555	17					20	my ($uc_all) = @_;
556
557	17					23	my %orig_components = %{ $address->{components} };
	17					163
558
559	17					36	my (%cased_components);
560	17					55	foreach my $curr_key ( keys %orig_components )
561							{
562	323					186	my $cased_value;
563	323					267	my $curr_value = $orig_components{$curr_key};
564
565	323	50				386	if ($uc_all)
566							{
567	0					0	$cased_components{$curr_key} = uc($curr_value);
568	0					0	next;
569							}
570
571
572	323	100				658	if ( $curr_key =~ /^(base_street_name\|street_name\|street_type\|suburb\|property_name\|sub_property\|pre_cursor\|po_box_type\|level\|building)/ )
		100
573							{
574
575	187	100	100			444	if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
		100
576							{
577	1					4	$cased_value = "US Highway $1";
578							}
579							elsif ( $curr_key eq 'sub_property_identifier' )
580							{
581							# UNIT, APT ... 12D etc
582
583	17					39	my @words = split(/ /,$curr_value);
584	17					18	my @cased_words;
585							my $cased_string;
586	17					32	foreach my $word (@words)
587							{
588	5					6	my $cased_word;
589	5	50	66			36	if ( $word =~ /^\d{1,3}(ST\|ND\|RD\|TH)$/)
		50
590							{
591							# ordinal component, as in 3rd Floor
592	0					0	$cased_word = lc($word);
593							}
594							elsif ( length($word) > 1 and $word !~ /\d/ )
595							{
596							# only need to title case words such as UNIT
597	0					0	$cased_word = Lingua::EN::NameParse::case_surname($word);
598							}
599							else
600							{
601	5					9	$cased_word = $word;
602							}
603	5					8	push(@cased_words,$cased_word);
604
605							}
606	17					41	$cased_value = join(' ',@cased_words);
607							}
608							else
609							{
610	169	100				162	if ($curr_value)
611							{
612							# Surnames can be used for street's or suburbs so this method
613							# will give correct capitalisation for most cases
614	71					133	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
615							}
616							else
617							{
618	98					93	$cased_value = '';
619							}
620							}
621							}
622							# retain street_direction,sub country and countries capitalisation, usually uppercase
623							elsif ($curr_key =~ /street_direction/)
624							{
625	34	100	66			147	if (length($curr_value) == 1 or length($curr_value) == 2)
		50
626							{
627							# N, SE etc is capitalised
628	1					3	$cased_value =$curr_value;
629							}
630							elsif (length($curr_value) > 2)
631							{
632	0					0	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
633							}
634							else
635							{
636	33					35	$cased_value = '';
637							}
638							}
639							# retain sub country and countries capitalisation, as usually uppercase
640							else
641							{
642	102					107	$cased_value = uc($curr_value);
643							}
644	323					1720	$cased_components{$curr_key} = $cased_value;
645							}
646	17					402	return(%cased_components);
647							}
648							#------------------------------------------------------------------------------
649							# Apply correct capitalisation to an entire address
650
651							sub case_all
652							{
653	0			0	1	0	my $address = shift;
654
655	0					0	my @cased_address;
656
657	0	0				0	unless ( $address->{properties}{type} eq 'unknown' )
658							{
659
660							# Hash of of lists, indicating the order that address components are assembled in.
661							# Each list element is itself the name of the key value in an address object.
662
663	0					0	my %component_order=
664							(
665							'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
666							'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
667							'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
668
669							);
670	0	0				0	if ( $address->{country} eq 'US' )
671							{
672	0					0	$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
673							}
674							else
675							{
676	0					0	$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
677							}
678
679	0					0	my %component_vals = $address->components;
680	0					0	my @order = @{ $component_order{$address->{properties}{type} } };
	0					0
681
682	0					0	foreach my $component ( @order )
683							{
684							# As some components such as property name are optional, they will appear
685							# in the order array but may or may not have have a value, so check
686							# for undefined values
687	0	0				0	if ( $component_vals{$component} )
688							{
689	0					0	push(@cased_address,$component_vals{$component});
690							}
691							}
692							}
693
694	0	0	0			0	if ( $address->{error} and $address->{force_case} )
695							{
696							# Despite errors, try to name case non-matching section. As the format
697							# of this section is unknown, surname case will provide the best
698							# approximation
699	0					0	push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
700							}
701
702	0					0	return(join(' ',@cased_address));
703							}
704							#------------------------------------------------------------------------------
705							sub properties
706							{
707	2			2	1	11	my $address = shift;
708	2					2	return(%{ $address->{properties} });
	2					11
709							}
710
711							#------------------------------------------------------------------------------
712							# Create a text report to standard output listing
713							# - the input string,
714							# - the name of each defined component
715							# - any non matching component
716
717							sub report
718							{
719	0			0	1	0	my $address = shift;
720
721	0					0	my $report = '';
722
723	0					0	_fmt_report_line(\$report,"Original Input",$address->{original_input});
724	0					0	_fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
725	0					0	_fmt_report_line(\$report,"Country address format",$address->{country_code});
726
727	0					0	my %props = $address->properties;
728	0	0				0	if ( $props{type} )
729							{
730	0					0	_fmt_report_line(\$report,"Address type",$props{type});
731							}
732
733
734	0					0	_fmt_report_line(\$report,"Non matching part",$props{non_matching});
735	0					0	_fmt_report_line(\$report,"Error",$address->{error});
736	0					0	_fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
737	0					0	_fmt_report_line(\$report,"Warning",$address->{error});
738	0					0	_fmt_report_line(\$report,"Warning description",$address->{warning_desc});
739	0					0	_fmt_report_line(\$report,"Case all",$address->case_all);
740
741
742	0					0	_fmt_report_line(\$report,"COMPONENTS",'');
743	0					0	my %comps = $address->components;
744	0					0	foreach my $comp ( sort keys %comps)
745							{
746	0	0				0	if (defined($comps{$comp}) )
747							{
748	0					0	_fmt_report_line(\$report,$comp,$comps{$comp});
749							}
750							}
751
752	0					0	return($report);
753							}
754
755
756							#------------------------------------------------------------------------------
757
758							# PRIVATE METHODS
759
760							#------------------------------------------------------------------------------
761
762							sub _assemble
763							{
764
765	15			15		25	my $address = shift;
766	15					29	my ($pre_cursor,$po_box_type,$level,$building) = @_;
767
768							# Parse the address according to the rules defined in the AddressParse::Grammar module,
769							# $::RD_TRACE = 1; # for debugging RecDescent output
770							# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
771	15					147	my $parsed_address = $address->{parse}->full_address($address->{input_string});
772
773							# Place components into a separate hash, so they can be easily returned to the user to inspect and modify
774	15					191025	$address->{components} = ();
775
776	15	100				146	if ($pre_cursor)
777							{
778	1					4	$address->{components}{'pre_cursor'} = $pre_cursor;
779							}
780							else
781							{
782	14					46	$address->{components}{'pre_cursor'} = '';
783							}
784
785	15	100				51	if ($level)
786							{
787	1					4	$address->{components}{'level'} = $level;
788							}
789							else
790							{
791	14					33	$address->{components}{'level'} = '';
792							}
793
794	15	100				40	if ($building)
795							{
796	1					3	$address->{components}{'building'} = $building;
797							}
798							else
799							{
800	14					28	$address->{components}{'building'} = '';
801							}
802
803
804	15	100				42	if ($po_box_type)
805							{
806	1					4	$address->{components}{'po_box_type'} = $po_box_type;
807							}
808							else
809							{
810	14					36	$address->{components}{'po_box_type'} = '';
811							}
812
813
814	15					42	$address->{components}{post_box} = '';
815	15	100				65	if ( $parsed_address->{post_box} )
816							{
817	1					3	$address->{components}{post_box} = $parsed_address->{post_box};
818							}
819
820	15					30	$address->{components}{road_box} = '';
821	15	50				57	if ( $parsed_address->{road_box} )
822							{
823	0					0	$address->{components}{road_box} = $parsed_address->{road_box};
824							}
825
826	15					42	$address->{components}{property_name} = '';
827	15	100				44	if ( $parsed_address->{property_name} )
828							{
829	1					4	$address->{components}{property_name} = $parsed_address->{property_name};
830							}
831
832	15					48	$address->{components}{sub_property_identifier} = '';
833	15					63	$address->{components}{sub_property_type} = '';
834
835	15	100				46	if ( $parsed_address->{sub_property} )
836							{
837	4	50				31	if ($parsed_address->{sub_property} =~ /^(#\|[A-Z]{1,}) (.*)$/ )
		0
838							{
839							# Such as Unit 24, # 4A etc
840	4					12	$address->{components}{sub_property_type} = $1;
841	4					13	$address->{components}{sub_property_identifier} = $2;
842							}
843							elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
844							{
845							# Such as 1st Floor
846	0					0	$address->{components}{sub_property_type} = $2;
847	0					0	$address->{components}{sub_property_identifier} = $1;
848							}
849							}
850
851	15					35	$address->{components}{property_identifier} = '';
852	15	100				50	if ( $parsed_address->{property_identifier} )
853							{
854	13					34	$address->{components}{property_identifier} = $parsed_address->{property_identifier};
855							}
856
857	15					46	$address->{components}{base_street_name} = '';
858	15					26	$address->{components}{street_direction_prefix} = '';
859	15					61	my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name});
860	15	50				42	if ($street_direction )
861							{
862	0					0	$address->{components}{street_direction_prefix} = $street_direction;
863	0					0	$address->{components}{base_street_name} = $base_street_name;
864							}
865							else
866							{
867	15					36	$address->{components}{base_street_name} = $parsed_address->{street_name};
868							}
869
870	15					34	$address->{components}{street_name} = '';
871	15					27	$address->{components}{street_type} = '';
872	15	100				45	if ( $parsed_address->{street_name} )
873							{
874							# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
875	13	50				40	if ( $parsed_address->{street_name} eq 'THE ' )
876							{
877	0					0	$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
878							}
879							else
880							{
881	13					32	$address->{components}{street_name} = $parsed_address->{street_name};
882	13					25	$address->{components}{street_type} = $parsed_address->{street_type};
883							}
884							}
885
886
887	15					41	$address->{components}{street_direction_suffix} = '';
888	15	100				43	if ( $parsed_address->{street_direction_suffix} )
889							{
890	1					2	$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
891							}
892
893
894	15					45	$address->{components}{suburb} = '';
895	15	50				36	if ( $parsed_address->{suburb} )
896							{
897	15					45	$address->{components}{suburb} = $parsed_address->{suburb};
898							}
899
900	15					28	$address->{components}{subcountry} = '';
901	15	50				58	if ( $parsed_address->{subcountry} )
902							{
903	15					24	my $sub_country = $parsed_address->{subcountry};
904
905							# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
906	15	100				39	if ($address->{abbreviate_subcountry})
907							{
908	7					51	my $country = Locale::SubCountry->new($address->{country});
909	7					172	my $code = $country->code($sub_country);
910	7	100				619	if ( $code ne 'unknown' )
911							{
912	1					4	$address->{components}{subcountry} = $code;
913							}
914							# sub country already abbreviated
915							else
916							{
917	6					20	$address->{components}{subcountry} = $sub_country;
918							}
919							}
920							else
921							{
922	8					24	$address->{components}{subcountry} = $sub_country;
923							}
924							}
925
926	15					27	$address->{components}{post_code} = '';
927	15	50				48	if ( $parsed_address->{post_code} )
928							{
929	15					29	$address->{components}{post_code} = $parsed_address->{post_code};
930							}
931
932	15					32	$address->{components}{country} = '';
933	15	100				44	if ( $parsed_address->{country} )
934							{
935	1					2	$address->{components}{country} = $parsed_address->{country};
936							}
937
938	15					40	$address->{properties} = ();
939
940	15					43	$address->{properties}{non_matching} = '';
941	15	100				43	if ( $parsed_address->{non_matching} )
942							{
943	1					2	$address->{properties}{non_matching} = $parsed_address->{non_matching};
944							}
945	15					36	$address->{properties}{type} = $parsed_address->{type};
946
947	15					48	_trim_trailing_space($address);
948
949	15					64	return($address);
950							}
951
952							#-------------------------------------------------------------------------------
953							#
954							sub _get_street_direction
955							{
956	15			15		30	my ($street_name) = @_;
957
958	15					19	my $street_direction;
959							my $base_street_name;
960
961	15	100				50	unless ($street_name)
962							{
963	2					4	return;
964							}
965
966	13					85	my @words = split(/\s/,$street_name);
967	13	100				39	if (@words > 1)
968							{
969	4	50				18	if ( $words[0] =~ /^(N\|NE\|NW\|E\|S\|SE\|SW\|W\|NORTH\|EAST\|SOUTH\|WEST\|NTH\|STH)$/ )
970							{
971	0					0	$street_direction = $1;
972	0					0	shift(@words);
973	0					0	$base_street_name = join(' ',@words);
974							}
975							}
976	13					39	return($street_direction,$base_street_name);
977
978							}
979
980							#------------------------------------------------------------------------------
981							# Check for several different types of syntax errors
982
983							sub _validate
984							{
985	15			15		28	my $address = shift;
986	15					35	$address->{error} = 0;
987	15					30	$address->{error_desc} = '';
988	15					25	$address->{warning} = 0;
989	15					37	$address->{warning_desc} = '';
990
991	15	100				40	if ( $address->{properties}{non_matching} )
992							{
993	1					2	$address->{error} = 1;
994	1					4	$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
995							}
996							else
997							{
998	14	50				40	if ( $address->{properties}{type} eq 'unknown' )
999							{
1000	0					0	$address->{error} = 1;
1001	0					0	$address->{error_desc} .= 'unknown address format';
1002							}
1003							else
1004							{
1005	14	50	33			107	if ($address->{force_post_code} and not $address->{components}{post_code})
1006							{
1007	0					0	$address->{error} = 1;
1008	0					0	$address->{error_desc} .= ':no post code';
1009							}
1010
1011							# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
1012	14	50				66	if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
1013							{
1014							# Note, if auto_clean is on, illegal characters will have been removed
1015							# for second parsing and no error flag or message reported
1016	0					0	$address->{error} = 1;
1017	0					0	$address->{error_desc} .= ':illegal chars';
1018							}
1019	14	100				37	if ( $address->{properties}{type} eq 'suburban' )
1020							{
1021	12					26	my $street = $address->{components}{street_name};
1022	12	100				42	if ($street !~ /\d/ )
1023							{
1024							# Not an ordinal or single letter street type
1025	11	100				31	if ( _check_vowel($address->{components}{base_street_name}) )
1026							{
1027							# street name must have a vowel sound,
1028	1					2	$address->{warning} = 1;
1029	1					6	$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1030							}
1031							}
1032							}
1033
1034	14	50				40	if ( _check_vowel($address->{components}{suburb}) )
1035							{
1036	0					0	$address->{warning} = 1;
1037	0					0	$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1038							}
1039							}
1040							}
1041							}
1042							#-------------------------------------------------------------------------------
1043							# Purge the input string of illegal or redundant characters.
1044							# Correct malformed patterns
1045
1046							sub _clean
1047							{
1048	12			12		13	my $address = shift;
1049
1050	12					29	my ($input) = $address->{input_string};
1051
1052							# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1053	12					25	$input =~ s\|$.*$\|\|;
1054
1055							# Normalise half house numbers, such as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1056	12					26	$input =~ s\|^(\d{1,4})\.5 \|$1 1/2 \|;
1057
1058							# strip full stops, remove illegal characters
1059							# & can be part of property name
1060							# hash (#) may denote number for USA address
1061							# quotes can occur as property name delimiters
1062
1063	12					32	$input =~ s\|[^A-Za-z0-9&#/'" -]\|\|go;
1064
1065							# remove repeating, leading and trailing spaces
1066	12					25	$input =~ s\| +\| \|go ;
1067	12					28	$input =~ s\|^ \|\|;
1068	12					20	$input =~ s\| $\|\|;
1069
1070
1071							# Expand abbreviations that are too short
1072
1073	12					26	$input =~ s/LAKE ST (GEORGE\|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1074	12					28	$input =~ s\| CSEWY \| CAUSEWAY \|;
1075
1076							# Standardise abbreviations
1077
1078	12					23	$input =~ s\|STR \|ST \|;
1079	12					24	$input =~ s\|TERR \|TERRACE \|;
1080
1081	12					14	$input =~ s\|^FCTR?Y \|FACTORY \|;
1082	12					19	$input =~ s\|^FACT?R?Y? \|FACTORY \|;
1083
1084	12					27	$input =~ s\|LVL \|LEVEL \|; # sub property identifiers
1085	12					34	$input =~ s\|^UN? \|UNIT \|;
1086	12					25	$input =~ s\|^U(\d+)\|UNIT $1\|;
1087
1088							# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1089	12					17	$input =~ s\|/ \|/\|;
1090	12					23	$input =~ s\| /\|/\|;
1091	12					22	$input =~ s\|- \|-\|;
1092	12					22	$input =~ s\| -\|-\|;
1093
1094							# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1095
1096	12	50				61	if ( $input !~ /^\d+ [A-Z] (ST\|AVE)/ )
1097							{
1098							# Don't remove space before single letter streets such as 21 B Street
1099	12	50				46	if ( $address->{country_code} eq 'US' )
1100							{
1101							# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1102							# Assume that the direction prefix is the more likely case
1103	0					0	$input =~ s\|^(\d+) ([A-DF-MO-RT-VX-Z] )\|$1$2\|;
1104							}
1105							else
1106							{
1107	12					42	$input =~ s\|^(\d+) ([A-Z] )\|$1$2\|;
1108							}
1109							}
1110
1111							# Normalise sub property identifiers
1112	12	50				30	if ( $address->{country_code} eq 'US' )
1113							{
1114							# Fix US sub property identifiers that appear after street name and type
1115							# add space between # or 'Apt' and the number so #2 becomes '# 2'
1116	0					0	$input =~ s\| #(\d)\| # $1\|;
1117	0					0	$input =~ s\| #([A-Z])\| # $1\|;
1118	0					0	$input =~ s\| (APT)(\d)\| $1 $2\|i;
1119
1120	0					0	$input =~ s\| (APT)-(\w)\| $1 $2\|i;
1121
1122							# remove redundant space so # 34 B becomes # 34B
1123	0					0	$input =~ s\| # (\d+) (\w) \| # $1$2 \|;
1124
1125							# remove redundant '#'
1126	0					0	$input =~ s/ (APT\|SUITE\|UNIT) #/ $1 /;
1127							# still to test
1128
1129							}
1130							else
1131							{
1132							# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1133	12					33	$input =~ s/^(UNIT\|LOT\|APT\|SHOP)(\d)/$1 $2/;
1134							}
1135
1136							# Remove redundant slash or dash
1137							# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1138	12					29	$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1139							# Unit J1/ 39 becomes Unit J1 39
1140	12					25	$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1141
1142
1143							# remove dash that does not from a sequence, such as D-5 or 22-A
1144	12					21	$input =~ s\|([A-Z])-(\d)\|$1$2\|;
1145	12					21	$input =~ s\|(\d)-([A-Z])\|$1$2\|;
1146
1147
1148	12					38	return($input);
1149							}
1150							#-------------------------------------------------------------------------------
1151							# Remove any "care of" type of precursor from the main address
1152							# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1153							# It will later be saved as an attribute in the address object
1154
1155							sub _extract_precursor
1156							{
1157	15			15		25	my ($input) = @_;
1158	15					25	my ($pre_cursor,$address_start,$address_end);
1159
1160	15	100				92	if ($input =~ m{^(C/O.?\|ATTN.?) (\d+\|PO BOX)( .*)})
1161							{
1162	1					5	$pre_cursor = $1;
1163	1					3	$address_start = $2;
1164	1					3	$address_end = $3;
1165	1					6	return($pre_cursor, $address_start . $address_end);
1166							}
1167							else
1168							{
1169	14					61	return('',$input)
1170							}
1171							}
1172							#-------------------------------------------------------------------------------
1173							# Remove any level or floor info such as:
1174							# 12 Smith St Floor 2
1175							# Level 22 Suite 3 12 Main St
1176							# It will later be saved as an attribute in the address object
1177
1178							sub _extract_level
1179							{
1180	15			15		21	my ($input) = @_;
1181	15					19	my ($level);
1182
1183	15	100	33			238	if
			66
			66
1184							(
1185							# Level info could be at start of string so first space is optional
1186							$input =~ / ?((FIRST\|SECONND\|THIRD\|FOURTH\|FIFTH\|SIXTH) (FLOOR\|FLR\|FL) )/ or
1187							$input =~ / ?(\d{1,2}(ST\|ND\|RD\|TH) (FLOOR\|FLR\|FL) )/ or
1188							$input =~ / ?(LEVEL (\d{1,2}\|[GM])[\/ -])/ or
1189							$input =~ / ?((FLOOR\|FLR\|FL) \d{1,2}[\/ -])/
1190							)
1191							{
1192	1					5	$level = $1;
1193	1					4	$level =~ s\|/\|\|;
1194	1					3	$level =~ s\|-\|\|;
1195	1					17	$input =~ s/$level//;
1196							}
1197
1198	15					47	return($level,$input);
1199							}
1200							#-------------------------------------------------------------------------------
1201							# Remove any building info such as:
1202							# Building 2 Level 12 123 Smith St
1203							# 12 Main St Tower A Level 2
1204							# It will later be saved as an attribute in the address object
1205
1206							sub _extract_building
1207							{
1208	15			15		18	my ($input) = @_;
1209	15					18	my ($building);
1210
1211	15					64	my $bld = qr{BLOCK\|BLDG?\|BUILDING\|TOWER};
1212	15					39	my $id = qr{[A-Z]\|A[A-Z]\|\d+\|\d{1,3}[A-Z]\|[A-Z]\d{1,3}}; # AA or 12 or 32C or C12
1213
1214	15	100	66			315	if
1215							(
1216							$input =~ / ($bld $id) / or $input =~ /^($bld $id) /
1217							)
1218							{
1219	1					3	$building = $1;
1220	1					3	$building =~ s\|/\|\|;
1221	1					2	$building =~ s\|-\|\|;
1222	1					12	$input =~ s/$building//;
1223							}
1224							# TO DO, North, East etc Building?
1225
1226	15					62	return($building,$input);
1227							}
1228
1229							#-------------------------------------------------------------------------------
1230							# Remove any description that follows the suburb after the main address
1231							# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1232							# It will be saved as an address attribute
1233
1234							sub _extract_po_box_type
1235							{
1236	15			15		23	my ($input) = @_;
1237	15					34	my ($po_box_type,$address_start,$address_end);
1238
1239	15	100				59	if ($input =~ /^(.?) (PRIVATE BOXES)( .)$/ )
1240							{
1241	1					4	$address_start = $1;
1242	1					3	$po_box_type = $2;
1243	1					2	$address_end = $3;
1244	1					5	return($po_box_type, $address_start . $address_end);
1245							}
1246							else
1247							{
1248	14					45	return('',$input)
1249							}
1250							}
1251							#------------------------------------------------------------------------------
1252							# For correct matching, the grammar of each component must include the
1253							# trailing space that separates it from any following word. This should
1254							# now be removed from each component
1255
1256							sub _trim_trailing_space
1257							{
1258	15			15		28	my ($address) = @_;
1259
1260	15					25	foreach my $key (keys %{ $address->{components} } )
	15					93
1261							{
1262	285	100				379	if ($address->{components}{$key} )
1263							{
1264	110					222	$address->{components}{$key} =~ s/ $//g;
1265							}
1266							}
1267							}
1268							#------------------------------------------------------------------------------
1269
1270							sub _fmt_report_line
1271							{
1272	0			0		0	my ($report_ref,$label,$value) = @_;
1273							# To DO $$ ??
1274	0					0	$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1275							}
1276							#------------------------------------------------------------------------------
1277
1278							sub _check_vowel
1279							{
1280	25			25		27	my ($str) = @_;
1281
1282	25					58	my @words = split(/ /,$str);
1283	25					34	foreach my $word (@words)
1284							{
1285							# Saint, Mount, Junior, Senior (as in Dr Martin Luther King Snr)
1286	34	100	66			192	if ( length($word) > 1 and $word !~ /[AEIOUY]\|ST\|MT\|DR\|JN?R\|SN?R/ )
1287							{
1288	1					4	return(1);
1289							}
1290							}
1291	24					69	return(0);
1292							}
1293							#------------------------------------------------------------------------------
1294
1295							return(1);