File Coverage

blib/lib/Lingua/EN/AddressParse.pm

Criterion	Covered	Total	%
statement	262	331	79.1
branch	90	124	72.5
condition	21	36	58.3
subroutine	20	23	86.9
pod	6	6	100.0
total	399	520	76.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::AddressParse - extract components of a street address from free format text
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::AddressParse;
8
9							my %args =
10							(
11							country => 'US',
12							auto_clean => 1,
13							force_case => 1,
14							abbreviate_subcountry => 0,
15							abbreviated_subcountry_only => 0,
16							force_post_code => 0
17							);
18
19							my $address = Lingua::EN::AddressParse->new(%args);
20							$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21
22							print $address->report;
23
24							Country address format 'US'
25							Address type 'suburban'
26							Non matching part 'HOLD MAIL '
27							Error '1'
28							Error descriptions 'non matching section : HOLD MAIL '
29							Warning '1'
30							Warning description ''
31							Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32							COMPONENTS ''
33							base_street_name 'Old Massachusetts'
34							post_code '98100'
35							property_identifier '40 1/2'
36							street_direction_prefix 'N'
37							street_name 'N Old Massachusetts'
38							street_type 'Ave'
39							sub_property_identifier '3B'
40							sub_property_type 'Apt'
41							subcountry 'WASHINGTON'
42							suburb 'Washington Valley'
43
44							%address_components = $address->components;
45							print $address_components{sub_property_type}; # APT
46							print $address_components{sub_property_identifier}; # 3B
47							print $address_components{property_identifier}; # 40 1/2
48
49							%address_properties = $address->properties;
50							print $address_properties{type}; # suburban
51							print $address_properties{non_matching}; # : HOLD MAIL
52
53							$correct_casing = $address->case_all;
54
55
56							=head1 DESCRIPTION
57
58							This module takes as input a suburban, rural or postal address in free format
59							text such as,
60
61							3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62							12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63							C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64
65							9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66							27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67
68							2A O'CONNELL ST KEW NSW 2123
69							12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70							"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71							GPO Box K318, HAYMARKET, NSW 2000
72
73
74							and attempts to parse it. If successful, the address is broken
75							down into it's components and useful functions can be performed such as :
76
77							converting upper or lower case values to title case (2A O'Connell St Kew NSW 2123)
78							extracting the addresses individual components (2A,O'Connell,St,KEW,NSW,2123)
79							determining the type of format the address is in ('suburban')
80
81
82							If the address cannot be parsed you have the option of cleaning the address
83							of bad characters, or extracting any portion that was parsed and the portion
84							that failed.
85
86							This module can be used for analysing and improving the quality of
87							lists of residential and postal addresses.
88
89							By using a large combination of regular expressiosn with look ahead analysis, patterns
90							can be parsed that confuse many other parsers. Examples are
91
92							Street names with several street types: Lane Cove Road
93							Suburbs which include street types: Smith Road St Marys
94							Suburbs that include state names: Fort Washington Washington
95
96							=head1 DEFINITIONS
97
98							The following terms are used by AddressParse to define the components that
99							can make up an address.
100
101							Pre cursor : C/O MR A Smith...
102							Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
103							Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44, 2.5
104							Property name : "Old Regret"
105							Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
106							Road Box : RMB 24A, RMS 234 ...
107							Street Direction: North, SE, Sth. etc
108							Street name : O'Hare, New South Head, The Causeway, Broadway
109							Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
110							Suburb : Dee Why, St. John's Wood ...
111							Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
112							Post (zip) code : 2062, 34532-1234, SG12A 9ET
113							Country : Australia, UK, US or Canada
114
115
116
117							The main address formats currently supported are as follows. (a ? means the component is optional):
118
119							'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
120
121							OR for the USA
122							'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
123
124							'rural' : property_name suburb subcountry post_code(?) country(?)
125							'post_box' : post_box suburb subcountry post_code(?) country(?)
126							'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
127							'road_box' : road_box suburb subcountry post_code(?) country(?)
128
129							Note that suburb and subcountry are not optional. The accuracy of the parser is
130							improved by providing as much context as possible. Proding a suburb can ehlp to
131							identify street names that would itherwise be ambigious.
132
133							For the case where you only have a street address, dummy (but still valid) values can be used
134							for suburb (such as 'Somewhere') and sub country (such as 'NY'). These dummy values will
135							be parsed but can be ignored.
136
137							All formats may contain a precursor
138
139							Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
140							module for a complete list of combinations.
141
142
143							=head1 METHODS
144
145							=head2 new
146
147							The C method creates an instance of an address object and sets up
148							the grammar used to parse addresses. This must be called before any of the
149							following methods are invoked. Note that the object only needs to be
150							created once, and can be reused with new input data.
151
152							Various setup options may be defined in a hash that is passed as an
153							optional argument to the C method.
154
155							my %args =
156							(
157							country => 'US',
158							auto_clean => 1,
159							force_case => 1,
160							abbreviate_subcountry => 1,
161							abbreviated_subcountry_only => 1,
162							force_post_code => 1
163							);
164
165							my $address = Lingua::EN::AddressParse->new(%args);
166
167							=over 4
168
169							=item country
170
171							The country argument must be specified. It determines the possible list of
172							valid sub countries (states, counties etc, defined in the Locale::SubCountry
173							module) and post code formats. Either the full name or abbreviation may be
174							specified. The currently supported country names and codes are:
175
176							AU or Australia
177							CA or Canada
178							GB or United Kingdom
179							US or United States
180
181							All forms of upper/lower case are acceptable in the country's spelling. If a
182							country name is supplied that the module doesn't recognise, it will die.
183
184							=item force_case (optional)
185
186							This option only applies to the C method, see below.
187
188							=item auto_clean (optional)
189
190							When this option is set to a positive value, the input string is
191							'cleaned' to try and normalise bad patterns. The type of cleaning
192							includes
193
194							remove non alphanumeric characters
195							remove full stops
196							remove redundant white space
197							add missing space separators
198							expand abbreviations to more common forms
199							remove bracketed annotations
200							fix badly formed sub property identifiers
201
202							=item abbreviate_subcountry (optional)
203
204							When this option is set to a positive value, the sub country is forced to it's
205							abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
206							already abbreviated then it's value is not altered.
207
208							=item abbreviated_subcountry_only (optional)
209
210							When this option is set to a positive value, only the abbreviated form
211							of sub country is allowed, such as "NSW" and not "New South Wales". This
212							will make parsing quicker and ensure that addresses comply with postal
213							standards that normally permit only abbreviated sub countries.
214
215							It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
216							Normally, 'Washington would be consumed as the sub country, but by first converting
217							the address to 'Port Washington NJ' we avoid this problem
218
219
220							=item force_post_code (optional)
221
222							When this option is set to a positive value, the address must contain
223							a post code. If it does not then an error flag is raised. If this option
224							is set to 0 than a post code is optional.
225
226							By default for this option is true.
227
228							=back
229
230							=head2 parse
231
232							$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
233
234							The C method takes a single parameter of a text string containing a
235							address. It attempts to parse the address and break it down into the components
236							described below. If the address is parsed successfully, a 0 is returned,
237							otherwise a 1.
238
239							Note that you can successfully parse all the components of an address and still
240							have an error returned. This occurs when you have non matching data following
241							a valid address. To check if the data is unusable, you also need to use the
242							C method to check the address type is 'unknown'
243
244							This method is a prerequisite for all the following methods.
245
246							=head2 components
247
248							%address = $address->components($upper_case_all);
249							$suburb = $address{suburb};
250
251							If the optional argument $upper_case_all is set to a postive value, all components
252							are converted to upper case.
253
254
255							The C method returns all the address components in a hash. The
256							following keys are used for each component:
257
258
259							pre_cursor - such as 'C/O Mr A Smith'
260							po_box_type - such as 'Private Boxes'
261							post_box
262							road_box
263							sub_property_type
264							sub_property_identifier
265							property_identifier
266							property_name
267							level - such as 12th Floor
268							building - such as Tower A
269							street_direction_prefix (such as East, NW, North etc)
270							base_street_name (the name with direction removed, such as "Main" in "East Main St")
271							street_name (the full street name such as "East Main")
272							street_type
273							street_direction_suffix (US only, abbreviated only such as N, SE etc)
274							suburb
275							subcountry
276							post_code
277							country
278
279							If a component has no matching data for a given address, it's values will be
280							set to the empty string.
281
282							Each component is converted to title case, meaning the first letter of each
283							component is set to capitals and the remainder to lower case.
284
285							Proper name capitalisations such as MacNay and O'Brien are observed
286
287							The following components are not converted to title case:
288
289							post_box
290							road_box
291							subcountry
292							post_code
293							country
294							street_direction_suffix
295
296							If your input data is all upper case and you want to retian that format for parsed
297							data, you will need to apply the 'uc' function to each component.
298
299							=head2 case_all
300
301							$correct_casing = $address->case_all;
302
303							The C method does the same thing as the C method except
304							the entire address is returned as a title cased text string.
305
306							If the force_case option was set in the C method above, address case the
307							entire input string, including any unmatched sections after a recognisable address
308							that failed parsing. This option is useful when you know you have invalid data,
309							but you still want to title case what you have.
310
311							=head2 properties
312
313							The C method returns several properties of the address as a hash.
314							The following keys are used for each property -
315
316							type - either suburban ,rural,post_box,road_box,unknown
317							non_matching - any trailing string not part the address
318
319
320							Additional properties can be accessed with the following
321
322							$address->{original_input}
323							$address->{input_string} - string after auto_clean option has been applied
324							$address->{country_code} - abbreviated Country address format (as defined in the C method)
325							$address->{error} - error flag, 0 = good, 1 = error
326							$address->{error_desc} - text to describe the type of parsing error
327							$address->{warning} - warning flag, 0 = good, 1 = warning
328							$address->{warning_desc} - text to to describe the type of parsing warning(s)
329
330							Warnings mean that the address has parsed but there may still be errors within it's components
331
332
333							=head2 report
334
335							Create a formatted text report
336
337							the input string
338							the cleaned input string
339							the country type
340							the address type
341							any non matching part of input string
342							if any parsing errors occurred
343							error description
344							if any parsing warning occurred
345							warning description
346
347							the name and value of each defined component
348
349
350							Returns a string containing a multi line formatted text report
351
352							=head1 DEPENDENCIES
353
354							L, L, L
355
356							=head1 BUGS
357
358							=head1 LIMITATIONS
359
360							Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
361							street type of null string.
362
363							The abbreviation 'St' can be interpreted as either street or Saint. This leads to
364							ambiguities such as '12 East St Thomas Lane'. This could be 'East Street', suburb of
365							'Thomas Lane' or 'East St Thomas Lane'. And the first pattern is the more common,
366							that is what will match.
367
368							For US addresses, an ambiguity arises between a street directional suffix and
369							a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
370							Main St, or South Springfield? The parser assumes that 'S' belongs to the street
371							description.
372
373							The huge number of character combinations that can form a valid address makes
374							it is impossible to correctly identify them all.
375
376							Valid addresses must contain:
377
378							property address, suburb, subcountry (aka state) in that order.
379
380							This format is widely accepted in Australia and the US.
381
382							UK addresses will often include suburb, town, city and county, formats that
383							are very difficult to parse.
384
385							Property names must be enclosed in single or double quotes like "Old Regret"
386
387							Because of the large combination of possible addresses defined in the grammar,
388							the program is not very fast.
389
390
391							=head1 REFERENCES
392
393							"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
394
395							Australian Standard AS4212-1994 "Geographic Information Systems -
396							Data Dictionary for transfer of street addressing information"
397
398							ISO 3166-2:1998, Codes for the representation of names of countries
399							and their subdivisions. Also released as AS/NZS 2632.2:1999
400
401
402							=head1 SEE ALSO
403
404							AddressParse is designed to identify properties, which have a unique physical
405							location. L will also parse addresses for the USA, and can handle
406							locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
407							"Mission Street at Valencia Street, San Francisco, CA"
408
409
410							L
411							L
412							L
413							L
414
415							See L
416							for a list of different addressing formats from around the world. And also
417							L
418
419							=head1 REPOSITORY
420
421							L
422
423							=head1 TO DO
424
425							Define grammar for other languages. Hopefully, all that would be needed is
426							to specify a new module with its own grammar, and inherit all the existing
427							methods. I don't have the knowledge of the naming conventions for non-english
428							languages.
429
430							=head1 AUTHOR
431
432							AddressParse was written by Kim Ryan
433
434							=head1 COPYRIGHT AND LICENSE
435
436							Copyright (c) 2018 Kim Ryan. All rights reserved.
437
438							This library is free software; you can redistribute it and/or modify
439							it under the same terms as Perl itself.
440
441
442							=cut
443
444							#------------------------------------------------------------------------------
445
446							package Lingua::EN::AddressParse;
447
448	1			1		491	use strict;
	1					2
	1					21
449	1			1		4	use Carp;
	1					1
	1					70
450	1			1		5	use warnings;
	1					2
	1					18
451	1			1		417	use Lingua::EN::AddressParse::Grammar;
	1					2
	1					28
452	1			1		570	use Lingua::EN::NameParse;
	1					37066
	1					64
453	1			1		12	use Parse::RecDescent;
	1					3
	1					6
454
455							our $VERSION = '1.27';
456
457							#------------------------------------------------------------------------------
458							# Create a new instance of an address parsing object. This step is time
459							# consuming and should normally only be called once in your program.
460
461							sub new
462							{
463	4			4	1	1045	my $class = shift;
464	4					12	my %args = @_;
465
466
467	4	50	33			40	unless (defined $args{country} and $args{country} =~ /^(AU\|Australia\|GB\|United Kingdom\|US\|United States\|CA\|Canada)$/ )
468							{
469	0					0	croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AU,GB,US or CA.\n";
470							}
471
472	4					11	my $address = {};
473	4					8	bless($address,$class);
474
475							# option defaults
476	4					14	$address->{'force_post_code'} = 1;
477
478							# Add error checking for invalid keys?
479	4					11	foreach my $curr_key (keys %args)
480							{
481	8					18	$address->{$curr_key} = $args{$curr_key};
482							}
483
484							# create the grammar tree (this is country dependent)
485	4					18	my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
486
487	4					26	$address->{parse} = Parse::RecDescent->new($grammar);
488
489	4					3011169	return ($address);
490							}
491							#------------------------------------------------------------------------------
492							sub parse
493							{
494	15			15	1	6906	my $address = shift;
495	15					39	my ($input_string) = @_;
496
497							# Save original data so we can check effect of auto cleaning
498	15					38	$address->{original_input} = $input_string;
499
500							# Convert to all upper case. This will allow for faster regexp matching in
501							# the grammar tree
502	15					41	$address->{input_string} = uc($input_string);
503
504	15					40	chomp($address->{input_string});
505
506	15					32	my $pre_cursor;
507	15					68	($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
508
509							# Replace commas (which can be used to chunk sections of addresses) with spaces
510	15					47	$address->{input_string} =~ s/,/ /g;
511
512	15	100				69	if ( $address->{auto_clean} )
513							{
514	12					33	$address->{input_string} = _clean($address);
515							}
516
517	15					22	my $po_box_type;
518	15					55	($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
519
520	15					30	my $level;
521	15					47	($level,$address->{input_string}) = _extract_level($address->{input_string});
522
523	15					29	my $building;
524	15					47	($building,$address->{input_string}) = _extract_building($address->{input_string});
525
526
527
528							# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
529	15	100	100			97	if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/\| ](\d+[ \w-].*)$/ )
530							{
531	2					11	$address->{input_string} = "UNIT $1 $2";
532							}
533
534							# We need to add a trailing space to the input string. This is because the grammar
535							# tree expects a terminator (the space) fro every production, optionally followed
536							# by other productions or any final non matching text.
537							# This space will be removed in the _assemble function
538	15					35	$address->{input_string} .= ' ';
539
540	15					45	$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
541	15					46	_validate($address);
542
543
544	15					50	return($address,$address->{error});
545							}
546
547							#------------------------------------------------------------------------------
548							# Apply correct capitalisation to each component of an address
549
550							sub components
551							{
552	17			17	1	84	my $address = shift;
553	17					29	my ($uc_all) = @_;
554
555	17					25	my %orig_components = %{ $address->{components} };
	17					158
556
557	17					44	my (%cased_components);
558	17					54	foreach my $curr_key ( keys %orig_components )
559							{
560	323					386	my ($cased_value, $curr_value);
561
562	323	100				452	if ( $orig_components{$curr_key})
563							{
564	129					170	$curr_value = $orig_components{$curr_key};
565							}
566							else
567							{
568	194					259	$curr_value = '';
569							}
570
571
572	323	50				474	if ($uc_all)
573							{
574	0					0	$cased_components{$curr_key} = uc($curr_value);
575	0					0	next;
576							}
577
578
579	323	100				800	if ( $curr_key =~ /^(base_street_name\|street_name\|street_type\|suburb\|property_name\|sub_property\|pre_cursor\|po_box_type\|level\|building)/ )
		100
580							{
581
582	187	100	100			445	if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
		100
583							{
584	1					3	$cased_value = "US Highway $1";
585							}
586							elsif ( $curr_key eq 'sub_property_identifier' )
587							{
588							# UNIT, APT ... 12D etc
589
590	17					40	my @words = split(/ /,$curr_value);
591	17					23	my @cased_words;
592							my $cased_string;
593	17					33	foreach my $word (@words)
594							{
595	5					7	my $cased_word;
596	5	50	66			39	if ( $word =~ /^\d{1,3}(ST\|ND\|RD\|TH)$/)
		50
597							{
598							# ordinal component, as in 3rd Floor
599	0					0	$cased_word = lc($word);
600							}
601							elsif ( length($word) > 1 and $word !~ /\d/ )
602							{
603							# only need to title case words such as UNIT
604	0					0	$cased_word = Lingua::EN::NameParse::case_surname($word);
605							}
606							else
607							{
608	5					11	$cased_word = $word;
609							}
610	5					12	push(@cased_words,$cased_word);
611
612							}
613	17					35	$cased_value = join(' ',@cased_words);
614							}
615							else
616							{
617	169	100				241	if ($curr_value)
618							{
619							# Surnames can be used for street's or suburbs so this method
620							# will give correct capitalisation for most cases
621	71					147	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
622							}
623							else
624							{
625	98					127	$cased_value = '';
626							}
627							}
628							}
629							# retain street_direction,sub country and countries capitalisation, usually uppercase
630							elsif ($curr_key =~ /street_direction/)
631							{
632	34	100	66			154	if (length($curr_value) == 1 or length($curr_value) == 2)
		50
633							{
634							# N, SE etc is capitalised
635	1					4	$cased_value =$curr_value;
636							}
637							elsif (length($curr_value) > 2)
638							{
639	0					0	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
640							}
641							else
642							{
643	33					47	$cased_value = '';
644							}
645							}
646							# retain sub country and countries capitalisation, as usually uppercase
647							else
648							{
649	102					143	$cased_value = uc($curr_value);
650							}
651	323					2348	$cased_components{$curr_key} = $cased_value;
652							}
653	17					299	return(%cased_components);
654							}
655							#------------------------------------------------------------------------------
656							# Apply correct capitalisation to an entire address
657
658							sub case_all
659							{
660	0			0	1	0	my $address = shift;
661
662	0					0	my @cased_address;
663
664	0	0				0	unless ( $address->{properties}{type} eq 'unknown' )
665							{
666
667							# Hash of of lists, indicating the order that address components are assembled in.
668							# Each list element is itself the name of the key value in an address object.
669
670	0					0	my %component_order=
671							(
672							'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
673							'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
674							'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
675
676							);
677	0	0				0	if ( $address->{country} eq 'US' )
678							{
679	0					0	$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
680							}
681							else
682							{
683	0					0	$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
684							}
685
686	0					0	my %component_vals = $address->components;
687	0					0	my @order = @{ $component_order{$address->{properties}{type} } };
	0					0
688
689	0					0	foreach my $component ( @order )
690							{
691							# As some components such as property name are optional, they will appear
692							# in the order array but may or may not have have a value, so check
693							# for undefined values
694	0	0				0	if ( $component_vals{$component} )
695							{
696	0					0	push(@cased_address,$component_vals{$component});
697							}
698							}
699							}
700
701	0	0	0			0	if ( $address->{error} and $address->{force_case} )
702							{
703							# Despite errors, try to name case non-matching section. As the format
704							# of this section is unknown, surname case will provide the best
705							# approximation
706	0					0	push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
707							}
708
709	0					0	return(join(' ',@cased_address));
710							}
711							#------------------------------------------------------------------------------
712							sub properties
713							{
714	2			2	1	11	my $address = shift;
715	2					2	return(%{ $address->{properties} });
	2					9
716							}
717
718							#------------------------------------------------------------------------------
719							# Create a text report to standard output listing
720							# - the input string,
721							# - the name of each defined component
722							# - any non matching component
723
724							sub report
725							{
726	0			0	1	0	my $address = shift;
727
728	0					0	my $report = '';
729
730	0					0	_fmt_report_line(\$report,"Original Input",$address->{original_input});
731	0					0	_fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
732	0					0	_fmt_report_line(\$report,"Country address format",$address->{country_code});
733
734	0					0	my %props = $address->properties;
735	0	0				0	if ( $props{type} )
736							{
737	0					0	_fmt_report_line(\$report,"Address type",$props{type});
738							}
739
740
741	0					0	_fmt_report_line(\$report,"Non matching part",$props{non_matching});
742	0					0	_fmt_report_line(\$report,"Error",$address->{error});
743	0					0	_fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
744	0					0	_fmt_report_line(\$report,"Warning",$address->{error});
745	0					0	_fmt_report_line(\$report,"Warning description",$address->{warning_desc});
746	0					0	_fmt_report_line(\$report,"Case all",$address->case_all);
747
748
749	0					0	_fmt_report_line(\$report,"COMPONENTS",'');
750	0					0	my %comps = $address->components;
751	0					0	foreach my $comp ( sort keys %comps)
752							{
753	0	0				0	if (defined($comps{$comp}) )
754							{
755	0					0	_fmt_report_line(\$report,$comp,$comps{$comp});
756							}
757							}
758
759	0					0	return($report);
760							}
761
762
763							#------------------------------------------------------------------------------
764
765							# PRIVATE METHODS
766
767							#------------------------------------------------------------------------------
768
769							sub _assemble
770							{
771
772	15			15		30	my $address = shift;
773	15					34	my ($pre_cursor,$po_box_type,$level,$building) = @_;
774
775							# Parse the address according to the rules defined in the AddressParse::Grammar module,
776							# $::RD_TRACE = 1; # for debugging RecDescent output
777							# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
778	15					118	my $parsed_address = $address->{parse}->full_address($address->{input_string});
779
780							# Place components into a separate hash, so they can be easily returned to the user to inspect and modify
781	15					228790	$address->{components} = ();
782
783	15	100				40	if ($pre_cursor)
784							{
785	1					3	$address->{components}{'pre_cursor'} = $pre_cursor;
786							}
787							else
788							{
789	14					49	$address->{components}{'pre_cursor'} = '';
790							}
791
792	15	100				39	if ($level)
793							{
794	1					3	$address->{components}{'level'} = $level;
795							}
796							else
797							{
798	14					30	$address->{components}{'level'} = '';
799							}
800
801	15	100				29	if ($building)
802							{
803	1					2	$address->{components}{'building'} = $building;
804							}
805							else
806							{
807	14					28	$address->{components}{'building'} = '';
808							}
809
810
811	15	100				28	if ($po_box_type)
812							{
813	1					3	$address->{components}{'po_box_type'} = $po_box_type;
814							}
815							else
816							{
817	14					33	$address->{components}{'po_box_type'} = '';
818							}
819
820
821	15					27	$address->{components}{post_box} = '';
822	15	100				41	if ( $parsed_address->{post_box} )
823							{
824	1					3	$address->{components}{post_box} = $parsed_address->{post_box};
825							}
826
827	15					31	$address->{components}{road_box} = '';
828	15	50				43	if ( $parsed_address->{road_box} )
829							{
830	0					0	$address->{components}{road_box} = $parsed_address->{road_box};
831							}
832
833	15					33	$address->{components}{property_name} = '';
834	15	100				34	if ( $parsed_address->{property_name} )
835							{
836	1					4	$address->{components}{property_name} = $parsed_address->{property_name};
837							}
838
839	15					44	$address->{components}{sub_property_identifier} = '';
840	15					28	$address->{components}{sub_property_type} = '';
841
842	15	100				38	if ( $parsed_address->{sub_property} )
843							{
844	4	50				26	if ($parsed_address->{sub_property} =~ /^(#\|[A-Z]{1,}) (.*)$/ )
		0
845							{
846							# Such as Unit 24, # 4A etc
847	4					14	$address->{components}{sub_property_type} = $1;
848	4					11	$address->{components}{sub_property_identifier} = $2;
849							}
850							elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
851							{
852							# Such as 1st Floor
853	0					0	$address->{components}{sub_property_type} = $2;
854	0					0	$address->{components}{sub_property_identifier} = $1;
855							}
856							}
857
858	15					32	$address->{components}{property_identifier} = '';
859	15	100				39	if ( $parsed_address->{property_identifier} )
860							{
861	13					28	$address->{components}{property_identifier} = $parsed_address->{property_identifier};
862							}
863
864	15					56	($address->{components}{street_direction_prefix},$address->{components}{base_street_name}) = _get_street_direction($parsed_address->{street_name});
865
866	15					35	$address->{components}{street_name} = '';
867	15					28	$address->{components}{street_type} = '';
868	15	100				48	if ( $parsed_address->{street_name} )
869							{
870							# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
871	13	50				38	if ( $parsed_address->{street_name} eq 'THE ' )
872							{
873	0					0	$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
874							}
875							else
876							{
877	13					24	$address->{components}{street_name} = $parsed_address->{street_name};
878	13					29	$address->{components}{street_type} = $parsed_address->{street_type};
879							}
880							}
881
882	15					32	$address->{components}{street_direction_suffix} = '';
883	15	100				35	if ( $parsed_address->{street_direction_suffix} )
884							{
885	1					6	$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
886							}
887
888
889	15					41	$address->{components}{suburb} = '';
890	15	50				32	if ( $parsed_address->{suburb} )
891							{
892	15					33	$address->{components}{suburb} = $parsed_address->{suburb};
893							}
894
895	15					29	$address->{components}{subcountry} = '';
896	15	50				35	if ( $parsed_address->{subcountry} )
897							{
898	15					31	my $sub_country = $parsed_address->{subcountry};
899
900							# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
901	15	100				29	if ($address->{abbreviate_subcountry})
902							{
903	7					32	my $country = Locale::SubCountry->new($address->{country});
904	7					148	my $code = $country->code($sub_country);
905	7	100				578	if ( $code ne 'unknown' )
906							{
907	1					3	$address->{components}{subcountry} = $code;
908							}
909							# sub country already abbreviated
910							else
911							{
912	6					20	$address->{components}{subcountry} = $sub_country;
913							}
914							}
915							else
916							{
917	8					18	$address->{components}{subcountry} = $sub_country;
918							}
919							}
920
921	15					32	$address->{components}{post_code} = '';
922	15	50				36	if ( $parsed_address->{post_code} )
923							{
924	15					33	$address->{components}{post_code} = $parsed_address->{post_code};
925							}
926
927	15					28	$address->{components}{country} = '';
928	15	100				62	if ( $parsed_address->{country} )
929							{
930	1					2	$address->{components}{country} = $parsed_address->{country};
931							}
932
933	15					62	$address->{properties} = ();
934
935	15					33	$address->{properties}{non_matching} = '';
936	15	100				33	if ( $parsed_address->{non_matching} )
937							{
938	1					3	$address->{properties}{non_matching} = $parsed_address->{non_matching};
939							}
940	15					32	$address->{properties}{type} = $parsed_address->{type};
941
942	15					41	_trim_trailing_space($address);
943
944	15					60	return($address);
945							}
946
947							#-------------------------------------------------------------------------------
948							#
949							sub _get_street_direction
950							{
951	15			15		29	my ($street_name) = @_;
952
953	15					28	my $street_direction;
954							my $base_street_name;
955
956	15	100				33	unless ($street_name)
957							{
958	2					5	return;
959							}
960
961	13					66	my @words = split(/\s/,$street_name);
962	13	100				37	if (@words > 1)
963							{
964	4	50				15	if ( $words[0] =~ /^(N\|NE\|NW\|E\|S\|SE\|SW\|W\|NORTH\|EAST\|SOUTH\|WEST\|NTH\|STH)$/ )
965							{
966	0					0	$street_direction = $1;
967	0					0	shift(@words);
968	0					0	$base_street_name = join(' ',@words);
969							}
970							else
971							{
972	4					7	$street_direction = '';
973	4					7	$base_street_name = $street_name;
974
975							}
976							}
977							else
978							{
979	9					17	$base_street_name = $street_name;
980							}
981	13					52	return($street_direction,$base_street_name);
982
983							}
984
985							#------------------------------------------------------------------------------
986							# Check for several different types of syntax errors
987
988							sub _validate
989							{
990	15			15		25	my $address = shift;
991	15					32	$address->{error} = 0;
992	15					28	$address->{error_desc} = '';
993	15					29	$address->{warning} = 0;
994	15					25	$address->{warning_desc} = '';
995
996	15	100				39	if ( $address->{properties}{non_matching} )
997							{
998	1					2	$address->{error} = 1;
999	1					4	$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
1000							}
1001							else
1002							{
1003	14	50				35	if ( $address->{properties}{type} eq 'unknown' )
1004							{
1005	0					0	$address->{error} = 1;
1006	0					0	$address->{error_desc} .= 'unknown address format';
1007							}
1008							else
1009							{
1010	14	50	33			117	if ($address->{force_post_code} and not $address->{components}{post_code})
1011							{
1012	0					0	$address->{error} = 1;
1013	0					0	$address->{error_desc} .= ':no post code';
1014							}
1015
1016							# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
1017	14	50				51	if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
1018							{
1019							# Note, if auto_clean is on, illegal characters will have been removed
1020							# for second parsing and no error flag or message reported
1021	0					0	$address->{error} = 1;
1022	0					0	$address->{error_desc} .= ':illegal chars';
1023							}
1024	14	100				40	if ( $address->{properties}{type} eq 'suburban' )
1025							{
1026	12					28	my $street = $address->{components}{street_name};
1027	12	100				37	if ($street !~ /\d/ )
1028							{
1029							# Not an ordinal or single letter street type
1030	11	100				32	if ( _check_vowel($address->{components}{base_street_name}) )
1031							{
1032							# street name must have a vowel sound,
1033	1					3	$address->{warning} = 1;
1034	1					5	$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1035							}
1036							}
1037							}
1038
1039	14	50				32	if ( _check_vowel($address->{components}{suburb}) )
1040							{
1041	0					0	$address->{warning} = 1;
1042	0					0	$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1043							}
1044							}
1045							}
1046							}
1047							#-------------------------------------------------------------------------------
1048							# Purge the input string of illegal or redundant characters.
1049							# Correct malformed patterns
1050
1051							sub _clean
1052							{
1053	12			12		20	my $address = shift;
1054
1055	12					26	my ($input) = $address->{input_string};
1056
1057							# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1058	12					21	$input =~ s\|$.*$\|\|;
1059
1060							# Normalise half house numbers, such as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1061	12					24	$input =~ s\|^(\d{1,4})\.5 \|$1 1/2 \|;
1062
1063							# strip full stops, remove illegal characters
1064							# & can be part of property name
1065							# hash (#) may denote number for USA address
1066							# quotes can occur as property name delimiters
1067
1068	12					35	$input =~ s\|[^A-Za-z0-9&#/'" -]\|\|go;
1069
1070							# remove repeating, leading and trailing spaces
1071	12					20	$input =~ s\| +\| \|go ;
1072	12					21	$input =~ s\|^ \|\|;
1073	12					25	$input =~ s\| $\|\|;
1074
1075
1076							# Expand abbreviations that are too short
1077
1078	12					20	$input =~ s/LAKE ST (GEORGE\|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1079	12					20	$input =~ s\| CSEWY \| CAUSEWAY \|;
1080
1081							# Standardise abbreviations
1082
1083	12					31	$input =~ s\|STR \|ST \|;
1084	12					21	$input =~ s\|TERR \|TERRACE \|;
1085
1086	12					20	$input =~ s\|^FCTR?Y \|FACTORY \|;
1087	12					19	$input =~ s\|^FACT?R?Y? \|FACTORY \|;
1088
1089	12					25	$input =~ s\|LVL \|LEVEL \|; # sub property identifiers
1090	12					25	$input =~ s\|^UN? \|UNIT \|;
1091	12					17	$input =~ s\|^U(\d+)\|UNIT $1\|;
1092
1093							# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1094	12					21	$input =~ s\|/ \|/\|;
1095	12					17	$input =~ s\| /\|/\|;
1096	12					21	$input =~ s\|- \|-\|;
1097	12					16	$input =~ s\| -\|-\|;
1098
1099							# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1100
1101	12	50				44	if ( $input !~ /^\d+ [A-Z] (ST\|AVE)/ )
1102							{
1103							# Don't remove space before single letter streets such as 21 B Street
1104	12	50				37	if ( $address->{country_code} eq 'US' )
1105							{
1106							# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1107							# Assume that the direction prefix is the more likely case
1108	0					0	$input =~ s\|^(\d+) ([A-DF-MO-RT-VX-Z] )\|$1$2\|;
1109							}
1110							else
1111							{
1112	12					30	$input =~ s\|^(\d+) ([A-Z] )\|$1$2\|;
1113							}
1114							}
1115
1116							# Normalise sub property identifiers
1117	12	50				30	if ( $address->{country_code} eq 'US' )
1118							{
1119							# Fix US sub property identifiers that appear after street name and type
1120							# add space between # or 'Apt' and the number so #2 becomes '# 2'
1121	0					0	$input =~ s\| #(\d)\| # $1\|;
1122	0					0	$input =~ s\| #([A-Z])\| # $1\|;
1123	0					0	$input =~ s\| (APT)(\d)\| $1 $2\|i;
1124
1125	0					0	$input =~ s\| (APT)-(\w)\| $1 $2\|i;
1126
1127							# remove redundant space so # 34 B becomes # 34B
1128	0					0	$input =~ s\| # (\d+) (\w) \| # $1$2 \|;
1129
1130							# remove redundant '#'
1131	0					0	$input =~ s/ (APT\|SUITE\|UNIT) #/ $1 /;
1132							# still to test
1133							}
1134							else
1135							{
1136							# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1137	12					27	$input =~ s/^(UNIT\|LOT\|APT\|SHOP)(\d)/$1 $2/;
1138							}
1139
1140							# Remove redundant slash or dash
1141							# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1142	12					25	$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1143							# Unit J1/ 39 becomes Unit J1 39
1144	12					25	$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1145
1146
1147							# remove dash that does not from a sequence, such as D-5 or 22-A
1148	12					24	$input =~ s\|([A-Z])-(\d)\|$1$2\|;
1149	12					22	$input =~ s\|(\d)-([A-Z])\|$1$2\|;
1150
1151
1152	12					30	return($input);
1153							}
1154							#-------------------------------------------------------------------------------
1155							# Remove any "care of" type of precursor from the main address
1156							# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1157							# It will later be saved as an attribute in the address object
1158
1159							sub _extract_precursor
1160							{
1161	15			15		35	my ($input) = @_;
1162	15					28	my ($pre_cursor,$address_start,$address_end);
1163
1164	15	100				80	if ($input =~ m{^(C/O.?\|ATTN.?) (\d+\|PO BOX)( .*)})
1165							{
1166	1					4	$pre_cursor = $1;
1167	1					3	$address_start = $2;
1168	1					4	$address_end = $3;
1169	1					14	return($pre_cursor, $address_start . $address_end);
1170							}
1171							else
1172							{
1173	14					51	return('',$input)
1174							}
1175							}
1176							#-------------------------------------------------------------------------------
1177							# Remove any level or floor info such as:
1178							# 12 Smith St Floor 2
1179							# Level 22 Suite 3 12 Main St
1180							# It will later be saved as an attribute in the address object
1181
1182							sub _extract_level
1183							{
1184	15			15		31	my ($input) = @_;
1185	15					24	my ($level);
1186
1187	15	100	33			167	if
			66
			66
1188							(
1189							# Level info could be at start of string so first space is optional
1190							# TO DO, add lower?, mezz, BASEMENT etc?
1191							$input =~ / ?((GROUND\|FIRST\|SECOND\|THIRD\|FOURTH\|FIFTH\|SIXTH) (FLOOR\|FLR\|FL) )/ or
1192							$input =~ / ?(\d{1,2}(ST\|ND\|RD\|TH) (FLOOR\|FLR\|FL) )/ or
1193							$input =~ / ?(LEVEL (\d{1,2}\|[GM])[\/ -])/ or
1194							$input =~ / ?((FLOOR\|FLR\|FL) \d{1,2}[\/ -])/
1195							)
1196							{
1197	1					4	$level = $1;
1198	1					3	$level =~ s\|/\|\|;
1199	1					2	$level =~ s\|-\|\|;
1200	1					12	$input =~ s/$level//;
1201							}
1202
1203	15					49	return($level,$input);
1204							}
1205							#-------------------------------------------------------------------------------
1206							# Remove any building info such as:
1207							# Building 2 Level 12 123 Smith St
1208							# 12 Main St Tower A Level 2
1209							# It will later be saved as an attribute in the address object
1210
1211							sub _extract_building
1212							{
1213	15			15		35	my ($input) = @_;
1214	15					21	my ($building);
1215
1216	15					58	my $building_label = qr{BLOCK\|BLDG?\|BUILDING\|TOWER};
1217	15					60	my $id = qr{[A-Z]\|A[A-Z]\|\d+\|\d{1,3}[A-Z]\|[A-Z]\d{1,3}}; # AA or 12 or 32C or C12
1218
1219	15	100	66			288	if
1220							(
1221							$input =~ /^($building_label $id) / or $input =~ / ($building_label $id) /
1222							)
1223							{
1224	1					3	$building = $1;
1225	1					4	$building =~ s\|/\|\|;
1226	1					2	$building =~ s\|-\|\|;
1227	1					7	$input =~ s/$building//;
1228							}
1229							# TO DO, North, East etc Building?
1230
1231
1232	15					79	my $name = qr{[A-Z]+}; #
1233	15					25	my $house;
1234
1235							# Allow for x house at start of text only. Not for example: 12 Gate House Road, suburb
1236	15	50				98	if ( $input =~ /^(($name )?$name HOUSE) / )
1237							{
1238	0					0	$house = $1;
1239	0					0	$input =~ s/$house//;
1240	0					0	$building .= $house;
1241							}
1242
1243	15					81	return($building,$input);
1244							}
1245
1246							#-------------------------------------------------------------------------------
1247							# Remove any description that follows the suburb after the main address
1248							# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1249							# It will be saved as an address attribute
1250
1251							sub _extract_po_box_type
1252							{
1253	15			15		36	my ($input) = @_;
1254	15					29	my ($po_box_type,$address_start,$address_end);
1255
1256	15	100				52	if ($input =~ /^(.?) (PRIVATE BOXES)( .)$/ )
1257							{
1258	1					3	$address_start = $1;
1259	1					4	$po_box_type = $2;
1260	1					2	$address_end = $3;
1261	1					5	return($po_box_type, $address_start . $address_end);
1262							}
1263							else
1264							{
1265	14					52	return('',$input)
1266							}
1267							}
1268							#------------------------------------------------------------------------------
1269							# For correct matching, the grammar of each component must include the
1270							# trailing space that separates it from any following word. This should
1271							# now be removed from each component
1272
1273							sub _trim_trailing_space
1274							{
1275	15			15		28	my ($address) = @_;
1276
1277	15					24	foreach my $key (keys %{ $address->{components} } )
	15					77
1278							{
1279	285	100				480	if ($address->{components}{$key} )
1280							{
1281	110					279	$address->{components}{$key} =~ s/ $//g;
1282							}
1283							}
1284							}
1285							#------------------------------------------------------------------------------
1286
1287							sub _fmt_report_line
1288							{
1289	0			0		0	my ($report_ref,$label,$value) = @_;
1290	0					0	$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1291							}
1292							#------------------------------------------------------------------------------
1293
1294							sub _check_vowel
1295							{
1296	25			25		51	my ($str) = @_;
1297
1298	25					65	my @words = split(/ /,$str);
1299	25					44	foreach my $word (@words)
1300							{
1301							# Saint, Mount, Junior, Senior (as in Dr Martin Luther King Snr)
1302	34	100	66			186	if ( length($word) > 1 and $word !~ /[AEIOUY]\|ST\|MT\|DR\|JN?R\|SN?R/ )
1303							{
1304	1					4	return(1);
1305							}
1306							}
1307	24					73	return(0);
1308							}
1309							#------------------------------------------------------------------------------
1310
1311							return(1);