File Coverage

blib/lib/Lingua/EN/AddressParse.pm

Criterion	Covered	Total	%
statement	255	322	79.1
branch	88	122	72.1
condition	22	36	61.1
subroutine	20	23	86.9
pod	6	6	100.0
total	391	509	76.8

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::AddressParse - extract components of a street address, presented as a text string
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::AddressParse;
8
9							my %args =
10							(
11							country => 'US',
12							auto_clean => 1,
13							force_case => 1,
14							abbreviate_subcountry => 0,
15							abbreviated_subcountry_only => 1,
16							force_post_code => 0
17							);
18
19							my $address = Lingua::EN::AddressParse->new(%args);
20							$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL");
21
22							print $address->report;
23
24							Country address format 'US'
25							Address type 'suburban'
26							Non matching part 'HOLD MAIL '
27							Error '1'
28							Error descriptions 'non matching section : HOLD MAIL '
29							Warning '1'
30							Warning description ''
31							Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100'
32							COMPONENTS ''
33							base_street_name 'Old Massachusetts'
34							post_code '98100'
35							property_identifier '40 1/2'
36							street_direction_prefix 'N'
37							street_name 'N Old Massachusetts'
38							street_type 'Ave'
39							sub_property_identifier '3B'
40							sub_property_type 'Apt'
41							subcountry 'WA'
42							suburb 'Washington Valley'
43
44							%address_components = $address->components;
45							print $address_components{sub_property_type}; # APT
46							print $address_components{sub_property_identifier}; # 3B
47							print $address_components{property_identifier}; # 40 1/2
48
49							%address_properties = $address->properties;
50							print $address_properties{type}; # suburban
51							print $address_properties{non_matching}; # : HOLD MAIL
52
53							$correct_casing = $address->case_all;
54
55
56							=head1 DESCRIPTION
57
58							This module takes as input a suburban, rural or postal address in free format
59							text such as,
60
61							3080 28TH AVE N ST PETERSBURG, FL 33713-3810
62							12 1st Avenue N Suite # 2 Somewhere CA 12345 USA
63							C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317
64
65							9 Church Street, Abertillery, Mid Glamorgan NP13 1DA
66							27 Bury Street, Abingdon, Oxfordshire OX14 3QT
67
68							2A LOW ST KEW NSW 2123
69							12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133
70							"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA
71							GPO Box K318, HAYMARKET, NSW 2000
72
73
74							and attempts to parse it. If successful, the address is broken
75							down into it's components and useful functions can be performed such as :
76
77							converting upper or lower case values to title case (2A Low St Kew NSW 2123)
78							extracting the addresses individual components (2A,Low,St,KEW,NSW,2123)
79							determining the type of format the address is in ('suburban')
80
81
82							If the address cannot be parsed you have the option of cleaning the address
83							of bad characters, or extracting any portion that was parsed and the portion
84							that failed.
85
86							This module can be used for analysing and improving the quality of
87							lists of residential and postal addresses.
88
89							=head1 DEFINITIONS
90
91							The following terms are used by AddressParse to define the components that
92							can make up an address.
93
94							Pre cursor : C/O MR A Smith...
95							Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ...
96							Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44
97							Property name : "Old Regret"
98							Post Box : GP0 Box K123, LPO 2345, RMS 23 ...
99							Road Box : RMB 24A, RMS 234 ...
100							Street Direction: North, SE, Sth. etc
101							Street name : O'Hare, New South Head, The Causeway
102							Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ...
103							Suburb : Dee Why, St. John's Wood ...
104							Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ...
105							Post (zip) code : 2062, 34532-1234, SG12A 9ET
106							Country : Australia, UK, US or Canada
107
108
109
110							The main address formats currently supported are as follows. (a ? means the component is optional):
111
112							'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?)
113
114							OR for the USA
115							'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?)
116
117							'rural' : property_name suburb subcountry post_code(?) country(?)
118							'post_box' : post_box suburb subcountry post_code(?) country(?)
119							'road_box' : road_box street street_type suburb subcountry post_code(?) country(?)
120							'road_box' : road_box suburb subcountry post_code(?) country(?)
121
122							All formats may contain a precursor
123
124							Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar
125							module for a complete list of combinations.
126
127
128							=head1 METHODS
129
130							=head2 new
131
132							The C method creates an instance of an address object and sets up
133							the grammar used to parse addresses. This must be called before any of the
134							following methods are invoked. Note that the object only needs to be
135							created once, and can be reused with new input data.
136
137							Various setup options may be defined in a hash that is passed as an
138							optional argument to the C method.
139
140							my %args =
141							(
142							country => 'Australia',
143							auto_clean => 1,
144							force_case => 1,
145							abbreviate_subcountry => 1,
146							abbreviated_subcountry_only => 1,
147							force_post_code => 1
148							);
149
150							my $address = Lingua::EN::AddressParse->new(%args);
151
152							=over 4
153
154							=item country
155
156							The country argument must be specified. It determines the possible list of
157							valid sub countries (states, counties etc, defined in the Locale::SubCountry
158							module) and post code formats. Either the full name or abbreviation may be
159							specified. The currently supported country names and codes are:
160
161							AU or Australia
162							CA or Canada
163							GB or United Kingdom
164							US or United States
165
166							All forms of upper/lower case are acceptable in the country's spelling. If a
167							country name is supplied that the module doesn't recognise, it will die.
168
169							=item force_case (optional)
170
171							This option only applies to the C method, see below.
172
173							=item auto_clean (optional)
174
175							When this option is set to a positive value, the input string is
176							'cleaned' to try and normalise bad patterns. The type of cleaning
177							includes
178
179							remove non alphanumeric characters
180							remove full stops
181							remove redundant white space
182							add missing space separators
183							expand abbreviations to more common forms
184							remove bracketed annotations
185							fix badly formed sub property identifiers
186
187							=item abbreviate_subcountry (optional)
188
189							When this option is set to a positive value, the sub country is forced to it's
190							abbreviated form, so "New South Wales" becomes "NSW". If the sub country is
191							already abbreviated then it's value is not altered.
192
193							=item abbreviated_subcountry_only (optional)
194
195							When this option is set to a positive value, only the abbreviated form
196							of sub country is allowed, such as "NSW" and not "New South Wales". This
197							will make parsing quicker and ensure that addresses comply with postal
198							standards that normally permit only abbreviated sub countries.
199
200							It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey'
201							Normally, 'Washington would be consumed as the sub country, but by first converting
202							the address to 'Port Washington NJ' we avoid this problem
203
204
205							=item force_post_code (optional)
206
207							When this option is set to a positive value, the address must contain
208							a post code. If it does not then an error flag is raised. If this option
209							is set to 0 than a post code is optional.
210
211							By default for this option is true.
212
213							=back
214
215							=head2 parse
216
217							$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133");
218
219							The C method takes a single parameter of a text string containing a
220							address. It attempts to parse the address and break it down into the components
221							described below. If the address is parsed successfully, a 0 is returned,
222							otherwise a 1.
223
224							Note that you can successfully parse all the components of an address and still
225							have an error returned. This occurs when you have non matching data following
226							a valid address. To check if the data is unusable, you also need to use the
227							C method to check the address type is 'unknown'
228
229							This method is a prerequisite for all the following methods.
230
231							=head2 components
232
233							%address = $address->components($upper_case_all);
234							$suburb = $address{suburb};
235
236							If the optional argument $upper_case_all is set to a postive value, all components
237							are converted to upper case.
238
239
240							The C method returns all the address components in a hash. The
241							following keys are used for each component:
242
243
244							pre_cursor - such as 'C/O Mr A Smith'
245							po_box_type - such as 'Private Boxes'
246							post_box
247							road_box
248							sub_property_type
249							sub_property_identifier
250							property_identifier
251							property_name
252							level - such as 12th Floor
253							building - such as Tower A
254							street_direction_prefix (such as East, NW, North etc)
255							base_street_name (the name with direction removed, such as "Main" in "East Main St")
256							street_name (the full street name such as "East Main")
257							street_type
258							street_direction_suffix (US only, abbreviated only such as N, SE etc)
259							suburb
260							subcountry
261							post_code
262							country
263
264							If a component has no matching data for a given address, it's values will be
265							set to the empty string.
266
267							Each component is converted to title case, meaning the first letter of each
268							component is set to capitals and the remainder to lower case.
269
270							Proper name capitalisations such as MacNay and O'Brien are observed
271
272							The following components are not converted to title case:
273
274							post_box
275							road_box
276							subcountry
277							post_code
278							country
279							street_direction_suffix
280
281							If your input data is all upper case and you want to retian that format for parsed
282							data, you will need to apply the 'uc' function to each component.
283
284							=head2 case_all
285
286							$correct_casing = $address->case_all;
287
288							The C method does the same thing as the C method except
289							the entire address is returned as a title cased text string.
290
291							If the force_case option was set in the C method above, address case the
292							entire input string, including any unmatched sections after a recognisable address
293							that failed parsing. This option is useful when you know you have invalid data,
294							but you still want to title case what you have.
295
296							=head2 properties
297
298							The C method returns several properties of the address as a hash.
299							The following keys are used for each property -
300
301							type - either suburban ,rural,post_box,road_box,unknown
302							non_matching - any trailing string not part the address
303
304
305							Additional properties can be accessed with the following
306
307							$address->{original_input}
308							$address->{input_string} - string after auto_clean option has been applied
309							$address->{country_code} - abbreviated Country address format (as defined in the C method)
310							$address->{error} - error flag, 0 = good, 1 = error
311							$address->{error_desc} - text to describe the type of parsing error
312							$address->{warning} - warning flag, 0 = good, 1 = warning
313							$address->{warning_desc} - text to to describe the type of parsing warning(s)
314
315							Warnings mean that the address has parsed but there may still be errors within it's components
316
317
318							=head2 report
319
320							Create a formatted text report
321
322							the input string
323							the cleaned input string
324							the country type
325							the address type
326							any non matching part of input string
327							if any parsing errors occurred
328							error description
329							if any parsing warning occurred
330							warning description
331
332							the name and value of each defined component
333
334
335							Returns a string containing a multi line formatted text report
336
337							=head1 DEPENDANCIES
338
339							L, L, L
340
341							=head1 BUGS
342
343							=head1 LIMITATIONS
344
345							Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a
346							street type of null string.
347
348							For US addresses, an ambiguity arises between a street directional suffix and
349							a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South
350							Main St, or South Springfield? The parser assumes that 'S' belongs to the street
351							description.
352
353							The huge number of character combinations that can form a valid address makes
354							it is impossible to correctly identify them all.
355
356							Valid addresses must contain:
357
358							property address, suburb, subcountry (aka state) in that order.
359
360							This format is widely accepted in Australia and the US.
361
362							UK addresses will often include suburb, town, city and county, formats that
363							are very difficult to parse.
364
365							Property names must be enclosed in single or double quotes like "Old Regret"
366
367							Because of the large combination of possible addresses defined in the grammar,
368							the program is not very fast.
369
370
371							=head1 REFERENCES
372
373							"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
374
375							Australian Standard AS4212-1994 "Geographic Information Systems -
376							Data Dictionary for transfer of street addressing information"
377
378							ISO 3166-2:1998, Codes for the representation of names of countries
379							and their subdivisions. Also released as AS/NZS 2632.2:1999
380
381
382							=head1 SEE ALSO
383
384							AddressParse is designed to identify properties, which have a unique physical
385							location. L will also parse addresses for the USA, and can handle
386							locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA"
387							"Mission Street at Valencia Street, San Francisco, CA"
388
389
390							L
391							L
392							L
393							L
394
395							See L
396							for a list of different addressing formats from around the world. And also
397							L
398
399							=head1 REPOSITORY
400
401							L
402
403							=head1 TO DO
404
405							Define grammar for other languages. Hopefully, all that would be needed is
406							to specify a new module with its own grammar, and inherit all the existing
407							methods. I don't have the knowledge of the naming conventions for non-english
408							languages.
409
410							=head1 AUTHOR
411
412							AddressParse was written by Kim Ryan
413
414							=head1 COPYRIGHT AND LICENSE
415
416							Copyright (c) 2015 Kim Ryan. All rights reserved.
417
418							This library is free software; you can redistribute it and/or modify
419							it under the same terms as Perl itself.
420
421
422							=cut
423
424							#------------------------------------------------------------------------------
425
426							package Lingua::EN::AddressParse;
427
428	2			2		89600	use strict;
	2					2
	2					45
429	2			2		6	use Carp;
	2					2
	2					115
430	2			2		7	use warnings;
	2					5
	2					40
431	2			2		829	use Lingua::EN::AddressParse::Grammar;
	2					3
	2					74
432	2			2		1413	use Lingua::EN::NameParse;
	2					67956
	2					99
433	2			2		12	use Parse::RecDescent;
	2					3
	2					9
434
435							our $VERSION = '1.25';
436
437							#------------------------------------------------------------------------------
438							# Create a new instance of an address parsing object. This step is time
439							# consuming and should normally only be called once in your program.
440
441							sub new
442							{
443	4			4	1	1364	my $class = shift;
444	4					9	my %args = @_;
445
446
447	4	50	33			53	unless (defined $args{country} and $args{country} =~
448							/^(AU\|Australia\|GB\|United Kingdom\|US\|United States\|CA\|Canada)$/ )
449							{
450	0					0	croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n";
451							}
452
453
454	4					6	my $address = {};
455	4					9	bless($address,$class);
456
457							# option defaults
458	4					11	$address->{'force_post_code'} = 1;
459
460							# Add error checking for invalid keys?
461	4					13	foreach my $curr_key (keys %args)
462							{
463	8					13	$address->{$curr_key} = $args{$curr_key};
464							}
465
466							# create the grammar tree (this is country dependent)
467	4					20	my $grammar = Lingua::EN::AddressParse::Grammar::_create($address);
468
469	4					26	$address->{parse} = Parse::RecDescent->new($grammar);
470
471	4					2614337	return ($address);
472							}
473							#------------------------------------------------------------------------------
474							sub parse
475							{
476	15			15	1	7686	my $address = shift;
477	15					25	my ($input_string) = @_;
478
479							# Save original data so we can check effect of auto cleaning
480	15					63	$address->{original_input} = $input_string;
481
482							# Convert to all upper case. This will allow for faster regexp matching in
483							# the grammar tree
484	15					40	$address->{input_string} = uc($input_string);
485
486	15					34	chomp($address->{input_string});
487
488	15					18	my $pre_cursor;
489	15					32	($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string});
490
491							# Replace commas (which can be used to chunk sections of addresses) with spaces
492	15					34	$address->{input_string} =~ s/,/ /g;
493
494	15	100				45	if ( $address->{auto_clean} )
495							{
496	12					24	$address->{input_string} = _clean($address);
497							}
498
499	15					15	my $po_box_type;
500	15					34	($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string});
501
502	15					16	my $level;
503	15					33	($level,$address->{input_string}) = _extract_level($address->{input_string});
504
505	15					15	my $building;
506	15					26	($building,$address->{input_string}) = _extract_building($address->{input_string});
507
508
509
510							# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12
511	15	100	100			94	if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/\| ](\d+[ \w-].*)$/ )
512							{
513	2					10	$address->{input_string} = "UNIT $1 $2";
514							}
515
516							# We need to add a trailing space to the input string. This is because the grammar
517							# tree expects a terminator (the space) fro every production, optionally followed
518							# by other productions or any final non matching text.
519							# This space will be removed in the _assemble function
520	15					21	$address->{input_string} .= ' ';
521
522	15					33	$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building);
523	15					32	_validate($address);
524
525
526	15					48	return($address,$address->{error});
527							}
528
529							#------------------------------------------------------------------------------
530							# Apply correct capitalisation to each component of an address
531
532							sub components
533							{
534	17			17	1	80	my $address = shift;
535	17					16	my ($uc_all) = @_;
536
537	17					18	my %orig_components = %{ $address->{components} };
	17					132
538
539	17					26	my (%cased_components);
540	17					49	foreach my $curr_key ( keys %orig_components )
541							{
542	323					191	my $cased_value;
543	323					232	my $curr_value = $orig_components{$curr_key};
544
545	323	50				359	if ($uc_all)
546							{
547	0					0	$cased_components{$curr_key} = uc($curr_value);
548	0					0	next;
549							}
550
551
552	323	100				607	if ( $curr_key =~ /^(base_street_name\|street_name\|street_type\|suburb\|property_name\|sub_property\|pre_cursor\|po_box_type\|level\|building)/ )
		100
553							{
554
555	187	100	100			405	if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ )
		100
556							{
557	1					5	$cased_value = "US Highway $1";
558							}
559							elsif ( $curr_key eq 'sub_property_identifier' )
560							{
561							# UNIT, APT ... 12D etc
562
563	17					28	my @words = split(/ /,$curr_value);
564	17					16	my @cased_words;
565							my $cased_string;
566	17					23	foreach my $word (@words)
567							{
568	5					3	my $cased_word;
569	5	50	66			31	if ( $word =~ /^\d{1,3}(ST\|ND\|RD\|TH)$/)
		50
570							{
571							# ordinal component, as in 3rd Floor
572	0					0	$cased_word = lc($word);
573							}
574							elsif ( length($word) > 1 and $word !~ /\d/ )
575							{
576							# only need to title case words such as UNIT
577	0					0	$cased_word = Lingua::EN::NameParse::case_surname($word);
578							}
579							else
580							{
581	5					6	$cased_word = $word;
582							}
583	5					7	push(@cased_words,$cased_word);
584
585							}
586	17					34	$cased_value = join(' ',@cased_words);
587							}
588							else
589							{
590	169	100				164	if ($curr_value)
591							{
592							# Surnames can be used for street's or suburbs so this method
593							# will give correct capitalisation for most cases
594	71					126	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
595							}
596							else
597							{
598	98					82	$cased_value = '';
599							}
600							}
601							}
602							# retain street_direction,sub country and countries capitalisation, usually uppercase
603							elsif ($curr_key =~ /street_direction/)
604							{
605	34	100	66			135	if (length($curr_value) == 1 or length($curr_value) == 2)
		50
606							{
607							# N, SE etc is capitalised
608	1					2	$cased_value =$curr_value;
609							}
610							elsif (length($curr_value) > 2)
611							{
612	0					0	$cased_value = Lingua::EN::NameParse::case_surname($curr_value);
613							}
614							else
615							{
616	33					34	$cased_value = '';
617							}
618							}
619							# retain sub country and countries capitalisation, as usually uppercase
620							else
621							{
622	102					103	$cased_value = uc($curr_value);
623							}
624	323					1518	$cased_components{$curr_key} = $cased_value;
625							}
626	17					324	return(%cased_components);
627							}
628							#------------------------------------------------------------------------------
629							# Apply correct capitalisation to an entire address
630
631							sub case_all
632							{
633	0			0	1	0	my $address = shift;
634
635	0					0	my @cased_address;
636
637	0	0				0	unless ( $address->{properties}{type} eq 'unknown' )
638							{
639
640							# Hash of of lists, indicating the order that address components are assembled in.
641							# Each list element is itself the name of the key value in an address object.
642
643	0					0	my %component_order=
644							(
645							'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/],
646							'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ],
647							'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ]
648
649							);
650	0	0				0	if ( $address->{country} eq 'US' )
651							{
652	0					0	$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/];
653							}
654							else
655							{
656	0					0	$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ];
657							}
658
659	0					0	my %component_vals = $address->components;
660	0					0	my @order = @{ $component_order{$address->{properties}{type} } };
	0					0
661
662	0					0	foreach my $component ( @order )
663							{
664							# As some components such as property name are optional, they will appear
665							# in the order array but may or may not have have a value, so check
666							# for undefined values
667	0	0				0	if ( $component_vals{$component} )
668							{
669	0					0	push(@cased_address,$component_vals{$component});
670							}
671							}
672							}
673
674	0	0	0			0	if ( $address->{error} and $address->{force_case} )
675							{
676							# Despite errors, try to name case non-matching section. As the format
677							# of this section is unknown, surname case will provide the best
678							# approximation
679	0					0	push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching}));
680							}
681
682	0					0	return(join(' ',@cased_address));
683							}
684							#------------------------------------------------------------------------------
685							sub properties
686							{
687	2			2	1	8	my $address = shift;
688	2					2	return(%{ $address->{properties} });
	2					10
689							}
690
691							#------------------------------------------------------------------------------
692							# Create a text report to standard output listing
693							# - the input string,
694							# - the name of each defined component
695							# - any non matching component
696
697							sub report
698							{
699	0			0	1	0	my $address = shift;
700
701	0					0	my $report = '';
702
703	0					0	_fmt_report_line(\$report,"Original Input",$address->{original_input});
704	0					0	_fmt_report_line(\$report,"Cleaned Input",$address->{input_string});
705	0					0	_fmt_report_line(\$report,"Country address format",$address->{country_code});
706
707	0					0	my %props = $address->properties;
708	0	0				0	if ( $props{type} )
709							{
710	0					0	_fmt_report_line(\$report,"Address type",$props{type});
711							}
712
713
714	0					0	_fmt_report_line(\$report,"Non matching part",$props{non_matching});
715	0					0	_fmt_report_line(\$report,"Error",$address->{error});
716	0					0	_fmt_report_line(\$report,"Error descriptions",$address->{error_desc});
717	0					0	_fmt_report_line(\$report,"Warning",$address->{error});
718	0					0	_fmt_report_line(\$report,"Warning description",$address->{warning_desc});
719	0					0	_fmt_report_line(\$report,"Case all",$address->case_all);
720
721
722	0					0	_fmt_report_line(\$report,"COMPONENTS",'');
723	0					0	my %comps = $address->components;
724	0					0	foreach my $comp ( sort keys %comps)
725							{
726	0	0				0	if (defined($comps{$comp}) )
727							{
728	0					0	_fmt_report_line(\$report,$comp,$comps{$comp});
729							}
730							}
731
732	0					0	return($report);
733							}
734
735
736							#------------------------------------------------------------------------------
737
738							# PRIVATE METHODS
739
740							#------------------------------------------------------------------------------
741
742							sub _assemble
743							{
744
745	15			15		17	my $address = shift;
746	15					19	my ($pre_cursor,$po_box_type,$level,$building) = @_;
747
748							# Parse the address according to the rules defined in the AddressParse::Grammar module,
749							# $::RD_TRACE = 1; # for debugging RecDescent output
750							# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree
751	15					127	my $parsed_address = $address->{parse}->full_address($address->{input_string});
752
753							# Place components into a separate hash, so they can be easily returned to the user to inspect and modify
754	15					169049	$address->{components} = ();
755
756	15	100				90	if ($pre_cursor)
757							{
758	1					4	$address->{components}{'pre_cursor'} = $pre_cursor;
759							}
760							else
761							{
762	14					35	$address->{components}{'pre_cursor'} = '';
763							}
764
765	15	100				31	if ($level)
766							{
767	1					2	$address->{components}{'level'} = $level;
768							}
769							else
770							{
771	14					24	$address->{components}{'level'} = '';
772							}
773
774	15	100				41	if ($building)
775							{
776	1					3	$address->{components}{'building'} = $building;
777							}
778							else
779							{
780	14					29	$address->{components}{'building'} = '';
781							}
782
783
784	15	100				28	if ($po_box_type)
785							{
786	1					2	$address->{components}{'po_box_type'} = $po_box_type;
787							}
788							else
789							{
790	14					22	$address->{components}{'po_box_type'} = '';
791							}
792
793
794	15					28	$address->{components}{post_box} = '';
795	15	100				39	if ( $parsed_address->{post_box} )
796							{
797	1					2	$address->{components}{post_box} = $parsed_address->{post_box};
798							}
799
800	15					23	$address->{components}{road_box} = '';
801	15	50				33	if ( $parsed_address->{road_box} )
802							{
803	0					0	$address->{components}{road_box} = $parsed_address->{road_box};
804							}
805
806	15					25	$address->{components}{property_name} = '';
807	15	100				34	if ( $parsed_address->{property_name} )
808							{
809	1					2	$address->{components}{property_name} = $parsed_address->{property_name};
810							}
811
812	15					38	$address->{components}{sub_property_identifier} = '';
813	15					26	$address->{components}{sub_property_type} = '';
814
815	15	100				35	if ( $parsed_address->{sub_property} )
816							{
817	4	50				49	if ($parsed_address->{sub_property} =~ /^(#\|[A-Z]{1,}) (.*)$/ )
		0
818							{
819							# Such as Unit 24, # 4A etc
820	4					15	$address->{components}{sub_property_type} = $1;
821	4					10	$address->{components}{sub_property_identifier} = $2;
822							}
823							elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ )
824							{
825							# Such as 1st Floor
826	0					0	$address->{components}{sub_property_type} = $2;
827	0					0	$address->{components}{sub_property_identifier} = $1;
828							}
829							}
830
831	15					23	$address->{components}{property_identifier} = '';
832	15	100				37	if ( $parsed_address->{property_identifier} )
833							{
834	13					27	$address->{components}{property_identifier} = $parsed_address->{property_identifier};
835							}
836
837	15					21	$address->{components}{base_street_name} = '';
838	15					25	$address->{components}{street_direction_prefix} = '';
839	15					38	my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name});
840	15	50				26	if ($street_direction )
841							{
842	0					0	$address->{components}{street_direction_prefix} = $street_direction;
843	0					0	$address->{components}{base_street_name} = $base_street_name;
844							}
845							else
846							{
847	15					24	$address->{components}{base_street_name} = $parsed_address->{street_name};
848							}
849
850	15					22	$address->{components}{street_name} = '';
851	15					22	$address->{components}{street_type} = '';
852	15	100				28	if ( $parsed_address->{street_name} )
853							{
854							# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out
855	13	50				29	if ( $parsed_address->{street_name} eq 'THE ' )
856							{
857	0					0	$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type};
858							}
859							else
860							{
861	13					21	$address->{components}{street_name} = $parsed_address->{street_name};
862	13					18	$address->{components}{street_type} = $parsed_address->{street_type};
863							}
864							}
865
866
867	15					25	$address->{components}{street_direction_suffix} = '';
868	15	100				34	if ( $parsed_address->{street_direction_suffix} )
869							{
870	1					5	$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix};
871							}
872
873
874	15					40	$address->{components}{suburb} = '';
875	15	50				37	if ( $parsed_address->{suburb} )
876							{
877	15					24	$address->{components}{suburb} = $parsed_address->{suburb};
878							}
879
880	15					23	$address->{components}{subcountry} = '';
881	15	50				33	if ( $parsed_address->{subcountry} )
882							{
883	15					17	my $sub_country = $parsed_address->{subcountry};
884
885							# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc
886	15	100				33	if ($address->{abbreviate_subcountry})
887							{
888	7					45	my $country = Locale::SubCountry->new($address->{country});
889	7					106	my $code = $country->code($sub_country);
890	7	100				497	if ( $code ne 'unknown' )
891							{
892	1					3	$address->{components}{subcountry} = $code;
893							}
894							# sub country already abbreviated
895							else
896							{
897	6					20	$address->{components}{subcountry} = $sub_country;
898							}
899							}
900							else
901							{
902	8					15	$address->{components}{subcountry} = $sub_country;
903							}
904							}
905
906	15					20	$address->{components}{post_code} = '';
907	15	50				34	if ( $parsed_address->{post_code} )
908							{
909	15					24	$address->{components}{post_code} = $parsed_address->{post_code};
910							}
911
912	15					19	$address->{components}{country} = '';
913	15	100				31	if ( $parsed_address->{country} )
914							{
915	1					2	$address->{components}{country} = $parsed_address->{country};
916							}
917
918	15					30	$address->{properties} = ();
919
920	15					31	$address->{properties}{non_matching} = '';
921	15	100				28	if ( $parsed_address->{non_matching} )
922							{
923	1					2	$address->{properties}{non_matching} = $parsed_address->{non_matching};
924							}
925	15					24	$address->{properties}{type} = $parsed_address->{type};
926
927	15					32	_trim_trailing_space($address);
928
929	15					59	return($address);
930							}
931
932							#-------------------------------------------------------------------------------
933							#
934							sub _get_street_direction
935							{
936	15			15		23	my ($street_name) = @_;
937
938	15					19	my $street_direction;
939							my $base_street_name;
940
941	15	100				32	unless ($street_name)
942							{
943	2					4	return;
944							}
945
946	13					65	my @words = split(/\s/,$street_name);
947	13	100				34	if (@words > 1)
948							{
949	4	50				15	if ( $words[0] =~ /^(N\|NE\|NW\|E\|S\|SE\|SW\|W\|NORTH\|EAST\|SOUTH\|WEST\|NTH\|STH)$/ )
950							{
951	0					0	$street_direction = $1;
952	0					0	shift(@words);
953	0					0	$base_street_name = join(' ',@words);
954							}
955							}
956	13					29	return($street_direction,$base_street_name);
957
958							}
959
960							#------------------------------------------------------------------------------
961							# Check for several different types of syntax errors
962
963							sub _validate
964							{
965	15			15		19	my $address = shift;
966	15					24	$address->{error} = 0;
967	15					21	$address->{error_desc} = '';
968	15					21	$address->{warning} = 0;
969	15					20	$address->{warning_desc} = '';
970
971	15	100				35	if ( $address->{properties}{non_matching} )
972							{
973	1					2	$address->{error} = 1;
974	1					4	$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching};
975							}
976							else
977							{
978	14	50				27	if ( $address->{properties}{type} eq 'unknown' )
979							{
980	0					0	$address->{error} = 1;
981	0					0	$address->{error_desc} .= 'unknown address format';
982							}
983							else
984							{
985	14	50	33			82	if ($address->{force_post_code} and not $address->{components}{post_code})
986							{
987	0					0	$address->{error} = 1;
988	0					0	$address->{error_desc} .= ':no post code';
989							}
990
991							# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses
992	14	50				50	if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,&#\/ ]/ )
993							{
994							# Note, if auto_clean is on, illegal characters will have been removed
995							# for second parsing and no error flag or message reported
996	0					0	$address->{error} = 1;
997	0					0	$address->{error_desc} .= ':illegal chars';
998							}
999	14	100				36	if ( $address->{properties}{type} eq 'suburban' )
1000							{
1001	12					12	my $street = $address->{components}{street_name};
1002	12	100				34	if ($street !~ /\d/ )
1003							{
1004							# Not an ordinal or single letter street type
1005	11	100				24	if ( _check_vowel($address->{components}{base_street_name}) )
1006							{
1007							# street name must have a vowel sound,
1008	1					2	$address->{warning} = 1;
1009	1					3	$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}";
1010							}
1011							}
1012							}
1013
1014	14	50				25	if ( _check_vowel($address->{components}{suburb}) )
1015							{
1016	0					0	$address->{warning} = 1;
1017	0					0	$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}";
1018							}
1019							}
1020							}
1021							}
1022							#-------------------------------------------------------------------------------
1023							# Purge the input string of illegal or redundant characters.
1024							# Correct malformed patterns
1025
1026							sub _clean
1027							{
1028	12			12		16	my $address = shift;
1029
1030	12					21	my ($input) = $address->{input_string};
1031
1032							# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St)
1033	12					15	$input =~ s\|$.*$\|\|;
1034
1035							# Normalise half house numbers, sucvh as 12.5 to 12 1/2. This is needed now before full stops are stripped out
1036	12					17	$input =~ s\|^(\d{1,4})\.5 \|$1 1/2 \|;
1037
1038							# strip full stops, remove illegal characters
1039							# & can be part of property name
1040							# hash (#) may denote number for USA address
1041							# quotes can occur as property name delimiters
1042
1043	12					24	$input =~ s\|[^A-Za-z0-9&#/'" -]\|\|go;
1044
1045							# remove repeating, leading and trailing spaces
1046	12					18	$input =~ s\| +\| \|go ;
1047	12					13	$input =~ s\|^ \|\|;
1048	12					18	$input =~ s\| $\|\|;
1049
1050
1051							# Expand abbreviations that are too short
1052
1053	12					18	$input =~ s/LAKE ST (GEORGE\|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street'
1054	12					17	$input =~ s\| CSEWY \| CAUSEWAY \|;
1055
1056							# street types
1057
1058
1059	12					13	$input =~ s\|^FCTR?Y \|FACTORY \|;
1060	12					50	$input =~ s\|^FACT?R?Y? \|FACTORY \|;
1061
1062	12					15	$input =~ s\|LVL \|LEVEL \|; # sub property identifiers
1063	12					14	$input =~ s\|^UN? \|UNIT \|;
1064	12					11	$input =~ s\|^U(\d+)\|UNIT $1\|;
1065
1066							# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A
1067	12					11	$input =~ s\|/ \|/\|;
1068	12					12	$input =~ s\| /\|/\|;
1069	12					12	$input =~ s\|- \|-\|;
1070	12					14	$input =~ s\| -\|-\|;
1071
1072							# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St
1073
1074	12	50				40	if ( $input !~ /^\d+ [A-Z] (ST\|AVE)/ )
1075							{
1076							# Don't remove space before single letter streets such as 21 B Street
1077	12	50				31	if ( $address->{country_code} eq 'US' )
1078							{
1079							# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET
1080							# Assume that the direction prefix is the more likely case
1081	0					0	$input =~ s\|^(\d+) ([A-DF-MO-RT-VX-Z] )\|$1$2\|;
1082							}
1083							else
1084							{
1085	12					31	$input =~ s\|^(\d+) ([A-Z] )\|$1$2\|;
1086							}
1087							}
1088
1089							# Add or remove spaces in sub property identifiers
1090	12	50				24	if ( $address->{country_code} eq 'US' )
1091							{
1092							# Fix US sub property identifiers that appear after street name and type
1093							# add space between # and the number so #2 becomes '# 2'
1094	0					0	$input =~ s\| #(\d)\| # $1\|;
1095	0					0	$input =~ s\| #([A-Z])\| # $1\|;
1096	0					0	$input =~ s\| (APT)(\d)\| $1 $2\|i;
1097
1098							# remove redundnant space so # 34 B becomes # 34B
1099	0					0	$input =~ s\| # (\d+) (\w) \| # $1$2 \|;
1100
1101							# remove redundnant '#'
1102	0					0	$input =~ s\| APT #\| APT \|;
1103							}
1104							else
1105							{
1106							# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2
1107	12					20	$input =~ s/^(UNIT\|LOT\|APT\|SHOP)(\d)/$1 $2/;
1108							}
1109
1110							# Remove redundant slash or dash
1111							# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12
1112							# TO DO, add \|# at start
1113	12					21	$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /;
1114							# Unit J1/ 39 becomes Unit J1 39
1115	12					18	$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /;
1116
1117
1118							# remove dash that is not from a sequence, such as D-5 or 22-A
1119	12					15	$input =~ s\|([A-Z])-(\d)\|$1$2\|;
1120	12					16	$input =~ s\|(\d)-([A-Z])\|$1$2\|;
1121
1122
1123	12					18	return($input);
1124							}
1125							#-------------------------------------------------------------------------------
1126							# Remove any "care of" type of precursor from the main address
1127							# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214
1128							# It will later be saved as an attribute in the address object
1129
1130							sub _extract_precursor
1131							{
1132	15			15		19	my ($input) = @_;
1133	15					19	my ($pre_cursor,$address_start,$address_end);
1134
1135	15	100				73	if ($input =~ m{^(C/O.?\|ATTN.?) (\d+\|PO BOX)( .*)})
1136							{
1137	1					5	$pre_cursor = $1;
1138	1					2	$address_start = $2;
1139	1					2	$address_end = $3;
1140	1					5	return($pre_cursor, $address_start . $address_end);
1141							}
1142							else
1143							{
1144	14					37	return('',$input)
1145							}
1146							}
1147							#-------------------------------------------------------------------------------
1148							# Remove any level or floor info such as:
1149							# 12 Smith St Floor 2
1150							# Level 22 Suite 3 12 Main St
1151							# It will later be saved as an attribute in the address object
1152
1153							sub _extract_level
1154							{
1155	15			15		14	my ($input) = @_;
1156	15					13	my ($level);
1157
1158	15	100	66			131	if
			66
1159							(
1160							# Level info could be at start of string so first space is optional
1161							$input =~ / ?(\d{1,2}(ST\|ND\|RD\|TH) (FLOOR\|FLR\|FL) )/ or
1162							$input =~ / ?(LEVEL (\d{1,2}\|[GM])[\/ -])/ or
1163							$input =~ / ?((FLOOR\|FLR\|FL) \d{1,2}[\/ -])/
1164							)
1165							{
1166	1					2	$level = $1;
1167	1					2	$level =~ s\|/\|\|;
1168	1					2	$level =~ s\|-\|\|;
1169	1					17	$input =~ s/$level//;
1170							}
1171
1172	15					40	return($level,$input);
1173							}
1174							#-------------------------------------------------------------------------------
1175							# Remove any building info such as:
1176							# Building 2 Level 12 123 Smith St
1177							# 12 Main St Tower A Level 2
1178							# It will later be saved as an attribute in the address object
1179
1180							sub _extract_building
1181							{
1182	15			15		17	my ($input) = @_;
1183	15					17	my ($building);
1184
1185	15					46	my $bld = qr{BLOCK\|BLDG?\|BUILDING\|TOWER};
1186
1187	15	100	66			486	if
			66
1188							(
1189							$input =~ / ?(($bld) ([A-Z]{1,2}\|\d+) )/ or # BLD 12 or AA
1190							$input =~ / ?(($bld) \d{1,3}[A-Z] )/ or # BLD 32C
1191							$input =~ / ?(($bld) [A-Z]\d{1,3} )/ # BLD C12
1192							)
1193							{
1194	1					2	$building = $1;
1195	1					1	$building =~ s\|/\|\|;
1196	1					2	$building =~ s\|-\|\|;
1197	1					7	$input =~ s/$building//;
1198							}
1199
1200	15					49	return($building,$input);
1201							}
1202
1203							#-------------------------------------------------------------------------------
1204							# Remove any description that follows the suburb after the main address
1205							# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965"
1206							# It will be saved as an address attribute
1207
1208							sub _extract_po_box_type
1209							{
1210	15			15		21	my ($input) = @_;
1211	15					19	my ($po_box_type,$address_start,$address_end);
1212
1213	15	100				42	if ($input =~ /^(.?) (PRIVATE BOXES)( .)$/ )
1214							{
1215	1					3	$address_start = $1;
1216	1					2	$po_box_type = $2;
1217	1					2	$address_end = $3;
1218	1					4	return($po_box_type, $address_start . $address_end);
1219							}
1220							else
1221							{
1222	14					31	return('',$input)
1223							}
1224							}
1225							#------------------------------------------------------------------------------
1226							# For correct matching, the grammar of each component must include the
1227							# trailing space that separates it from any following word. This should
1228							# now be removed from each component
1229
1230							sub _trim_trailing_space
1231							{
1232	15			15		17	my ($address) = @_;
1233
1234	15					17	foreach my $key (keys %{ $address->{components} } )
	15					79
1235							{
1236	285	100				386	if ($address->{components}{$key} )
1237							{
1238	110					211	$address->{components}{$key} =~ s/ $//g;
1239							}
1240							}
1241							}
1242							#------------------------------------------------------------------------------
1243
1244							sub _fmt_report_line
1245							{
1246	0			0		0	my ($report_ref,$label,$value) = @_;
1247							# To DO $$ ??
1248	0					0	$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value);
1249							}
1250							#------------------------------------------------------------------------------
1251
1252							sub _check_vowel
1253							{
1254	25			25		34	my ($str) = @_;
1255
1256	25					49	my @words = split(/ /,$str);
1257	25					38	foreach my $word (@words)
1258							{
1259							# Saint, Mount, Junior, Senior (as in Martin Luther KIng Snr)
1260	34	100	66			160	if ( length($word) > 1 and $word !~ /[AEIOUY]\|ST\|MT\|JN?R\|SN?R/ )
1261							{
1262	1					4	return(1);
1263							}
1264							}
1265	24					65	return(0);
1266							}
1267							#------------------------------------------------------------------------------
1268
1269							return(1);