| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
=head1 NAME |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Lingua::EN::AddressParse - extract components of a street address, presented as a text string |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Lingua::EN::AddressParse; |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my %args = |
|
10
|
|
|
|
|
|
|
( |
|
11
|
|
|
|
|
|
|
country => 'US', |
|
12
|
|
|
|
|
|
|
auto_clean => 1, |
|
13
|
|
|
|
|
|
|
force_case => 1, |
|
14
|
|
|
|
|
|
|
abbreviate_subcountry => 0, |
|
15
|
|
|
|
|
|
|
abbreviated_subcountry_only => 1, |
|
16
|
|
|
|
|
|
|
force_post_code => 0 |
|
17
|
|
|
|
|
|
|
); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
|
20
|
|
|
|
|
|
|
$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL"); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
print $address->report; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
Country address format 'US' |
|
25
|
|
|
|
|
|
|
Address type 'suburban' |
|
26
|
|
|
|
|
|
|
Non matching part 'HOLD MAIL ' |
|
27
|
|
|
|
|
|
|
Error '1' |
|
28
|
|
|
|
|
|
|
Error descriptions 'non matching section : HOLD MAIL ' |
|
29
|
|
|
|
|
|
|
Warning '1' |
|
30
|
|
|
|
|
|
|
Warning description '' |
|
31
|
|
|
|
|
|
|
Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100' |
|
32
|
|
|
|
|
|
|
COMPONENTS '' |
|
33
|
|
|
|
|
|
|
base_street_name 'Old Massachusetts' |
|
34
|
|
|
|
|
|
|
post_code '98100' |
|
35
|
|
|
|
|
|
|
property_identifier '40 1/2' |
|
36
|
|
|
|
|
|
|
street_direction_prefix 'N' |
|
37
|
|
|
|
|
|
|
street_name 'N Old Massachusetts' |
|
38
|
|
|
|
|
|
|
street_type 'Ave' |
|
39
|
|
|
|
|
|
|
sub_property_identifier '3B' |
|
40
|
|
|
|
|
|
|
sub_property_type 'Apt' |
|
41
|
|
|
|
|
|
|
subcountry 'WA' |
|
42
|
|
|
|
|
|
|
suburb 'Washington Valley' |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
%address_components = $address->components; |
|
45
|
|
|
|
|
|
|
print $address_components{sub_property_type}; # APT |
|
46
|
|
|
|
|
|
|
print $address_components{sub_property_identifier}; # 3B |
|
47
|
|
|
|
|
|
|
print $address_components{property_identifier}; # 40 1/2 |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
%address_properties = $address->properties; |
|
50
|
|
|
|
|
|
|
print $address_properties{type}; # suburban |
|
51
|
|
|
|
|
|
|
print $address_properties{non_matching}; # : HOLD MAIL |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This module takes as input a suburban, rural or postal address in free format |
|
59
|
|
|
|
|
|
|
text such as, |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
3080 28TH AVE N ST PETERSBURG, FL 33713-3810 |
|
62
|
|
|
|
|
|
|
12 1st Avenue N Suite # 2 Somewhere CA 12345 USA |
|
63
|
|
|
|
|
|
|
C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317 |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
9 Church Street, Abertillery, Mid Glamorgan NP13 1DA |
|
66
|
|
|
|
|
|
|
27 Bury Street, Abingdon, Oxfordshire OX14 3QT |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
2A LOW ST KEW NSW 2123 |
|
69
|
|
|
|
|
|
|
12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133 |
|
70
|
|
|
|
|
|
|
"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA |
|
71
|
|
|
|
|
|
|
GPO Box K318, HAYMARKET, NSW 2000 |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
and attempts to parse it. If successful, the address is broken |
|
75
|
|
|
|
|
|
|
down into it's components and useful functions can be performed such as : |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
converting upper or lower case values to title case (2A Low St Kew NSW 2123) |
|
78
|
|
|
|
|
|
|
extracting the addresses individual components (2A,Low,St,KEW,NSW,2123) |
|
79
|
|
|
|
|
|
|
determining the type of format the address is in ('suburban') |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
If the address cannot be parsed you have the option of cleaning the address |
|
83
|
|
|
|
|
|
|
of bad characters, or extracting any portion that was parsed and the portion |
|
84
|
|
|
|
|
|
|
that failed. |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
This module can be used for analysing and improving the quality of |
|
87
|
|
|
|
|
|
|
lists of residential and postal addresses. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head1 DEFINITIONS |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
The following terms are used by AddressParse to define the components that |
|
92
|
|
|
|
|
|
|
can make up an address. |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Pre cursor : C/O MR A Smith... |
|
95
|
|
|
|
|
|
|
Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ... |
|
96
|
|
|
|
|
|
|
Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44 |
|
97
|
|
|
|
|
|
|
Property name : "Old Regret" |
|
98
|
|
|
|
|
|
|
Post Box : GP0 Box K123, LPO 2345, RMS 23 ... |
|
99
|
|
|
|
|
|
|
Road Box : RMB 24A, RMS 234 ... |
|
100
|
|
|
|
|
|
|
Street Direction: North, SE, Sth. etc |
|
101
|
|
|
|
|
|
|
Street name : O'Hare, New South Head, The Causeway |
|
102
|
|
|
|
|
|
|
Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ... |
|
103
|
|
|
|
|
|
|
Suburb : Dee Why, St. John's Wood ... |
|
104
|
|
|
|
|
|
|
Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ... |
|
105
|
|
|
|
|
|
|
Post (zip) code : 2062, 34532-1234, SG12A 9ET |
|
106
|
|
|
|
|
|
|
Country : Australia, UK, US or Canada |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
The main address formats currently supported are as follows. (a ? means the component is optional): |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?) |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
OR for the USA |
|
115
|
|
|
|
|
|
|
'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?) |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
'rural' : property_name suburb subcountry post_code(?) country(?) |
|
118
|
|
|
|
|
|
|
'post_box' : post_box suburb subcountry post_code(?) country(?) |
|
119
|
|
|
|
|
|
|
'road_box' : road_box street street_type suburb subcountry post_code(?) country(?) |
|
120
|
|
|
|
|
|
|
'road_box' : road_box suburb subcountry post_code(?) country(?) |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
All formats may contain a precursor |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar |
|
125
|
|
|
|
|
|
|
module for a complete list of combinations. |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 METHODS |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head2 new |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
The C method creates an instance of an address object and sets up |
|
133
|
|
|
|
|
|
|
the grammar used to parse addresses. This must be called before any of the |
|
134
|
|
|
|
|
|
|
following methods are invoked. Note that the object only needs to be |
|
135
|
|
|
|
|
|
|
created once, and can be reused with new input data. |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Various setup options may be defined in a hash that is passed as an |
|
138
|
|
|
|
|
|
|
optional argument to the C method. |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my %args = |
|
141
|
|
|
|
|
|
|
( |
|
142
|
|
|
|
|
|
|
country => 'Australia', |
|
143
|
|
|
|
|
|
|
auto_clean => 1, |
|
144
|
|
|
|
|
|
|
force_case => 1, |
|
145
|
|
|
|
|
|
|
abbreviate_subcountry => 1, |
|
146
|
|
|
|
|
|
|
abbreviated_subcountry_only => 1, |
|
147
|
|
|
|
|
|
|
force_post_code => 1 |
|
148
|
|
|
|
|
|
|
); |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=over 4 |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item country |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The country argument must be specified. It determines the possible list of |
|
157
|
|
|
|
|
|
|
valid sub countries (states, counties etc, defined in the Locale::SubCountry |
|
158
|
|
|
|
|
|
|
module) and post code formats. Either the full name or abbreviation may be |
|
159
|
|
|
|
|
|
|
specified. The currently supported country names and codes are: |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
AU or Australia |
|
162
|
|
|
|
|
|
|
CA or Canada |
|
163
|
|
|
|
|
|
|
GB or United Kingdom |
|
164
|
|
|
|
|
|
|
US or United States |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
All forms of upper/lower case are acceptable in the country's spelling. If a |
|
167
|
|
|
|
|
|
|
country name is supplied that the module doesn't recognise, it will die. |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item force_case (optional) |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
This option only applies to the C method, see below. |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=item auto_clean (optional) |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
When this option is set to a positive value, the input string is |
|
176
|
|
|
|
|
|
|
'cleaned' to try and normalise bad patterns. The type of cleaning |
|
177
|
|
|
|
|
|
|
includes |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
remove non alphanumeric characters |
|
180
|
|
|
|
|
|
|
remove full stops |
|
181
|
|
|
|
|
|
|
remove redundant white space |
|
182
|
|
|
|
|
|
|
add missing space separators |
|
183
|
|
|
|
|
|
|
expand abbreviations to more common forms |
|
184
|
|
|
|
|
|
|
remove bracketed annotations |
|
185
|
|
|
|
|
|
|
fix badly formed sub property identifiers |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item abbreviate_subcountry (optional) |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
When this option is set to a positive value, the sub country is forced to it's |
|
190
|
|
|
|
|
|
|
abbreviated form, so "New South Wales" becomes "NSW". If the sub country is |
|
191
|
|
|
|
|
|
|
already abbreviated then it's value is not altered. |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item abbreviated_subcountry_only (optional) |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
When this option is set to a positive value, only the abbreviated form |
|
196
|
|
|
|
|
|
|
of sub country is allowed, such as "NSW" and not "New South Wales". This |
|
197
|
|
|
|
|
|
|
will make parsing quicker and ensure that addresses comply with postal |
|
198
|
|
|
|
|
|
|
standards that normally permit only abbreviated sub countries. |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey' |
|
201
|
|
|
|
|
|
|
Normally, 'Washington would be consumed as the sub country, but by first converting |
|
202
|
|
|
|
|
|
|
the address to 'Port Washington NJ' we avoid this problem |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=item force_post_code (optional) |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
When this option is set to a positive value, the address must contain |
|
208
|
|
|
|
|
|
|
a post code. If it does not then an error flag is raised. If this option |
|
209
|
|
|
|
|
|
|
is set to 0 than a post code is optional. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
By default for this option is true. |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=back |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 parse |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133"); |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
The C method takes a single parameter of a text string containing a |
|
220
|
|
|
|
|
|
|
address. It attempts to parse the address and break it down into the components |
|
221
|
|
|
|
|
|
|
described below. If the address is parsed successfully, a 0 is returned, |
|
222
|
|
|
|
|
|
|
otherwise a 1. |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Note that you can successfully parse all the components of an address and still |
|
225
|
|
|
|
|
|
|
have an error returned. This occurs when you have non matching data following |
|
226
|
|
|
|
|
|
|
a valid address. To check if the data is unusable, you also need to use the |
|
227
|
|
|
|
|
|
|
C method to check the address type is 'unknown' |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
This method is a prerequisite for all the following methods. |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=head2 components |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
%address = $address->components($upper_case_all); |
|
234
|
|
|
|
|
|
|
$suburb = $address{suburb}; |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
If the optional argument $upper_case_all is set to a postive value, all components |
|
237
|
|
|
|
|
|
|
are converted to upper case. |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
The C method returns all the address components in a hash. The |
|
241
|
|
|
|
|
|
|
following keys are used for each component: |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
pre_cursor - such as 'C/O Mr A Smith' |
|
245
|
|
|
|
|
|
|
po_box_type - such as 'Private Boxes' |
|
246
|
|
|
|
|
|
|
post_box |
|
247
|
|
|
|
|
|
|
road_box |
|
248
|
|
|
|
|
|
|
sub_property_type |
|
249
|
|
|
|
|
|
|
sub_property_identifier |
|
250
|
|
|
|
|
|
|
property_identifier |
|
251
|
|
|
|
|
|
|
property_name |
|
252
|
|
|
|
|
|
|
level - such as 12th Floor |
|
253
|
|
|
|
|
|
|
building - such as Tower A |
|
254
|
|
|
|
|
|
|
street_direction_prefix (such as East, NW, North etc) |
|
255
|
|
|
|
|
|
|
base_street_name (the name with direction removed, such as "Main" in "East Main St") |
|
256
|
|
|
|
|
|
|
street_name (the full street name such as "East Main") |
|
257
|
|
|
|
|
|
|
street_type |
|
258
|
|
|
|
|
|
|
street_direction_suffix (US only, abbreviated only such as N, SE etc) |
|
259
|
|
|
|
|
|
|
suburb |
|
260
|
|
|
|
|
|
|
subcountry |
|
261
|
|
|
|
|
|
|
post_code |
|
262
|
|
|
|
|
|
|
country |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
If a component has no matching data for a given address, it's values will be |
|
265
|
|
|
|
|
|
|
set to the empty string. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Each component is converted to title case, meaning the first letter of each |
|
268
|
|
|
|
|
|
|
component is set to capitals and the remainder to lower case. |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
Proper name capitalisations such as MacNay and O'Brien are observed |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
The following components are not converted to title case: |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
post_box |
|
275
|
|
|
|
|
|
|
road_box |
|
276
|
|
|
|
|
|
|
subcountry |
|
277
|
|
|
|
|
|
|
post_code |
|
278
|
|
|
|
|
|
|
country |
|
279
|
|
|
|
|
|
|
street_direction_suffix |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
If your input data is all upper case and you want to retian that format for parsed |
|
282
|
|
|
|
|
|
|
data, you will need to apply the 'uc' function to each component. |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head2 case_all |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
The C method does the same thing as the C method except |
|
289
|
|
|
|
|
|
|
the entire address is returned as a title cased text string. |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
If the force_case option was set in the C method above, address case the |
|
292
|
|
|
|
|
|
|
entire input string, including any unmatched sections after a recognisable address |
|
293
|
|
|
|
|
|
|
that failed parsing. This option is useful when you know you have invalid data, |
|
294
|
|
|
|
|
|
|
but you still want to title case what you have. |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head2 properties |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
The C method returns several properties of the address as a hash. |
|
299
|
|
|
|
|
|
|
The following keys are used for each property - |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
type - either suburban ,rural,post_box,road_box,unknown |
|
302
|
|
|
|
|
|
|
non_matching - any trailing string not part the address |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Additional properties can be accessed with the following |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
$address->{original_input} |
|
308
|
|
|
|
|
|
|
$address->{input_string} - string after auto_clean option has been applied |
|
309
|
|
|
|
|
|
|
$address->{country_code} - abbreviated Country address format (as defined in the C method) |
|
310
|
|
|
|
|
|
|
$address->{error} - error flag, 0 = good, 1 = error |
|
311
|
|
|
|
|
|
|
$address->{error_desc} - text to describe the type of parsing error |
|
312
|
|
|
|
|
|
|
$address->{warning} - warning flag, 0 = good, 1 = warning |
|
313
|
|
|
|
|
|
|
$address->{warning_desc} - text to to describe the type of parsing warning(s) |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Warnings mean that the address has parsed but there may still be errors within it's components |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=head2 report |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Create a formatted text report |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
the input string |
|
323
|
|
|
|
|
|
|
the cleaned input string |
|
324
|
|
|
|
|
|
|
the country type |
|
325
|
|
|
|
|
|
|
the address type |
|
326
|
|
|
|
|
|
|
any non matching part of input string |
|
327
|
|
|
|
|
|
|
if any parsing errors occurred |
|
328
|
|
|
|
|
|
|
error description |
|
329
|
|
|
|
|
|
|
if any parsing warning occurred |
|
330
|
|
|
|
|
|
|
warning description |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
the name and value of each defined component |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Returns a string containing a multi line formatted text report |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 DEPENDANCIES |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
L, L, L |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head1 BUGS |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head1 LIMITATIONS |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a |
|
346
|
|
|
|
|
|
|
street type of null string. |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
For US addresses, an ambiguity arises between a street directional suffix and |
|
349
|
|
|
|
|
|
|
a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South |
|
350
|
|
|
|
|
|
|
Main St, or South Springfield? The parser assumes that 'S' belongs to the street |
|
351
|
|
|
|
|
|
|
description. |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
The huge number of character combinations that can form a valid address makes |
|
354
|
|
|
|
|
|
|
it is impossible to correctly identify them all. |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Valid addresses must contain: |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
property address, suburb, subcountry (aka state) in that order. |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
This format is widely accepted in Australia and the US. |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
UK addresses will often include suburb, town, city and county, formats that |
|
363
|
|
|
|
|
|
|
are very difficult to parse. |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Property names must be enclosed in single or double quotes like "Old Regret" |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Because of the large combination of possible addresses defined in the grammar, |
|
368
|
|
|
|
|
|
|
the program is not very fast. |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head1 REFERENCES |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997) |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Australian Standard AS4212-1994 "Geographic Information Systems - |
|
376
|
|
|
|
|
|
|
Data Dictionary for transfer of street addressing information" |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
ISO 3166-2:1998, Codes for the representation of names of countries |
|
379
|
|
|
|
|
|
|
and their subdivisions. Also released as AS/NZS 2632.2:1999 |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
AddressParse is designed to identify properties, which have a unique physical |
|
385
|
|
|
|
|
|
|
location. L will also parse addresses for the USA, and can handle |
|
386
|
|
|
|
|
|
|
locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA" |
|
387
|
|
|
|
|
|
|
"Mission Street at Valencia Street, San Francisco, CA" |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
L |
|
391
|
|
|
|
|
|
|
L |
|
392
|
|
|
|
|
|
|
L |
|
393
|
|
|
|
|
|
|
L |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
See L |
|
396
|
|
|
|
|
|
|
for a list of different addressing formats from around the world. And also |
|
397
|
|
|
|
|
|
|
L |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=head1 REPOSITORY |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
L |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 TO DO |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Define grammar for other languages. Hopefully, all that would be needed is |
|
406
|
|
|
|
|
|
|
to specify a new module with its own grammar, and inherit all the existing |
|
407
|
|
|
|
|
|
|
methods. I don't have the knowledge of the naming conventions for non-english |
|
408
|
|
|
|
|
|
|
languages. |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 AUTHOR |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
AddressParse was written by Kim Ryan |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Copyright (c) 2015 Kim Ryan. All rights reserved. |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
419
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=cut |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
package Lingua::EN::AddressParse; |
|
427
|
|
|
|
|
|
|
|
|
428
|
2
|
|
|
2
|
|
89600
|
use strict; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
45
|
|
|
429
|
2
|
|
|
2
|
|
6
|
use Carp; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
115
|
|
|
430
|
2
|
|
|
2
|
|
7
|
use warnings; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
40
|
|
|
431
|
2
|
|
|
2
|
|
829
|
use Lingua::EN::AddressParse::Grammar; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
74
|
|
|
432
|
2
|
|
|
2
|
|
1413
|
use Lingua::EN::NameParse; |
|
|
2
|
|
|
|
|
67956
|
|
|
|
2
|
|
|
|
|
99
|
|
|
433
|
2
|
|
|
2
|
|
12
|
use Parse::RecDescent; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
9
|
|
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
our $VERSION = '1.25'; |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
438
|
|
|
|
|
|
|
# Create a new instance of an address parsing object. This step is time |
|
439
|
|
|
|
|
|
|
# consuming and should normally only be called once in your program. |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub new |
|
442
|
|
|
|
|
|
|
{ |
|
443
|
4
|
|
|
4
|
1
|
1364
|
my $class = shift; |
|
444
|
4
|
|
|
|
|
9
|
my %args = @_; |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
|
447
|
4
|
50
|
33
|
|
|
53
|
unless (defined $args{country} and $args{country} =~ |
|
448
|
|
|
|
|
|
|
/^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ ) |
|
449
|
|
|
|
|
|
|
{ |
|
450
|
0
|
|
|
|
|
0
|
croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n"; |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
|
454
|
4
|
|
|
|
|
6
|
my $address = {}; |
|
455
|
4
|
|
|
|
|
9
|
bless($address,$class); |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# option defaults |
|
458
|
4
|
|
|
|
|
11
|
$address->{'force_post_code'} = 1; |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# Add error checking for invalid keys? |
|
461
|
4
|
|
|
|
|
13
|
foreach my $curr_key (keys %args) |
|
462
|
|
|
|
|
|
|
{ |
|
463
|
8
|
|
|
|
|
13
|
$address->{$curr_key} = $args{$curr_key}; |
|
464
|
|
|
|
|
|
|
} |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# create the grammar tree (this is country dependent) |
|
467
|
4
|
|
|
|
|
20
|
my $grammar = Lingua::EN::AddressParse::Grammar::_create($address); |
|
468
|
|
|
|
|
|
|
|
|
469
|
4
|
|
|
|
|
26
|
$address->{parse} = Parse::RecDescent->new($grammar); |
|
470
|
|
|
|
|
|
|
|
|
471
|
4
|
|
|
|
|
2614337
|
return ($address); |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
474
|
|
|
|
|
|
|
sub parse |
|
475
|
|
|
|
|
|
|
{ |
|
476
|
15
|
|
|
15
|
1
|
7686
|
my $address = shift; |
|
477
|
15
|
|
|
|
|
25
|
my ($input_string) = @_; |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# Save original data so we can check effect of auto cleaning |
|
480
|
15
|
|
|
|
|
63
|
$address->{original_input} = $input_string; |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Convert to all upper case. This will allow for faster regexp matching in |
|
483
|
|
|
|
|
|
|
# the grammar tree |
|
484
|
15
|
|
|
|
|
40
|
$address->{input_string} = uc($input_string); |
|
485
|
|
|
|
|
|
|
|
|
486
|
15
|
|
|
|
|
34
|
chomp($address->{input_string}); |
|
487
|
|
|
|
|
|
|
|
|
488
|
15
|
|
|
|
|
18
|
my $pre_cursor; |
|
489
|
15
|
|
|
|
|
32
|
($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string}); |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# Replace commas (which can be used to chunk sections of addresses) with spaces |
|
492
|
15
|
|
|
|
|
34
|
$address->{input_string} =~ s/,/ /g; |
|
493
|
|
|
|
|
|
|
|
|
494
|
15
|
100
|
|
|
|
45
|
if ( $address->{auto_clean} ) |
|
495
|
|
|
|
|
|
|
{ |
|
496
|
12
|
|
|
|
|
24
|
$address->{input_string} = _clean($address); |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
|
|
499
|
15
|
|
|
|
|
15
|
my $po_box_type; |
|
500
|
15
|
|
|
|
|
34
|
($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string}); |
|
501
|
|
|
|
|
|
|
|
|
502
|
15
|
|
|
|
|
16
|
my $level; |
|
503
|
15
|
|
|
|
|
33
|
($level,$address->{input_string}) = _extract_level($address->{input_string}); |
|
504
|
|
|
|
|
|
|
|
|
505
|
15
|
|
|
|
|
15
|
my $building; |
|
506
|
15
|
|
|
|
|
26
|
($building,$address->{input_string}) = _extract_building($address->{input_string}); |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12 |
|
511
|
15
|
100
|
100
|
|
|
94
|
if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ ) |
|
512
|
|
|
|
|
|
|
{ |
|
513
|
2
|
|
|
|
|
10
|
$address->{input_string} = "UNIT $1 $2"; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# We need to add a trailing space to the input string. This is because the grammar |
|
517
|
|
|
|
|
|
|
# tree expects a terminator (the space) fro every production, optionally followed |
|
518
|
|
|
|
|
|
|
# by other productions or any final non matching text. |
|
519
|
|
|
|
|
|
|
# This space will be removed in the _assemble function |
|
520
|
15
|
|
|
|
|
21
|
$address->{input_string} .= ' '; |
|
521
|
|
|
|
|
|
|
|
|
522
|
15
|
|
|
|
|
33
|
$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building); |
|
523
|
15
|
|
|
|
|
32
|
_validate($address); |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
|
526
|
15
|
|
|
|
|
48
|
return($address,$address->{error}); |
|
527
|
|
|
|
|
|
|
} |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
530
|
|
|
|
|
|
|
# Apply correct capitalisation to each component of an address |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub components |
|
533
|
|
|
|
|
|
|
{ |
|
534
|
17
|
|
|
17
|
1
|
80
|
my $address = shift; |
|
535
|
17
|
|
|
|
|
16
|
my ($uc_all) = @_; |
|
536
|
|
|
|
|
|
|
|
|
537
|
17
|
|
|
|
|
18
|
my %orig_components = %{ $address->{components} }; |
|
|
17
|
|
|
|
|
132
|
|
|
538
|
|
|
|
|
|
|
|
|
539
|
17
|
|
|
|
|
26
|
my (%cased_components); |
|
540
|
17
|
|
|
|
|
49
|
foreach my $curr_key ( keys %orig_components ) |
|
541
|
|
|
|
|
|
|
{ |
|
542
|
323
|
|
|
|
|
191
|
my $cased_value; |
|
543
|
323
|
|
|
|
|
232
|
my $curr_value = $orig_components{$curr_key}; |
|
544
|
|
|
|
|
|
|
|
|
545
|
323
|
50
|
|
|
|
359
|
if ($uc_all) |
|
546
|
|
|
|
|
|
|
{ |
|
547
|
0
|
|
|
|
|
0
|
$cased_components{$curr_key} = uc($curr_value); |
|
548
|
0
|
|
|
|
|
0
|
next; |
|
549
|
|
|
|
|
|
|
} |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
|
|
552
|
323
|
100
|
|
|
|
607
|
if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ ) |
|
|
|
100
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
{ |
|
554
|
|
|
|
|
|
|
|
|
555
|
187
|
100
|
100
|
|
|
405
|
if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ ) |
|
|
|
100
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
{ |
|
557
|
1
|
|
|
|
|
5
|
$cased_value = "US Highway $1"; |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
elsif ( $curr_key eq 'sub_property_identifier' ) |
|
560
|
|
|
|
|
|
|
{ |
|
561
|
|
|
|
|
|
|
# UNIT, APT ... 12D etc |
|
562
|
|
|
|
|
|
|
|
|
563
|
17
|
|
|
|
|
28
|
my @words = split(/ /,$curr_value); |
|
564
|
17
|
|
|
|
|
16
|
my @cased_words; |
|
565
|
|
|
|
|
|
|
my $cased_string; |
|
566
|
17
|
|
|
|
|
23
|
foreach my $word (@words) |
|
567
|
|
|
|
|
|
|
{ |
|
568
|
5
|
|
|
|
|
3
|
my $cased_word; |
|
569
|
5
|
50
|
66
|
|
|
31
|
if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/) |
|
|
|
50
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
{ |
|
571
|
|
|
|
|
|
|
# ordinal component, as in 3rd Floor |
|
572
|
0
|
|
|
|
|
0
|
$cased_word = lc($word); |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
elsif ( length($word) > 1 and $word !~ /\d/ ) |
|
575
|
|
|
|
|
|
|
{ |
|
576
|
|
|
|
|
|
|
# only need to title case words such as UNIT |
|
577
|
0
|
|
|
|
|
0
|
$cased_word = Lingua::EN::NameParse::case_surname($word); |
|
578
|
|
|
|
|
|
|
} |
|
579
|
|
|
|
|
|
|
else |
|
580
|
|
|
|
|
|
|
{ |
|
581
|
5
|
|
|
|
|
6
|
$cased_word = $word; |
|
582
|
|
|
|
|
|
|
} |
|
583
|
5
|
|
|
|
|
7
|
push(@cased_words,$cased_word); |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
} |
|
586
|
17
|
|
|
|
|
34
|
$cased_value = join(' ',@cased_words); |
|
587
|
|
|
|
|
|
|
} |
|
588
|
|
|
|
|
|
|
else |
|
589
|
|
|
|
|
|
|
{ |
|
590
|
169
|
100
|
|
|
|
164
|
if ($curr_value) |
|
591
|
|
|
|
|
|
|
{ |
|
592
|
|
|
|
|
|
|
# Surnames can be used for street's or suburbs so this method |
|
593
|
|
|
|
|
|
|
# will give correct capitalisation for most cases |
|
594
|
71
|
|
|
|
|
126
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
else |
|
597
|
|
|
|
|
|
|
{ |
|
598
|
98
|
|
|
|
|
82
|
$cased_value = ''; |
|
599
|
|
|
|
|
|
|
} |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
# retain street_direction,sub country and countries capitalisation, usually uppercase |
|
603
|
|
|
|
|
|
|
elsif ($curr_key =~ /street_direction/) |
|
604
|
|
|
|
|
|
|
{ |
|
605
|
34
|
100
|
66
|
|
|
135
|
if (length($curr_value) == 1 or length($curr_value) == 2) |
|
|
|
50
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
{ |
|
607
|
|
|
|
|
|
|
# N, SE etc is capitalised |
|
608
|
1
|
|
|
|
|
2
|
$cased_value =$curr_value; |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
elsif (length($curr_value) > 2) |
|
611
|
|
|
|
|
|
|
{ |
|
612
|
0
|
|
|
|
|
0
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
else |
|
615
|
|
|
|
|
|
|
{ |
|
616
|
33
|
|
|
|
|
34
|
$cased_value = ''; |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
# retain sub country and countries capitalisation, as usually uppercase |
|
620
|
|
|
|
|
|
|
else |
|
621
|
|
|
|
|
|
|
{ |
|
622
|
102
|
|
|
|
|
103
|
$cased_value = uc($curr_value); |
|
623
|
|
|
|
|
|
|
} |
|
624
|
323
|
|
|
|
|
1518
|
$cased_components{$curr_key} = $cased_value; |
|
625
|
|
|
|
|
|
|
} |
|
626
|
17
|
|
|
|
|
324
|
return(%cased_components); |
|
627
|
|
|
|
|
|
|
} |
|
628
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
629
|
|
|
|
|
|
|
# Apply correct capitalisation to an entire address |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub case_all |
|
632
|
|
|
|
|
|
|
{ |
|
633
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
|
634
|
|
|
|
|
|
|
|
|
635
|
0
|
|
|
|
|
0
|
my @cased_address; |
|
636
|
|
|
|
|
|
|
|
|
637
|
0
|
0
|
|
|
|
0
|
unless ( $address->{properties}{type} eq 'unknown' ) |
|
638
|
|
|
|
|
|
|
{ |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# Hash of of lists, indicating the order that address components are assembled in. |
|
641
|
|
|
|
|
|
|
# Each list element is itself the name of the key value in an address object. |
|
642
|
|
|
|
|
|
|
|
|
643
|
0
|
|
|
|
|
0
|
my %component_order= |
|
644
|
|
|
|
|
|
|
( |
|
645
|
|
|
|
|
|
|
'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/], |
|
646
|
|
|
|
|
|
|
'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ], |
|
647
|
|
|
|
|
|
|
'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ] |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
); |
|
650
|
0
|
0
|
|
|
|
0
|
if ( $address->{country} eq 'US' ) |
|
651
|
|
|
|
|
|
|
{ |
|
652
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/]; |
|
653
|
|
|
|
|
|
|
} |
|
654
|
|
|
|
|
|
|
else |
|
655
|
|
|
|
|
|
|
{ |
|
656
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ]; |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
0
|
my %component_vals = $address->components; |
|
660
|
0
|
|
|
|
|
0
|
my @order = @{ $component_order{$address->{properties}{type} } }; |
|
|
0
|
|
|
|
|
0
|
|
|
661
|
|
|
|
|
|
|
|
|
662
|
0
|
|
|
|
|
0
|
foreach my $component ( @order ) |
|
663
|
|
|
|
|
|
|
{ |
|
664
|
|
|
|
|
|
|
# As some components such as property name are optional, they will appear |
|
665
|
|
|
|
|
|
|
# in the order array but may or may not have have a value, so check |
|
666
|
|
|
|
|
|
|
# for undefined values |
|
667
|
0
|
0
|
|
|
|
0
|
if ( $component_vals{$component} ) |
|
668
|
|
|
|
|
|
|
{ |
|
669
|
0
|
|
|
|
|
0
|
push(@cased_address,$component_vals{$component}); |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
} |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
|
|
674
|
0
|
0
|
0
|
|
|
0
|
if ( $address->{error} and $address->{force_case} ) |
|
675
|
|
|
|
|
|
|
{ |
|
676
|
|
|
|
|
|
|
# Despite errors, try to name case non-matching section. As the format |
|
677
|
|
|
|
|
|
|
# of this section is unknown, surname case will provide the best |
|
678
|
|
|
|
|
|
|
# approximation |
|
679
|
0
|
|
|
|
|
0
|
push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching})); |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
return(join(' ',@cased_address)); |
|
683
|
|
|
|
|
|
|
} |
|
684
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
685
|
|
|
|
|
|
|
sub properties |
|
686
|
|
|
|
|
|
|
{ |
|
687
|
2
|
|
|
2
|
1
|
8
|
my $address = shift; |
|
688
|
2
|
|
|
|
|
2
|
return(%{ $address->{properties} }); |
|
|
2
|
|
|
|
|
10
|
|
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
692
|
|
|
|
|
|
|
# Create a text report to standard output listing |
|
693
|
|
|
|
|
|
|
# - the input string, |
|
694
|
|
|
|
|
|
|
# - the name of each defined component |
|
695
|
|
|
|
|
|
|
# - any non matching component |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
sub report |
|
698
|
|
|
|
|
|
|
{ |
|
699
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
|
700
|
|
|
|
|
|
|
|
|
701
|
0
|
|
|
|
|
0
|
my $report = ''; |
|
702
|
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Original Input",$address->{original_input}); |
|
704
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Cleaned Input",$address->{input_string}); |
|
705
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Country address format",$address->{country_code}); |
|
706
|
|
|
|
|
|
|
|
|
707
|
0
|
|
|
|
|
0
|
my %props = $address->properties; |
|
708
|
0
|
0
|
|
|
|
0
|
if ( $props{type} ) |
|
709
|
|
|
|
|
|
|
{ |
|
710
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Address type",$props{type}); |
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
|
|
714
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Non matching part",$props{non_matching}); |
|
715
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error",$address->{error}); |
|
716
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error descriptions",$address->{error_desc}); |
|
717
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning",$address->{error}); |
|
718
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning description",$address->{warning_desc}); |
|
719
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Case all",$address->case_all); |
|
720
|
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
|
|
722
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"COMPONENTS",''); |
|
723
|
0
|
|
|
|
|
0
|
my %comps = $address->components; |
|
724
|
0
|
|
|
|
|
0
|
foreach my $comp ( sort keys %comps) |
|
725
|
|
|
|
|
|
|
{ |
|
726
|
0
|
0
|
|
|
|
0
|
if (defined($comps{$comp}) ) |
|
727
|
|
|
|
|
|
|
{ |
|
728
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,$comp,$comps{$comp}); |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
} |
|
731
|
|
|
|
|
|
|
|
|
732
|
0
|
|
|
|
|
0
|
return($report); |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
# PRIVATE METHODS |
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
741
|
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
sub _assemble |
|
743
|
|
|
|
|
|
|
{ |
|
744
|
|
|
|
|
|
|
|
|
745
|
15
|
|
|
15
|
|
17
|
my $address = shift; |
|
746
|
15
|
|
|
|
|
19
|
my ($pre_cursor,$po_box_type,$level,$building) = @_; |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# Parse the address according to the rules defined in the AddressParse::Grammar module, |
|
749
|
|
|
|
|
|
|
# $::RD_TRACE = 1; # for debugging RecDescent output |
|
750
|
|
|
|
|
|
|
# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree |
|
751
|
15
|
|
|
|
|
127
|
my $parsed_address = $address->{parse}->full_address($address->{input_string}); |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
# Place components into a separate hash, so they can be easily returned to the user to inspect and modify |
|
754
|
15
|
|
|
|
|
169049
|
$address->{components} = (); |
|
755
|
|
|
|
|
|
|
|
|
756
|
15
|
100
|
|
|
|
90
|
if ($pre_cursor) |
|
757
|
|
|
|
|
|
|
{ |
|
758
|
1
|
|
|
|
|
4
|
$address->{components}{'pre_cursor'} = $pre_cursor; |
|
759
|
|
|
|
|
|
|
} |
|
760
|
|
|
|
|
|
|
else |
|
761
|
|
|
|
|
|
|
{ |
|
762
|
14
|
|
|
|
|
35
|
$address->{components}{'pre_cursor'} = ''; |
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
15
|
100
|
|
|
|
31
|
if ($level) |
|
766
|
|
|
|
|
|
|
{ |
|
767
|
1
|
|
|
|
|
2
|
$address->{components}{'level'} = $level; |
|
768
|
|
|
|
|
|
|
} |
|
769
|
|
|
|
|
|
|
else |
|
770
|
|
|
|
|
|
|
{ |
|
771
|
14
|
|
|
|
|
24
|
$address->{components}{'level'} = ''; |
|
772
|
|
|
|
|
|
|
} |
|
773
|
|
|
|
|
|
|
|
|
774
|
15
|
100
|
|
|
|
41
|
if ($building) |
|
775
|
|
|
|
|
|
|
{ |
|
776
|
1
|
|
|
|
|
3
|
$address->{components}{'building'} = $building; |
|
777
|
|
|
|
|
|
|
} |
|
778
|
|
|
|
|
|
|
else |
|
779
|
|
|
|
|
|
|
{ |
|
780
|
14
|
|
|
|
|
29
|
$address->{components}{'building'} = ''; |
|
781
|
|
|
|
|
|
|
} |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
|
|
784
|
15
|
100
|
|
|
|
28
|
if ($po_box_type) |
|
785
|
|
|
|
|
|
|
{ |
|
786
|
1
|
|
|
|
|
2
|
$address->{components}{'po_box_type'} = $po_box_type; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
|
|
|
|
|
|
else |
|
789
|
|
|
|
|
|
|
{ |
|
790
|
14
|
|
|
|
|
22
|
$address->{components}{'po_box_type'} = ''; |
|
791
|
|
|
|
|
|
|
} |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
|
|
794
|
15
|
|
|
|
|
28
|
$address->{components}{post_box} = ''; |
|
795
|
15
|
100
|
|
|
|
39
|
if ( $parsed_address->{post_box} ) |
|
796
|
|
|
|
|
|
|
{ |
|
797
|
1
|
|
|
|
|
2
|
$address->{components}{post_box} = $parsed_address->{post_box}; |
|
798
|
|
|
|
|
|
|
} |
|
799
|
|
|
|
|
|
|
|
|
800
|
15
|
|
|
|
|
23
|
$address->{components}{road_box} = ''; |
|
801
|
15
|
50
|
|
|
|
33
|
if ( $parsed_address->{road_box} ) |
|
802
|
|
|
|
|
|
|
{ |
|
803
|
0
|
|
|
|
|
0
|
$address->{components}{road_box} = $parsed_address->{road_box}; |
|
804
|
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
|
|
806
|
15
|
|
|
|
|
25
|
$address->{components}{property_name} = ''; |
|
807
|
15
|
100
|
|
|
|
34
|
if ( $parsed_address->{property_name} ) |
|
808
|
|
|
|
|
|
|
{ |
|
809
|
1
|
|
|
|
|
2
|
$address->{components}{property_name} = $parsed_address->{property_name}; |
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
|
|
812
|
15
|
|
|
|
|
38
|
$address->{components}{sub_property_identifier} = ''; |
|
813
|
15
|
|
|
|
|
26
|
$address->{components}{sub_property_type} = ''; |
|
814
|
|
|
|
|
|
|
|
|
815
|
15
|
100
|
|
|
|
35
|
if ( $parsed_address->{sub_property} ) |
|
816
|
|
|
|
|
|
|
{ |
|
817
|
4
|
50
|
|
|
|
49
|
if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ ) |
|
|
|
0
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
{ |
|
819
|
|
|
|
|
|
|
# Such as Unit 24, # 4A etc |
|
820
|
4
|
|
|
|
|
15
|
$address->{components}{sub_property_type} = $1; |
|
821
|
4
|
|
|
|
|
10
|
$address->{components}{sub_property_identifier} = $2; |
|
822
|
|
|
|
|
|
|
} |
|
823
|
|
|
|
|
|
|
elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ ) |
|
824
|
|
|
|
|
|
|
{ |
|
825
|
|
|
|
|
|
|
# Such as 1st Floor |
|
826
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_type} = $2; |
|
827
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_identifier} = $1; |
|
828
|
|
|
|
|
|
|
} |
|
829
|
|
|
|
|
|
|
} |
|
830
|
|
|
|
|
|
|
|
|
831
|
15
|
|
|
|
|
23
|
$address->{components}{property_identifier} = ''; |
|
832
|
15
|
100
|
|
|
|
37
|
if ( $parsed_address->{property_identifier} ) |
|
833
|
|
|
|
|
|
|
{ |
|
834
|
13
|
|
|
|
|
27
|
$address->{components}{property_identifier} = $parsed_address->{property_identifier}; |
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
|
|
837
|
15
|
|
|
|
|
21
|
$address->{components}{base_street_name} = ''; |
|
838
|
15
|
|
|
|
|
25
|
$address->{components}{street_direction_prefix} = ''; |
|
839
|
15
|
|
|
|
|
38
|
my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name}); |
|
840
|
15
|
50
|
|
|
|
26
|
if ($street_direction ) |
|
841
|
|
|
|
|
|
|
{ |
|
842
|
0
|
|
|
|
|
0
|
$address->{components}{street_direction_prefix} = $street_direction; |
|
843
|
0
|
|
|
|
|
0
|
$address->{components}{base_street_name} = $base_street_name; |
|
844
|
|
|
|
|
|
|
} |
|
845
|
|
|
|
|
|
|
else |
|
846
|
|
|
|
|
|
|
{ |
|
847
|
15
|
|
|
|
|
24
|
$address->{components}{base_street_name} = $parsed_address->{street_name}; |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
|
|
850
|
15
|
|
|
|
|
22
|
$address->{components}{street_name} = ''; |
|
851
|
15
|
|
|
|
|
22
|
$address->{components}{street_type} = ''; |
|
852
|
15
|
100
|
|
|
|
28
|
if ( $parsed_address->{street_name} ) |
|
853
|
|
|
|
|
|
|
{ |
|
854
|
|
|
|
|
|
|
# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out |
|
855
|
13
|
50
|
|
|
|
29
|
if ( $parsed_address->{street_name} eq 'THE ' ) |
|
856
|
|
|
|
|
|
|
{ |
|
857
|
0
|
|
|
|
|
0
|
$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type}; |
|
858
|
|
|
|
|
|
|
} |
|
859
|
|
|
|
|
|
|
else |
|
860
|
|
|
|
|
|
|
{ |
|
861
|
13
|
|
|
|
|
21
|
$address->{components}{street_name} = $parsed_address->{street_name}; |
|
862
|
13
|
|
|
|
|
18
|
$address->{components}{street_type} = $parsed_address->{street_type}; |
|
863
|
|
|
|
|
|
|
} |
|
864
|
|
|
|
|
|
|
} |
|
865
|
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
|
|
867
|
15
|
|
|
|
|
25
|
$address->{components}{street_direction_suffix} = ''; |
|
868
|
15
|
100
|
|
|
|
34
|
if ( $parsed_address->{street_direction_suffix} ) |
|
869
|
|
|
|
|
|
|
{ |
|
870
|
1
|
|
|
|
|
5
|
$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix}; |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
|
|
874
|
15
|
|
|
|
|
40
|
$address->{components}{suburb} = ''; |
|
875
|
15
|
50
|
|
|
|
37
|
if ( $parsed_address->{suburb} ) |
|
876
|
|
|
|
|
|
|
{ |
|
877
|
15
|
|
|
|
|
24
|
$address->{components}{suburb} = $parsed_address->{suburb}; |
|
878
|
|
|
|
|
|
|
} |
|
879
|
|
|
|
|
|
|
|
|
880
|
15
|
|
|
|
|
23
|
$address->{components}{subcountry} = ''; |
|
881
|
15
|
50
|
|
|
|
33
|
if ( $parsed_address->{subcountry} ) |
|
882
|
|
|
|
|
|
|
{ |
|
883
|
15
|
|
|
|
|
17
|
my $sub_country = $parsed_address->{subcountry}; |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc |
|
886
|
15
|
100
|
|
|
|
33
|
if ($address->{abbreviate_subcountry}) |
|
887
|
|
|
|
|
|
|
{ |
|
888
|
7
|
|
|
|
|
45
|
my $country = Locale::SubCountry->new($address->{country}); |
|
889
|
7
|
|
|
|
|
106
|
my $code = $country->code($sub_country); |
|
890
|
7
|
100
|
|
|
|
497
|
if ( $code ne 'unknown' ) |
|
891
|
|
|
|
|
|
|
{ |
|
892
|
1
|
|
|
|
|
3
|
$address->{components}{subcountry} = $code; |
|
893
|
|
|
|
|
|
|
} |
|
894
|
|
|
|
|
|
|
# sub country already abbreviated |
|
895
|
|
|
|
|
|
|
else |
|
896
|
|
|
|
|
|
|
{ |
|
897
|
6
|
|
|
|
|
20
|
$address->{components}{subcountry} = $sub_country; |
|
898
|
|
|
|
|
|
|
} |
|
899
|
|
|
|
|
|
|
} |
|
900
|
|
|
|
|
|
|
else |
|
901
|
|
|
|
|
|
|
{ |
|
902
|
8
|
|
|
|
|
15
|
$address->{components}{subcountry} = $sub_country; |
|
903
|
|
|
|
|
|
|
} |
|
904
|
|
|
|
|
|
|
} |
|
905
|
|
|
|
|
|
|
|
|
906
|
15
|
|
|
|
|
20
|
$address->{components}{post_code} = ''; |
|
907
|
15
|
50
|
|
|
|
34
|
if ( $parsed_address->{post_code} ) |
|
908
|
|
|
|
|
|
|
{ |
|
909
|
15
|
|
|
|
|
24
|
$address->{components}{post_code} = $parsed_address->{post_code}; |
|
910
|
|
|
|
|
|
|
} |
|
911
|
|
|
|
|
|
|
|
|
912
|
15
|
|
|
|
|
19
|
$address->{components}{country} = ''; |
|
913
|
15
|
100
|
|
|
|
31
|
if ( $parsed_address->{country} ) |
|
914
|
|
|
|
|
|
|
{ |
|
915
|
1
|
|
|
|
|
2
|
$address->{components}{country} = $parsed_address->{country}; |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
|
|
918
|
15
|
|
|
|
|
30
|
$address->{properties} = (); |
|
919
|
|
|
|
|
|
|
|
|
920
|
15
|
|
|
|
|
31
|
$address->{properties}{non_matching} = ''; |
|
921
|
15
|
100
|
|
|
|
28
|
if ( $parsed_address->{non_matching} ) |
|
922
|
|
|
|
|
|
|
{ |
|
923
|
1
|
|
|
|
|
2
|
$address->{properties}{non_matching} = $parsed_address->{non_matching}; |
|
924
|
|
|
|
|
|
|
} |
|
925
|
15
|
|
|
|
|
24
|
$address->{properties}{type} = $parsed_address->{type}; |
|
926
|
|
|
|
|
|
|
|
|
927
|
15
|
|
|
|
|
32
|
_trim_trailing_space($address); |
|
928
|
|
|
|
|
|
|
|
|
929
|
15
|
|
|
|
|
59
|
return($address); |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
933
|
|
|
|
|
|
|
# |
|
934
|
|
|
|
|
|
|
sub _get_street_direction |
|
935
|
|
|
|
|
|
|
{ |
|
936
|
15
|
|
|
15
|
|
23
|
my ($street_name) = @_; |
|
937
|
|
|
|
|
|
|
|
|
938
|
15
|
|
|
|
|
19
|
my $street_direction; |
|
939
|
|
|
|
|
|
|
my $base_street_name; |
|
940
|
|
|
|
|
|
|
|
|
941
|
15
|
100
|
|
|
|
32
|
unless ($street_name) |
|
942
|
|
|
|
|
|
|
{ |
|
943
|
2
|
|
|
|
|
4
|
return; |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
|
|
946
|
13
|
|
|
|
|
65
|
my @words = split(/\s/,$street_name); |
|
947
|
13
|
100
|
|
|
|
34
|
if (@words > 1) |
|
948
|
|
|
|
|
|
|
{ |
|
949
|
4
|
50
|
|
|
|
15
|
if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ ) |
|
950
|
|
|
|
|
|
|
{ |
|
951
|
0
|
|
|
|
|
0
|
$street_direction = $1; |
|
952
|
0
|
|
|
|
|
0
|
shift(@words); |
|
953
|
0
|
|
|
|
|
0
|
$base_street_name = join(' ',@words); |
|
954
|
|
|
|
|
|
|
} |
|
955
|
|
|
|
|
|
|
} |
|
956
|
13
|
|
|
|
|
29
|
return($street_direction,$base_street_name); |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
} |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
961
|
|
|
|
|
|
|
# Check for several different types of syntax errors |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
sub _validate |
|
964
|
|
|
|
|
|
|
{ |
|
965
|
15
|
|
|
15
|
|
19
|
my $address = shift; |
|
966
|
15
|
|
|
|
|
24
|
$address->{error} = 0; |
|
967
|
15
|
|
|
|
|
21
|
$address->{error_desc} = ''; |
|
968
|
15
|
|
|
|
|
21
|
$address->{warning} = 0; |
|
969
|
15
|
|
|
|
|
20
|
$address->{warning_desc} = ''; |
|
970
|
|
|
|
|
|
|
|
|
971
|
15
|
100
|
|
|
|
35
|
if ( $address->{properties}{non_matching} ) |
|
972
|
|
|
|
|
|
|
{ |
|
973
|
1
|
|
|
|
|
2
|
$address->{error} = 1; |
|
974
|
1
|
|
|
|
|
4
|
$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching}; |
|
975
|
|
|
|
|
|
|
} |
|
976
|
|
|
|
|
|
|
else |
|
977
|
|
|
|
|
|
|
{ |
|
978
|
14
|
50
|
|
|
|
27
|
if ( $address->{properties}{type} eq 'unknown' ) |
|
979
|
|
|
|
|
|
|
{ |
|
980
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
|
981
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= 'unknown address format'; |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
else |
|
984
|
|
|
|
|
|
|
{ |
|
985
|
14
|
50
|
33
|
|
|
82
|
if ($address->{force_post_code} and not $address->{components}{post_code}) |
|
986
|
|
|
|
|
|
|
{ |
|
987
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
|
988
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':no post code'; |
|
989
|
|
|
|
|
|
|
} |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses |
|
992
|
14
|
50
|
|
|
|
50
|
if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,\/ ]/ ) |
|
993
|
|
|
|
|
|
|
{ |
|
994
|
|
|
|
|
|
|
# Note, if auto_clean is on, illegal characters will have been removed |
|
995
|
|
|
|
|
|
|
# for second parsing and no error flag or message reported |
|
996
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
|
997
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':illegal chars'; |
|
998
|
|
|
|
|
|
|
} |
|
999
|
14
|
100
|
|
|
|
36
|
if ( $address->{properties}{type} eq 'suburban' ) |
|
1000
|
|
|
|
|
|
|
{ |
|
1001
|
12
|
|
|
|
|
12
|
my $street = $address->{components}{street_name}; |
|
1002
|
12
|
100
|
|
|
|
34
|
if ($street !~ /\d/ ) |
|
1003
|
|
|
|
|
|
|
{ |
|
1004
|
|
|
|
|
|
|
# Not an ordinal or single letter street type |
|
1005
|
11
|
100
|
|
|
|
24
|
if ( _check_vowel($address->{components}{base_street_name}) ) |
|
1006
|
|
|
|
|
|
|
{ |
|
1007
|
|
|
|
|
|
|
# street name must have a vowel sound, |
|
1008
|
1
|
|
|
|
|
2
|
$address->{warning} = 1; |
|
1009
|
1
|
|
|
|
|
3
|
$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}"; |
|
1010
|
|
|
|
|
|
|
} |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
} |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
14
|
50
|
|
|
|
25
|
if ( _check_vowel($address->{components}{suburb}) ) |
|
1015
|
|
|
|
|
|
|
{ |
|
1016
|
0
|
|
|
|
|
0
|
$address->{warning} = 1; |
|
1017
|
0
|
|
|
|
|
0
|
$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}"; |
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
1023
|
|
|
|
|
|
|
# Purge the input string of illegal or redundant characters. |
|
1024
|
|
|
|
|
|
|
# Correct malformed patterns |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
sub _clean |
|
1027
|
|
|
|
|
|
|
{ |
|
1028
|
12
|
|
|
12
|
|
16
|
my $address = shift; |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
12
|
|
|
|
|
21
|
my ($input) = $address->{input_string}; |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St) |
|
1033
|
12
|
|
|
|
|
15
|
$input =~ s|\(.*\)||; |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# Normalise half house numbers, sucvh as 12.5 to 12 1/2. This is needed now before full stops are stripped out |
|
1036
|
12
|
|
|
|
|
17
|
$input =~ s|^(\d{1,4})\.5 |$1 1/2 |; |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# strip full stops, remove illegal characters |
|
1039
|
|
|
|
|
|
|
# & can be part of property name |
|
1040
|
|
|
|
|
|
|
# hash (#) may denote number for USA address |
|
1041
|
|
|
|
|
|
|
# quotes can occur as property name delimiters |
|
1042
|
|
|
|
|
|
|
|
|
1043
|
12
|
|
|
|
|
24
|
$input =~ s|[^A-Za-z0-9/'" -]||go; |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
# remove repeating, leading and trailing spaces |
|
1046
|
12
|
|
|
|
|
18
|
$input =~ s| +| |go ; |
|
1047
|
12
|
|
|
|
|
13
|
$input =~ s|^ ||; |
|
1048
|
12
|
|
|
|
|
18
|
$input =~ s| $||; |
|
1049
|
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
# Expand abbreviations that are too short |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
12
|
|
|
|
|
18
|
$input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street' |
|
1054
|
12
|
|
|
|
|
17
|
$input =~ s| CSEWY | CAUSEWAY |; |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
# street types |
|
1057
|
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
|
|
1059
|
12
|
|
|
|
|
13
|
$input =~ s|^FCTR?Y |FACTORY |; |
|
1060
|
12
|
|
|
|
|
50
|
$input =~ s|^FACT?R?Y? |FACTORY |; |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
12
|
|
|
|
|
15
|
$input =~ s|LVL |LEVEL |; # sub property identifiers |
|
1063
|
12
|
|
|
|
|
14
|
$input =~ s|^UN? |UNIT |; |
|
1064
|
12
|
|
|
|
|
11
|
$input =~ s|^U(\d+)|UNIT $1|; |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A |
|
1067
|
12
|
|
|
|
|
11
|
$input =~ s|/ |/|; |
|
1068
|
12
|
|
|
|
|
12
|
$input =~ s| /|/|; |
|
1069
|
12
|
|
|
|
|
12
|
$input =~ s|- |-|; |
|
1070
|
12
|
|
|
|
|
14
|
$input =~ s| -|-|; |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
12
|
50
|
|
|
|
40
|
if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ ) |
|
1075
|
|
|
|
|
|
|
{ |
|
1076
|
|
|
|
|
|
|
# Don't remove space before single letter streets such as 21 B Street |
|
1077
|
12
|
50
|
|
|
|
31
|
if ( $address->{country_code} eq 'US' ) |
|
1078
|
|
|
|
|
|
|
{ |
|
1079
|
|
|
|
|
|
|
# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET |
|
1080
|
|
|
|
|
|
|
# Assume that the direction prefix is the more likely case |
|
1081
|
0
|
|
|
|
|
0
|
$input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|; |
|
1082
|
|
|
|
|
|
|
} |
|
1083
|
|
|
|
|
|
|
else |
|
1084
|
|
|
|
|
|
|
{ |
|
1085
|
12
|
|
|
|
|
31
|
$input =~ s|^(\d+) ([A-Z] )|$1$2|; |
|
1086
|
|
|
|
|
|
|
} |
|
1087
|
|
|
|
|
|
|
} |
|
1088
|
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
# Add or remove spaces in sub property identifiers |
|
1090
|
12
|
50
|
|
|
|
24
|
if ( $address->{country_code} eq 'US' ) |
|
1091
|
|
|
|
|
|
|
{ |
|
1092
|
|
|
|
|
|
|
# Fix US sub property identifiers that appear after street name and type |
|
1093
|
|
|
|
|
|
|
# add space between # and the number so #2 becomes '# 2' |
|
1094
|
0
|
|
|
|
|
0
|
$input =~ s| #(\d)| # $1|; |
|
1095
|
0
|
|
|
|
|
0
|
$input =~ s| #([A-Z])| # $1|; |
|
1096
|
0
|
|
|
|
|
0
|
$input =~ s| (APT)(\d)| $1 $2|i; |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
# remove redundnant space so # 34 B becomes # 34B |
|
1099
|
0
|
|
|
|
|
0
|
$input =~ s| # (\d+) (\w) | # $1$2 |; |
|
1100
|
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
# remove redundnant '#' |
|
1102
|
0
|
|
|
|
|
0
|
$input =~ s| APT #| APT |; |
|
1103
|
|
|
|
|
|
|
} |
|
1104
|
|
|
|
|
|
|
else |
|
1105
|
|
|
|
|
|
|
{ |
|
1106
|
|
|
|
|
|
|
# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2 |
|
1107
|
12
|
|
|
|
|
20
|
$input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/; |
|
1108
|
|
|
|
|
|
|
} |
|
1109
|
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
# Remove redundant slash or dash |
|
1111
|
|
|
|
|
|
|
# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12 |
|
1112
|
|
|
|
|
|
|
# TO DO, add |# at start |
|
1113
|
12
|
|
|
|
|
21
|
$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /; |
|
1114
|
|
|
|
|
|
|
# Unit J1/ 39 becomes Unit J1 39 |
|
1115
|
12
|
|
|
|
|
18
|
$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /; |
|
1116
|
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
# remove dash that is not from a sequence, such as D-5 or 22-A |
|
1119
|
12
|
|
|
|
|
15
|
$input =~ s|([A-Z])-(\d)|$1$2|; |
|
1120
|
12
|
|
|
|
|
16
|
$input =~ s|(\d)-([A-Z])|$1$2|; |
|
1121
|
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
|
|
1123
|
12
|
|
|
|
|
18
|
return($input); |
|
1124
|
|
|
|
|
|
|
} |
|
1125
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
1126
|
|
|
|
|
|
|
# Remove any "care of" type of precursor from the main address |
|
1127
|
|
|
|
|
|
|
# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214 |
|
1128
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
|
1129
|
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
sub _extract_precursor |
|
1131
|
|
|
|
|
|
|
{ |
|
1132
|
15
|
|
|
15
|
|
19
|
my ($input) = @_; |
|
1133
|
15
|
|
|
|
|
19
|
my ($pre_cursor,$address_start,$address_end); |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
15
|
100
|
|
|
|
73
|
if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)}) |
|
1136
|
|
|
|
|
|
|
{ |
|
1137
|
1
|
|
|
|
|
5
|
$pre_cursor = $1; |
|
1138
|
1
|
|
|
|
|
2
|
$address_start = $2; |
|
1139
|
1
|
|
|
|
|
2
|
$address_end = $3; |
|
1140
|
1
|
|
|
|
|
5
|
return($pre_cursor, $address_start . $address_end); |
|
1141
|
|
|
|
|
|
|
} |
|
1142
|
|
|
|
|
|
|
else |
|
1143
|
|
|
|
|
|
|
{ |
|
1144
|
14
|
|
|
|
|
37
|
return('',$input) |
|
1145
|
|
|
|
|
|
|
} |
|
1146
|
|
|
|
|
|
|
} |
|
1147
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
1148
|
|
|
|
|
|
|
# Remove any level or floor info such as: |
|
1149
|
|
|
|
|
|
|
# 12 Smith St Floor 2 |
|
1150
|
|
|
|
|
|
|
# Level 22 Suite 3 12 Main St |
|
1151
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
sub _extract_level |
|
1154
|
|
|
|
|
|
|
{ |
|
1155
|
15
|
|
|
15
|
|
14
|
my ($input) = @_; |
|
1156
|
15
|
|
|
|
|
13
|
my ($level); |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
15
|
100
|
66
|
|
|
131
|
if |
|
|
|
|
66
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
( |
|
1160
|
|
|
|
|
|
|
# Level info could be at start of string so first space is optional |
|
1161
|
|
|
|
|
|
|
$input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or |
|
1162
|
|
|
|
|
|
|
$input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or |
|
1163
|
|
|
|
|
|
|
$input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/ |
|
1164
|
|
|
|
|
|
|
) |
|
1165
|
|
|
|
|
|
|
{ |
|
1166
|
1
|
|
|
|
|
2
|
$level = $1; |
|
1167
|
1
|
|
|
|
|
2
|
$level =~ s|/||; |
|
1168
|
1
|
|
|
|
|
2
|
$level =~ s|-||; |
|
1169
|
1
|
|
|
|
|
17
|
$input =~ s/$level//; |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
15
|
|
|
|
|
40
|
return($level,$input); |
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
1175
|
|
|
|
|
|
|
# Remove any building info such as: |
|
1176
|
|
|
|
|
|
|
# Building 2 Level 12 123 Smith St |
|
1177
|
|
|
|
|
|
|
# 12 Main St Tower A Level 2 |
|
1178
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
sub _extract_building |
|
1181
|
|
|
|
|
|
|
{ |
|
1182
|
15
|
|
|
15
|
|
17
|
my ($input) = @_; |
|
1183
|
15
|
|
|
|
|
17
|
my ($building); |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
15
|
|
|
|
|
46
|
my $bld = qr{BLOCK|BLDG?|BUILDING|TOWER}; |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
15
|
100
|
66
|
|
|
486
|
if |
|
|
|
|
66
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
( |
|
1189
|
|
|
|
|
|
|
$input =~ / ?(($bld) ([A-Z]{1,2}|\d+) )/ or # BLD 12 or AA |
|
1190
|
|
|
|
|
|
|
$input =~ / ?(($bld) \d{1,3}[A-Z] )/ or # BLD 32C |
|
1191
|
|
|
|
|
|
|
$input =~ / ?(($bld) [A-Z]\d{1,3} )/ # BLD C12 |
|
1192
|
|
|
|
|
|
|
) |
|
1193
|
|
|
|
|
|
|
{ |
|
1194
|
1
|
|
|
|
|
2
|
$building = $1; |
|
1195
|
1
|
|
|
|
|
1
|
$building =~ s|/||; |
|
1196
|
1
|
|
|
|
|
2
|
$building =~ s|-||; |
|
1197
|
1
|
|
|
|
|
7
|
$input =~ s/$building//; |
|
1198
|
|
|
|
|
|
|
} |
|
1199
|
|
|
|
|
|
|
|
|
1200
|
15
|
|
|
|
|
49
|
return($building,$input); |
|
1201
|
|
|
|
|
|
|
} |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
1204
|
|
|
|
|
|
|
# Remove any description that follows the suburb after the main address |
|
1205
|
|
|
|
|
|
|
# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965" |
|
1206
|
|
|
|
|
|
|
# It will be saved as an address attribute |
|
1207
|
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
sub _extract_po_box_type |
|
1209
|
|
|
|
|
|
|
{ |
|
1210
|
15
|
|
|
15
|
|
21
|
my ($input) = @_; |
|
1211
|
15
|
|
|
|
|
19
|
my ($po_box_type,$address_start,$address_end); |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
15
|
100
|
|
|
|
42
|
if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ ) |
|
1214
|
|
|
|
|
|
|
{ |
|
1215
|
1
|
|
|
|
|
3
|
$address_start = $1; |
|
1216
|
1
|
|
|
|
|
2
|
$po_box_type = $2; |
|
1217
|
1
|
|
|
|
|
2
|
$address_end = $3; |
|
1218
|
1
|
|
|
|
|
4
|
return($po_box_type, $address_start . $address_end); |
|
1219
|
|
|
|
|
|
|
} |
|
1220
|
|
|
|
|
|
|
else |
|
1221
|
|
|
|
|
|
|
{ |
|
1222
|
14
|
|
|
|
|
31
|
return('',$input) |
|
1223
|
|
|
|
|
|
|
} |
|
1224
|
|
|
|
|
|
|
} |
|
1225
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
1226
|
|
|
|
|
|
|
# For correct matching, the grammar of each component must include the |
|
1227
|
|
|
|
|
|
|
# trailing space that separates it from any following word. This should |
|
1228
|
|
|
|
|
|
|
# now be removed from each component |
|
1229
|
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
sub _trim_trailing_space |
|
1231
|
|
|
|
|
|
|
{ |
|
1232
|
15
|
|
|
15
|
|
17
|
my ($address) = @_; |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
15
|
|
|
|
|
17
|
foreach my $key (keys %{ $address->{components} } ) |
|
|
15
|
|
|
|
|
79
|
|
|
1235
|
|
|
|
|
|
|
{ |
|
1236
|
285
|
100
|
|
|
|
386
|
if ($address->{components}{$key} ) |
|
1237
|
|
|
|
|
|
|
{ |
|
1238
|
110
|
|
|
|
|
211
|
$address->{components}{$key} =~ s/ $//g; |
|
1239
|
|
|
|
|
|
|
} |
|
1240
|
|
|
|
|
|
|
} |
|
1241
|
|
|
|
|
|
|
} |
|
1242
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
1243
|
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
sub _fmt_report_line |
|
1245
|
|
|
|
|
|
|
{ |
|
1246
|
0
|
|
|
0
|
|
0
|
my ($report_ref,$label,$value) = @_; |
|
1247
|
|
|
|
|
|
|
# To DO $$ ?? |
|
1248
|
0
|
|
|
|
|
0
|
$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value); |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
sub _check_vowel |
|
1253
|
|
|
|
|
|
|
{ |
|
1254
|
25
|
|
|
25
|
|
34
|
my ($str) = @_; |
|
1255
|
|
|
|
|
|
|
|
|
1256
|
25
|
|
|
|
|
49
|
my @words = split(/ /,$str); |
|
1257
|
25
|
|
|
|
|
38
|
foreach my $word (@words) |
|
1258
|
|
|
|
|
|
|
{ |
|
1259
|
|
|
|
|
|
|
# Saint, Mount, Junior, Senior (as in Martin Luther KIng Snr) |
|
1260
|
34
|
100
|
66
|
|
|
160
|
if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|JN?R|SN?R/ ) |
|
1261
|
|
|
|
|
|
|
{ |
|
1262
|
1
|
|
|
|
|
4
|
return(1); |
|
1263
|
|
|
|
|
|
|
} |
|
1264
|
|
|
|
|
|
|
} |
|
1265
|
24
|
|
|
|
|
65
|
return(0); |
|
1266
|
|
|
|
|
|
|
} |
|
1267
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
return(1); |