File Coverage

blib/lib/Geo/StreetAddress/US.pm
Criterion Covered Total %
statement 97 98 98.9
branch 23 28 82.1
condition 17 25 68.0
subroutine 12 12 100.0
pod 6 6 100.0
total 155 169 91.7


line stmt bran cond sub pod time code
1             package Geo::StreetAddress::US;
2              
3 1     1   985 use 5.008_001;
  1         3  
  1         37  
4 1     1   6 use strict;
  1         1  
  1         33  
5 1     1   15 use warnings;
  1         2  
  1         58  
6              
7             our $VERSION = '1.04';
8              
9 1     1   6 use base 'Class::Data::Inheritable';
  1         2  
  1         791  
10              
11             =head1 NAME
12              
13             Geo::StreetAddress::US - Perl extension for parsing US street addresses
14              
15             =head1 SYNOPSIS
16              
17             use Geo::StreetAddress::US;
18              
19             $hashref = Geo::StreetAddress::US->parse_location(
20             "1005 Gravenstein Hwy N, Sebastopol CA 95472" );
21              
22             $hashref = Geo::StreetAddress::US->parse_location(
23             "Hollywood & Vine, Los Angeles, CA" );
24              
25             $hashref = Geo::StreetAddress::US->parse_address(
26             "1600 Pennsylvania Ave, Washington, DC" );
27              
28             $hashref = Geo::StreetAddress::US->parse_address(
29             "1600 Pennsylvania Ave, Washington, DC" );
30              
31             $hashref = Geo::StreetAddress::US->parse_informal_address(
32             "Lot 3 Pennsylvania Ave" );
33              
34             $hashref = Geo::StreetAddress::US->parse_intersection(
35             "Mission Street at Valencia Street, San Francisco, CA" );
36              
37             $hashref = Geo::StreetAddress::US->normalize_address( \%spec );
38             # the parse_* methods call this automatically...
39              
40             =head1 DESCRIPTION
41              
42             Geo::StreetAddress::US is a regex-based street address and street intersection
43             parser for the United States. Its basic goal is to be as forgiving as possible
44             when parsing user-provided address strings. Geo::StreetAddress::US knows about
45             directional prefixes and suffixes, fractional building numbers, building units,
46             grid-based addresses (such as those used in parts of Utah), 5 and 9 digit ZIP
47             codes, and all of the official USPS abbreviations for street types, state
48             names and secondary unit designators.
49              
50             =head1 RETURN VALUES
51              
52             Most Geo::StreetAddress::US methods return a reference to a hash containing
53             address or intersection information. This
54             "address specifier" hash may contain any of the following fields for a
55             given address. If a given field is not present in the address, the
56             corresponding key will be set to C in the hash.
57              
58             Future versions of this module may add extra fields.
59              
60             =head1 ADDRESS SPECIFIER
61              
62             =head2 number
63              
64             House or street number.
65              
66             =head2 prefix
67              
68             Directional prefix for the street, such as N, NE, E, etc. A given prefix
69             should be one to two characters long.
70              
71             =head2 street
72              
73             Name of the street, without directional or type qualifiers.
74              
75             =head2 type
76              
77             Abbreviated street type, e.g. Rd, St, Ave, etc. See the USPS official
78             type abbreviations at L
79             for a list of abbreviations used.
80              
81             =head2 suffix
82              
83             Directional suffix for the street, as above.
84              
85             =head2 city
86              
87             Name of the city, town, or other locale that the address is situated in.
88              
89             =head2 state
90              
91             The state which the address is situated in, given as its two-letter
92             postal abbreviation. for a list of abbreviations used.
93              
94             =head2 zip
95              
96             Five digit ZIP postal code for the address, including leading zero, if needed.
97              
98             =head2 sec_unit_type
99              
100             If the address includes a Secondary Unit Designator, such as a room, suite or
101             appartment, the C field will indicate the type of unit.
102              
103             =head2 sec_unit_num
104              
105             If the address includes a Secondary Unit Designator, such as a room, suite or appartment,
106             the C field will indicate the number of the unit (which may not be numeric).
107              
108             =head1 INTERSECTION SPECIFIER
109              
110             =head2 prefix1, prefix2
111              
112             Directional prefixes for the streets in question.
113              
114             =head2 street1, street2
115              
116             Names of the streets in question.
117              
118             =head2 type1, type2
119              
120             Street types for the streets in question.
121              
122             =head2 suffix1, suffix2
123              
124             Directional suffixes for the streets in question.
125              
126             =head2 city
127              
128             City or locale containing the intersection, as above.
129              
130             =head2 state
131              
132             State abbreviation, as above.
133              
134             =head2 zip
135              
136             Five digit ZIP code, as above.
137              
138             =cut
139              
140             =head1 GLOBAL VARIABLES
141              
142             Geo::StreetAddress::US contains a number of global variables which it
143             uses to recognize different bits of US street addresses. Although you
144             will probably not need them, they are documented here for completeness's
145             sake.
146              
147             =cut
148              
149             =head2 %Directional
150              
151             Maps directional names (north, northeast, etc.) to abbreviations (N, NE, etc.).
152              
153             =head2 %Direction_Code
154              
155             Maps directional abbreviations to directional names.
156              
157             =cut
158              
159             our %Directional = (
160             north => "N",
161             northeast => "NE",
162             east => "E",
163             southeast => "SE",
164             south => "S",
165             southwest => "SW",
166             west => "W",
167             northwest => "NW",
168             );
169              
170             our %Direction_Code; # setup in init();
171              
172             =head2 %Street_Type
173              
174             Maps lowercased USPS standard street types to their canonical postal
175             abbreviations as found in TIGER/Line. See eg/get_street_abbrev.pl in
176             the distrbution for how this map was generated.
177              
178             =cut
179              
180             our %Street_Type = (
181             allee => "aly",
182             alley => "aly",
183             ally => "aly",
184             anex => "anx",
185             annex => "anx",
186             annx => "anx",
187             arcade => "arc",
188             av => "ave",
189             aven => "ave",
190             avenu => "ave",
191             avenue => "ave",
192             avn => "ave",
193             avnue => "ave",
194             bayoo => "byu",
195             bayou => "byu",
196             beach => "bch",
197             bend => "bnd",
198             bluf => "blf",
199             bluff => "blf",
200             bluffs => "blfs",
201             bot => "btm",
202             bottm => "btm",
203             bottom => "btm",
204             boul => "blvd",
205             boulevard => "blvd",
206             boulv => "blvd",
207             branch => "br",
208             brdge => "brg",
209             bridge => "brg",
210             brnch => "br",
211             brook => "brk",
212             brooks => "brks",
213             burg => "bg",
214             burgs => "bgs",
215             bypa => "byp",
216             bypas => "byp",
217             bypass => "byp",
218             byps => "byp",
219             camp => "cp",
220             canyn => "cyn",
221             canyon => "cyn",
222             cape => "cpe",
223             causeway => "cswy",
224             causway => "cswy",
225             cen => "ctr",
226             cent => "ctr",
227             center => "ctr",
228             centers => "ctrs",
229             centr => "ctr",
230             centre => "ctr",
231             circ => "cir",
232             circl => "cir",
233             circle => "cir",
234             circles => "cirs",
235             ck => "crk",
236             cliff => "clf",
237             cliffs => "clfs",
238             club => "clb",
239             cmp => "cp",
240             cnter => "ctr",
241             cntr => "ctr",
242             cnyn => "cyn",
243             common => "cmn",
244             corner => "cor",
245             corners => "cors",
246             course => "crse",
247             court => "ct",
248             courts => "cts",
249             cove => "cv",
250             coves => "cvs",
251             cr => "crk",
252             crcl => "cir",
253             crcle => "cir",
254             crecent => "cres",
255             creek => "crk",
256             crescent => "cres",
257             cresent => "cres",
258             crest => "crst",
259             crossing => "xing",
260             crossroad => "xrd",
261             crscnt => "cres",
262             crsent => "cres",
263             crsnt => "cres",
264             crssing => "xing",
265             crssng => "xing",
266             crt => "ct",
267             curve => "curv",
268             dale => "dl",
269             dam => "dm",
270             div => "dv",
271             divide => "dv",
272             driv => "dr",
273             drive => "dr",
274             drives => "drs",
275             drv => "dr",
276             dvd => "dv",
277             estate => "est",
278             estates => "ests",
279             exp => "expy",
280             expr => "expy",
281             express => "expy",
282             expressway => "expy",
283             expw => "expy",
284             extension => "ext",
285             extensions => "exts",
286             extn => "ext",
287             extnsn => "ext",
288             falls => "fls",
289             ferry => "fry",
290             field => "fld",
291             fields => "flds",
292             flat => "flt",
293             flats => "flts",
294             ford => "frd",
295             fords => "frds",
296             forest => "frst",
297             forests => "frst",
298             forg => "frg",
299             forge => "frg",
300             forges => "frgs",
301             fork => "frk",
302             forks => "frks",
303             fort => "ft",
304             freeway => "fwy",
305             freewy => "fwy",
306             frry => "fry",
307             frt => "ft",
308             frway => "fwy",
309             frwy => "fwy",
310             garden => "gdn",
311             gardens => "gdns",
312             gardn => "gdn",
313             gateway => "gtwy",
314             gatewy => "gtwy",
315             gatway => "gtwy",
316             glen => "gln",
317             glens => "glns",
318             grden => "gdn",
319             grdn => "gdn",
320             grdns => "gdns",
321             green => "grn",
322             greens => "grns",
323             grov => "grv",
324             grove => "grv",
325             groves => "grvs",
326             gtway => "gtwy",
327             harb => "hbr",
328             harbor => "hbr",
329             harbors => "hbrs",
330             harbr => "hbr",
331             haven => "hvn",
332             havn => "hvn",
333             height => "hts",
334             heights => "hts",
335             hgts => "hts",
336             highway => "hwy",
337             highwy => "hwy",
338             hill => "hl",
339             hills => "hls",
340             hiway => "hwy",
341             hiwy => "hwy",
342             hllw => "holw",
343             hollow => "holw",
344             hollows => "holw",
345             holws => "holw",
346             hrbor => "hbr",
347             ht => "hts",
348             hway => "hwy",
349             inlet => "inlt",
350             island => "is",
351             islands => "iss",
352             isles => "isle",
353             islnd => "is",
354             islnds => "iss",
355             jction => "jct",
356             jctn => "jct",
357             jctns => "jcts",
358             junction => "jct",
359             junctions => "jcts",
360             junctn => "jct",
361             juncton => "jct",
362             key => "ky",
363             keys => "kys",
364             knol => "knl",
365             knoll => "knl",
366             knolls => "knls",
367             la => "ln",
368             lake => "lk",
369             lakes => "lks",
370             landing => "lndg",
371             lane => "ln",
372             lanes => "ln",
373             ldge => "ldg",
374             light => "lgt",
375             lights => "lgts",
376             lndng => "lndg",
377             loaf => "lf",
378             lock => "lck",
379             locks => "lcks",
380             lodg => "ldg",
381             lodge => "ldg",
382             loops => "loop",
383             manor => "mnr",
384             manors => "mnrs",
385             meadow => "mdw",
386             meadows => "mdws",
387             medows => "mdws",
388             mill => "ml",
389             mills => "mls",
390             mission => "msn",
391             missn => "msn",
392             mnt => "mt",
393             mntain => "mtn",
394             mntn => "mtn",
395             mntns => "mtns",
396             motorway => "mtwy",
397             mount => "mt",
398             mountain => "mtn",
399             mountains => "mtns",
400             mountin => "mtn",
401             mssn => "msn",
402             mtin => "mtn",
403             neck => "nck",
404             orchard => "orch",
405             orchrd => "orch",
406             overpass => "opas",
407             ovl => "oval",
408             parks => "park",
409             parkway => "pkwy",
410             parkways => "pkwy",
411             parkwy => "pkwy",
412             passage => "psge",
413             paths => "path",
414             pikes => "pike",
415             pine => "pne",
416             pines => "pnes",
417             pk => "park",
418             pkway => "pkwy",
419             pkwys => "pkwy",
420             pky => "pkwy",
421             place => "pl",
422             plain => "pln",
423             plaines => "plns",
424             plains => "plns",
425             plaza => "plz",
426             plza => "plz",
427             point => "pt",
428             points => "pts",
429             port => "prt",
430             ports => "prts",
431             prairie => "pr",
432             prarie => "pr",
433             prk => "park",
434             prr => "pr",
435             rad => "radl",
436             radial => "radl",
437             radiel => "radl",
438             ranch => "rnch",
439             ranches => "rnch",
440             rapid => "rpd",
441             rapids => "rpds",
442             rdge => "rdg",
443             rest => "rst",
444             ridge => "rdg",
445             ridges => "rdgs",
446             river => "riv",
447             rivr => "riv",
448             rnchs => "rnch",
449             road => "rd",
450             roads => "rds",
451             route => "rte",
452             rvr => "riv",
453             shoal => "shl",
454             shoals => "shls",
455             shoar => "shr",
456             shoars => "shrs",
457             shore => "shr",
458             shores => "shrs",
459             skyway => "skwy",
460             spng => "spg",
461             spngs => "spgs",
462             spring => "spg",
463             springs => "spgs",
464             sprng => "spg",
465             sprngs => "spgs",
466             spurs => "spur",
467             sqr => "sq",
468             sqre => "sq",
469             sqrs => "sqs",
470             squ => "sq",
471             square => "sq",
472             squares => "sqs",
473             station => "sta",
474             statn => "sta",
475             stn => "sta",
476             str => "st",
477             strav => "stra",
478             strave => "stra",
479             straven => "stra",
480             stravenue => "stra",
481             stravn => "stra",
482             stream => "strm",
483             street => "st",
484             streets => "sts",
485             streme => "strm",
486             strt => "st",
487             strvn => "stra",
488             strvnue => "stra",
489             sumit => "smt",
490             sumitt => "smt",
491             summit => "smt",
492             terr => "ter",
493             terrace => "ter",
494             throughway => "trwy",
495             tpk => "tpke",
496             tr => "trl",
497             trace => "trce",
498             traces => "trce",
499             track => "trak",
500             tracks => "trak",
501             trafficway => "trfy",
502             trail => "trl",
503             trails => "trl",
504             trk => "trak",
505             trks => "trak",
506             trls => "trl",
507             trnpk => "tpke",
508             trpk => "tpke",
509             tunel => "tunl",
510             tunls => "tunl",
511             tunnel => "tunl",
512             tunnels => "tunl",
513             tunnl => "tunl",
514             turnpike => "tpke",
515             turnpk => "tpke",
516             underpass => "upas",
517             union => "un",
518             unions => "uns",
519             valley => "vly",
520             valleys => "vlys",
521             vally => "vly",
522             vdct => "via",
523             viadct => "via",
524             viaduct => "via",
525             view => "vw",
526             views => "vws",
527             vill => "vlg",
528             villag => "vlg",
529             village => "vlg",
530             villages => "vlgs",
531             ville => "vl",
532             villg => "vlg",
533             villiage => "vlg",
534             vist => "vis",
535             vista => "vis",
536             vlly => "vly",
537             vst => "vis",
538             vsta => "vis",
539             walks => "walk",
540             well => "wl",
541             wells => "wls",
542             wy => "way",
543             );
544              
545             our %_Street_Type_List; # set up in init() later;
546             our %_Street_Type_Match; # set up in init() later;
547              
548             =head2 %State_Code
549              
550             Maps lowercased US state and territory names to their canonical two-letter
551             postal abbreviations. See eg/get_state_abbrev.pl in the distrbution
552             for how this map was generated.
553              
554             =cut
555              
556             our %State_Code = (
557             "alabama" => "AL",
558             "alaska" => "AK",
559             "american samoa" => "AS",
560             "arizona" => "AZ",
561             "arkansas" => "AR",
562             "california" => "CA",
563             "colorado" => "CO",
564             "connecticut" => "CT",
565             "delaware" => "DE",
566             "district of columbia" => "DC",
567             "federated states of micronesia" => "FM",
568             "florida" => "FL",
569             "georgia" => "GA",
570             "guam" => "GU",
571             "hawaii" => "HI",
572             "idaho" => "ID",
573             "illinois" => "IL",
574             "indiana" => "IN",
575             "iowa" => "IA",
576             "kansas" => "KS",
577             "kentucky" => "KY",
578             "louisiana" => "LA",
579             "maine" => "ME",
580             "marshall islands" => "MH",
581             "maryland" => "MD",
582             "massachusetts" => "MA",
583             "michigan" => "MI",
584             "minnesota" => "MN",
585             "mississippi" => "MS",
586             "missouri" => "MO",
587             "montana" => "MT",
588             "nebraska" => "NE",
589             "nevada" => "NV",
590             "new hampshire" => "NH",
591             "new jersey" => "NJ",
592             "new mexico" => "NM",
593             "new york" => "NY",
594             "north carolina" => "NC",
595             "north dakota" => "ND",
596             "northern mariana islands" => "MP",
597             "ohio" => "OH",
598             "oklahoma" => "OK",
599             "oregon" => "OR",
600             "palau" => "PW",
601             "pennsylvania" => "PA",
602             "puerto rico" => "PR",
603             "rhode island" => "RI",
604             "south carolina" => "SC",
605             "south dakota" => "SD",
606             "tennessee" => "TN",
607             "texas" => "TX",
608             "utah" => "UT",
609             "vermont" => "VT",
610             "virgin islands" => "VI",
611             "virginia" => "VA",
612             "washington" => "WA",
613             "west virginia" => "WV",
614             "wisconsin" => "WI",
615             "wyoming" => "WY",
616             );
617              
618             =head2 %State_FIPS
619              
620             Maps two-digit FIPS-55 US state and territory codes (including the
621             leading zero!) as found in TIGER/Line to the state's canonical two-letter
622             postal abbreviation. See eg/get_state_fips.pl in the distrbution for
623             how this map was generated. Yes, I know the FIPS data also has the state
624             names. Oops.
625              
626             =cut
627              
628             our %State_FIPS = (
629             "01" => "AL",
630             "02" => "AK",
631             "04" => "AZ",
632             "05" => "AR",
633             "06" => "CA",
634             "08" => "CO",
635             "09" => "CT",
636             "10" => "DE",
637             "11" => "DC",
638             "12" => "FL",
639             "13" => "GA",
640             "15" => "HI",
641             "16" => "ID",
642             "17" => "IL",
643             "18" => "IN",
644             "19" => "IA",
645             "20" => "KS",
646             "21" => "KY",
647             "22" => "LA",
648             "23" => "ME",
649             "24" => "MD",
650             "25" => "MA",
651             "26" => "MI",
652             "27" => "MN",
653             "28" => "MS",
654             "29" => "MO",
655             "30" => "MT",
656             "31" => "NE",
657             "32" => "NV",
658             "33" => "NH",
659             "34" => "NJ",
660             "35" => "NM",
661             "36" => "NY",
662             "37" => "NC",
663             "38" => "ND",
664             "39" => "OH",
665             "40" => "OK",
666             "41" => "OR",
667             "42" => "PA",
668             "44" => "RI",
669             "45" => "SC",
670             "46" => "SD",
671             "47" => "TN",
672             "48" => "TX",
673             "49" => "UT",
674             "50" => "VT",
675             "51" => "VA",
676             "53" => "WA",
677             "54" => "WV",
678             "55" => "WI",
679             "56" => "WY",
680             "72" => "PR",
681             "78" => "VI",
682             );
683              
684             our %FIPS_State; # setup in init() later;
685              
686             =head2 %Addr_Match
687              
688             A hash of compiled regular expressions corresponding to different
689             types of address or address portions. Defined regexen include
690             type, number, fraction, state, direct(ion), dircode, zip, corner,
691             street, place, address, and intersection.
692              
693             Direct use of these patterns is not recommended because they may change in
694             subtle ways between releases.
695              
696             =cut
697              
698             our %Addr_Match; # setup in init()
699              
700             init();
701              
702             our %Normalize_Map = (
703             prefix => \%Directional,
704             prefix1 => \%Directional,
705             prefix2 => \%Directional,
706             suffix => \%Directional,
707             suffix1 => \%Directional,
708             suffix2 => \%Directional,
709             type => \%Street_Type,
710             type1 => \%Street_Type,
711             type2 => \%Street_Type,
712             state => \%State_Code,
713             );
714              
715              
716             =head1 CLASS ACCESSORS
717              
718             =head2 avoid_redundant_street_type
719              
720             If true then L will set the C field to undef
721             if the C field contains a word that corresponds to the C in L<\%Street_Type>.
722              
723             For example, given "4321 Country Road 7", C will be "Country Road 7"
724             and C will be "Rd". With avoid_redundant_street_type set true, C
725             will be undef because C matches /\b (rd|road) \b/ix;
726              
727             Also applies to C for C and C for C
728             fields for intersections.
729              
730             The default is false, for backwards compatibility.
731              
732             =cut
733              
734 1     1   1358 BEGIN { __PACKAGE__->mk_classdata('avoid_redundant_street_type' => 0) }
735              
736             =head1 CLASS METHODS
737              
738             =head2 init
739              
740             # Add another street type mapping:
741             $Geo::StreetAddress::US::Street_Type{'cur'}='curv';
742             # Re-initialize to pick up the change
743             Geo::StreetAddress::US::init();
744              
745             Runs the setup on globals. This is run automatically when the module is loaded,
746             but if you subsequently change the globals, you should run it again.
747              
748             =cut
749              
750             sub init {
751              
752 1     1 1 6 %Direction_Code = reverse %Directional;
753              
754 1         23 %FIPS_State = reverse %State_FIPS;
755              
756 1         35 %_Street_Type_List = map { $_ => 1 } %Street_Type;
  724         1087  
757              
758             # build hash { 'rd' => qr/\b (?: rd|road ) \b/xi, ... }
759 1         66 %_Street_Type_Match = map { $_ => $_ } values %Street_Type;
  362         467  
760 1         26 while ( my ($type_alt, $type_abbrv) = each %Street_Type ) {
761 362         1001 $_Street_Type_Match{$type_abbrv} .= "|\Q$type_alt";
762             }
763 188         246 %_Street_Type_Match = map {
764 1         16 my $alts = $_Street_Type_Match{$_};
765 188         3281 $_ => qr/\b (?: $alts ) \b/xi;
766             } keys %_Street_Type_Match;
767              
768 1     1   232 use re 'eval';
  1         2  
  1         3083  
769              
770 118         138 %Addr_Match = (
771             type => join("|", keys %_Street_Type_List),
772             fraction => qr{\d+\/\d+},
773             state => '\b(?:'.join("|",
774             # escape spaces in state names (e.g., "new york" --> "new\\ york")
775             # so they still match in the x environment below
776 8         10 map { ( quotemeta $_) } keys %State_Code, values %State_Code
777             ).')\b',
778             direct => join("|",
779             # map direction names to direction codes
780             keys %Directional,
781             # also map the dotted version of the code to the code itself
782             map {
783 8         31 my $c = $_; $c =~ s/(\w)/$1./g; ( quotemeta $c, $_ )
  8         27  
  16         19  
784 1         138 } sort { length $b <=> length $a } values %Directional
785             ),
786             dircode => join("|", keys %Direction_Code),
787             zip => qr/\d{5}(?:-?\d{4})?/, # XXX add \b?
788             corner => qr/(?:\band\b|\bat\b|&|\@)/i,
789             );
790              
791             # we don't include letters in the number regex because we want to
792             # treat "42S" as "42 S" (42 South). For example,
793             # Utah and Wisconsin have a more elaborate system of block numbering
794             # http://en.wikipedia.org/wiki/House_number#Block_numbers
795             $Addr_Match{number} = qr/(\d+-?\d*)(?=\D) (?{ $_{number} = $^N })/ix,
796              
797             # note that expressions like [^,]+ may scan more than you expect
798             $Addr_Match{street} = qr/
799             (?:
800             # special case for addresses like 100 South Street
801 39         365 (?:($Addr_Match{direct})\W+ (?{ $_{street} = $^N })
802 1         9 ($Addr_Match{type})\b (?{ $_{type} = $^N }))
803             #(?{ $_{_street}.=1 })
804             |
805 35         136 (?:($Addr_Match{direct})\W+ (?{ $_{prefix} = $^N }))?
806             (?:
807 95         302 ([^,]*\d) (?{ $_{street} = $^N })
808 4   0     7 (?:[^\w,]*($Addr_Match{direct})\b (?{ $_{suffix} = $^N; $_{type}||='' }))
  4         31  
809             #(?{ $_{_street}.=3 })
810             |
811 1880         5277 ([^,]+) (?{ $_{street} = $^N })
812 76         570 (?:[^\w,]+($Addr_Match{type})\b (?{ $_{type} = $^N }))
813 4         28 (?:[^\w,]+($Addr_Match{direct})\b (?{ $_{suffix} = $^N }))?
814             #(?{ $_{_street}.=2 })
815             |
816 1006   50     1334 ([^,]+?) (?{ $_{street} = $^N; $_{type}||='' })
  1006         3550  
817 28         285 (?:[^\w,]+($Addr_Match{type})\b (?{ $_{type} = $^N }))?
818 0         0 (?:[^\w,]+($Addr_Match{direct})\b (?{ $_{suffix} = $^N }))?
819             #(?{ $_{_street}.=4 })
820             )
821             )
822 1         3475 /ix;
823              
824              
825             # http://pe.usps.com/text/pub28/pub28c2_003.htm
826             # TODO add support for those that don't require a number
827             # TODO map to standard names/abbreviations
828             $Addr_Match{sec_unit_type_numbered} = qr/
829             (su?i?te
830             |p\W*[om]\W*b(?:ox)?
831             |(?:ap|dep)(?:ar)?t(?:me?nt)?
832             |ro*m
833             |flo*r?
834             |uni?t
835             |bu?i?ldi?n?g
836             |ha?nga?r
837             |lo?t
838             |pier
839             |slip
840             |spa?ce?
841             |stop
842             |tra?i?le?r
843             |box)(?![a-z]) (?{ $_{sec_unit_type} = $^N })
844 1         33 /ix;
845              
846             $Addr_Match{sec_unit_type_unnumbered} = qr/
847             (ba?se?me?n?t
848             |fro?nt
849             |lo?bby
850             |lowe?r
851             |off?i?ce?
852             |pe?n?t?ho?u?s?e?
853             |rear
854             |side
855             |uppe?r
856             )\b (?{ $_{sec_unit_type} = $^N })
857 1         5 /ix;
858              
859             $Addr_Match{sec_unit} = qr/
860             (:?
861             (?: (?:$Addr_Match{sec_unit_type_numbered} \W*)
862 2         14 | (\#)\W* (?{ $_{sec_unit_type} = $^N })
863             )
864 7         61 ( [\w-]+) (?{ $_{sec_unit_num} = $^N })
865             )
866             |
867             $Addr_Match{sec_unit_type_unnumbered}
868 1         246 /ix;
869              
870             $Addr_Match{city_and_state} = qr/
871             (?:
872 174         877 ([^\d,]+?)\W+ (?{ $_{city} = $^N })
873 37         296 ($Addr_Match{state}) (?{ $_{state} = $^N })
874             )
875 1         372 /ix;
876              
877             $Addr_Match{place} = qr/
878             (?:$Addr_Match{city_and_state}\W*)?
879 23         194 (?:($Addr_Match{zip}) (?{ $_{zip} = $^N }))?
880 1         459 /ix;
881              
882             # the \x23 below is an alias for '#' to avoid a bug in perl 5.18.1
883             # https://rt.cpan.org/Ticket/Display.html?id=91420
884 1         7763 $Addr_Match{address} = qr/
885             ^
886             [^\w\x23]* # skip non-word chars except # (eg unit)
887             ( $Addr_Match{number} )\W*
888             (?:$Addr_Match{fraction}\W*)?
889             $Addr_Match{street}\W+
890             (?:$Addr_Match{sec_unit}\W+)?
891             $Addr_Match{place}
892             \W* # require on non-word chars at end
893             $ # right up to end of string
894             /ix;
895              
896 1         19 my $sep = qr/(?:\W+|\Z)/;
897              
898 1         5786 $Addr_Match{informal_address} = qr/
899             ^
900             \s* # skip leading whitespace
901             (?:$Addr_Match{sec_unit} $sep)?
902             (?:$Addr_Match{number})?\W*
903             (?:$Addr_Match{fraction}\W*)?
904             $Addr_Match{street} $sep
905             (?:$Addr_Match{sec_unit} $sep)?
906             (?:$Addr_Match{place})?
907             # don't require match to reach end of string
908             /ix;
909              
910             $Addr_Match{intersection} = qr/^\W*
911             $Addr_Match{street}\W*?
912              
913             \s+$Addr_Match{corner}\s+
914              
915 8   100     113 (?{ exists $_{$_} and $_{$_.1} = delete $_{$_} for (qw{prefix street type suffix})})
916             $Addr_Match{street}\W+
917 8   100     93 (?{ exists $_{$_} and $_{$_.2} = delete $_{$_} for (qw{prefix street type suffix})})
918              
919             $Addr_Match{place}
920 1         11658 \W*$/ix;
921             }
922              
923             =head2 parse_location
924              
925             $spec = Geo::StreetAddress::US->parse_location( $string )
926              
927             Parses any address or intersection string and returns the appropriate
928             specifier. If $string matches the $Addr_Match{corner} pattern then
929             parse_intersection() is used. Else parse_address() is called and if that
930             returns false then parse_informal_address() is called.
931              
932             =cut
933              
934             sub parse_location {
935 53     53 1 31958 my ($class, $addr) = @_;
936              
937 53 100       583 if ($addr =~ /$Addr_Match{corner}/ios) {
938 8         23 return $class->parse_intersection($addr);
939             }
940 45   66     108 return $class->parse_address($addr)
941             || $class->parse_informal_address($addr);
942             }
943              
944              
945             =head2 parse_address
946              
947             $spec = Geo::StreetAddress::US->parse_address( $address_string )
948              
949             Parses a street address into an address specifier using the $Addr_Match{address}
950             pattern. Returning undef if the address cannot be parsed as a complete formal
951             address.
952              
953             You may want to use parse_location() instead.
954              
955             =cut
956              
957             sub parse_address {
958 45     45 1 55 my ($class, $addr) = @_;
959 45         64 local %_;
960              
961 45 100       256 $addr =~ /$Addr_Match{address}/ios
962             or return undef;
963              
964 38         262 return $class->normalize_address({ %_ });
965             }
966              
967              
968             =head2 parse_informal_address
969              
970             $spec = Geo::StreetAddress::US->parse_informal_address( $address_string )
971              
972             Acts like parse_address() except that it handles a wider range of address
973             formats because it uses the L pattern. That means a
974             unit can come first, a street number is optional, and the city and state aren't
975             needed. Which means that informal addresses like "#42 123 Main St" can be parsed.
976              
977             Returns undef if the address cannot be parsed.
978              
979             You may want to use parse_location() instead.
980              
981             =cut
982              
983             sub parse_informal_address {
984 7     7 1 12 my ($class, $addr) = @_;
985 7         11 local %_;
986              
987 7 50       58 $addr =~ /$Addr_Match{informal_address}/ios
988             or return undef;
989              
990 7         44 return $class->normalize_address({ %_ });
991             }
992              
993              
994             =head2 parse_intersection
995              
996             $spec = Geo::StreetAddress::US->parse_intersection( $intersection_string )
997              
998             Parses an intersection string into an intersection specifier, returning
999             undef if the address cannot be parsed. You probably want to use
1000             parse_location() instead.
1001              
1002             =cut
1003              
1004             sub parse_intersection {
1005 8     8 1 11 my ($class, $addr) = @_;
1006 8         9 local %_;
1007              
1008 8 50       62 $addr =~ /$Addr_Match{intersection}/ios
1009             or return undef;
1010              
1011 8         41 my %part = %_;
1012             # if we've a type2 and type1 is either missing or the same,
1013             # and the type seems plural,
1014             # and is still valid if the trailing 's' is removed, then remove it.
1015             # So "X & Y Streets" becomes "X Street" and "Y Street".
1016 8 100 66     52 if ($part{type2} && (!$part{type1} or $part{type1} eq $part{type2})) {
      66        
1017 5         6 my $type = $part{type2};
1018 5 100 66     1992 if ($type =~ s/s\W*$//ios and $type =~ /^$Addr_Match{type}$/ios) {
1019 3         8 $part{type1} = $part{type2} = $type;
1020             }
1021             }
1022              
1023 8         112 return $class->normalize_address(\%part);
1024             }
1025              
1026              
1027             =head2 normalize_address
1028              
1029             $spec = Geo::StreetAddress::US->normalize_address( $spec )
1030              
1031             Takes an address or intersection specifier, and normalizes its components,
1032             stripping out all leading and trailing whitespace and punctuation, and
1033             substituting official abbreviations for prefix, suffix, type, and state values.
1034             Also, city names that are prefixed with a directional abbreviation (e.g. N, NE,
1035             etc.) have the abbreviation expanded. The original specifier ref is returned.
1036              
1037             Typically, you won't need to use this method, as the C methods
1038             call it for you.
1039              
1040             N.B., C crops 9-digit ZIP codes to 5 digits. This is for
1041             the benefit of Geo::Coder::US and may not be what you want. E-mail me if this
1042             is a problem and I'll see what I can do to fix it.
1043              
1044             =cut
1045              
1046             sub normalize_address {
1047 53     53 1 93 my ($class, $part) = @_;
1048              
1049             #m/^_/ and delete $part->{$_} for keys %$part; # for debug
1050              
1051             # strip off some punctuation
1052 53   50     984 defined($_) && s/^\s+|\s+$|[^\w\s\-\#\&]//gos for values %$part;
1053              
1054 53         164 while (my ($key, $map) = each %Normalize_Map) {
1055 530 100 100     2310 $part->{$key} = $map->{lc $part->{$key}}
1056             if defined $part->{$key}
1057             and exists $map->{lc $part->{$key}};
1058             }
1059              
1060             $part->{$_} = ucfirst lc $part->{$_}
1061 53         272 for grep(exists $part->{$_}, qw( type type1 type2 ));
1062              
1063 53 100       180 if ($class->avoid_redundant_street_type) {
1064 2         23 for my $suffix ('', '1', '2') {
1065 6 100       28 next unless my $street = $part->{"street$suffix"};
1066 2 50       15 next unless my $type = $part->{"type$suffix"};
1067 2 50       13 my $type_regex = $_Street_Type_Match{lc $type}
1068             or die "panic: no _Street_Type_Match for $type";
1069 2 50       27 $part->{"type$suffix"} = undef
1070             if $street =~ $type_regex;
1071             }
1072             }
1073              
1074             # attempt to expand directional prefixes on place names
1075 53 100       519 $part->{city} =~ s/^($Addr_Match{dircode})\s+(?=\S)
1076             /\u$Direction_Code{uc $1} /iosx
1077             if $part->{city};
1078              
1079             # strip ZIP+4 (which may be missing a hyphen)
1080 53 100       179 $part->{zip} =~ s/^(.{5}).*/$1/os if $part->{zip};
1081              
1082 53         308 return $part;
1083             }
1084              
1085              
1086             1;
1087             __END__