File Coverage

blib/lib/Geo/Coder/US.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package Geo::Coder::US;
2              
3             =head1 NAME
4              
5             Geo::Coder::US - Geocode (estimate latitude and longitude for) any US address
6              
7             =head1 SYNOPSIS
8              
9             use Geo::Coder::US;
10              
11             Geo::Coder::US->set_db( "geocoder.db" );
12              
13             my @matches = Geo::Coder::US->geocode(
14             "1600 Pennsylvania Ave., Washington, DC" );
15              
16             my @matches = Geo::Coder::US->geocode(
17             "42nd & Broadway New York NY" )
18              
19             my ($ora) = Geo::Coder::US->geocode(
20             "1005 Gravenstein Hwy N, 95472" );
21              
22             print "O'Reilly is located at $ora->{lat} degrees north, "
23             "$ora->{long} degrees east.\n";
24              
25              
26             =head1 DESCRIPTION
27              
28             Geo::Coder::US provides a complete facility for geocoding US addresses, that
29             is, estimating the latitude and longitude of any street address or intersection
30             in the United States, using the TIGER/Line data set from the US Census Bureau.
31             Geo::Coder::US uses Geo::TigerLine to parse this data, and DB_File to store a
32             highly compressed distillation of it, and Geo::StreetAddress::US to parse
33             addresses into normalized components suitable for looking up in its
34             database.
35              
36             You can find a live demo of this code at L. The
37             demo.cgi script is included in eg/ directory distributed with this module,
38             along with a whole bunch of other goodies. See L
39             for how to build your own Geo::Coder::US database.
40              
41             Consider using a web service to access this geocoder over the Internet,
42             rather than going to all the trouble of building a database yourself.
43             See eg/soap-client.pl, eg/xmlrpc-client.pl, and eg/rest-client.pl for
44             different examples of working clients for the rpc.geocoder.us geocoder
45             web service.
46              
47             =head1 METHODS
48              
49             In general, the only methods you are likely to need to call on
50             Geo::Coder::US are set_db() and geocode(). The following documentation
51             is included for completeness's sake, and for the benefit of developers
52             interested in using bits of the module's internals.
53              
54             Note: Calling conventions for address and intersection specifiers are
55             discussed in the following section on CALLING CONVENTIONS.
56              
57             =over 4
58              
59             =cut
60              
61 3     3   2211 use 5.6.1;
  3         13  
  3         148  
62 3     3   12118 use Geo::StreetAddress::US;
  3         276588  
  3         284  
63 3     3   2051 use DB_File;
  0            
  0            
64             use strict;
65             use warnings;
66              
67             our $VERSION = '1.00';
68              
69             use constant SNAP_DISTANCE => 0.00015;
70             # distance to snap intersection points, in degrees
71             # 0.00005 = ~7 meters
72             # 0.0001 = ~14 meters
73              
74             our $Parser = 'Geo::StreetAddress::US';
75              
76             our ( %DB, $DBO );
77              
78             sub db { \%DB }
79              
80             sub db_file { $DBO }
81              
82             sub set_db {
83             my ($class, $file, $writable) = @_;
84             return $DBO if $DBO and not $writable;
85             my $mode = $writable ? O_CREAT|O_RDWR : O_RDONLY;
86             $DB_BTREE->{compare} = sub { lc $_[0] cmp lc $_[1] };
87             $DBO = tie %DB, "DB_File", $file, $mode, 0666, $DB_BTREE;
88             return \%DB;
89             }
90              
91             =item Geo::Coder::US->geocode( $string )
92              
93             Given a string containing a street address or intersection, return a
94             list of specifiers including latitude and longitude for all matching
95             entities in the database. To keep from churning over the entire database,
96             the given address string must contain either a city and state, or a ZIP
97             code (or both), or geocode() will return undef.
98              
99             geocode() will attempt to normalize directional prefixes and suffixes,
100             street types, and state abbreviations, as well as substitute TIGER/Line's
101             idea of the "primary street name", if an alternate street name was
102             provided instead.
103              
104             If geocode() can parse the address, but not find a match in the database,
105             it will return a hashref containing the parsed and normalized address
106             or intersection, but without the "lat" and "long" keys specifying the
107             location. If geocode() cannot even parse the address, it will return
108             undef. B for the existence of "lat" and "long" keys
109             in the hashes returned from geocode() B attempting to use the
110             values! This serves to distinguish between addresses that cannot be
111             found versus addresses that are completely unparseable.
112              
113             geocode() attempts to be as forgiving as possible when geocoding an
114             address. If you say "Mission Ave" and all it knows about is "Mission St",
115             then "Mission St" is what you'll get back. If you leave off directional
116             identifiers, geocode() will return address geocoded in all the variants
117             it can find, i.e. both "N Main St" I "S Main St".
118              
119             Don't be surprised if geocoding an intersection returns more than one
120             lat/long pair for a single intersection. If one of the streets curves
121             greatly or doglegs even slightly, this will be the likely outcome.
122              
123             geocode() is probably the method you want to use. See more in the
124             following section on the structure of the returned address and
125             intersection specifiers.
126              
127             =cut
128              
129             sub geocode {
130             my ($class, $addr) = @_;
131             my @results;
132            
133             my $part = $Parser->parse_location($addr);
134              
135             return unless $part
136             and ($part->{zip} or ($part->{city} and $part->{state}));
137              
138             if ( exists $part->{street1} ) {
139             @results = $class->lookup_intersection($part);
140             } else {
141             @results = $class->lookup_ranges($part);
142             }
143              
144             return @results ? @results : $part;
145             }
146              
147             =item Geo::Coder::US->geocode_address( $string )
148              
149             Works exactly like geocode(), but only parses addresses.
150              
151             =cut
152              
153             sub geocode_address {
154             my ($class, $addr) = @_;
155             my @results;
156            
157             my $part = $Parser->parse_address($addr);
158              
159             return unless $part
160             and ($part->{zip} or ($part->{city} and $part->{state}));
161              
162             @results = $class->lookup_ranges($part);
163             return @results ? @results : $part;
164             }
165              
166             =item Geo::Coder::US->geocode_intersection( $string )
167              
168             Works exactly like geocode(), but only parses intersections.
169              
170             =cut
171              
172             sub geocode_intersection {
173             my ($class, $addr) = @_;
174             my @results;
175            
176             my $part = $Parser->parse_intersection($addr);
177              
178             return unless $part and $part->{street1} and $part->{street2}
179             and ($part->{zip} or ($part->{city} and $part->{state}));
180              
181             @results = $class->lookup_intersection($part);
182             return @results ? @results : $part;
183             }
184              
185             =item Geo::Coder::US->filter_ranges( $spec, @candidates )
186              
187             Filters a list of address specifiers (presumably from the database)
188             against a query specifier, filtering by prefix, type, suffix, or primary
189             name if possible. Returns a list of matching specifiers. filter_ranges()
190             will ignore a filtering step if it would result in no specifiers being
191             returned. You probably won't need to use this.
192              
193             =cut
194              
195             sub filter_ranges {
196             my ($class, $args, @addrs) = @_;
197             my @filter;
198              
199             for my $field (qw( prefix type suffix city zip )) {
200             next unless $args->{$field};
201             @filter = grep { lc $_->{$field} eq lc $args->{$field} } @addrs;
202             @addrs = @filter if @filter;
203             }
204              
205             return @addrs;
206             }
207              
208             =item Geo::Coder::US->find_ranges( $address_spec )
209              
210             Given a normalized address specifier, return all the address ranges
211             in the database that appear to cover that address. find_ranges()
212             ignores prefix, suffix, and type fields in the specifier for search
213             purposes, and then filters against them ex post facto. The intention
214             for find_ranges() to find the closest match possible in preference to
215             returning nothing. You probably want to use lookup_ranges() instead,
216             which will call find_ranges() for you.
217              
218             =cut
219              
220             sub find_zips_by_city {
221             my ($class, $args) = @_;
222             my $city = "$args->{city}, $args->{state}";
223             return unless exists $DB{$city};
224              
225             my @zips = unpack "w*", $DB{$city};
226              
227             # city, state might point to the FIPS code of the
228             # place that encompasses it. in which case, get the place
229             # name for *that* FIPS code and try again.
230             if (@zips == 1 and $zips[0] > 99999) {
231             my $fips = sprintf "%07d", $zips[0];
232             $city = "$DB{$fips}, $args->{state}";
233             return unless exists $DB{$city};
234             @zips = unpack "w*", $DB{$city};
235             }
236              
237             # finally, format the ZIP codes
238             return map { sprintf "%05d", $_ } @zips;
239             }
240              
241             sub find_streets_by_zip {
242             my ($class, $args, @zips) = @_;
243             my @streets;
244              
245             for my $zip ( @zips ) {
246             my $path = "/$zip/$args->{street}/";
247             my ($key, $value);
248             $DBO->seq( $key = $path, $value, R_CURSOR );
249             while ( $key and $value and $key =~ /^$path/i ) {
250             if ($value =~ /^\//o) {
251             push @streets, map { "/$zip$_" } split( ",", $value );
252             } else {
253             push @streets, $key;
254             }
255             $DBO->seq( $key, $value, R_NEXT );
256             }
257             }
258              
259             return @streets;
260             }
261              
262             sub find_streets {
263             my ($class, $args) = @_;
264             my (@streets);
265              
266             # try first with the zip code if we have one
267             if ( $args->{zip} ) {
268             @streets = $class->find_streets_by_zip( $args, $args->{zip} );
269             }
270              
271             # no luck with the zip code? try again
272             if ( not @streets and $args->{city} and $args->{state} ) {
273             my @zips = $class->find_zips_by_city( $args );
274             @streets = $class->find_streets_by_zip( $args, @zips );
275             }
276              
277             return @streets;
278             }
279              
280             sub add_city_and_state {
281             my ($class, @results) = @_;
282              
283             for my $item (@results) {
284             my $fips = sprintf "%07d", $item->{fips};
285             my $state = substr($fips, 0, 2);
286              
287             # if the FIPS code points to a county subdivision (i.e. not
288             # in the database) find the nearest inhabited place by ZIP
289             # code instead.
290             #
291             $fips = sprintf "%07d", unpack( "w", $DB{$item->{zip}} )
292             unless $DB{$fips};
293              
294             $item->{city} = $DB{$fips};
295             $item->{state} = $Geo::StreetAddress::US::State_FIPS{$state};
296             }
297             }
298              
299             sub find_ranges {
300             my ($class, $args) = @_;
301             my @streets = $class->find_streets($args);
302             my $number = $args->{number};
303             my @results;
304              
305             $number =~ s/\D//gos; # remove non-numerics, e.g. dashes
306              
307             for my $street (@streets) {
308             my ($fips, @data) = unpack "w*", $DB{$street};
309             my (@from, @to, @range, @best, $matched);
310             while (@data) {
311             @from = splice( @data, 0, 2 ) if $data[0] > 1_000_000;
312             while (@data and $data[0] < 1_000_000) {
313             shift @data if not $data[0]; # skip street-side zero marker
314             @range = splice( @data, 0, 2 );
315             if ($number % 2 == $range[0] % 2 and
316             (($number >= $range[0] and $number <= $range[1]) or
317             ($number <= $range[0] and $number >= $range[1]))) {
318             $matched++;
319             shift @data while @data and $data[0] < 1_000_000;
320             }
321             else {
322             next if $best[0] and
323             abs($best[0] - $number) < abs($range[0] - $number);
324             @best = ($range[0], @from);
325             }
326             }
327             last unless @data;
328             @to = splice( @data, 0, 2 );
329             last if $matched;
330             @best = ($range[1], @to)
331             if $best[0] and
332             abs($best[0] - $number) > abs($range[0] - $number);
333             @from = @to;
334             }
335             if (@best and not $matched) {
336             @range = @best[0,0];
337             @from = @to = @best[1,2];
338             }
339             if ($matched or @best) {
340             my %found = ( fips => $fips );
341             @found{qw{ zip street type prefix suffix }}
342             = split "/", substr($street, 1), 5;
343             @found{qw{ toadd fradd }} = @range;
344             @found{qw{ frlat frlong tolat tolong }}
345             = map( $_ / 1_000_000, @from, @to );
346             $found{$_} *= -1 for qw/frlong tolong/;
347             push @results, \%found;
348             }
349             }
350              
351             $class->add_city_and_state( @results );
352             return $class->filter_ranges( $args, @results );
353             }
354              
355             =item Geo::Coder::US->lookup_ranges( $address_spec, @ranges )
356              
357             Given an address specifier and (optionally) some address ranges from the
358             database, interpolate the street address into the street segment referred
359             to by the address range, and return a latitude and longitude for the
360             given address within each of the given ranges. If @ranges is not given,
361             lookup_ranges() calls find_ranges() with the given address specifier,
362             and uses those returned. You probably want to just use geocode() instead,
363             which also parses an address string and determines whether it's a proper
364             address or an intersection automatically.
365              
366             =cut
367              
368             sub lookup_ranges {
369             my ($class, $args, @addrs) = @_;
370             my %results;
371              
372             @addrs = $class->find_ranges($args) unless @addrs;
373              
374             for my $range (@addrs) {
375             my %target = %$args;
376             if ($range->{fradd} == $range->{toadd}) {
377             @target{qw{ lat long number }} = @$range{qw{ frlat frlong fradd }};
378             } else {
379             my $pct = ($args->{number} - $range->{toadd}) /
380             ($range->{fradd} - $range->{toadd});
381              
382             $target{lat} = sprintf "%.6f",
383             $range->{frlat} + ($range->{tolat} - $range->{frlat} ) * $pct;
384              
385             $target{long} = sprintf "%.6f",
386             $range->{frlong} + ($range->{tolong} - $range->{frlong}) * $pct;
387              
388             $target{number} = $args->{number};
389             }
390             $target{$_} = $range->{$_}
391             for (qw( prefix street type suffix city state zip ));
392              
393             $results{"$target{lat}:$target{long}"} = \%target;
394             }
395              
396             my @filter = grep { $_->{number} eq $args->{number} } values %results;
397             return @filter ? @filter : values %results;
398             }
399              
400             =item Geo::Coder::US->find_segments( $intersection_spec )
401              
402             Given a normalized intersection specifier, find all of the street segments
403             in the database matching the two given streets in the given locale or
404             ZIP code. find_segments() ignores prefix, suffix, and type fields in
405             the specifier for search purposes, and then filters against them ex
406             post facto. The intention for find_segments() to find the closest match
407             possible in preference to returning nothing. You probably want to use
408             lookup_intersection() instead, which will call find_segments() for you.
409              
410             =cut
411              
412             sub find_segments {
413             my ($class, $args) = @_;
414             my @streets = $class->find_streets($args);
415             my @segments;
416            
417             for my $street (@streets) {
418             my ($fips, @data) = unpack "w*", $DB{$street};
419             my (@from, @to);
420             while (@data) {
421             @from = splice( @data, 0, 2 ) if $data[0] > 1_000_000;
422             shift @data while @data and $data[0] < 1_000_000;
423             last unless @data;
424             my @to = splice( @data, 0, 2 );
425              
426             my %found = (fips => $fips);
427             @found{qw{ zip street type prefix suffix }}
428             = split "/", substr($street, 1), 5;
429             @found{qw{ city state }} = @$args{qw{ city state }};
430             @found{qw{ frlat frlong tolat tolong }}
431             = map( $_ / 1_000_000, @from, @to );
432             $found{$_} *= -1 for qw/frlong tolong/;
433             push @segments, \%found;
434              
435             @from = @to;
436             }
437             }
438              
439             $class->add_city_and_state( @segments );
440             return $class->filter_ranges( $args, @segments );
441             }
442              
443             =item Geo::Coder::US->lookup_intersection( $intersection_spec )
444              
445             Given an intersection specifier, return all of the intersections in the
446             database between the two streets specified, plus a latitude and longitude
447             for each intersection. You probably want to just use geocode() instead,
448             which also parses an address string and determines whether it's a proper
449             address or an intersection automatically.
450              
451             =cut
452              
453             sub lookup_intersection {
454             my ($class, $args) = @_;
455             my (@points1, @points2, %results);
456             my %subargs = %$args;
457              
458             $subargs{$_} = $args->{$_ . 1} for (qw( prefix street suffix type ));
459             push @points1,
460             [$_->{frlat}, $_->{frlong}, $_],
461             [$_->{tolat}, $_->{tolong}, $_]
462             for $class->find_segments(\%subargs);
463              
464             $subargs{$_} = $args->{$_ . 2} for (qw( prefix street suffix type ));
465             push @points2,
466             [$_->{frlat}, $_->{frlong}, $_],
467             [$_->{tolat}, $_->{tolong}, $_]
468             for $class->find_segments(\%subargs);
469              
470             return unless @points1 and @points2;
471              
472             %subargs = %$args;
473              
474             for my $x (@points1) {
475             for my $y (@points2) {
476             if (abs($x->[0] - $y->[0]) < SNAP_DISTANCE and
477             abs($x->[1] - $y->[1]) < SNAP_DISTANCE) {
478             my ($st1, $st2, %target) = ($x->[2], $y->[2]);
479             $target{lat} = $x->[0];
480             $target{long} = $x->[1];
481             $target{$_ . 1} = $st1->{$_} for (qw( prefix type suffix ));
482             $target{street1} = $st1->{street};
483             $target{$_ . 2} = $st2->{$_} for (qw( prefix type suffix ));
484             $target{street2} = $st2->{street};
485             $target{$_} = $st1->{$_} || $st2->{$_} for qw/zip city state/;
486             $results{"$target{lat}:$target{long}"} = \%target;
487             }
488             }
489             }
490              
491             return values %results;
492             }
493              
494             1;
495             __END__