File Coverage

blib/lib/Geo/Coder/US/Import.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Geo::Coder::US::Import - Import TIGER/Line data into a Geo::Coder::US database
4              
5             =head1 SYNOPSIS
6              
7             use Geo::Coder::US::Import;
8              
9             Geo::Coder::US->set_db( "/path/to/geocoder.db", 1 );
10              
11             Geo::Coder::US::Import->load_tiger_data( "TGR06075" );
12              
13             Geo::Coder::US::Import->load_fips_data( "All_fips55.txt" );
14              
15             =head1 DESCRIPTION
16              
17             Geo::Coder::US::Import provides methods for importing TIGER/Line data
18             into a BerkeleyDB database for use with Geo::Coder::US.
19              
20             Instead of using this module directly, you may want to use one of the
21             included utility scripts in the eg/ directory of this distribtion.
22             The import_tiger.pl script imports uncompresed TIGER/Line files from a
23             given location:
24              
25             $ perl eg/import_tiger.pl geocoder.db /path/to/tiger/files/TGRnnnnn
26              
27             Be sure to leave off the .RT? extensions or import_tiger.pl will complain.
28              
29             The import_tiger_zip.pl script imports compressed TIGER/Line data by
30             using L to extract only the needed files from the ZIP file
31             into a temporary directory, which it cleans up for you afterwards. This
32             is the B method of data import, as it can handle multiple
33             ZIP files at once:
34              
35             $ perl eg/import_tiger_zip.pl geocoder.db /path/to/tiger/zips/*.zip
36              
37             Both of these import scripts need to cache a lot of data in memory, so
38             you may find that you need one or two hundred megs of RAM for the import
39             to run to completion. The import process takes about 6 hours to import
40             all 4 gigabytes of compressed TIGER/Line data on a 2 GHz Linux machine,
41             and it appears to be mostly processor bound. The final BerkeleyDB database
42             produced by such an import tops out around 750 megabytes.
43              
44             One way of avoiding the RAM bloat on import is to use xargs to run
45             import_tiger_zip.pl on each TIGER/Line ZIP separately:
46              
47             $ find ~/tiger -name '*.zip' | \
48             xargs -n1 perl eg/import_tiger_zip.pl geocoder.db
49              
50             Similarly, you can import FIPS-55 place name data into a
51             Geo::Coder::US database with eg/import_fips.pl:
52              
53             $ perl eg/import_fips.pl geocoder.db All_fips55.txt
54              
55             Note that you can make a perfectly good geocoder for a particular
56             region of the US by simply importing only the TIGER/Line and FIPS-55
57             files for the region you're interested in. You only need to import all
58             of the TIGER/Line data sets in the event that you want a geocoder for
59             the whole US.
60              
61             =cut
62              
63             package Geo::Coder::US::Import;
64              
65 2     2   3268 use Geo::Coder::US;
  0            
  0            
66             use Geo::StreetAddress::US;
67             use Geo::TigerLine::Record::1;
68             use Geo::TigerLine::Record::4;
69             use Geo::TigerLine::Record::5;
70             use Geo::TigerLine::Record::6;
71             use Geo::TigerLine::Record::C;
72             use Geo::Fips55;
73             use Carp;
74             use strict;
75             use warnings;
76              
77             my (%place, %street, %seg, %tlid, %feat, %alt,
78             %fips_to_zip, %zip_to_fips,
79             %place_type, %place_name);
80              
81             =head1 CLASS METHODS
82              
83             =over 4
84              
85             =item load_tiger_data( $tiger_basename )
86              
87             Loads all data from the specified TIGER/Line data set in order of the
88             following record types: C, 5, 1, 4, 6. This ordering ensures that record
89             references are set correctly. You may prefix $tiger_basename with an
90             absolute or relative path, but B provide the .RT? filename suffix
91             as part of $tiger_basename or load_tiger_data() will become cranky.
92              
93             Note that you B first call Geo::Coder::US->set_db() with a second
94             argument with a true value, or set_db() won't open the database for
95             writing.
96              
97             =item load_fips_data( $fips_file )
98              
99             Loads all the data from the specified FIPS-55 gazetteer file. This
100             provides additional or alternate place name data to supplement
101             TIGER/Line.
102              
103             =cut
104              
105             sub _fixup_directionals {
106             my $record = shift;
107              
108             # fix up direction prefix embedded in feature name
109             # either a full or abbreviated directional
110             $record->{fedirp} =
111             $Geo::StreetAddress::US::Directional{lc $1} || uc $1
112             if not $record->{fedirp} and $record->{fename} =~
113             s/^($Geo::StreetAddress::US::Addr_Match{direct})\s+(?=\S)//ios;
114              
115             # do the same for suffixes
116             $record->{fedirs} =
117             $Geo::StreetAddress::US::Directional{lc $1} || uc $1
118             if not $record->{fedirs} and $record->{fename} =~
119             s/(?<=\S)\s+($Geo::StreetAddress::US::Addr_Match{direct})$//ios;
120             }
121              
122             sub _add_range {
123             my ($tlid, $side, $from, $to) = @_;
124              
125             s/\D//go for ($from, $to);
126              
127             # each value in %seg is [lat, lon, lat, lon, [right side], [left side]]
128             push @{$seg{$tlid}[$side eq "r" ? 4 : 5]}, $from, $to;
129             }
130              
131             sub _type_1 {
132             my $record = shift;
133             return unless $record->{fename} and $record->{cfcc} =~ /^A/o;
134              
135             my $tlid = $record->{tlid};
136              
137            
138             # each value in %seg is [lat, lon, lat, lon, [right side], [left side]]
139             $seg{$tlid} ||=
140             [ map(abs, @$record{qw{ frlat frlong tolat tolong }}), [], [] ];
141              
142             # fix up direction prefix embedded in feature name
143             _fixup_directionals($record);
144              
145             for my $side ("r", "l") {
146             my $fips = $record->{"place$side"} || $record->{"cousub$side"}
147             or next;
148             $fips = $record->{"state$side"} . $fips;
149              
150             my ($from, $to, $zip) =
151             @$record{"fradd$side", "toadd$side", "zip$side"};
152              
153             next unless $from and $to and $zip
154             and $zip =~ /^\d{5}$/os
155             and $zip ne '99999';
156              
157             _add_range( $tlid, $side, $from, $to );
158              
159             my $key =
160             join("/", "", $zip, @$record{qw{ fename fetype fedirp fedirs }});
161             $tlid{"$tlid$side"} = $key;
162             $street{$key}{$tlid}++;
163             $place{$key} ||= $fips;
164              
165             $fips_to_zip{$fips}{$zip}++;
166             $zip_to_fips{$zip} = $fips
167             if $place_type{$fips} and (
168             $place_type{$fips} eq 'C'
169             or not $zip_to_fips{$zip}
170             or ($zip_to_fips{$zip} and
171             $place_type{$zip_to_fips{$zip}} ne 'C'));
172              
173             }
174             }
175              
176             sub _type_4 {
177             my $record = shift;
178             push @{$feat{$_}}, $record->{tlid}
179             for grep($_, map($record->{"feat$_"}, 1 .. 5));
180             }
181              
182             sub _type_5 {
183             my $record = shift;
184             my $ids = $feat{$record->{feat}} or return;
185             for my $id (@$ids) {
186             for my $side ("r", "l") {
187             my $main = $tlid{"$id$side"} or next;
188             next unless exists $Geo::Coder::US::DB{$main};
189             my ($zip, $rt1) = ($main =~ /^\/(\d+)(\/.+)/gos);
190             _fixup_directionals($record);
191             my $rt5 = join("/",
192             "", $zip, @$record{qw{ fename fetype fedirp fedirs }});
193             $alt{$rt5}{$rt1}++;
194             }
195             }
196             }
197              
198             sub _type_6 {
199             my $record = shift;
200             my $tlid = $record->{tlid};
201             return unless exists $seg{$tlid};
202              
203             for my $side ("r", "l") {
204             my ($from, $to, $zip) = @$record{"fradd$side", "toadd$side"};
205             next unless $from and $to;
206             _add_range( $tlid, $side, $from, $to );
207             }
208             }
209              
210             sub _type_C {
211             my $record = shift;
212             return unless $record->{fipscc} =~ /^([CDEFTU])/o # inhabited place
213             and $record->{name} and $record->{fips} and $record->{state};
214              
215             my $fips = $record->{state} . $record->{fips};
216             $place_type{$fips} = $1;
217              
218             $record->{name} =~ s/\s*\(.+\)\s*//gos; # cleanup bits with parens
219              
220             $place_name{$fips} = $record->{name};
221             if (exists($Geo::StreetAddress::US::State_FIPS{$record->{state}})) {
222             my $state = $Geo::StreetAddress::US::State_FIPS{$record->{state}};
223             $place_name{$fips} .= ", $state" if ($state);
224             }
225              
226             # map fips->name
227             $Geo::Coder::US::DB{$fips} = $record->{name};
228             }
229              
230             sub _compress_segments {
231             my @segments = @_;
232             my $thunk;
233             while (my $item = shift @segments) {
234             my ($frlat, $frlong, $tolat, $tolong, $right, $left) = @$item;
235             $thunk .= pack("w*", $frlat, $frlong, @$right);
236             $thunk .= pack("w*", 0, @$left) if @$left;
237             next if @segments and $segments[0][0] == $tolat
238             and $segments[0][1] == $tolong;
239             $thunk .= pack("w*", $tolat, $tolong);
240             }
241             return $thunk;
242             }
243              
244             sub load_tiger_data {
245             my ($class, $source) = @_;
246              
247             my $DB = \%Geo::Coder::US::DB;
248             croak "No database specified" unless tied( %$DB );
249              
250             open TIGER, "<$source.RTC" or croak "can't read $source.RTC: $!";
251             Geo::TigerLine::Record::C->parse_file( \*TIGER, \&_type_C );
252              
253             open TIGER, "<$source.RT1" or croak "can't read $source.RT1: $!";
254             Geo::TigerLine::Record::1->parse_file( \*TIGER, \&_type_1 );
255              
256             if (open TIGER, "<$source.RT6") {
257             Geo::TigerLine::Record::6->parse_file( \*TIGER, \&_type_6 );
258             } else {
259             carp "can't read $source.RT6: $!";
260             }
261              
262             while (my ($path, $tlids) = each %street) {
263             my @segments = @seg{keys %$tlids};
264             my @thunk;
265              
266             # right side first, ascending
267             $thunk[0] = _compress_segments( sort {
268             ($a->[4][0] || $a->[5][0]) <=> ($b->[4][0] || $b->[5][0])
269             } @segments );
270             # right side first, descending
271             $thunk[1] = _compress_segments( sort {
272             ($b->[4][0] || $b->[5][0]) <=> ($a->[4][0] || $a->[5][0])
273             } @segments );
274             # left side first, ascending
275             $thunk[2] = _compress_segments( sort {
276             ($a->[5][0] || $a->[4][0]) <=> ($b->[5][0] || $b->[4][0])
277             } @segments );
278             # left side first, descending
279             $thunk[3] = _compress_segments( sort {
280             ($b->[5][0] || $b->[4][0]) <=> ($a->[5][0] || $a->[4][0])
281             } @segments );
282              
283             @thunk = sort { length($a) <=> length($b) } @thunk;
284             $DB->{$path} = pack("w", $place{$path}) . $thunk[0];
285             }
286              
287             # place name -> zip codes mapping
288             while (my ($fips, $zips) = each %fips_to_zip) {
289             my $place = $place_name{$fips} or next;
290             # make sure place->fips mapping doesn't get duplicates
291             if ( exists $DB->{$place} ) {
292             $zips->{$_}++ for unpack("w*", $DB->{$place})
293             }
294             $DB->{$place} = pack("w*", keys %$zips);
295             }
296              
297             # ZIP code -> FIPS mapping
298             $DB->{$_} = pack "w", $zip_to_fips{$_} for keys %zip_to_fips;
299              
300             if (open TIGER, "<$source.RT4") {
301             Geo::TigerLine::Record::4->parse_file( \*TIGER, \&_type_4 );
302             } else {
303             carp "can't read $source.RT4: $!";
304             }
305              
306             if (open TIGER, "<$source.RT5") {
307             Geo::TigerLine::Record::5->parse_file( \*TIGER, \&_type_5 );
308             } else {
309             carp "can't read $source.RT5: $!";
310             }
311              
312             $DB->{$_} ||= join ",", keys %{$alt{$_}} for keys %alt;
313              
314             %tlid = %street = %place = %seg = %feat = %alt
315             = %place_type = %place_name
316             = %zip_to_fips = %fips_to_zip = ();
317             }
318              
319             sub _fips55 {
320             my $record = shift;
321             my $DB = \%Geo::Coder::US::DB;
322             return unless $record->{name} and $record->{state}
323             and $record->{class} =~ /^[CUT]|^Z1/o;
324              
325             for my $type ( "part_of", "other_name" ) {
326             next unless $record->{$type};
327              
328             my $fips = sprintf("%02d%05d", $record->{state_fips}, $record->{$type});
329             next unless exists $DB->{$fips};
330              
331             my $name = "$record->{name}, $record->{state}";
332             $name =~ s/\s*\(.+\)\s*//gos; # cleanup bits with parens
333             next if $name =~ /^\d/o or exists $DB->{$name};
334              
335             $DB->{$name} = pack "w", $fips;
336             }
337             }
338              
339             sub load_fips_data {
340             my ($class, $source) = @_;
341             croak "No database specified" unless tied( %Geo::Coder::US::DB );
342              
343             open TIGER, "<$source" or die "can't read $source: $!";
344             Geo::Fips55->parse_file( \*TIGER, \&_fips55 );
345             }
346              
347             =item load_rtC( $tiger_basename )
348              
349             =item load_rt5( $tiger_basename )
350              
351             =item load_rt1( $tiger_basename )
352              
353             =item load_rt4( $tiger_basename )
354              
355             =item load_rt6( $tiger_basename )
356              
357             Each of these methods loads all records from the TIGER/Line record type
358             specified, with the following exceptions: Type C data is only loaded for
359             records with a FIPS-55 class code beginning with C, D, E, F, T, U or Z
360             (i.e. inhabited places). Type 1 data is only loaded for records with a
361             Census Feature Class Code beginning with A (i.e. street data). Also, Type
362             1 data for which no feature name or FIPS place and/or county subdivision
363             is found are not loaded. Finally, Type 6 data lacking a matching Type
364             1 record in the database are not loaded.
365              
366             You may prefix $tiger_basename with an absolute or relative path, but
367             B provide the .RT? filename suffix as part of $tiger_basename
368             or the load_rt*() methods will become cranky.
369              
370             =back
371              
372             =head1 BUGS
373              
374             The import throws away probably useful data on the assumption that it's
375             not. Similarly, it imports a lot of data you may never use. Mea culpa.
376             Patches welcome.
377              
378             Also, you will encounter from time to time errors from your DBI driver
379             about duplicate keys for certain records. I think the TIGER/Line data has
380             the odd duplicated TLID in Record Type 1, even though it's not supposed
381             to. These errors are annoying but not fatal, and can probably be ignored.
382              
383             The import process can take up huge amounts of RAM. Be forewarned. If
384             anyone really needs it, the data cached in memory by the import process
385             could be buffered to disk, but this would slow down the import process
386             considerably (I think). Contact me if you really want to try this --
387             it might be faster for you to just download a binary version of the
388             fully imported database.
389              
390             Right now, I can't afford to make the full 750 megabyte database freely
391             downloadable from my website -- the bandwidth charges would eat me
392             alive. Contact me if you can offer funding or mirroring.
393              
394             =head1 SEE ALSO
395              
396             Geo::Coder::US(3pm), Geo::StreetAddress::US(3pm), Geo::TigerLine(3pm),
397             Geo::Fips55(3pm), DB_File(3pm), Archive::Zip(3pm)
398              
399             eg/import_tiger.pl, eg/import_tiger_zip.pl, eg/import_fips.pl
400              
401             You can download the latest TIGER/Line data (as of this writing) from:
402              
403             L
404              
405             You can get the latest FIPS-55 data from:
406              
407             L
408              
409             If you have copious spare time, you can slog through the TIGER/Line 2003
410             and FIPS-55-3 technical manuals:
411              
412             L
413              
414             L
415              
416             The TIGER/Line 2004 FE schema is more or less unchanged from 2003.
417              
418             Finally, a few words about FIPS-55-3 class codes:
419              
420             L
421              
422             =head1 APPRECIATION
423              
424             Considerable thanks are due to Michael Schwern
425             for writing the very useful Geo::TigerLine package, which does all
426             the heavy lifting for this module.
427              
428             =head1 AUTHOR
429              
430             Schuyler Erle
431              
432             =head1 LICENSE
433              
434             See L for licensing details.
435              
436             =cut
437              
438             1;