File Coverage

blib/lib/WWW/Scraper/Wikipedia/ISO3166.pm
Criterion Covered Total %
statement 18 29 62.0
branch 0 4 0.0
condition 0 4 0.0
subroutine 6 9 66.6
pod 1 3 33.3
total 25 49 51.0


line stmt bran cond sub pod time code
1             package WWW::Scraper::Wikipedia::ISO3166;
2              
3             require v5.10.1;
4 1     1   382 use strict;
  1         1  
  1         23  
5 1     1   2 use warnings;
  1         7  
  1         20  
6              
7 1     1   418 use File::ShareDir;
  1         4374  
  1         37  
8 1     1   4 use File::Spec;
  1         1  
  1         13  
9              
10 1     1   472 use Moo;
  1         9132  
  1         3  
11              
12 1     1   1509 use Types::Standard qw/Int Str/;
  1         44475  
  1         7  
13              
14             has config_file =>
15             (
16             default => sub{return '.htwww.scraper.wikipedia.iso3166.conf'},
17             is => 'rw',
18             isa => Str,
19             required => 0,
20             );
21              
22             has data_file =>
23             (
24             default => sub{return 'data/en.wikipedia.org.wiki.ISO_3166-2'},
25             is => 'rw',
26             isa => Str,
27             required => 0,
28             );
29              
30             has share_dir =>
31             (
32             default => sub{return ''},
33             is => 'rw',
34             isa => Str,
35             required => 0,
36             );
37              
38             has sqlite_file =>
39             (
40             default => sub{return 'www.scraper.wikipedia.iso3166.sqlite'},
41             is => 'rw',
42             isa => Str,
43             required => 0,
44             );
45              
46             has verbose =>
47             (
48             default => sub{return 0},
49             is => 'rw',
50             isa => Int,
51             required => 0,
52             );
53              
54             our $VERSION = '1.04';
55              
56             # -----------------------------------------------
57              
58             sub BUILD
59             {
60 0     0 0   my($self, $arg) = @_;
61 0           (my $package = __PACKAGE__) =~ s/::/-/g;
62 0 0         my($dir_name) = $ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package);
63              
64 0           $self -> config_file(File::Spec -> catfile($dir_name, $self -> config_file) );
65 0           $self -> sqlite_file(File::Spec -> catfile($dir_name, $self -> sqlite_file) );
66              
67             } # End of BUILD.
68              
69             # -----------------------------------------------
70              
71             sub log
72             {
73 0     0 1   my($self, $level, $s) = @_;
74 0   0       $level ||= 'debug';
75 0   0       $s ||= '';
76              
77 0 0         print "$level: $s. \n" if ($self -> verbose);
78              
79             } # End of log.
80              
81             # -----------------------------------------------
82              
83             sub run
84             {
85 0     0 0   my($self) = @_;
86              
87             # Return 0 for success and 1 for failure.
88              
89 0           return 0;
90              
91             } # End of run.
92              
93             # -----------------------------------------------
94              
95             1;
96              
97             =pod
98              
99             =head1 NAME
100              
101             WWW::Scraper::Wikipedia::ISO3166 - Gently scrape Wikipedia for ISO3166-2 data
102              
103             =encoding utf-8
104              
105             =head1 Synopsis
106              
107             Wikipedia I. You do not need to run the scripts which download pages from there.
108              
109             Just use the SQLite database shipped with this module, as discussed next.
110              
111             =head2 Methods which return hashrefs
112              
113             use WWW::Scraper::Wikipedia::ISO3166::Database;
114              
115             my($database) = WWW::Scraper::Wikipedia::ISO3166::Database -> new;
116             my($countries) = $database -> read_countries_table;
117             my($subcountries) = $database -> read_subcountries_table;
118             ...
119              
120             Each key in %$countries and %$subcountries points to a hashref of all columns for the given key.
121              
122             So, $$countries{13} points to this hashref:
123              
124             {
125             id => 13,
126             code2 => 'AU',
127             code3 => '',
128             fc_name => 'australia',
129             hash_subcountries => 'Yes',
130             name => 'Australia',
131             timestamp => '2012-05-08 04:04:43',
132             }
133              
134             One element of %$subcountries is $$subcountries{4276}:
135              
136             {
137             id => 4276,
138             country_id => 13,
139             code => 'AU-VIC',
140             fc_name => 'victoria',
141             name => 'Victoria',
142             sequence => 5,
143             timestamp => '2012-05-08 04:05:27',
144             }
145              
146             =head3 Warnings
147              
148             # 1: These hashrefs use the table's primary key as the hashref's key. In the case of the I
149             table, the primary key is the country's id, and is used as subcountries.country_id. But, in the case of
150             the I table, the id does not have any meaning apart from being a db primary key.
151             See L for details.
152              
153             # 2: Do not assume subcountry names are unique within a country.
154              
155             L.
156              
157             =head2 Scripts which output to a file
158              
159             All scripts respond to the -h option.
160              
161             Some examples:
162              
163             shell>perl scripts/export.as.csv.pl -c countries.csv -s subcountries.csv
164             shell>perl scripts/export.as.html.pl -w iso.3166-2.html
165              
166             This file is on-line at: L.
167              
168             shell>perl scripts/report.statistics.pl
169              
170             Output statistics:
171             countries_in_db => 249.
172             has_subcounties => 199.
173             subcountries_in_db => 4593.
174             subcountry_files_downloaded => 249.
175              
176             =head1 Description
177              
178             C is a pure Perl module.
179              
180             It is used to download various ISO3166-related pages from Wikipedia, and to then import data
181             (scraped from those pages) into an SQLite database.
182              
183             The pages have already been downloaded, so that phase only needs to be run when pages are updated.
184              
185             Likewise, the data has been imported.
186              
187             This means you would normally only ever use the database in read-only mode.
188              
189             Its components are:
190              
191             =over 4
192              
193             =item o scripts/get.country.page.pl
194              
195             1: Downloads the ISO3166-1_alpha-3 page from Wikipedia.
196              
197             Input: L.
198              
199             Output: data/en.wikipedia.org.wiki.ISO_3166-2.3.html.
200              
201             2: Downloads the ISO3166-2 page from Wikipedia.
202              
203             Input: L.
204              
205             Output: data/en.wikipedia.org.wiki.ISO_3166-2.html.
206              
207             =item o scripts/populate.countries.pl
208              
209             Imports country data into an SQLite database.
210              
211             inputs: data/en.wikipedia.org.wiki.ISO_3166-2.html, data/en.wikipedia.org.wiki.ISO_3166-2.3.html.
212              
213             Output: share/www.scraper.wikipedia.iso3166.sqlite.
214              
215             =item o scripts/get.subcountry.page.pl and scripts/get.subcountry.pages.pl
216              
217             Downloads each countries' corresponding subcountries page.
218              
219             Source: http://en.wikipedia.org/wiki/ISO_3166:$code2.html.
220              
221             Output: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html.
222              
223             =item o scripts/populate.subcountry.pl and scripts/populate.subcountries.pl
224              
225             Imports subcountry data into the database.
226              
227             Source: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html.
228              
229             Output: share/www.scraper.wikipedia.iso3166.sqlite.
230              
231             Note: When the distro is installed, this SQLite file is installed too.
232             See L for details.
233              
234             =item o scripts/export.as.csv.pl -c c.csv -s s.csv
235              
236             Exports the country and subcountry data as CSV.
237              
238             Input: share/www.scraper.wikipedia.iso3166.sqlite.
239              
240             Output: data/countries.csv and data/subcountries.csv.
241              
242             =item o scripts/export.as.html -w c.html
243              
244             Exports the country and subcountry data as HTML.
245              
246             Input: share/www.scraper.wikipedia.iso3166.sqlite.
247              
248             Output: data/iso.3166-2.html.
249              
250             On-line: L.
251              
252             =item o scripts/get.statoids.pl
253              
254             Downloads some pages from L in case one day we need to convert from FIPS to ISO 3166-2.
255              
256             See data/List_of_FIPS_region_codes_*.html.
257              
258             =item o scripts/populate.fips.codes.pl
259              
260             This reads the files output by scripts/get.statoids.pl and produces 2 reports, data/wikipedia.fips.codes.txt
261             and data/wikipedia.fips.mismatch.log. These are discussed in L
262              
263             =item o scripts/test.nfc.pl
264              
265             See L's NFC() for sorting?> for a discussion of this script.
266              
267             =back
268              
269             =head1 Constructor and initialization
270              
271             new(...) returns an object of type C.
272              
273             This is the class's contructor.
274              
275             Usage: C<< WWW::Scraper::Wikipedia::ISO3166 -> new() >>.
276              
277             This method takes a hash of options.
278              
279             Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
280              
281             Available options (these are also methods):
282              
283             =over 4
284              
285             =item o config_file => $file_name
286              
287             The name of the file containing config info, such as I and I.
288             These are used by L.
289              
290             The code prefixes this name with the directory returned by L.
291              
292             Default: .htwww.scraper.wikipedia.iso3166.conf.
293              
294             =item o sqlite_file => $file_name
295              
296             The name of the SQLite database of country and subcountry data.
297              
298             The code prefixes this name with the directory returned by L.
299              
300             Default: www.scraper.wikipedia.iso3166.sqlite.
301              
302             =item o verbose => $integer
303              
304             Print more or less information.
305              
306             Default: 0 (print nothing).
307              
308             =back
309              
310             =head1 Distributions
311              
312             This module is available as a Unix-style distro (*.tgz).
313              
314             Install WWW::Scraper::Wikipedia::ISO3166 as you would for any C module:
315              
316             Run:
317              
318             cpanm WWW::Scraper::Wikipedia::ISO3166
319              
320             or run:
321              
322             sudo cpan WWW::Scraper::Wikipedia::ISO3166
323              
324             or unpack the distro, and then run:
325              
326             perl Makefile.PL
327             make (or dmake)
328             make test
329             make install
330              
331             See L for details.
332              
333             See L for
334             help on unpacking and installing.
335              
336             =head1 Methods
337              
338             =head2 config_file($file_name)
339              
340             Get or set the name of the config file.
341              
342             The code prefixes this name with the directory returned by L.
343              
344             Also, I is an option to L.
345              
346             =head2 log($level => $s)
347              
348             Print $s at log level $level, if ($self -> verbose);
349              
350             Since $self -> verbose defaults to 0, nothing is printed by default.
351              
352             =head2 new()
353              
354             See L.
355              
356             =head2 sqlite_file($file_name)
357              
358             Get or set the name of the database file.
359              
360             The code prefixes this name with the directory returned by L.
361              
362             Also, I is an option to L.
363              
364             =head2 verbose($integer)
365              
366             Get or set the verbosity level.
367              
368             Also, I is an option to L.
369              
370             =head1 FAQ
371              
372             =head2 Design faults in ISO3166
373              
374             Where ISO3166 uses Country Name, I would have used Long Name and Short Name.
375              
376             Then we'd have:
377              
378             Long Name: Bolivia, Plurinational State of
379             Short Name: Bolivia
380              
381             This distro uses the value directly from Wikipedia, which is what I have called 'Long Name', for
382             all country and subcountry names.
383              
384             =head2 Where is the database?
385              
386             It is shipped in share/www.scraper.wikipedia.iso3166.sqlite.
387              
388             It is installed into the distro's shared dir, as returned by L.
389             On my machine that's:
390              
391             /home/ron/perl5/perlbrew/perls/perl-5.14.2/lib/site_perl/5.14.2/auto/share/dist/WWW-Scraper-Wikipedia-ISO3166/www.scraper.wikipedia.iso3166.sqlite.
392              
393             =head2 What is the database schema?
394              
395             A single SQLite file holds 2 tables, I and I:
396              
397             countries subcountries
398             --------- ------------
399             id id
400             code2 country_id
401             code3 code
402             fc_name fc_name
403             has_subcountries name
404             name sequence
405             timestamp timestamp
406              
407             I has a couple of special cases. 2 countries have no value for code3:
408             Libyan Arab Jamahiriya and Sint Maarten.
409             3-letter codes which almost match: LBY => Libya and MAF => Saint Martin (French part).
410              
411             I points to I.
412              
413             I is output from calling fc(decode('utf8', $name) ).
414              
415             For decode(), see L.
416              
417             For fc(), see L.
418              
419             $name is from a Wikipedia page.
420              
421             I is 'Yes' or 'No'.
422              
423             I is output from calling decode('utf8', $name).
424              
425             I is a number (1 .. N) indicating the order in which subcountry names appear in the list
426             on that subcountry's Wikipedia page.
427              
428             See the source code of L for details of the SQL
429             used to create the tables.
430              
431             =head2 What do I do if I find a mistake in the data?
432              
433             What data? What mistake? How do you know it's wrong?
434              
435             Also, you must decide what exactly you were expecting the data to be.
436              
437             If the problem is the ISO data, report it to them.
438              
439             If the problem is the Wikipedia data, get agreement from everyone concerned and update Wikipedia.
440              
441             If the problem is the output from my code, try to identify the bug in the code and report it via the
442             usual mechanism. See L.
443              
444             If the problem is with your computer's display of the data, consider (in alphabetical order):
445              
446             =over 4
447              
448             =item o CSV
449              
450             Does the file display correctly in 'Emacs'? On the screen using 'less'?
451              
452             scripts/export.as.csv.pl uses: use open ':utf8';
453              
454             Is that not working?
455              
456             =item o DBD::SQLite
457              
458             Did you set the sqlite_unicode attribute? Use something like:
459              
460             my($dsn) = 'dbi:SQLite:dbname=www.scraper.wikipedia.iso3166.sqlite'; # Sample only.
461             my($attributes) = {AutoCommit => 1, RaiseError => 1, sqlite_unicode => 1};
462             my($dbh) = DBI -> connect($dsn, '', '', $attributes);
463              
464             The SQLite file ships in the share/ directory of the distro, and must be found by File::ShareDir
465             at run time.
466              
467             Did you set the foreign_keys pragma (if needed)? Use:
468              
469             $dbh -> do('PRAGMA foreign_keys = ON');
470              
471             =item o HTML
472              
473             The template htdocs/assets/templates/www/scraper/wikipedia/iso3166/iso3166.report.tx which ships with
474             this distro contains this line:
475              
476            
477              
478             Is that not working?
479              
480             =item o Locale
481              
482             Here's my setup:
483              
484             shell>locale
485             LANG=en_AU.utf8
486             LANGUAGE=
487             LC_CTYPE="en_AU.utf8"
488             LC_NUMERIC="en_AU.utf8"
489             LC_TIME="en_AU.utf8"
490             LC_COLLATE="en_AU.utf8"
491             LC_MONETARY="en_AU.utf8"
492             LC_MESSAGES="en_AU.utf8"
493             LC_PAPER="en_AU.utf8"
494             LC_NAME="en_AU.utf8"
495             LC_ADDRESS="en_AU.utf8"
496             LC_TELEPHONE="en_AU.utf8"
497             LC_MEASUREMENT="en_AU.utf8"
498             LC_IDENTIFICATION="en_AU.utf8"
499             LC_ALL=
500              
501             =item o OS
502              
503             Unicode is a moving target. Perhaps your OS's installed version of unicode files needs updating.
504              
505             =item o SQLite
506              
507             Both Oracle and SQLite.org ship a program called sqlite3. They are not compatible.
508             Which one are you using? I use the one from the SQLite.org.
509              
510             AFAICT, sqlite3 does not have command line options, or options while running, to set unicode or pragmas.
511              
512             =back
513              
514             =head2 Why did you use L's NFC() for sorting?
515              
516             This question implies why not use NFD() instead.
517              
518             Run scripts/test.nfc.pl, and the output is:
519              
520             code2 => AX
521             code3 => ALA
522             fc_name => åland islands
523             has_subcountries => No
524             id => 15
525             name => Åland Islands
526             timestamp => 2012-05-13 23:37:20
527              
528             And this (Åland Islands) is what Wikipedia displays. So, NFC() it is.
529              
530             See L, and specifically prescription # 1.
531              
532             See also section 1.2 Normalization Forms in L.
533              
534             See also L.
535              
536             =head2 What is $ENV{AUTHOR_TESTING} used for?
537              
538             When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the
539             database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts
540             writing to that copy anyway.
541              
542             At run-time, L is used to find the installed version of *.sqlite.
543              
544             =head2 What FIPS data is included?
545              
546             Firstly, scripts/get.fips.pages.pl downloads some Wikipedia data, into data/List_of_FIPS_region_codes_*.html.
547              
548             Secondly, the latter files are parsed by scripts/populate.fips.codes.pl and the 2 reports are in
549             data/wikipedia.fips.codes.txt, and data/wikipedia.fips.mismatch.log.
550              
551             This data is I written into the SQLite database yet, but it's available in case it's included
552             one day.
553              
554             =head1 Wikipedia's Terms of Use
555              
556             See L.
557              
558             Also, since I'm distributing copies of Wikipedia-sourced material, reformatted but not changed by editing,
559             I hereby give notice that their material is released under CC-BY-SA.
560             See L for that licence.
561              
562             =head1 References
563              
564             In no particular order:
565              
566             L
567              
568             L
569              
570             L
571              
572             L
573              
574             L
575              
576             This is complex set of XML files concerning currency, postal, etc, formats and other details for various countries
577             and/or languages.
578              
579             For Debian etc users: /usr/share/xml/iso-codes/iso_3166_2.xml, as installed from the iso-codes package, with:
580              
581             sudo apt-get install iso-codes
582              
583             L
584              
585             L
586              
587             L
588              
589             Check the Monthly Archives at Perl.com, starting in April 2012, for a series of Unicode-specific articles by
590             Tom Christiansen.
591              
592             L
593              
594             L
595              
596             =head1 Repository
597              
598             L
599              
600             =head1 Support
601              
602             Email the author, or log a bug on RT:
603              
604             L.
605              
606             =head1 Author
607              
608             C was written by Ron Savage Iron@savage.net.auE> in 2012.
609              
610             Home page: L.
611              
612             =head1 Copyright
613              
614             Australian copyright (c) 2012 Ron Savage.
615              
616             All Programs of mine are 'OSI Certified Open Source Software';
617             you can redistribute them and/or modify them under the terms of
618             The Artistic License, a copy of which is available at:
619             http://www.opensource.org/licenses/index.html
620              
621              
622             =cut