File Coverage

blib/lib/WWW/Scraper/Wikipedia/ISO3166.pm
Criterion Covered Total %
statement 21 33 63.6
branch 0 10 0.0
condition n/a
subroutine 7 9 77.7
pod 1 2 50.0
total 29 54 53.7


line stmt bran cond sub pod time code
1             package WWW::Scraper::Wikipedia::ISO3166;
2              
3             require v5.10.1;
4 1     1   384 use strict;
  1         1  
  1         21  
5 1     1   3 use warnings;
  1         5  
  1         18  
6              
7 1     1   403 use File::ShareDir;
  1         3868  
  1         35  
8 1     1   4 use File::Spec;
  1         1  
  1         14  
9              
10 1     1   638 use Log::Handler;
  1         27910  
  1         3  
11              
12 1     1   458 use Moo;
  1         5507  
  1         4  
13              
14 1     1   1315 use Types::Standard qw/Any Int Str/;
  1         43222  
  1         5  
15              
16             has config_file =>
17             (
18             default => sub{return '.htwww.scraper.wikipedia.iso3166.conf'},
19             is => 'rw',
20             isa => Str,
21             required => 0,
22             );
23              
24             has data_file =>
25             (
26             default => sub{return 'data/en.wikipedia.org.wiki.ISO_3166-1'},
27             is => 'rw',
28             isa => Str,
29             required => 0,
30             );
31              
32             has logger =>
33             (
34             default => sub{return undef},
35             is => 'rw',
36             isa => Any,
37             required => 0,
38             );
39              
40             has maxlevel =>
41             (
42             default => sub{return 'notice'},
43             is => 'rw',
44             isa => Str,
45             required => 0,
46             );
47              
48             has minlevel =>
49             (
50             default => sub{return 'error'},
51             is => 'rw',
52             isa => Str,
53             required => 0,
54             );
55              
56             has share_dir =>
57             (
58             default => sub{return ''},
59             is => 'rw',
60             isa => Str,
61             required => 0,
62             );
63              
64             has sqlite_file =>
65             (
66             default => sub{return 'www.scraper.wikipedia.iso3166.sqlite'},
67             is => 'rw',
68             isa => Str,
69             required => 0,
70             );
71              
72             our $VERSION = '2.00';
73              
74             # -----------------------------------------------
75              
76             sub BUILD
77             {
78 0     0 0   my($self) = @_;
79 0           (my $package = __PACKAGE__) =~ s/::/-/g;
80 0 0         my($dir_name) = $ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package);
81              
82 0 0         if (! defined $self -> logger)
83             {
84 0           $self -> logger(Log::Handler -> new);
85 0           $self -> logger -> add
86             (
87             screen =>
88             {
89             maxlevel => $self -> maxlevel,
90             message_layout => '%m',
91             minlevel => $self -> minlevel,
92             utf8 => 1,
93             }
94             );
95             }
96              
97 0           $self -> config_file(File::Spec -> catfile($dir_name, $self -> config_file) );
98 0           $self -> sqlite_file(File::Spec -> catfile($dir_name, $self -> sqlite_file) );
99              
100             } # End of BUILD.
101              
102             # -----------------------------------------------
103              
104             sub log
105             {
106 0     0 1   my($self, $level, $s) = @_;
107 0 0         $level = 'notice' if (! defined $level);
108 0 0         $s = '' if (! defined $s);
109              
110 0 0         $self -> logger -> $level($s) if ($self -> logger);
111              
112             } # End of log.
113              
114             # -----------------------------------------------
115              
116             1;
117              
118             =pod
119              
120             =head1 NAME
121              
122             WWW::Scraper::Wikipedia::ISO3166 - Gently scrape Wikipedia for ISO3166-2 data
123              
124             =encoding utf-8
125              
126             =head1 Synopsis
127              
128             Wikipedia I. You do not need to run the scripts which download pages from there.
129              
130             Just use the SQLite database shipped with this module, share/www.scraper.wikipedia.iso3166.sqlite.
131              
132             See scripts/export.*.pl and scripts/get.*.pl for sample code.
133              
134             =head2 Methods which return hashrefs
135              
136             use WWW::Scraper::Wikipedia::ISO3166::Database;
137              
138             my($database) = WWW::Scraper::Wikipedia::ISO3166::Database -> new;
139             my($countries) = $database -> read_countries_table;
140             my($subcountries) = $database -> read_subcountries_table;
141             my($categories) = $database -> read_subcountry_categories_table;
142             my($types) = $database -> read_subcountry_info_table;
143             ...
144              
145             Each key in returned C points to a hashref of all columns for the given key.
146              
147             So, $$countries{13} points to this hashref:
148              
149             {
150             id => 13,
151             code2 => 'AU',
152             code3 => '',
153             fc_name => 'australia',
154             hash_subcountries => 'Yes',
155             name => 'Australia',
156             number => '036',
157             timestamp => '2012-05-08 04:04:43',
158             }
159              
160             One element of %$subcountries is $$subcountries{941}:
161              
162             {
163             id => 941,
164             country_id => 13,
165             code => 'AU-VIC',
166             fc_name => 'victoria',
167             name => 'Victoria',
168             sequence => 7,
169             subcountry_category_id => 8,
170             timestamp => '2012-05-08 04:05:27',
171             }
172              
173             =head3 Warnings
174              
175             These hashrefs use the table's primary key as the hashref's key. In the case of the I
176             table, the primary key is the country's id, and is used as subcountries.country_id. But, in the case of
177             the I table, the id does not have any meaning apart from being a db primary key.
178             See L for details.
179              
180             =head2 Scripts which output to a file
181              
182             Note: Many of these programs respond to the -h command line switch, but not create.tables.pl nor
183             drop.tables.pl.
184              
185             Some examples:
186              
187             shell> perl scripts/export.as.csv.pl -c countries.csv -s subcountries.csv
188             shell> perl scripts/export.as.html.pl -w iso.3166-2.html
189             shell> perl -Ilib scripts/populate.countries.pl -maxlevel debug
190             shell> perl -Ilib scripts/populate.subcountries.pl -maxlevel debug
191              
192             The HTML file is on-line at: L.
193              
194             shell>perl scripts/report.statistics.pl
195              
196             Output statistics:
197             countries_in_db => 249
198             has_subcounties => 200
199             subcountries_in_db => 5297
200             subcountry_files_downloaded => 249
201             subcountry_info_in_db => 352
202              
203             See also scripts/report.*.pl and t/report.t.
204              
205             =head1 Description
206              
207             C is a pure Perl module.
208              
209             It is used to download various ISO3166-related pages from Wikipedia, and to then import data
210             (scraped from those pages) into an SQLite database.
211              
212             The pages have already been downloaded, so that phase only needs to be run when pages are updated.
213              
214             Likewise, the data has been imported.
215              
216             This means you would normally only ever use the database in read-only mode.
217              
218             Note: Many of these programs respond to the -h command line switch, but not create.tables.pl nor
219             drop.tables.pl.
220              
221             Scripts, all shipped in scripts/:
222              
223             =over 4
224              
225             =item o build.database.sh
226              
227             Mainly for use by me. It runs:
228              
229             =over 4
230              
231             =item o perl -Ilib scripts/drop.tables.pl
232              
233             =item o perl -Ilib scripts/create.tables.pl
234              
235             =item o perl -Ilib scripts/populate.countries.pl -maxlevel debug
236              
237             =item o perl -Ilib scripts/populate.subcountries.pl -maxlevel debug
238              
239             =item o perl -Ilib scripts/export.as.html.pl -w data/iso.3166-2.html
240              
241             =item o cp data/iso.3166-2.html $DR/
242              
243             $DR is my web site's RAMdisk-based doc root.
244              
245             =item perl -Ilib scripts/export.as.csv.pl \
246              
247             =back
248              
249             =item o check.downloads.pl
250              
251             Ensure each subcountry file has been downloaded, and report any which haven't been. Also report
252             and unexpected subcountry files found in data/.
253              
254             =item o perl -Ilib scripts/create.tables.pl
255              
256             =item o perl -Ilib scripts/drop.tables.pl
257              
258             =item o export.as.csv.pl -country_file c.csv -subcountry_file s.csv subcountry_info_file i.csv
259              
260             Exports the country, subcountry and subcountry info data as CSV.
261              
262             Input: share/www.scraper.wikipedia.iso3166.sqlite.
263              
264             Output: data/countries.csv and data/subcountries.csv.
265              
266             =item o export.as.html -w c.html
267              
268             Exports the country and subcountry data as HTML.
269              
270             Input: share/www.scraper.wikipedia.iso3166.sqlite.
271              
272             Output: data/iso.3166-2.html.
273              
274             On-line: L.
275              
276             =item o find.db.pl
277              
278             After installation, this will print the path to www.scraper.wikipedia.iso3166.sqlite.
279              
280             =item o get.country.pages.pl
281              
282             1: Downloads the ISO3166-1 and ISO3166-2 pages from Wikipedia.
283              
284             Input: L and
285             .
286              
287             Output: data/en.wikipedia.org.wiki.ISO_3166-1.html and data/en.wikipedia.org.wiki.ISO_3166-2.html.
288              
289             =item o get.subcountry.page.pl and scripts/get.subcountry.pages.pl
290              
291             Downloads each countries' corresponding subcountries page.
292              
293             Source: http://en.wikipedia.org/wiki/ISO_3166:$code2.html.
294              
295             Output: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html.
296              
297             =item o pod2html.sh
298              
299             For use by the author. It converts each *.pm file into the corresponding *.html file, and outputs
300             them to my web server's doc root.
301              
302             =item o populate.countries.pl
303              
304             Imports country data into an SQLite database.
305              
306             Input: data/en.wikipedia.org.wiki.ISO_3166-1.html, data/en.wikipedia.org.wiki.ISO_3166-2.html.
307              
308             Output: share/www.scraper.wikipedia.iso3166.sqlite.
309              
310             =item o populate.subcountry.pl and scripts/populate.subcountries.pl
311              
312             Imports subcountry data into the database.
313              
314             Source: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html.
315              
316             Output: share/www.scraper.wikipedia.iso3166.sqlite.
317              
318             Note: When the distro is installed, this SQLite file is installed too.
319             See L for details.
320              
321             =item o report.Australian.statistics.pl
322              
323             A simple test program. See also the next script.
324              
325             Run it with the '-max info' command line options.
326              
327             =item o report.statistics.pl
328              
329             A simple test program. See also the previous script.
330              
331             Run it with the '-max info' command line options.
332              
333             =back
334              
335             =head1 Constructor and initialization
336              
337             new(...) returns an object of type C.
338              
339             This is the class's contructor.
340              
341             Usage: C<< WWW::Scraper::Wikipedia::ISO3166 -> new() >>.
342              
343             This method takes a hash of options.
344              
345             Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
346              
347             Available options (these are also methods):
348              
349             =over 4
350              
351             =item o config_file => $file_name
352              
353             The name of the file containing config info, such as I and I.
354             These are used by L.
355              
356             The code prefixes this name with the directory returned by L.
357              
358             Default: .htwww.scraper.wikipedia.iso3166.conf.
359              
360             =item o logger => $aLoggerObject
361              
362             Specify a logger compatible with L, for the lexer and parser to use.
363              
364             Default: A logger of type L which writes to the screen.
365              
366             To disable logging, just set 'logger' to the empty string (not undef).
367              
368             =item o maxlevel => $logOption1
369              
370             This option affects L.
371              
372             Possible values for C and C are:
373              
374             =over 4
375              
376             =item o debug
377              
378             Generates the maximum amount of output.
379              
380             =item o info
381              
382             =item o notice
383              
384             By default, C is the highest level used.
385              
386             =item o warning, warn
387              
388             =item o error, err
389              
390             By default, C is the lowest level used.
391              
392             =item o critical, crit
393              
394             =item o alert
395              
396             =item o emergency, emerg
397              
398             =back
399              
400             See the L docs.
401              
402             Default: 'notice'.
403              
404             =item o minlevel => $logOption2
405              
406             This option affects L.
407              
408             See the L docs.
409              
410             Default: 'error'.
411              
412             No lower levels are used.
413              
414             =item o sqlite_file => $file_name
415              
416             The name of the SQLite database of country and subcountry data.
417              
418             The code prefixes this name with the directory returned by L.
419              
420             Default: www.scraper.wikipedia.iso3166.sqlite.
421              
422             =back
423              
424             =head1 Distributions
425              
426             This module is available as a Unix-style distro (*.tgz).
427              
428             Install WWW::Scraper::Wikipedia::ISO3166 as you would for any C module:
429              
430             Run:
431              
432             cpanm WWW::Scraper::Wikipedia::ISO3166
433              
434             or run:
435              
436             sudo cpan WWW::Scraper::Wikipedia::ISO3166
437              
438             or unpack the distro, and then run:
439              
440             perl Makefile.PL
441             make (or dmake)
442             make test
443             make install
444              
445             See L for details.
446              
447             See L for
448             help on unpacking and installing.
449              
450             =head1 Methods
451              
452             =head2 config_file($file_name)
453              
454             Get or set the name of the config file.
455              
456             The code prefixes this name with the directory returned by L.
457              
458             C is an option to L.
459              
460             =head2 log($level, $s)
461              
462             If a logger is defined, this logs the message $s at level $level.
463              
464             =head2 logger([$logger_object])
465              
466             Here, the [] indicate an optional parameter.
467              
468             Get or set the logger object.
469              
470             To disable logging, just set 'logger' to the empty string (not undef), in the call to L.
471              
472             This logger is passed to other modules.
473              
474             C is a parameter to L. See L for details.
475              
476             =head2 maxlevel([$string])
477              
478             Here, the [] indicate an optional parameter.
479              
480             Get or set the value used by the logger object.
481              
482             This option is only used if an object of type L is ceated.
483             See L.
484              
485             C is a parameter to L. See L for details.
486              
487             =head2 minlevel([$string])
488              
489             Here, the [] indicate an optional parameter.
490              
491             Get or set the value used by the logger object.
492              
493             This option is only used if an object of type L is created.
494             See L.
495              
496             C is a parameter to L. See L for details.
497              
498             =head2 new()
499              
500             See L.
501              
502             =head2 sqlite_file($file_name)
503              
504             Get or set the name of the database file.
505              
506             The code prefixes this name with the directory returned by L.
507              
508             C is an option to L.
509              
510             =head1 FAQ
511              
512             =head2 Design faults in ISO3166
513              
514             Where ISO3166 uses Country Name, I would have used C and C.
515              
516             Then we'd have:
517              
518             Long Name: Bolivia (Plurinational State of)
519             Short Name: Bolivia
520              
521             This distro uses the values directly from Wikipedia, which is what I have called C above,
522             for all country and subcountry names.
523              
524             =head2 Are any names modified by the code?
525              
526             Yes. ' is converted into a single quote.
527              
528             =head2 Where is the database?
529              
530             It is shipped in share/www.scraper.wikipedia.iso3166.sqlite.
531              
532             It is installed into the distro's shared dir, as returned by L.
533             Run scripts/find.db.pl to see what dir it is on your machine.
534              
535             On my machine it's:
536              
537             /home/ron/perl5/perlbrew/perls/perl-5.20.2/lib/site_perl/5.20.2/auto/share/dist/WWW-Scraper-Wikipedia-ISO3166/www.scraper.wikipedia.iso3166.sqlite
538              
539             =head2 What is the database schema?
540              
541             A single SQLite file holds 4 tables:
542              
543             countries subcountries subcountry_categories subcountry_info
544             --------- ------------ --------------------- ---------------
545             id id id id
546             code2 country_id name country_id
547             code3 subcountry_category_id timestamp name
548             fc_name fc_name sequence
549             has_subcountries code timestamp
550             name name
551             number sequence
552             timestamp timestamp
553              
554             An SVG image of the schema is shipped as data/www.scraper.wikipedia.iso3166.schema.svg,
555             and is L.
556              
557             The schema of the C table is basically taken straight from the big table on
558             L. Likewise for the subcountry_info table,
559             it's taken from L.
560              
561             I points to I.
562              
563             I is output from calling fc($name). It's in UTF-8.
564              
565             For decode(), see L.
566              
567             For fc(), see L.
568              
569             $name is from a Wikipedia page.
570              
571             I is 'Yes' or 'No'.
572              
573             I is in UTF-8.
574              
575             I is the 3-digit number from the ISO_3166-1 page.
576              
577             I is a number (1 .. N) indicating the order in which records for the same country_id
578             should be accessed.
579              
580             See the source code of L for details of the SQL
581             used to create the tables.
582              
583             Lastly, in L, there are 4 methods for reading the 4
584             tables, as well as various more general methods.
585              
586             =head2 A Warning about Creating the Database
587              
588             See also L below.
589              
590             If you run scripts/drop.tables.pl and scripts/create.tables.pl before running
591             scripts/populate.countries.pl and scripts/populate.subcountries, then the primary keys in the
592             tables will start from 1. This is good because it preempts a source of confusion.
593              
594             Without that step, L will simply increment the primary keys starting from 1
595             more than was previously used.
596              
597             =head2 What do I do if I find a mistake in the data?
598              
599             What data? What mistake? How do you know it's wrong?
600              
601             Also, you must decide what exactly you were expecting the data to be.
602              
603             If the problem is the ISO data, report it to them.
604              
605             If the problem is the Wikipedia data, get agreement from everyone concerned and update Wikipedia.
606              
607             If the problem is the output from my code, try to identify the bug in the code and report it via the
608             usual mechanism. See L.
609              
610             If the problem is with your computer's display of the data, consider (in alphabetical order):
611              
612             =over 4
613              
614             =item o CSV
615              
616             Does the file display correctly in 'Emacs'? On the screen using 'less'?
617              
618             scripts/export.as.csv.pl uses: use open ':utf8';
619              
620             Is that not working?
621              
622             =item o DBD::SQLite
623              
624             Did you set the sqlite_unicode attribute? Use something like:
625              
626             my($dsn) = 'dbi:SQLite:dbname=www.scraper.wikipedia.iso3166.sqlite'; # Sample only.
627             my($attributes) = {AutoCommit => 1, RaiseError => 1, sqlite_unicode => 1};
628             my($dbh) = DBI -> connect($dsn, '', '', $attributes);
629              
630             The SQLite file ships in the share/ directory of the distro, and must be found by File::ShareDir
631             at run time.
632              
633             Did you set the foreign_keys pragma (if needed)? Use:
634              
635             $dbh -> do('PRAGMA foreign_keys = ON');
636              
637             =item o HTML
638              
639             The template htdocs/assets/templates/www/scraper/wikipedia/iso3166/iso3166.report.tx which ships with
640             this distro contains this line:
641              
642            
643              
644             Is that not working?
645              
646             =item o Locale
647              
648             Here's my setup:
649              
650             shell>locale
651             LANG=en_AU.utf8
652             LANGUAGE=
653             LC_CTYPE="en_AU.utf8"
654             LC_NUMERIC="en_AU.utf8"
655             LC_TIME="en_AU.utf8"
656             LC_COLLATE="en_AU.utf8"
657             LC_MONETARY="en_AU.utf8"
658             LC_MESSAGES="en_AU.utf8"
659             LC_PAPER="en_AU.utf8"
660             LC_NAME="en_AU.utf8"
661             LC_ADDRESS="en_AU.utf8"
662             LC_TELEPHONE="en_AU.utf8"
663             LC_MEASUREMENT="en_AU.utf8"
664             LC_IDENTIFICATION="en_AU.utf8"
665             LC_ALL=
666              
667             =item o OS
668              
669             Unicode is a moving target. Perhaps your OS's installed version of unicode files needs updating.
670              
671             =item o SQLite
672              
673             Both Oracle and SQLite.org ship a program called sqlite3. They are not compatible.
674             Which one are you using? I use the one from the SQLite.org.
675              
676             AFAICT, sqlite3 does not have command line options, or options while running, to set unicode or pragmas.
677              
678             =back
679              
680             =head2 Why did you use L's fc() for sorting?
681              
682             See L, and specifically prescription # 1.
683              
684             See also section 1.2 Normalization Forms in L.
685              
686             See also L.
687              
688             =head2 What is $ENV{AUTHOR_TESTING} used for?
689              
690             When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the
691             database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts
692             writing to that copy anyway.
693              
694             At run-time, L is used to find the installed version of *.sqlite.
695              
696             =head1 Wikipedia's Terms of Use
697              
698             See L.
699              
700             Also, since I'm distributing copies of Wikipedia-sourced material, reformatted but not changed by editing,
701             I hereby give notice that their material is released under CC-BY-SA.
702             See L for that licence.
703              
704             =head1 See Also
705              
706             L by Sullivan Beck.
707              
708             =head1 References
709              
710             In no particular order:
711              
712             L
713              
714             L
715              
716             L
717              
718             L
719              
720             L
721              
722             This is complex set of XML files concerning currency, postal, etc, formats and other details for various countries
723             and/or languages.
724              
725             For Debian etc users: /usr/share/xml/iso-codes/iso_3166_2.xml, as installed from the iso-codes package, with:
726              
727             sudo apt-get install iso-codes
728              
729             L
730              
731             L
732              
733             L
734              
735             Check the Monthly Archives at Perl.com, starting in April 2012, for a series of Unicode-specific articles by
736             Tom Christiansen.
737              
738             L
739              
740             L
741              
742             =head1 Repository
743              
744             L
745              
746             =head1 Support
747              
748             Email the author, or log a bug on RT:
749              
750             L.
751              
752             =head1 Author
753              
754             C was written by Ron Savage Iron@savage.net.auE> in 2012.
755              
756             Home page: L.
757              
758             =head1 Copyright
759              
760             Australian copyright (c) 2012 Ron Savage.
761              
762             All Programs of mine are 'OSI Certified Open Source Software';
763             you can redistribute them and/or modify them under the terms of
764             The Artistic License, a copy of which is available at:
765             http://www.opensource.org/licenses/index.html
766              
767              
768             =cut