| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Scraper::Wikipedia::ISO3166; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require v5.10.1; |
|
4
|
1
|
|
|
1
|
|
13591
|
use strict; |
|
|
1
|
|
|
|
|
12
|
|
|
|
1
|
|
|
|
|
25
|
|
|
5
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
21
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
760
|
use File::ShareDir; |
|
|
1
|
|
|
|
|
4671
|
|
|
|
1
|
|
|
|
|
41
|
|
|
8
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
15
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
486
|
use Moo; |
|
|
1
|
|
|
|
|
9511
|
|
|
|
1
|
|
|
|
|
3
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
1547
|
use Types::Standard qw/Int Str/; |
|
|
1
|
|
|
|
|
47097
|
|
|
|
1
|
|
|
|
|
7
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has config_file => |
|
15
|
|
|
|
|
|
|
( |
|
16
|
|
|
|
|
|
|
default => sub{return '.htwww.scraper.wikipedia.iso3166.conf'}, |
|
17
|
|
|
|
|
|
|
is => 'rw', |
|
18
|
|
|
|
|
|
|
isa => Str, |
|
19
|
|
|
|
|
|
|
required => 0, |
|
20
|
|
|
|
|
|
|
); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has data_file => |
|
23
|
|
|
|
|
|
|
( |
|
24
|
|
|
|
|
|
|
default => sub{return 'data/en.wikipedia.org.wiki.ISO_3166-2'}, |
|
25
|
|
|
|
|
|
|
is => 'rw', |
|
26
|
|
|
|
|
|
|
isa => Str, |
|
27
|
|
|
|
|
|
|
required => 0, |
|
28
|
|
|
|
|
|
|
); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has share_dir => |
|
31
|
|
|
|
|
|
|
( |
|
32
|
|
|
|
|
|
|
default => sub{return ''}, |
|
33
|
|
|
|
|
|
|
is => 'rw', |
|
34
|
|
|
|
|
|
|
isa => Str, |
|
35
|
|
|
|
|
|
|
required => 0, |
|
36
|
|
|
|
|
|
|
); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
has sqlite_file => |
|
39
|
|
|
|
|
|
|
( |
|
40
|
|
|
|
|
|
|
default => sub{return 'www.scraper.wikipedia.iso3166.sqlite'}, |
|
41
|
|
|
|
|
|
|
is => 'rw', |
|
42
|
|
|
|
|
|
|
isa => Str, |
|
43
|
|
|
|
|
|
|
required => 0, |
|
44
|
|
|
|
|
|
|
); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
has verbose => |
|
47
|
|
|
|
|
|
|
( |
|
48
|
|
|
|
|
|
|
default => sub{return 0}, |
|
49
|
|
|
|
|
|
|
is => 'rw', |
|
50
|
|
|
|
|
|
|
isa => Int, |
|
51
|
|
|
|
|
|
|
required => 0, |
|
52
|
|
|
|
|
|
|
); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
our $VERSION = '1.03'; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# ----------------------------------------------- |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub BUILD |
|
59
|
|
|
|
|
|
|
{ |
|
60
|
0
|
|
|
0
|
0
|
|
my($self, $arg) = @_; |
|
61
|
0
|
|
|
|
|
|
(my $package = __PACKAGE__) =~ s/::/-/g; |
|
62
|
0
|
0
|
|
|
|
|
my($dir_name) = $ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package); |
|
63
|
|
|
|
|
|
|
|
|
64
|
0
|
|
|
|
|
|
$self -> config_file(File::Spec -> catfile($dir_name, $self -> config_file) ); |
|
65
|
0
|
|
|
|
|
|
$self -> sqlite_file(File::Spec -> catfile($dir_name, $self -> sqlite_file) ); |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
} # End of BUILD. |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# ----------------------------------------------- |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub log |
|
72
|
|
|
|
|
|
|
{ |
|
73
|
0
|
|
|
0
|
1
|
|
my($self, $level, $s) = @_; |
|
74
|
0
|
|
0
|
|
|
|
$level ||= 'debug'; |
|
75
|
0
|
|
0
|
|
|
|
$s ||= ''; |
|
76
|
|
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
|
print "$level: $s. \n" if ($self -> verbose); |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
} # End of log. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# ----------------------------------------------- |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub run |
|
84
|
|
|
|
|
|
|
{ |
|
85
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Return 0 for success and 1 for failure. |
|
88
|
|
|
|
|
|
|
|
|
89
|
0
|
|
|
|
|
|
return 0; |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
} # End of run. |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# ----------------------------------------------- |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
1; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=pod |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head1 NAME |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
WWW::Scraper::Wikipedia::ISO3166 - Gently scrape Wikipedia for ISO3166-2 data |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=encoding utf-8 |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 Synopsis |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Wikipedia I. You do not need to run the scripts which download pages from there. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Just use the SQLite database shipped with this module, as discussed next. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 Methods which return hashrefs |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
use WWW::Scraper::Wikipedia::ISO3166::Database; |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my($database) = WWW::Scraper::Wikipedia::ISO3166::Database -> new; |
|
116
|
|
|
|
|
|
|
my($countries) = $database -> read_countries_table; |
|
117
|
|
|
|
|
|
|
my($subcountries) = $database -> read_subcountries_table; |
|
118
|
|
|
|
|
|
|
... |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Each key in %$countries and %$subcountries points to a hashref of all columns for the given key. |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
So, $$countries{13} points to this hashref: |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
{ |
|
125
|
|
|
|
|
|
|
id => 13, |
|
126
|
|
|
|
|
|
|
code2 => 'AU', |
|
127
|
|
|
|
|
|
|
code3 => '', |
|
128
|
|
|
|
|
|
|
fc_name => 'australia', |
|
129
|
|
|
|
|
|
|
hash_subcountries => 'Yes', |
|
130
|
|
|
|
|
|
|
name => 'Australia', |
|
131
|
|
|
|
|
|
|
timestamp => '2012-05-08 04:04:43', |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
One element of %$subcountries is $$subcountries{4276}: |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
{ |
|
137
|
|
|
|
|
|
|
id => 4276, |
|
138
|
|
|
|
|
|
|
country_id => 13, |
|
139
|
|
|
|
|
|
|
code => 'AU-VIC', |
|
140
|
|
|
|
|
|
|
fc_name => 'victoria', |
|
141
|
|
|
|
|
|
|
name => 'Victoria', |
|
142
|
|
|
|
|
|
|
sequence => 5, |
|
143
|
|
|
|
|
|
|
timestamp => '2012-05-08 04:05:27', |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head3 Warnings |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# 1: These hashrefs use the table's primary key as the hashref's key. In the case of the I |
|
149
|
|
|
|
|
|
|
table, the primary key is the country's id, and is used as subcountries.country_id. But, in the case of |
|
150
|
|
|
|
|
|
|
the I table, the id does not have any meaning apart from being a db primary key. |
|
151
|
|
|
|
|
|
|
See L for details. |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# 2: Do not assume subcountry names are unique within a country. |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
L. |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 Scripts which output to a file |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
All scripts respond to the -h option. |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Some examples: |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
shell>perl scripts/export.as.csv.pl -c countries.csv -s subcountries.csv |
|
164
|
|
|
|
|
|
|
shell>perl scripts/export.as.html.pl -w iso.3166-2.html |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
This file is on-line at: L. |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
shell>perl scripts/report.statistics.pl |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Output statistics: |
|
171
|
|
|
|
|
|
|
countries_in_db => 249. |
|
172
|
|
|
|
|
|
|
has_subcounties => 199. |
|
173
|
|
|
|
|
|
|
subcountries_in_db => 4593. |
|
174
|
|
|
|
|
|
|
subcountry_files_downloaded => 249. |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 Description |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
C is a pure Perl module. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
It is used to download various ISO3166-related pages from Wikipedia, and to then import data |
|
181
|
|
|
|
|
|
|
(scraped from those pages) into an SQLite database. |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
The pages have already been downloaded, so that phase only needs to be run when pages are updated. |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Likewise, the data has been imported. |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
This means you would normally only ever use the database in read-only mode. |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Its components are: |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=over 4 |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item o scripts/get.country.page.pl |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
1: Downloads the ISO3166-1_alpha-3 page from Wikipedia. |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Input: L. |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.3.html. |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
2: Downloads the ISO3166-2 page from Wikipedia. |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Input: L. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.html. |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=item o scripts/populate.countries.pl |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Imports country data into an SQLite database. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
inputs: data/en.wikipedia.org.wiki.ISO_3166-2.html, data/en.wikipedia.org.wiki.ISO_3166-2.3.html. |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Output: share/www.scraper.wikipedia.iso3166.sqlite. |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=item o scripts/get.subcountry.page.pl and scripts/get.subcountry.pages.pl |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Downloads each countries' corresponding subcountries page. |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Source: http://en.wikipedia.org/wiki/ISO_3166:$code2.html. |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item o scripts/populate.subcountry.pl and scripts/populate.subcountries.pl |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Imports subcountry data into the database. |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Source: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Output: share/www.scraper.wikipedia.iso3166.sqlite. |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Note: When the distro is installed, this SQLite file is installed too. |
|
232
|
|
|
|
|
|
|
See L for details. |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item o scripts/export.as.csv.pl -c c.csv -s s.csv |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Exports the country and subcountry data as CSV. |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Input: share/www.scraper.wikipedia.iso3166.sqlite. |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
Output: data/countries.csv and data/subcountries.csv. |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=item o scripts/export.as.html -w c.html |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
Exports the country and subcountry data as HTML. |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Input: share/www.scraper.wikipedia.iso3166.sqlite. |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Output: data/iso.3166-2.html. |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
On-line: L. |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item o scripts/get.statoids.pl |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
Downloads some pages from L in case one day we need to convert from FIPS to ISO 3166-2. |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
See data/List_of_FIPS_region_codes_*.html. |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=item o scripts/populate.fips.codes.pl |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
This reads the files output by scripts/get.statoids.pl and produces 2 reports, data/wikipedia.fips.codes.txt |
|
261
|
|
|
|
|
|
|
and data/wikipedia.fips.mismatch.log. These are discussed in L |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=item o scripts/test.nfc.pl |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
See L's NFC() for sorting?> for a discussion of this script. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=back |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head1 Constructor and initialization |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
new(...) returns an object of type C. |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This is the class's contructor. |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Usage: C<< WWW::Scraper::Wikipedia::ISO3166 -> new() >>. |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
This method takes a hash of options. |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>. |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
Available options (these are also methods): |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=over 4 |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=item o config_file => $file_name |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
The name of the file containing config info, such as I and I. |
|
288
|
|
|
|
|
|
|
These are used by L. |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Default: .htwww.scraper.wikipedia.iso3166.conf. |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=item o sqlite_file => $file_name |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
The name of the SQLite database of country and subcountry data. |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Default: www.scraper.wikipedia.iso3166.sqlite. |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=item o verbose => $integer |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Print more or less information. |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
Default: 0 (print nothing). |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=back |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=head1 Distributions |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
This module is available as a Unix-style distro (*.tgz). |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Install WWW::Scraper::Wikipedia::ISO3166 as you would for any C module: |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Run: |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
cpanm WWW::Scraper::Wikipedia::ISO3166 |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
or run: |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sudo cpan WWW::Scraper::Wikipedia::ISO3166 |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
or unpack the distro, and then run: |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
perl Makefile.PL |
|
327
|
|
|
|
|
|
|
make (or dmake) |
|
328
|
|
|
|
|
|
|
make test |
|
329
|
|
|
|
|
|
|
make install |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
See L for details. |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
See L for |
|
334
|
|
|
|
|
|
|
help on unpacking and installing. |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head1 Methods |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=head2 config_file($file_name) |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
Get or set the name of the config file. |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Also, I is an option to L. |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head2 log($level => $s) |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
Print $s at log level $level, if ($self -> verbose); |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Since $self -> verbose defaults to 0, nothing is printed by default. |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=head2 new() |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
See L. |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=head2 sqlite_file($file_name) |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Get or set the name of the database file. |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Also, I is an option to L. |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 verbose($integer) |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Get or set the verbosity level. |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Also, I is an option to L. |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=head1 FAQ |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=head2 Design faults in ISO3166 |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
Where ISO3166 uses Country Name, I would have used Long Name and Short Name. |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
Then we'd have: |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Long Name: Bolivia, Plurinational State of |
|
379
|
|
|
|
|
|
|
Short Name: Bolivia |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
This distro uses the value directly from Wikipedia, which is what I have called 'Long Name', for |
|
382
|
|
|
|
|
|
|
all country and subcountry names. |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 Where is the database? |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
It is shipped in share/www.scraper.wikipedia.iso3166.sqlite. |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
It is installed into the distro's shared dir, as returned by L. |
|
389
|
|
|
|
|
|
|
On my machine that's: |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
/home/ron/perl5/perlbrew/perls/perl-5.14.2/lib/site_perl/5.14.2/auto/share/dist/WWW-Scraper-Wikipedia-ISO3166/www.scraper.wikipedia.iso3166.sqlite. |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head2 What is the database schema? |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
A single SQLite file holds 2 tables, I and I: |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
countries subcountries |
|
398
|
|
|
|
|
|
|
--------- ------------ |
|
399
|
|
|
|
|
|
|
id id |
|
400
|
|
|
|
|
|
|
code2 country_id |
|
401
|
|
|
|
|
|
|
code3 code |
|
402
|
|
|
|
|
|
|
fc_name fc_name |
|
403
|
|
|
|
|
|
|
has_subcountries name |
|
404
|
|
|
|
|
|
|
name sequence |
|
405
|
|
|
|
|
|
|
timestamp timestamp |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
I has a couple of special cases. 2 countries have no value for code3: |
|
408
|
|
|
|
|
|
|
Libyan Arab Jamahiriya and Sint Maarten. |
|
409
|
|
|
|
|
|
|
3-letter codes which almost match: LBY => Libya and MAF => Saint Martin (French part). |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
I points to I. |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
I is output from calling fc(decode('utf8', $name) ). |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
For decode(), see L. |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
For fc(), see L. |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
$name is from a Wikipedia page. |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
I is 'Yes' or 'No'. |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
I is output from calling decode('utf8', $name). |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
I is a number (1 .. N) indicating the order in which subcountry names appear in the list |
|
426
|
|
|
|
|
|
|
on that subcountry's Wikipedia page. |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
See the source code of L for details of the SQL |
|
429
|
|
|
|
|
|
|
used to create the tables. |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=head2 What do I do if I find a mistake in the data? |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
What data? What mistake? How do you know it's wrong? |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Also, you must decide what exactly you were expecting the data to be. |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
If the problem is the ISO data, report it to them. |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
If the problem is the Wikipedia data, get agreement from everyone concerned and update Wikipedia. |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
If the problem is the output from my code, try to identify the bug in the code and report it via the |
|
442
|
|
|
|
|
|
|
usual mechanism. See L. |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
If the problem is with your computer's display of the data, consider (in alphabetical order): |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=over 4 |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item o CSV |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Does the file display correctly in 'Emacs'? On the screen using 'less'? |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
scripts/export.as.csv.pl uses: use open ':utf8'; |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Is that not working? |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item o DBD::SQLite |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
Did you set the sqlite_unicode attribute? Use something like: |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
my($dsn) = 'dbi:SQLite:dbname=www.scraper.wikipedia.iso3166.sqlite'; # Sample only. |
|
461
|
|
|
|
|
|
|
my($attributes) = {AutoCommit => 1, RaiseError => 1, sqlite_unicode => 1}; |
|
462
|
|
|
|
|
|
|
my($dbh) = DBI -> connect($dsn, '', '', $attributes); |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
The SQLite file ships in the share/ directory of the distro, and must be found by File::ShareDir |
|
465
|
|
|
|
|
|
|
at run time. |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
Did you set the foreign_keys pragma (if needed)? Use: |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
$dbh -> do('PRAGMA foreign_keys = ON'); |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=item o HTML |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
The template htdocs/assets/templates/www/scraper/wikipedia/iso3166/iso3166.report.tx which ships with |
|
474
|
|
|
|
|
|
|
this distro contains this line: |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
Is that not working? |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=item o Locale |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Here's my setup: |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
shell>locale |
|
485
|
|
|
|
|
|
|
LANG=en_AU.utf8 |
|
486
|
|
|
|
|
|
|
LANGUAGE= |
|
487
|
|
|
|
|
|
|
LC_CTYPE="en_AU.utf8" |
|
488
|
|
|
|
|
|
|
LC_NUMERIC="en_AU.utf8" |
|
489
|
|
|
|
|
|
|
LC_TIME="en_AU.utf8" |
|
490
|
|
|
|
|
|
|
LC_COLLATE="en_AU.utf8" |
|
491
|
|
|
|
|
|
|
LC_MONETARY="en_AU.utf8" |
|
492
|
|
|
|
|
|
|
LC_MESSAGES="en_AU.utf8" |
|
493
|
|
|
|
|
|
|
LC_PAPER="en_AU.utf8" |
|
494
|
|
|
|
|
|
|
LC_NAME="en_AU.utf8" |
|
495
|
|
|
|
|
|
|
LC_ADDRESS="en_AU.utf8" |
|
496
|
|
|
|
|
|
|
LC_TELEPHONE="en_AU.utf8" |
|
497
|
|
|
|
|
|
|
LC_MEASUREMENT="en_AU.utf8" |
|
498
|
|
|
|
|
|
|
LC_IDENTIFICATION="en_AU.utf8" |
|
499
|
|
|
|
|
|
|
LC_ALL= |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item o OS |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Unicode is a moving target. Perhaps your OS's installed version of unicode files needs updating. |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=item o SQLite |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
Both Oracle and SQLite.org ship a program called sqlite3. They are not compatible. |
|
508
|
|
|
|
|
|
|
Which one are you using? I use the one from the SQLite.org. |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
AFAICT, sqlite3 does not have command line options, or options while running, to set unicode or pragmas. |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=back |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head2 Why did you use L's NFC() for sorting? |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
This question implies why not use NFD() instead. |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Run scripts/test.nfc.pl, and the output is: |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
code2 => AX |
|
521
|
|
|
|
|
|
|
code3 => ALA |
|
522
|
|
|
|
|
|
|
fc_name => åland islands |
|
523
|
|
|
|
|
|
|
has_subcountries => No |
|
524
|
|
|
|
|
|
|
id => 15 |
|
525
|
|
|
|
|
|
|
name => Åland Islands |
|
526
|
|
|
|
|
|
|
timestamp => 2012-05-13 23:37:20 |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
And this (Åland Islands) is what Wikipedia displays. So, NFC() it is. |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
See L, and specifically prescription # 1. |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
See also section 1.2 Normalization Forms in L. |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
See also L. |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=head2 What is $ENV{AUTHOR_TESTING} used for? |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the |
|
539
|
|
|
|
|
|
|
database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts |
|
540
|
|
|
|
|
|
|
writing to that copy anyway. |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
At run-time, L is used to find the installed version of *.sqlite. |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head2 What FIPS data is included? |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Firstly, scripts/get.fips.pages.pl downloads some Wikipedia data, into data/List_of_FIPS_region_codes_*.html. |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
Secondly, the latter files are parsed by scripts/populate.fips.codes.pl and the 2 reports are in |
|
549
|
|
|
|
|
|
|
data/wikipedia.fips.codes.txt, and data/wikipedia.fips.mismatch.log. |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
This data is I written into the SQLite database yet, but it's available in case it's included |
|
552
|
|
|
|
|
|
|
one day. |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head1 Wikipedia's Terms of Use |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
See L. |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Also, since I'm distributing copies of Wikipedia-sourced material, reformatted but not changed by editing, |
|
559
|
|
|
|
|
|
|
I hereby give notice that their material is released under CC-BY-SA. |
|
560
|
|
|
|
|
|
|
See L for that licence. |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=head1 References |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
In no particular order: |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
L |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
L |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
L |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
L |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
L |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
This is complex set of XML files concerning currency, postal, etc, formats and other details for various countries |
|
577
|
|
|
|
|
|
|
and/or languages. |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
For Debian etc users: /usr/share/xml/iso-codes/iso_3166_2.xml, as installed from the iso-codes package, with: |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sudo apt-get install iso-codes |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
L |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
L |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
L |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
Check the Monthly Archives at Perl.com, starting in April 2012, for a series of Unicode-specific articles by |
|
590
|
|
|
|
|
|
|
Tom Christiansen. |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
L |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
L |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=head1 Repository |
|
597
|
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
L |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=head1 Support |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
Email the author, or log a bug on RT: |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
L. |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=head1 Author |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
C was written by Ron Savage Iron@savage.net.auE> in 2012. |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
Home page: L. |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head1 Copyright |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
Australian copyright (c) 2012 Ron Savage. |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
All Programs of mine are 'OSI Certified Open Source Software'; |
|
617
|
|
|
|
|
|
|
you can redistribute them and/or modify them under the terms of |
|
618
|
|
|
|
|
|
|
The Artistic License, a copy of which is available at: |
|
619
|
|
|
|
|
|
|
http://www.opensource.org/licenses/index.html |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=cut |