File Coverage

blib/lib/Lingua/EN/GivenNames.pm
Criterion Covered Total %
statement 27 53 50.9
branch 0 6 0.0
condition 0 12 0.0
subroutine 9 12 75.0
pod 2 2 100.0
total 38 85 44.7


line stmt bran cond sub pod time code
1             package Lingua::EN::GivenNames;
2              
3 1     1   21027 use feature qw/say unicode_strings/;
  1         2  
  1         167  
4 1     1   786 use open qw(:std :utf8);
  1         1281  
  1         5  
5 1     1   106 use strict;
  1         7  
  1         39  
6 1     1   6 use warnings;
  1         2  
  1         24  
7 1     1   3 use warnings qw(FATAL utf8);
  1         1  
  1         32  
8              
9 1     1   779 use Config::Tiny;
  1         974  
  1         26  
10              
11 1     1   972 use File::ShareDir;
  1         7151  
  1         57  
12 1     1   9 use File::Spec;
  1         1  
  1         23  
13              
14 1     1   849 use Hash::FieldHash ':all';
  1         1688  
  1         859  
15              
16             fieldhash my %config => 'config';
17             fieldhash my %config_file => 'config_file';
18             fieldhash my %data_dir => 'data_dir';
19             fieldhash my %sex => 'sex';
20             fieldhash my %share_dir => 'share_dir';
21             fieldhash my %sqlite_file => 'sqlite_file';
22             fieldhash my %verbose => 'verbose';
23              
24             our $VERSION = '1.00';
25              
26             # -----------------------------------------------
27              
28             sub _init
29             {
30 0     0     my($self, $arg) = @_;
31 0   0       $$arg{config_file} ||= '.ht.lingua.en.givennames.conf'; # Caller can set.
32 0           $$arg{data_dir} = 'data';
33 0   0       $$arg{sex} ||= ''; # Caller can set.
34 0           $$arg{share_dir} = '';
35 0   0       $$arg{sqlite_file} ||= 'lingua.en.givennames.sqlite'; # Caller can set.
36 0   0       $$arg{verbose} ||= 0; # Caller can set.
37 0           $self = from_hash($self, $arg);
38 0           (my $package = __PACKAGE__) =~ s/::/-/g;
39              
40 0 0         $self -> share_dir($ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package) );
41 0           $self -> config_file(File::Spec -> catfile($self -> share_dir, $self -> config_file) );
42 0           $self -> config(Config::Tiny -> read($self -> config_file) );
43              
44 0 0         die Config::Tiny -> errstr if (Config::Tiny -> errstr);
45              
46 0           $self -> sqlite_file(File::Spec -> catfile($self -> share_dir, $self -> sqlite_file) );
47              
48 0           binmode STDOUT;
49              
50 0           $self -> log(debug => 'Config file: ' . $self -> config_file);
51 0           $self -> log(debug => 'SQLite file: ' . $self -> sqlite_file);
52              
53 0           return $self;
54              
55             } # End of _init.
56              
57             # -----------------------------------------------
58              
59             sub log
60             {
61 0     0 1   my($self, $level, $s) = @_;
62 0   0       $level ||= 'debug';
63 0   0       $s ||= '';
64              
65 0 0         say "$level: $s" if ($self -> verbose);
66              
67             } # End of log.
68              
69             # -----------------------------------------------
70              
71             sub new
72             {
73 0     0 1   my($class, %arg) = @_;
74 0           my($self) = bless {}, $class;
75 0           $self = $self -> _init(\%arg);
76              
77 0           return $self;
78              
79             } # End of new.
80              
81             # -----------------------------------------------
82              
83             1;
84              
85             =pod
86              
87             =head1 NAME
88              
89             Lingua::EN::GivenNames - An SQLite database of derivations of English given names
90              
91             =head1 Synopsis
92              
93             L I for English given names. You do not need to run the script
94             which downloads pages from there. That web site, though, does have names for 13 other languages, if you wish
95             to adapt this distro for a different language.
96              
97             So, just use the SQLite database shipped with this module, as discussed next, or scripts/export.pl to output to
98             CSV or HTML.
99              
100             The database has been exported as L.
101             This on-line version was created with scripts/export.pl's I switch set to 1.
102              
103             The database is also shipped as data/given.names.csv and data/given.names.html, although this latter page
104             was created with scripts/export.pl's I switch set to 0.
105              
106             =head2 Basic Usage
107              
108             This is the simplest way to access the data.
109              
110             use Lingua::EN::GivenNames::Database;
111              
112             my($database) = Lingua::EN::GivenNames::Database -> new;
113              
114             # $names is an arrayref of hashrefs.
115              
116             my($names) = $database -> read_names_table;
117              
118             Each element in @$names contains a hashref of data for 1 record in the database, and has these keys
119             (in alphabetical order):
120              
121             {
122             derivation => The derivation,
123             fc_name => The case-folded name,
124             form => The form,
125             id => The primary key of this record,
126             kind => The kind,
127             meaning => The meaning,
128             name => The name,
129             original => The original (name),
130             rating => The rating (relability indicator),
131             sex => The sex,
132             source => The source (language or name),
133             }
134              
135             The most important fields are: name, sex and derivation.
136              
137             Here, sex means the classification of the name into I or I within the web site which was scraped
138             to provide the given name data.
139              
140             See L entries for details.
141              
142             =head2 Scripts which output to a file
143              
144             scripts/export.pl responds to the -h option.
145              
146             Some examples, with output files that happen to be the defaults:
147              
148             shell>perl scripts/export.pl -cvs_file given.names.csv
149             shell>perl scripts/export.pl -web_page_file given.names.html -j 1
150              
151             =head1 Description
152              
153             C is a pure Perl module.
154              
155             It is used to download various Englsh given names-related pages from 20000-names.com, and to then
156             import data scraped from those pages into an SQLite database.
157              
158             The pages have already been downloaded, so that phase only needs to be run when pages are updated.
159             Likewise, the data has been imported.
160              
161             This means you would normally only ever use the database in read-only mode, as per the L.
162              
163             =head1 Constructor and initialization
164              
165             new(...) returns an object of type C.
166              
167             This is the class's contructor.
168              
169             Usage: C<< Lingua::EN::GivenNames -> new() >>.
170              
171             This method takes a hash of options.
172              
173             Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
174              
175             Available options (these are also methods):
176              
177             =over 4
178              
179             =item o config_file => $file_name
180              
181             The name of the file containing config info, such as I and I, as used by various modules.
182              
183             The code prefixes this name with the directory returned by L on the end-user's
184             machine, and prefixes it with a simple 'share' on the author's machine (i.e. when $ENV{AUTHOR_TESTING} is 1).
185              
186             Default: .ht.lingua.en.givennames.conf.
187              
188             =item o sex => $male_or_female
189              
190             Some scripts (scripts/extract.derivations.pl and scripts/get.name.pages.pl) set this parameter to 'male' or
191             'female' as needed. See scripts/import.sh for details.
192              
193             Default: ''.
194              
195             =item o sqlite_file => $file_name
196              
197             The name of the SQLite database of given name data.
198              
199             The code prefixes this name with the directory returned by L or with 'share',
200             as explained under I just above.
201              
202             Default: lingua.en.givennames.sqlite.
203              
204             =item o verbose => $integer
205              
206             Print more or less information.
207              
208             Default: 0 (print nothing).
209              
210             =back
211              
212             =head1 Distributions
213              
214             This module is available as a Unix-style distro (*.tgz).
215              
216             Install Lingua::EN::GivenNames as you would for any C module:
217              
218             Run:
219              
220             cpanm Lingua::EN::GivenNames
221              
222             or run:
223              
224             sudo cpan Lingua::EN::GivenNames
225              
226             or unpack the distro, and then run:
227              
228             perl Makefile.PL
229             make (or dmake)
230             make test
231             make install
232              
233             See L for details.
234              
235             See L for
236             help on unpacking and installing.
237              
238             =head1 Methods
239              
240             =head2 config()
241              
242             Returns the hashref of config data as read by L. Used like this:
243              
244             my($config) = $self -> config;
245             my($css_url) = $$config{_}{css_url}; # Note the '_' hash key!
246              
247             =head2 config_file($file_name)
248              
249             Get or set the name of the config file.
250              
251             The code prefixes this name with the directory returned by L.
252              
253             Also, I is an option to L.
254              
255             =head2 data_dir()
256              
257             Returns the name of the data dir within the distro, which is the constant 'data'.
258              
259             =head2 log($level => $s)
260              
261             Print $s at log level $level, if ($self -> verbose);
262              
263             Since $self -> verbose defaults to 0, nothing is printed by default.
264              
265             =head2 new()
266              
267             See L.
268              
269             =head2 sex($male_or_female)
270              
271             Gets and sets the sex attribute, as used by scripts/extract.derivations.pl and scripts/get.name.pages.pl.
272              
273             Also, I is an option to L.
274              
275             =head2 share_dir()
276              
277             Returns the name of the share dir. When $ENV{AUTHOR_TESTING} is 1, this will be 'share', within the distro.
278             And when $ENV{AUTHOR_TESTING} is 0 (i.e. on an end-user machine), it will be the directory returned by
279             L.
280              
281             =head2 sqlite_file($file_name)
282              
283             Get or set the name of the database file.
284              
285             The code prefixes this name with the directory returned by L.
286              
287             Also, I is an option to L.
288              
289             =head2 verbose($integer)
290              
291             Get or set the verbosity level.
292              
293             Also, I is an option to L.
294              
295             =head1 FAQ
296              
297             =head2 What does L's read_names_table() return?
298              
299             It returns an arrayref of hashrefs.
300              
301             Each element in the arrayref contains data for 1 record built from the names table, and has these keys
302             (in alphabetical order):
303              
304             {
305             derivation => The derivation,
306             fc_name => The case-folded name,
307             form => The form,
308             id => The primary key of this record,
309             kind => The kind,
310             meaning => The meaning,
311             name => The name,
312             original => The original (name),
313             rating => The rating (relability indicator),
314             sex => The sex,
315             source => The source (language or name),
316             }
317              
318             Details:
319              
320             =over 4
321              
322             =item o derivation
323              
324             This is the name field from the derivations table.
325              
326             =item o fc_name
327              
328             This is the case-folded version of the name field (below).
329              
330             =item o form
331              
332             This is the name field from the forms table.
333              
334             =item o id
335              
336             This is the primary key in the names table.
337              
338             =item o kind
339              
340             This is the name field from the kinds table.
341              
342             =item o meaning
343              
344             This is the name field from the meanings table.
345              
346             =item o name
347              
348             This is, finally, the name itself.
349              
350             =item o original
351              
352             This is the name field from the originals table.
353              
354             =item o rating
355              
356             This is the name field from the ratings table.
357              
358             =item o sex
359              
360             This is the name field from the sexes table.
361              
362             The value is the classification of the name into I or I within the web site which was scraped
363             to provide the given name data.
364              
365             =item o source
366              
367             This is the name field from the sources table.
368              
369             =back
370              
371             =head2 Are the input web pages difficult to process?
372              
373             Yes! Some pages contain names in various character encodings, making the derivation analysis very
374             difficult.
375              
376             Examples of the many, many things to watch out for are:
377              
378             =over 4
379              
380             =item o data/female_english_names.htm line 4913
381              
382             =item o data/female_english_names_05.htm line 3284
383              
384             =item o The hex char \xC2
385              
386             This appears all over the place.
387              
388             =item o Nested web pages
389              
390             The pages contain the names in a table of 1 row and 1 column, within which is a long list
391             of the
  • entries I parse.
  • 392              
    393             But elsewhere on the pages, entire web pages have been jammed into table cells. Thanx FrontPage!
    394              
    395             =back
    396              
    397             =head2 Where is the database?
    398              
    399             It is shipped in share/lingua.en.givennames.sqlite.
    400              
    401             It is installed into the distro's shared dir, as returned by L.
    402             On my machine that's:
    403              
    404             /home/ron/perl5/perlbrew/perls/perl-5.14.2/lib/site_perl/5.14.2/auto/share/dist/Lingua-EN-GivenNames/lingua.en.givennames.sqlite.
    405              
    406             =head2 Where is the config file?
    407              
    408             It is shipped in share/.ht.lingua.en.givennames.conf.
    409              
    410             It is installed into the distro's shared dir, along with the database.
    411              
    412             =head2 What is the database schema?
    413              
    414             See data/schema.png.
    415              
    416             The table names are: forms, kinds, meanings, names, originals, ratings, sexes and sources,
    417             with names being the main table.
    418              
    419             These are the columns in the names table:
    420              
    421             =over 4
    422              
    423             =item o derivation_id
    424              
    425             This is a foreign key pointing to the id column of the derivations table. See data/schema.png.
    426              
    427             The name field in the derivations table is constructed from various fields in the input,
    428             in one of the following ways. These fields are extracted from the input using capturing parentheses
    429             in regexps.
    430              
    431             =over 4
    432              
    433             =item o qq|$$item{kind} $$item{form}, $$item{rating} $$item{meaning}|
    434              
    435             That is, for a given name, the kind field in the input is put into the kinds table, and the
    436             id which results from that insertion goes into the kind_id field in the names table. Likewise for the
    437             other components in this derivation.
    438              
    439             This is used when the regexp in L sub parse_derivations()
    440             is type 'c', and hence there is no field in the input which can be extracted and put into the
    441             originals table. In this case, the name field in the originals table is '-'. The id in the originals
    442             table will, in this case, be 1 and the original_id field in the names table will also be 1.
    443             Note: whenever the name field in the originals table is '-', then the name in the sources table is
    444             also '-'.
    445              
    446             =item o qq|$$item{kind} $$item{form} of $$item{source} $$item{original}, $$item{rating} $$item{meaning}|
    447              
    448             This is used for regexp types 'a', 'b' and 'd', when a meaningful value for original can be extracted
    449             from the input.
    450              
    451             =back
    452              
    453             In other words, when extracting data from the various tables, if you wish to reconstruct the value
    454             in the derivations table from the foreign keys in the names table, then one of these syntaxes must
    455             be used to build the original derivation scraped from the web pages. To save you that effort is
    456             of course why the derivations table is provided, and which is accessed via the derivation_id in the
    457             names table.
    458              
    459             =item o fc_name
    460              
    461             This is the case-folded version of the name field (below).
    462              
    463             =item o form_id
    464              
    465             This is a foreign key pointing to the id column of the forms table.
    466              
    467             If we say the name 'Tonya' is the English equivalent of the Italian/Spanish 'Tonia', then the
    468             'equivalent' component comes from the forms table.
    469              
    470             =item o id
    471              
    472             This is the primary key.
    473              
    474             =item o kind_id
    475              
    476             This is a foreign key pointing to the id column of the kinds table.
    477              
    478             If we say the name 'Tonya' is the English equivalent of the Italian/Spanish 'Tonia', then the
    479             'English' component of that derivation comes from the kinds table
    480              
    481             =item o meaning_id
    482              
    483             This is a foreign key pointing to the id column of the meanings table.
    484              
    485             Given the derivation of Tonya as 'English equivalent of Italian/Spanish Tonia, a short form of Latin Antonia, possibly meaning "invaluable"',
    486             then the component "invaluable" comes from the meanings table.
    487              
    488             =item o name
    489              
    490             This is the name itself.
    491              
    492             =item o original_id
    493              
    494             This is a foreign key pointing to the id column of the originals table.
    495              
    496             Given the derivation of Tonya as 'English equivalent of Italian/Spanish Tonia, a short form of Latin Antonia, possibly meaning "invaluable"',
    497             then the component 'Tonia, a short form of Latin Antonia' comes from the originals table.
    498              
    499             =item o rating_id
    500              
    501             This is a foreign key pointing to the id column of the ratings table.
    502              
    503             The value in the ratings table gives an indicator of the reliability of the meaning of the name,
    504             where the meaning comes from the meanings table.
    505              
    506             The value will be one of:
    507              
    508             =over 4
    509              
    510             =item o meaning
    511              
    512             It just means what it means.
    513              
    514             =item o meaning both
    515              
    516             That is, the name has 2 meanings.
    517              
    518             Thus the name 'Bonny' means both "good" and "pretty".
    519              
    520             =item o meaning either
    521              
    522             That is, there is doubt as to which of the 2 meanings is most reliable. The name field in the
    523             corresponding meanings table will have 2 separate meanings in double-quotes.
    524              
    525             Thus the name 'Ailward' has the meaning "noble guard" or "elf guard".
    526              
    527             =item o meaning simply
    528              
    529             Thus the name 'Brande' means simply "brandy".
    530              
    531             =item o possibly meaning
    532              
    533             Thus the name 'Raelene' possibly means "sunbeam".
    534              
    535             =back
    536              
    537             =item o sex_id
    538              
    539             This is a foreign key pointing to the id column of the sexes table.
    540              
    541             The value in the sexes table, female or male, is how the web site classified the name.
    542             So, female means the name came from one of the data/female_english_names*.htm files. Likewise for male.
    543              
    544             =item o source_id
    545              
    546             This is a foreign key pointing to the id column of the sources table.
    547              
    548             The value in the sources table is often a language, e.g. 'Italian/Spanish'.
    549              
    550             Thus when we say the name 'Tonya' is the English equivalent of the Italian/Spanish 'Tonia', this means
    551             'Tonya' is sourced from 'Tonia' in Italian/Spanish.
    552              
    553             =back
    554              
    555             =head2 What do I do if I find a mistake in the data?
    556              
    557             What data? What mistake? How do you know it's wrong?
    558              
    559             Also, you must decide what exactly you were expecting the data to be.
    560              
    561             Firstly, report your claim to the webmaster at L<20000-names.com>.
    562              
    563             Note: The input data is partially free-form, as per the original web pages, and commentary
    564             as used on those pages I.
    565              
    566             So, perhaps the solution lies in making the regexps in L smarter.
    567              
    568             Another possibility is to pre-process one or both of the input files data/derivations.raw and
    569             data/derivations.csv before they are processed. The next question discusses how to intervene in the
    570             data flow.
    571              
    572             =head2 How do the scripts and modules interact to produce the data?
    573              
    574             Recall from above that the web site L<20000-names.com> I. The output files from that
    575             step are in data/*.htm.
    576              
    577             The database tables are created with:
    578              
    579             scripts/drop.tables.pl
    580             scripts/create.tables.pl
    581              
    582             Then the data is processed with (see scripts/import.sh):
    583              
    584             Input files: data/*.htm
    585             Reader: scripts/extract.derivations.pl
    586             Output file: data/derivations.raw
    587             Reader: scripts/parse.derivations.pl
    588             Output file: data/derivations.csv
    589             Reader: scripts/import.derivations.pl
    590             Output file: share/lingua.en.givennames.sqlite (when $ENV{AUTHOR_TESTING} == 1)
    591             Reader: scripts/export.pl
    592             Output file: data/given.names.html
    593              
    594             Scripts (in alphabetical order):
    595              
    596             =over 4
    597              
    598             =item o scripts/create.tables.pl
    599              
    600             Creates all the database tables. Remember to run drop.tables.pl first if the tables already exist.
    601              
    602             =item o scripts/drop.tables.pl
    603              
    604             Drops all the database tables. Then run create.tables.pl immediately afterwards.
    605              
    606             =item o scripts/export.pl
    607              
    608             This script obviously reads the database and outputs the expected data. It uses
    609             L, and command line options -csv_file or -web_page_file.
    610              
    611             =item o scripts/extract.derivations.pl
    612              
    613             This script is run once each for 20 pages of female names and once each for 17 pages of male names.
    614             It uses L.
    615              
    616             =item o scripts/extract.parse.sh
    617              
    618             Run scripts/extract.derivations.pl and then scripts/parse.derivations.pl on one page for one sex.
    619             This script is used only by the author while developing the module.
    620              
    621             =item o scripts/get.name.pages.pl
    622              
    623             This script is run once to get 20 pages of female names and once to get 17 pages of male names.
    624             It uses L.
    625              
    626             =item o scripts/import.derivations.pl
    627              
    628             This scripts actually writes the database tables. It uses L.
    629              
    630             =item o scripts/import.sh
    631              
    632             That sequence of commands (above) is performed by scripts/import.sh.
    633              
    634             To re-create the database, do this:
    635              
    636             =over 4
    637              
    638             =item o shell> AUTHOR_TESTING=1
    639              
    640             This will tell the code to write to share/lingua.en.givennames.sqlite, rather than to the installed database.
    641             The latter is probably read-only, anyway.
    642              
    643             =item o shell> export AUTHOR_TESTING
    644              
    645             =item o shell> scripts/import.sh
    646              
    647             This runs all the appropriate scripts in one hit. The output is worth examining to get some idea of what happens.
    648              
    649             =back
    650              
    651             =item o scripts/parse.derivations.pl
    652              
    653             Besides outputting data/derivations.csv, this script also outputs data/mismatches.log and
    654             data/parse.log. It uses L.
    655              
    656             See L for more about the mismatches file.
    657              
    658             Also, this script uses data/unparsable.txt to skip some names. Further, it currently skips names which
    659             are not all ASCII characters.
    660              
    661             =item o scripts/pod2html.sh
    662              
    663             A bash script to convert all *.pm files into HTML under my web server's doc root.
    664              
    665             =item o scripts/report.name.pl
    666              
    667             Takes a '-name $name' parameter. Samples:
    668              
    669             1) perl -Ilib scripts/report.name.pl -n Abaegayle
    670              
    671             derivation Variant spelling of English Abigail, meaning "father rejoices"
    672             fc_name abaegayle
    673             form spelling
    674             id 8
    675             kind Variant
    676             meaning "father rejoices"
    677             name Abaegayle
    678             original Abigail
    679             rating meaning
    680             sex female
    681             source English
    682              
    683             Consult L for the 6 ways to spell
    684             Abagail.
    685              
    686             2) perl scripts/report.name.pl -n Zoe
    687              
    688             derivation Greek name, meaning "life"
    689             fc_name zoe
    690             form name
    691             id 3962
    692             kind Greek
    693             meaning "life"
    694             name Zoe
    695             original -
    696             rating meaning
    697             sex female
    698             source -
    699              
    700             =item o scripts/report.statistics.pl
    701              
    702             Currently prints these database statistics:
    703              
    704             Table Records
    705             derivations 3062
    706             forms 15
    707             kinds 52
    708             meanings 1356
    709             names 3967
    710             originals 2393
    711             ratings 5
    712             sexes 2
    713             sources 56
    714              
    715             =item o scripts/report.stop.words.pl
    716              
    717             This uses Lingua::EN::StopWordList to report any stop words which happened to be picked up by the regexps
    718             used to parse the web page data.
    719              
    720             Currently prints this report:
    721              
    722             Table 'sources' contains these stop words: of
    723             Table 'forms' contains these stop words: from, name
    724              
    725             =item o scripts/test.pattern.pl
    726              
    727             This is code I use to test new regexps before putting them into production in sub parse_derivations()
    728             in L.
    729              
    730             =back
    731              
    732             =head2 What is $ENV{AUTHOR_TESTING} used for?
    733              
    734             When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the
    735             database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts
    736             writing to that copy anyway.
    737              
    738             After end-user installation, L is used to find the installed version of *.sqlite.
    739              
    740             =head2 TODO
    741              
    742             Mismatches, output from analyzing the web pages, are shipped in data/mismatches.log. The next step is to
    743             extend the list of regexps in L's sub parse_derivations() to
    744             capture more derivations.
    745              
    746             The mismatch file is sorted and reformatted compared to the data/derivations.*, to make it easy to use to
    747             build new regexps.
    748              
    749             =head2 Why don't you use Perl6::Slurp to read files?
    750              
    751             Because I found it (V 0.051000) did not respect the 'raw' file encoding option I specified.
    752              
    753             =head1 Non-English names
    754              
    755             The web site L<20000-names.com> has names in various other languages, for those wishing the adapt
    756             this code to deal with those cases.
    757              
    758             =head1 Support
    759              
    760             Email the author, or log a bug on RT:
    761              
    762             L.
    763              
    764             =head1 Author
    765              
    766             C was written by Ron Savage Iron@savage.net.auE> in 2012.
    767              
    768             Home page: L.
    769              
    770             =head1 Copyright
    771              
    772             Australian copyright (c) 2012 Ron Savage.
    773              
    774             All Programs of mine are 'OSI Certified Open Source Software';
    775             you can redistribute them and/or modify them under the terms of
    776             The Artistic License, a copy of which is available at:
    777             http://www.opensource.org/licenses/index.html
    778              
    779              
    780             =cut