File Coverage

blib/lib/Wiktionary/Parser.pm
Criterion Covered Total %
statement 55 83 66.2
branch 11 24 45.8
condition 1 2 50.0
subroutine 9 14 64.2
pod 2 8 25.0
total 78 131 59.5


line stmt bran cond sub pod time code
1             package Wiktionary::Parser;
2              
3 2     2   40554 use strict;
  2         4  
  2         72  
4 2     2   10 use warnings;
  2         3  
  2         53  
5 2     2   1217 use Data::Dumper;
  2         10456  
  2         122  
6              
7 2     2   4018 use MediaWiki::API;
  2         165042  
  2         77  
8 2     2   1729 use Wiktionary::Parser::Document;
  2         13  
  2         87  
9 2     2   2042 use Carp::Always;
  2         1512  
  2         4020  
10              
11             our $VERSION = 0.11;
12              
13             my $CACHE;
14              
15             sub new {
16 1     1 1 20 my $class = shift;
17 1         2 my %args = @_;
18              
19 1   50     8 $args{wiktionary_url} ||= 'http://en.wiktionary.org/w/api.php';
20              
21 1         3 my $self = bless \%args, $class;
22              
23 1         6 $self->{cache} = 0; # 1: cache content locally, 0: don't
24 1         6 $self->{mediawiki_client} = MediaWiki::API->new({ api_url => $self->get_wiktionary_url() });
25              
26 1         16678 return $self;
27             }
28              
29              
30             # create the base url
31             # add http if it hasn't already been prepended
32             sub get_wiktionary_url {
33 1     1 0 2 my $self = shift;
34 1         2 my $url = $self->{wiktionary_url};
35 1         2 $url =~ s|/$||;
36 1 50       18 return sprintf("%s$url", $url =~ m|^https?://| ? '' : 'http://');
37             }
38              
39              
40             sub get_document {
41 0     0 1 0 my $self = shift;
42 0         0 my %args = @_;
43 0         0 my $title = $args{title};
44              
45 0 0       0 if ($self->get_cache($title)) {
46 0         0 return $self->get_cache($title);
47             }
48              
49 0         0 my $page_data = $self->get_page_data(
50             title => $title,
51             );
52 0         0 my $content = $page_data->{'*'};
53              
54 0 0       0 return unless $content;
55              
56 0         0 my $document = $self->parse_page_content(
57             content => $content,
58             title => $title,
59             );
60              
61 0         0 $self->set_cache($title,$document);
62              
63 0         0 return $document;
64             }
65              
66              
67             sub get_cache {
68 0     0 0 0 my $self = shift;
69 0 0       0 return unless $self->{cache};
70 0         0 my $title = shift;
71 0         0 return $CACHE->{ $self->get_wiktionary_url() }{$title};
72             }
73              
74             sub set_cache {
75 0     0 0 0 my $self = shift;
76 0 0       0 return unless $self->{cache};
77 0         0 my $title = shift;
78 0         0 my $document = shift;
79 0         0 $CACHE->{ $self->get_wiktionary_url() }{$title} = $document;
80             }
81              
82              
83             sub parse_page_content {
84 1     1 0 80 my $self = shift;
85 1         6 my %args = @_;
86 1         4 my $document_content = $args{content};
87 1         3 my $title = $args{title};
88              
89 1 50       7 die 'document not defined' unless defined $document_content;
90              
91 1         250 my @lines = split(/\n/,$document_content);
92 1         23 my %sections = ();
93 1         3 my @section_number = ();
94              
95 1         14 my $document = Wiktionary::Parser::Document->new(
96             title => $title,
97             # pass in a parser if we want this document to follow
98             # links to other documents (e.g. wikisaurus)
99             # and include metadata from them
100             parser => $self,
101             );
102              
103 1         7 my $current_section = $document->create_section(
104             section_number => 0,
105             header => $title,
106             );
107              
108 1         3 for my $line (@lines) {
109 646         810 chomp $line;
110 646 100       1158 next unless $line;
111 577 100       3432 if ($line =~ m/^(==+)([^=]+)/) {
    100          
    100          
112 40         82 my $markup = $1;
113 40         65 my $header = $2;
114 40         60 my $n = length($markup);
115 40         65 $section_number[$n-2]++;
116 40         70 $#section_number = $n-2;
117 40 50       63 my $section_number = join('.',map {$_ || ()} @section_number);
  79         248  
118              
119 40         156 $current_section = $document->create_section(
120             section_number => $section_number,
121             header => $header,
122             );
123             } elsif ($line =~ m/^\[\[(Category:[^\]]+)\]\]/) {
124             # e.g. [[Category:Animals]]
125 17         52 $document->add_category(category => $1);
126             } elsif ($line =~ m/^\[\[([a-z]+:$title)\]\]/) {
127             # e.g. [[de:dog]]
128 51         142 $document->add_language_link(tag => $1);
129             } else {
130 469         1457 $current_section->add_content($line);
131             }
132             }
133              
134 1         52 return $document;
135             }
136              
137             sub get_page_data {
138 0     0 0   my $self = shift;
139 0           my %args = @_;
140 0           my $title = $args{title};
141            
142 0 0         die 'title is not defined' unless defined $title;
143              
144 0           my $page_data = $self->get_mediawiki_client()->get_page({title => $title});
145              
146 0           return $page_data;
147             }
148              
149             sub get_mediawiki_client {
150 0     0 0   my $self = shift;
151 0           return $self->{mediawiki_client};
152             }
153              
154              
155              
156              
157             1;
158              
159             =head1 Name
160              
161             Wiktionary::Parser - Client and Parser of content from the Wiktionary API
162              
163             =head1 Synopsis
164              
165             This package may be used to query the Wiktionary API (en.wiktionary.org/w/api.php) for documents by title. It parses the resulting MediaWiki document and provides access to data structures containing word senses, translations, synonyms, parts of speech, etc. It also provides access to the raw content of each MediaWiki section should you wish to extract other data on your own, or build on top of this package.
166              
167             The repository for this package is available on github at https://github.com/clbecker/perl-wiktionary-parser
168             And on CPAN at: http://search.cpan.org/~clbecker/Wiktionary-Parser/
169              
170             =head1 Usage
171              
172             my $parser = Wiktionary::Parser->new();
173            
174             my $document = $parser->get_document(title => 'bunny');
175            
176             my $translation_hashref = $document->get_translations();
177             my $word_sense_hashref = $document->get_word_senses();
178             my $parts_of_speech_hashref = $document->get_parts_of_speech();
179             my $pronunciations_hashref = $document->get_pronunciations();
180             my $synonyms_hashref = $document->get_synonyms();
181             my $hyponyms_hashref = $document->get_hyponyms();
182             my $hypernyms_hashref = $document->get_hypernyms();
183             my $antonyms_hashref = $document->get_antonyms();
184             my $derived_terms_hashref = $document->get_derived_terms();
185              
186             my $section_hashref = $document->get_sections();
187             my $sub_document = $document->get_sub_document(title => 'string or regex');
188             my $table_of_contents_arrayref = $document->get_table_of_contents();
189              
190              
191              
192             =head2 Methods for Wiktionary::Parser
193              
194             =over
195              
196             =item B
197              
198             Create an instance of the Wiktionary::Parser. This object is used to contact the Wiktionary API, and parse the results.
199              
200             my $parser = Wiktionary::Parser->new();
201              
202              
203             =item B (title => TITLE)
204              
205             Contacts the wiktionary API, and downloads the page with the given title. It then parses the content and returns a Wiktionary::Parser::Document object that you can call further methods on.
206              
207             my $document = $parser->get_document(title => 'orange');
208              
209             =back
210              
211              
212             =head2 Methods for Wiktionary::Parser::Document
213              
214             See https://github.com/clbecker/perl-wiktionary-parser/wiki for details and examples on methods for the Wiktionary::Parser::Document object.
215              
216             =over
217              
218              
219             =item B
220              
221             Returns a reference to a hash mapping language to a list of derived terms and phrases
222              
223             my $derived_words = $document->get_derived_words();
224              
225             print Dumper $derived_words;
226            
227             $VAR1 = {
228             'en' => [
229             'bergamot orange',
230             'bitter orange',
231             'blood orange',
232             'burnt orange',
233             ...
234             ],
235             ...
236             }
237              
238             =item B
239              
240             Returns a reference to a hash mapping language to a list of parts of speech. See https://github.com/clbecker/perl-wiktionary-parser/wiki/Parts-of-speech for details.
241              
242              
243             my $parts_of_speech = $document->get_parts_of_speech();
244            
245             print Dumper $parts_of_speech;
246            
247             $VAR1 = {
248             'en' => {
249             'language' => 'English',
250             'part_of_speech' => [
251             'noun',
252             'adj',
253             'verb'
254             ]
255             },
256             'sv' => {
257             'language' => 'Swedish',
258             'part_of_speech' => [
259             'adjective',
260             'noun'
261             ]
262             },
263             ...
264             }
265              
266              
267             =item B
268              
269             Returns a reference to a hash mapping language to a pronunciation metadata. See https://github.com/clbecker/perl-wiktionary-parser/wiki/pronunciations for details.
270              
271             my $pronunciations = $document->get_pronunciations();
272            
273             =item B
274              
275             Returns a reference to a hash mapping word sense to language to translated words
276              
277             my $translations = $document_get_translations();
278              
279             print Dumper $translations
280            
281             $VAR1 = {
282             'fruit' => {
283             'tr' => {
284             'language' => 'Turkish',
285             'translations' => [
286             'portakal'
287             ],
288             'part_of_speech' => 'noun'
289             },
290             'fr' => {
291             'language' => 'French',
292             'translations' => [
293             'orange'
294             ],
295             'part_of_speech' => 'noun'
296             },
297             'da' => {
298             'language' => 'Danish',
299             'translations' => [
300             'appelsin'
301             ],
302             'part_of_speech' => 'noun'
303             },
304             ...
305             },
306             ...
307             }
308              
309              
310             =item B
311              
312             Returns an arrayref containing a list of word senses
313              
314              
315             my $word_senses = $document->get_word_senses();
316            
317             print Dumper $word_senses;
318            
319             $VAR1 = [
320             'tree',
321             'colour',
322             'fruit',
323             ...
324             ]
325              
326             =item B
327              
328             Returns a reference to a hash mapping language and word sense to a list of synonyms
329              
330             my $synonyms = $document->get_synonyms();
331              
332             # Synonyms of 'cat'
333             print Dumper $synonyms;
334            
335             $VAR1 = {
336             'en' => {
337             'language' => 'English',
338             'sense' => {
339             'domestic species' => [
340             'housecat',
341             'kitten',
342             'kitty',
343             ...
344             ],
345             ...
346             },
347             }
348             }
349              
350             =item B
351              
352             Returns a reference to a hash mapping language and word sense to a list of hyponyms
353              
354             my $hyponyms = $document->get_hyponyms();
355              
356             =item B
357              
358             Returns a reference to a hash mapping language and word sense to a list of hypernyms
359              
360             my $hypernyms = $document->get_hypernyms();
361              
362             =item B
363              
364             Returns a reference to a hash mapping language and word sense to a list of antonyms
365              
366             my $antonyms = $document->get_antonyms();
367              
368              
369              
370             =item B (number => SECTION_NUMBER)
371              
372             Given the section number, returns the corresponding Wiktionary::Parser::Section object. Numbers correspond to the those in the table of contents shown on a mediawiki page.
373              
374             my $section = $document->get_section(number => '1.2');
375              
376             =item B
377              
378             Returns a reference to a hash of Wiktionary::Parser::Section objects. These provide access to the data for each section of the document.
379             The format of the hash is { $section_number => object } e.g. {'1.2.1' => $obj}
380              
381             =item B (title => STRING_OR_REGEX)
382              
383             Given a string or regular expression, this will return an array of Section objects containing any sections that match the given title pattern.
384              
385             This returns a list containing section(s) with 'english' in the title (case insensitive) In most cases this will just return the 'English' section, in some cases you'll also get the 'Old English' section too.
386              
387             my $sections = $document->get_sections(title => 'english');
388              
389             If you want to get only the "English" section, use this pattern:
390              
391             my $sections = $document->get_sections(title => '^english$');
392              
393             This returns an array of all etymology, pronunciation, and synonym sections
394             my $sections = $document->get_sections(title => 'etymology|pronunciation|synonyms');
395              
396              
397             =item B (title => STRING_OR_REGEX)
398              
399             Given a string or regular expression, this will return a Wiktionary::Parser::Document object consisting of just the matching sections, and their child sections. This can be used if you're just interested in certain parts of a document.
400              
401              
402             This returns a document object containing just the sections with 'english' in the title (case insensitive). In most cases this will just return the 'English' section, in some cases you'll also get the 'Old English' section too.
403              
404             my $sub_document = $document->get_sub_document(title => 'english');
405              
406             If you want to get only the "English" section, use this pattern:
407              
408             my $sub_document = $document->get_sub_document(title => '^english$');
409              
410             # To verify what sections you have, you can print out the table of contents for this sub document.
411             print Dumper $sub_document->get_table_of_contents();
412              
413              
414             =item B
415              
416             Returns an array of Wiktionary::Parser::Section objects representating all the sections on the page that cover a part of speech. This current includes all sections that match the following header:
417              
418              
419             Parts of Speech used in Wiktionary include:
420            
421             noun
422             verb
423             adjective
424             adverb
425             pronoun
426             preposition
427             article
428             conjunction
429             determiner
430             interjection
431             symbol
432              
433              
434             my $sections = $document->get_part_of_speech_section();
435              
436              
437             =item B
438              
439             Returns an array of Wiktionary::Parser::Section objects consisting of all Translation sections in the document.
440              
441             my $sections = $document->get_translation_sections();
442              
443              
444             =item B
445              
446             Returns an array of Wiktionary::Parser::Section objects consisting of all Synonym sections in the document.
447              
448             my $sections = $document->get_synonym_sections();
449              
450              
451             =item B
452              
453             Returns an array of Wiktionary::Parser::Section objects consisting of all Hyponym sections in the document.
454              
455             my $sections = $document->get_hyponym_sections();
456              
457              
458             =item B
459              
460             Returns an array of Wiktionary::Parser::Section objects consisting of all Hypernym sections in the document.
461              
462             my $sections = $document->get_hypernym_sections();
463              
464              
465             =item B
466              
467             Returns an array of Wiktionary::Parser::Section objects consisting of all Antonym sections in the document.
468              
469             my $sections = $document->get_antonym_sections();
470              
471             =item B
472              
473             Returns an array of Wiktionary::Parser::Section objects consisting of all Pronunciation sections in the document.
474              
475             my $sections = $document->get_pronunciation_sections();
476              
477              
478              
479              
480             =item B
481              
482             Returns an arrayref containing section numbers and names. Mostly helpful for informational / debugging purposes when you need a summary of what's in your document object.
483              
484             my $table_of_contents = $document->get_table_of_contents();
485              
486              
487             print Dumper $table_of_contents;
488              
489             $VAR1 = [
490             '1,english',
491             '1.1,etymology',
492             '1.2,pronunciation',
493             '1.2.1,usage notes',
494             '1.3,noun',
495             '1.3.1,derived terms',
496             '1.3.2,translations',
497             '1.4,adjective',
498             '1.4.1,translations',
499             ...
500             ]
501              
502             =item B
503              
504             Return the document title. ( i.e. the argument you used in $parser->get_document(title => $title) )
505              
506             =item B
507              
508             Return an array of all categories this page falls under. (These are usually the links that appear at the bottom of a wiki page)
509              
510              
511             =back
512              
513             =head2 Methods for Wiktionary::Parser::Section
514              
515             =over
516              
517             =item B
518              
519             Returns an arrayref containing lines of text from the section of the document
520              
521             =item B
522              
523             Returns the section heading name
524              
525             =item B
526              
527             Returns the number of this section (e.g. 1.2.1)
528              
529             =item B
530              
531             Return the Wiktionary::Parser::Section instane of the parent section. e.g. if you call this on section 1.2.1, it'll return the object for section 1.2
532              
533             =item B
534              
535             Returns the language that this section falls under.
536              
537             =item B
538              
539             Returns the part of speech that this section falls under.
540              
541             =item B
542              
543             Return an arrayref containing all sections above this one in the hierarchy.
544              
545             =item B
546              
547             This returns a Wiktionary::Parser::Document object containing the current section and all its child sections.
548              
549             =item B
550              
551             Returns an array of all sections below this one in the hierarchy
552              
553             =back
554              
555             =cut