File Coverage

blib/lib/Wiktionary/Parser.pm

Criterion	Covered	Total	%
statement	55	83	66.2
branch	11	24	45.8
condition	1	2	50.0
subroutine	9	14	64.2
pod	2	8	25.0
total	78	131	59.5

line	stmt	bran	cond	sub	pod	time	code
1							package Wiktionary::Parser;
2
3	2			2		40554	use strict;
	2					4
	2					72
4	2			2		10	use warnings;
	2					3
	2					53
5	2			2		1217	use Data::Dumper;
	2					10456
	2					122
6
7	2			2		4018	use MediaWiki::API;
	2					165042
	2					77
8	2			2		1729	use Wiktionary::Parser::Document;
	2					13
	2					87
9	2			2		2042	use Carp::Always;
	2					1512
	2					4020
10
11							our $VERSION = 0.11;
12
13							my $CACHE;
14
15							sub new {
16	1			1	1	20	my $class = shift;
17	1					2	my %args = @_;
18
19	1		50			8	$args{wiktionary_url} \|\|= 'http://en.wiktionary.org/w/api.php';
20
21	1					3	my $self = bless \%args, $class;
22
23	1					6	$self->{cache} = 0; # 1: cache content locally, 0: don't
24	1					6	$self->{mediawiki_client} = MediaWiki::API->new({ api_url => $self->get_wiktionary_url() });
25
26	1					16678	return $self;
27							}
28
29
30							# create the base url
31							# add http if it hasn't already been prepended
32							sub get_wiktionary_url {
33	1			1	0	2	my $self = shift;
34	1					2	my $url = $self->{wiktionary_url};
35	1					2	$url =~ s\|/$\|\|;
36	1	50				18	return sprintf("%s$url", $url =~ m\|^https?://\| ? '' : 'http://');
37							}
38
39
40							sub get_document {
41	0			0	1	0	my $self = shift;
42	0					0	my %args = @_;
43	0					0	my $title = $args{title};
44
45	0	0				0	if ($self->get_cache($title)) {
46	0					0	return $self->get_cache($title);
47							}
48
49	0					0	my $page_data = $self->get_page_data(
50							title => $title,
51							);
52	0					0	my $content = $page_data->{'*'};
53
54	0	0				0	return unless $content;
55
56	0					0	my $document = $self->parse_page_content(
57							content => $content,
58							title => $title,
59							);
60
61	0					0	$self->set_cache($title,$document);
62
63	0					0	return $document;
64							}
65
66
67							sub get_cache {
68	0			0	0	0	my $self = shift;
69	0	0				0	return unless $self->{cache};
70	0					0	my $title = shift;
71	0					0	return $CACHE->{ $self->get_wiktionary_url() }{$title};
72							}
73
74							sub set_cache {
75	0			0	0	0	my $self = shift;
76	0	0				0	return unless $self->{cache};
77	0					0	my $title = shift;
78	0					0	my $document = shift;
79	0					0	$CACHE->{ $self->get_wiktionary_url() }{$title} = $document;
80							}
81
82
83							sub parse_page_content {
84	1			1	0	80	my $self = shift;
85	1					6	my %args = @_;
86	1					4	my $document_content = $args{content};
87	1					3	my $title = $args{title};
88
89	1	50				7	die 'document not defined' unless defined $document_content;
90
91	1					250	my @lines = split(/\n/,$document_content);
92	1					23	my %sections = ();
93	1					3	my @section_number = ();
94
95	1					14	my $document = Wiktionary::Parser::Document->new(
96							title => $title,
97							# pass in a parser if we want this document to follow
98							# links to other documents (e.g. wikisaurus)
99							# and include metadata from them
100							parser => $self,
101							);
102
103	1					7	my $current_section = $document->create_section(
104							section_number => 0,
105							header => $title,
106							);
107
108	1					3	for my $line (@lines) {
109	646					810	chomp $line;
110	646	100				1158	next unless $line;
111	577	100				3432	if ($line =~ m/^(==+)([^=]+)/) {
		100
		100
112	40					82	my $markup = $1;
113	40					65	my $header = $2;
114	40					60	my $n = length($markup);
115	40					65	$section_number[$n-2]++;
116	40					70	$#section_number = $n-2;
117	40	50				63	my $section_number = join('.',map {$_ \|\| ()} @section_number);
	79					248
118
119	40					156	$current_section = $document->create_section(
120							section_number => $section_number,
121							header => $header,
122							);
123							} elsif ($line =~ m/^\[\[(Category:[^\]]+)\]\]/) {
124							# e.g. [[Category:Animals]]
125	17					52	$document->add_category(category => $1);
126							} elsif ($line =~ m/^\[\[([a-z]+:$title)\]\]/) {
127							# e.g. [[de:dog]]
128	51					142	$document->add_language_link(tag => $1);
129							} else {
130	469					1457	$current_section->add_content($line);
131							}
132							}
133
134	1					52	return $document;
135							}
136
137							sub get_page_data {
138	0			0	0		my $self = shift;
139	0						my %args = @_;
140	0						my $title = $args{title};
141
142	0	0					die 'title is not defined' unless defined $title;
143
144	0						my $page_data = $self->get_mediawiki_client()->get_page({title => $title});
145
146	0						return $page_data;
147							}
148
149							sub get_mediawiki_client {
150	0			0	0		my $self = shift;
151	0						return $self->{mediawiki_client};
152							}
153
154
155
156
157							1;
158
159							=head1 Name
160
161							Wiktionary::Parser - Client and Parser of content from the Wiktionary API
162
163							=head1 Synopsis
164
165							This package may be used to query the Wiktionary API (en.wiktionary.org/w/api.php) for documents by title. It parses the resulting MediaWiki document and provides access to data structures containing word senses, translations, synonyms, parts of speech, etc. It also provides access to the raw content of each MediaWiki section should you wish to extract other data on your own, or build on top of this package.
166
167							The repository for this package is available on github at https://github.com/clbecker/perl-wiktionary-parser
168							And on CPAN at: http://search.cpan.org/~clbecker/Wiktionary-Parser/
169
170							=head1 Usage
171
172							my $parser = Wiktionary::Parser->new();
173
174							my $document = $parser->get_document(title => 'bunny');
175
176							my $translation_hashref = $document->get_translations();
177							my $word_sense_hashref = $document->get_word_senses();
178							my $parts_of_speech_hashref = $document->get_parts_of_speech();
179							my $pronunciations_hashref = $document->get_pronunciations();
180							my $synonyms_hashref = $document->get_synonyms();
181							my $hyponyms_hashref = $document->get_hyponyms();
182							my $hypernyms_hashref = $document->get_hypernyms();
183							my $antonyms_hashref = $document->get_antonyms();
184							my $derived_terms_hashref = $document->get_derived_terms();
185
186							my $section_hashref = $document->get_sections();
187							my $sub_document = $document->get_sub_document(title => 'string or regex');
188							my $table_of_contents_arrayref = $document->get_table_of_contents();
189
190
191
192							=head2 Methods for Wiktionary::Parser
193
194							=over
195
196							=item B
197
198							Create an instance of the Wiktionary::Parser. This object is used to contact the Wiktionary API, and parse the results.
199
200							my $parser = Wiktionary::Parser->new();
201
202
203							=item B (title => TITLE)
204
205							Contacts the wiktionary API, and downloads the page with the given title. It then parses the content and returns a Wiktionary::Parser::Document object that you can call further methods on.
206
207							my $document = $parser->get_document(title => 'orange');
208
209							=back
210
211
212							=head2 Methods for Wiktionary::Parser::Document
213
214							See https://github.com/clbecker/perl-wiktionary-parser/wiki for details and examples on methods for the Wiktionary::Parser::Document object.
215
216							=over
217
218
219							=item B
220
221							Returns a reference to a hash mapping language to a list of derived terms and phrases
222
223							my $derived_words = $document->get_derived_words();
224
225							print Dumper $derived_words;
226
227							$VAR1 = {
228							'en' => [
229							'bergamot orange',
230							'bitter orange',
231							'blood orange',
232							'burnt orange',
233							...
234							],
235							...
236							}
237
238							=item B
239
240							Returns a reference to a hash mapping language to a list of parts of speech. See https://github.com/clbecker/perl-wiktionary-parser/wiki/Parts-of-speech for details.
241
242
243							my $parts_of_speech = $document->get_parts_of_speech();
244
245							print Dumper $parts_of_speech;
246
247							$VAR1 = {
248							'en' => {
249							'language' => 'English',
250							'part_of_speech' => [
251							'noun',
252							'adj',
253							'verb'
254							]
255							},
256							'sv' => {
257							'language' => 'Swedish',
258							'part_of_speech' => [
259							'adjective',
260							'noun'
261							]
262							},
263							...
264							}
265
266
267							=item B
268
269							Returns a reference to a hash mapping language to a pronunciation metadata. See https://github.com/clbecker/perl-wiktionary-parser/wiki/pronunciations for details.
270
271							my $pronunciations = $document->get_pronunciations();
272
273							=item B
274
275							Returns a reference to a hash mapping word sense to language to translated words
276
277							my $translations = $document_get_translations();
278
279							print Dumper $translations
280
281							$VAR1 = {
282							'fruit' => {
283							'tr' => {
284							'language' => 'Turkish',
285							'translations' => [
286							'portakal'
287							],
288							'part_of_speech' => 'noun'
289							},
290							'fr' => {
291							'language' => 'French',
292							'translations' => [
293							'orange'
294							],
295							'part_of_speech' => 'noun'
296							},
297							'da' => {
298							'language' => 'Danish',
299							'translations' => [
300							'appelsin'
301							],
302							'part_of_speech' => 'noun'
303							},
304							...
305							},
306							...
307							}
308
309
310							=item B
311
312							Returns an arrayref containing a list of word senses
313
314
315							my $word_senses = $document->get_word_senses();
316
317							print Dumper $word_senses;
318
319							$VAR1 = [
320							'tree',
321							'colour',
322							'fruit',
323							...
324							]
325
326							=item B
327
328							Returns a reference to a hash mapping language and word sense to a list of synonyms
329
330							my $synonyms = $document->get_synonyms();
331
332							# Synonyms of 'cat'
333							print Dumper $synonyms;
334
335							$VAR1 = {
336							'en' => {
337							'language' => 'English',
338							'sense' => {
339							'domestic species' => [
340							'housecat',
341							'kitten',
342							'kitty',
343							...
344							],
345							...
346							},
347							}
348							}
349
350							=item B
351
352							Returns a reference to a hash mapping language and word sense to a list of hyponyms
353
354							my $hyponyms = $document->get_hyponyms();
355
356							=item B
357
358							Returns a reference to a hash mapping language and word sense to a list of hypernyms
359
360							my $hypernyms = $document->get_hypernyms();
361
362							=item B
363
364							Returns a reference to a hash mapping language and word sense to a list of antonyms
365
366							my $antonyms = $document->get_antonyms();
367
368
369
370							=item B (number => SECTION_NUMBER)
371
372							Given the section number, returns the corresponding Wiktionary::Parser::Section object. Numbers correspond to the those in the table of contents shown on a mediawiki page.
373
374							my $section = $document->get_section(number => '1.2');
375
376							=item B
377
378							Returns a reference to a hash of Wiktionary::Parser::Section objects. These provide access to the data for each section of the document.
379							The format of the hash is { $section_number => object } e.g. {'1.2.1' => $obj}
380
381							=item B (title => STRING_OR_REGEX)
382
383							Given a string or regular expression, this will return an array of Section objects containing any sections that match the given title pattern.
384
385							This returns a list containing section(s) with 'english' in the title (case insensitive) In most cases this will just return the 'English' section, in some cases you'll also get the 'Old English' section too.
386
387							my $sections = $document->get_sections(title => 'english');
388
389							If you want to get only the "English" section, use this pattern:
390
391							my $sections = $document->get_sections(title => '^english$');
392
393							This returns an array of all etymology, pronunciation, and synonym sections
394							my $sections = $document->get_sections(title => 'etymology\|pronunciation\|synonyms');
395
396
397							=item B (title => STRING_OR_REGEX)
398
399							Given a string or regular expression, this will return a Wiktionary::Parser::Document object consisting of just the matching sections, and their child sections. This can be used if you're just interested in certain parts of a document.
400
401
402							This returns a document object containing just the sections with 'english' in the title (case insensitive). In most cases this will just return the 'English' section, in some cases you'll also get the 'Old English' section too.
403
404							my $sub_document = $document->get_sub_document(title => 'english');
405
406							If you want to get only the "English" section, use this pattern:
407
408							my $sub_document = $document->get_sub_document(title => '^english$');
409
410							# To verify what sections you have, you can print out the table of contents for this sub document.
411							print Dumper $sub_document->get_table_of_contents();
412
413
414							=item B
415
416							Returns an array of Wiktionary::Parser::Section objects representating all the sections on the page that cover a part of speech. This current includes all sections that match the following header:
417
418
419							Parts of Speech used in Wiktionary include:
420
421							noun
422							verb
423							adjective
424							adverb
425							pronoun
426							preposition
427							article
428							conjunction
429							determiner
430							interjection
431							symbol
432
433
434							my $sections = $document->get_part_of_speech_section();
435
436
437							=item B
438
439							Returns an array of Wiktionary::Parser::Section objects consisting of all Translation sections in the document.
440
441							my $sections = $document->get_translation_sections();
442
443
444							=item B
445
446							Returns an array of Wiktionary::Parser::Section objects consisting of all Synonym sections in the document.
447
448							my $sections = $document->get_synonym_sections();
449
450
451							=item B
452
453							Returns an array of Wiktionary::Parser::Section objects consisting of all Hyponym sections in the document.
454
455							my $sections = $document->get_hyponym_sections();
456
457
458							=item B
459
460							Returns an array of Wiktionary::Parser::Section objects consisting of all Hypernym sections in the document.
461
462							my $sections = $document->get_hypernym_sections();
463
464
465							=item B
466
467							Returns an array of Wiktionary::Parser::Section objects consisting of all Antonym sections in the document.
468
469							my $sections = $document->get_antonym_sections();
470
471							=item B
472
473							Returns an array of Wiktionary::Parser::Section objects consisting of all Pronunciation sections in the document.
474
475							my $sections = $document->get_pronunciation_sections();
476
477
478
479
480							=item B
481
482							Returns an arrayref containing section numbers and names. Mostly helpful for informational / debugging purposes when you need a summary of what's in your document object.
483
484							my $table_of_contents = $document->get_table_of_contents();
485
486
487							print Dumper $table_of_contents;
488
489							$VAR1 = [
490							'1,english',
491							'1.1,etymology',
492							'1.2,pronunciation',
493							'1.2.1,usage notes',
494							'1.3,noun',
495							'1.3.1,derived terms',
496							'1.3.2,translations',
497							'1.4,adjective',
498							'1.4.1,translations',
499							...
500							]
501
502							=item B
503
504							Return the document title. ( i.e. the argument you used in $parser->get_document(title => $title) )
505
506							=item B
507
508							Return an array of all categories this page falls under. (These are usually the links that appear at the bottom of a wiki page)
509
510
511							=back
512
513							=head2 Methods for Wiktionary::Parser::Section
514
515							=over
516
517							=item B
518
519							Returns an arrayref containing lines of text from the section of the document
520
521							=item B
522
523							Returns the section heading name
524
525							=item B
526
527							Returns the number of this section (e.g. 1.2.1)
528
529							=item B
530
531							Return the Wiktionary::Parser::Section instane of the parent section. e.g. if you call this on section 1.2.1, it'll return the object for section 1.2
532
533							=item B
534
535							Returns the language that this section falls under.
536
537							=item B
538
539							Returns the part of speech that this section falls under.
540
541							=item B
542
543							Return an arrayref containing all sections above this one in the hierarchy.
544
545							=item B
546
547							This returns a Wiktionary::Parser::Document object containing the current section and all its child sections.
548
549							=item B
550
551							Returns an array of all sections below this one in the hierarchy
552
553							=back
554
555							=cut