File Coverage

blib/lib/Lingua/EN/Tagger.pm

Criterion	Covered	Total	%
statement	300	327	91.7
branch	95	134	70.9
condition	22	29	75.8
subroutine	36	38	94.7
pod	12	14	85.7
total	465	542	85.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Tagger;
2
3							our $VERSION = '0.29';
4
5	10			10		848865	use warnings;
	10					102
	10					376
6	10			10		102	use strict;
	10					28
	10					309
7
8	10			10		207	use 5.008000;
	10					33
9
10	10			10		95	use Carp;
	10					41
	10					715
11	10			10		71	use File::Spec;
	10					22
	10					316
12	10			10		4853	use FileHandle;
	10					110294
	10					73
13	10			10		9644	use HTML::TokeParser;
	10					116870
	10					382
14	10			10		5413	use Lingua::Stem::En;
	10					25743
	10					531
15	10			10		6908	use Storable;
	10					36588
	10					675
16	10			10		6877	use Memoize;
	10					28122
	10					4666
17
18							# Class variables
19							our %_LEXICON; # this holds the word lexicon
20							our %_HMM; # this holds the hidden markov model for English grammar
21							our $MNP; # this holds the compiled maximal noun phrase regex
22							our ($lexpath, $word_path, $tag_path);
23							our ($NUM, $GER, $NNP, $ADJ, $PART, $NN, $PREP, $DET, $PAREN, $QUOT, $SEN, $WORD);
24
25							BEGIN { # REGEX SETUP
26							sub get_exp {
27	120			120	0	265	my ($tag) = @_;
28	120	50				353	return unless defined $tag;
29	10			10		7052	return qr\|<$tag>[^<]+\s*\|;
	10					185
	10					168
	120					2621
30							}
31
32	10			10		57	$NUM = get_exp('cd');
33	10					65	$GER = get_exp('vbg');
34	10					72	$ADJ = get_exp('jj[rs]*');
35	10					52	$PART = get_exp('vbn');
36	10					46	$NN = get_exp('nn[sp]*');
37	10					61	$NNP = get_exp('nnp');
38	10					58	$PREP = get_exp('in');
39	10					32	$DET = get_exp('det');
40	10					32	$PAREN= get_exp('[lr]rb');
41	10					40	$QUOT = get_exp('ppr');
42	10					36	$SEN = get_exp('pp');
43	10					40	$WORD = get_exp('\p{IsWord}+');
44
45	10					240045	($lexpath) = __FILE__ =~ /(.*)\.pm/;
46	10					254	$word_path = File::Spec->catfile($lexpath, 'pos_words.hash');
47	10					97	$tag_path = File::Spec->catfile($lexpath, 'pos_tags.hash');
48
49	10					119	memoize(\&Lingua::EN::Tagger::stem,
50							TIE => [ 'Memoize::ExpireLRU',
51							CACHESIZE => 1000,
52							]);
53
54	10					3707	memoize(\&Linua::EN::Tagger::_assign_tag,
55							TIE => ['Memoize::ExpireLRU',
56							CACHESIZE => 10000,
57							]);
58							}
59
60
61							######################################################################
62
63							=head1 NAME
64
65							Lingua::EN::Tagger - Part-of-speech tagger for English natural language processing.
66
67
68							=head1 SYNOPSIS
69
70							# Create a parser object
71							my $p = new Lingua::EN::Tagger;
72
73							# Add part of speech tags to a text
74							my $tagged_text = $p->add_tags($text);
75
76							...
77
78							# Get a list of all nouns and noun phrases with occurrence counts
79							my %word_list = $p->get_words($text);
80
81							...
82
83							# Get a readable version of the tagged text
84							my $readable_text = $p->get_readable($text);
85
86
87							=head1 DESCRIPTION
88
89							The module is a probability based, corpus-trained tagger that assigns POS tags to
90							English text based on a lookup dictionary and a set of probability values. The tagger
91							assigns appropriate tags based on conditional probabilities - it examines the
92							preceding tag to determine the appropriate tag for the current word.
93							Unknown words are classified according to word morphology or can be set to
94							be treated as nouns or other parts of speech.
95
96							The tagger also extracts as many nouns and noun phrases as it can, using a
97							set of regular expressions.
98
99							=head1 CONSTRUCTOR
100
101							=over
102
103
104							=item new %PARAMS
105
106							Class constructor. Takes a hash with the following parameters (shown with default
107							values):
108
109							=over
110
111							=item unknown_word_tag => ''
112
113							Tag to assign to unknown words
114
115							=item stem => 0
116
117							Stem single words using Lingua::Stem::EN
118
119							=item weight_noun_phrases => 0
120
121							When returning occurrence counts for a noun phrase, multiply the value
122							by the number of words in the NP.
123
124							=item longest_noun_phrase => 5
125
126							Will ignore noun phrases longer than this threshold. This affects
127							only the get_words() and get_nouns() methods.
128
129							=item relax => 0
130
131							Relax the Hidden Markov Model: this may improve accuracy for
132							uncommon words, particularly words used polysemously
133
134							=back
135
136							=cut
137
138							######################################################################
139
140							sub new {
141	10			10	1	7246	my ($class, %params) = @_;
142	10					168	my $self = {unknown_word_tag => '',
143							stem => 0,
144							weight_noun_phrases => 0,
145							longest_noun_phrase => 5,
146							lc => 1,
147							tag_lex => 'tags.yml',
148							word_lex => 'words.yml',
149							unknown_lex => 'unknown.yml',
150							word_path => $word_path,
151							tag_path => $tag_path,
152							relax => 0,
153							debug => 0,
154							%params};
155
156	10					38	bless $self, $class;
157
158	10	50	33			552	unless (-f $self->{'word_path'} and -f $self->{'tag_path'}){
159	0	0				0	carp "Couldn't locate POS lexicon, creating new one" if $self->{'debug'};
160	0					0	$self->install();
161							} else {
162	10					41	%_LEXICON = %{retrieve($self->{'word_path'})}; # A hash of words and corresponding parts of speech
	10					119
163	10					2013728	%_HMM = %{retrieve($self->{'tag_path'})}; # A hash of adjacent part of speech tags and the probability of each
	10					199
164							}
165
166	10					15754	$MNP = $self->_get_max_noun_regex();
167	10					82	$self->_reset();
168
169	10					120	return $self;
170							}
171
172							######################################################################
173
174							=back
175
176							=head1 METHODS
177
178							=over
179
180							=item add_tags TEXT
181
182							Examine the string provided and return it fully tagged (XML style)
183
184							=cut
185
186							######################################################################
187							sub add_tags {
188	36			36	1	147	my ($self, $text) = @_;
189
190	36					180	my $tags = $self->add_tags_incrementally($text);
191	36					254	$self->_reset;
192	36					214	return $tags;
193							}
194
195							######################################################################
196
197							=item add_tags_incrementally TEXT
198
199							Examine the string provided and return it fully tagged (XML style) but
200							do not reset the internal part-of-speech state between invocations.
201
202							=cut
203
204							######################################################################
205							sub add_tags_incrementally {
206	36			36	1	130	my ($self, $text) = @_;
207
208	36	100				145	return unless $self->_valid_text($text);
209
210	28					161	my @text = $self->_clean_text($text);
211	28					209	my $t = $self->{'current_tag'}; # shortcut
212							my (@tags) =
213							map {
214	28					114	$t = $self->_assign_tag($t, $self->_clean_word($_))
215	1966		50			7651	\|\| $self->{'unknown_word_tag'} \|\| 'nn';
216	1966					12049	"<$t>$_"
217							} @text;
218	28					147	$self->{'current_tag'} = $t;
219	28					1403	return join ' ', @tags;
220							}
221
222							######################################################################
223
224							=item get_words TEXT
225
226							Given a text string, return as many nouns and
227							noun phrases as possible. Applies L and involves three stages:
228
229							=over
230
231							* Tag the text
232							* Extract all the maximal noun phrases
233							* Recursively extract all noun phrases from the MNPs
234
235							=back
236
237							=cut
238
239							######################################################################
240							sub get_words {
241	2			2	1	33	my ($self, $text) = @_;
242
243	2	50				14	return unless $self->_valid_text($text);
244
245	2					14	my $tagged = $self->add_tags($text);
246
247	2	50				17	if($self->{'longest_noun_phrase'} <= 1){
248	0					0	return $self->get_nouns($tagged);
249							} else {
250	2					15	return $self->get_noun_phrases($tagged);
251							}
252							}
253
254
255							######################################################################
256
257							=item get_readable TEXT
258
259							Return an easy-on-the-eyes tagged version of a text string. Applies
260							L and reformats to be easier to read.
261
262							=cut
263
264							######################################################################
265							sub get_readable {
266	15			15	1	8023	my ($self, $text) = @_;
267
268	15	50				92	return unless $self->_valid_text($text);
269
270	15					87	my $tagged = $self->add_tags($text);
271	15					1830	$tagged =~ s/<\p{IsLower}+>([^<]+)<\/(\p{IsLower}+)>/$1\/\U$2/go;
272	15					242	return $tagged;
273							}
274
275
276							######################################################################
277
278							=item get_sentences TEXT
279
280							Returns an anonymous array of sentences (without POS tags) from a text.
281
282							=cut
283
284							######################################################################
285							sub get_sentences {
286	1			1	1	913	my ($self, $text) = @_;
287
288	1	50				7	return unless $self->_valid_text($text);
289	1					8	my $tagged = $self->add_tags($text);
290	1					3	my @sentences;
291							{
292	1					2	local $self->{'lc'};
	1					4
293	1					3	$self->{'lc'} = 0;
294	1					21	@sentences = map {$self->_strip_tags($_)}
	8					23
295							split /<\/pp>/, $tagged;
296							}
297
298	1					5	foreach (@sentences){
299	8					21	s/ ('s?) /$1 /g;
300	8					33	s/ ([\$\(\[\{]) / $1/g;
301	8					63	s/ (\P{IsWord}+) /$1 /g;
302	8					15	s/ (`+) / $1/g;
303	8					45	s/ (\P{IsWord}+)$/$1/;
304	8					15	s/^(`+) /$1/;
305	8					14	s/^([\$\(\[\{]) /$1/g;
306							}
307	1					9	return \@sentences;
308							}
309
310
311							###########################################
312							# _valid_text TEXT
313							#
314							# Check whether the text is a valid string
315							###########################################
316							sub _valid_text {
317	448			448		1085	my ($self, $text) = @_;
318	448	100				2609	if(!defined $text){
		50
		100
319							# $text is undefined, nothing to parse
320	4	50				27	carp "method call on uninitialized variable" if $self->{'debug'};
321	4					23	return undef;
322							} elsif (ref $text){
323							# $text is a scalar reference, don't parse
324	0	0				0	carp "method call on a scalar reference" if $self->{'debug'};
325	0					0	return undef;
326							} elsif ($text =~ /^\s*$/){
327							# $text is defined as an empty string, nothing to parse
328	4					30	return undef;
329							} else {
330							# $text is valid
331	440					1777	return 1;
332							}
333							}
334
335
336							sub lower_case {
337	0			0	0	0	my ($self, $lc) = @_;
338	0	0				0	if($lc){
339	0					0	$self->{'lc'} = 1;
340							} else {
341	0					0	$self->{'lc'} = 0;
342							}
343							}
344
345							#####################################################################
346							# _strip_tags TEXT
347							#
348							# Return a text string with the XML-style part-of-speech tags removed.
349							#####################################################################
350							sub _strip_tags {
351	355			355		912	my ($self, $text) = @_;
352	355	50				948	return unless $self->_valid_text($text);
353
354	355					3090	$text =~ s/<[^>]+>//gs;
355	355					1660	$text =~ s/\s+/ /gs;
356	355					1301	$text =~ s/^\s*//;
357	355					2145	$text =~ s/\s*$//;
358	355	100				1155	if($self->{'lc'}){
359	347					1327	return lc($text);
360							} else {
361	8					22	return $text;
362							}
363							}
364
365
366							#####################################################################
367							# _clean_text TEXT
368							#
369							# Strip the provided text of HTML-style tags and separate off
370							# any punctuation in preparation for tagging
371							#####################################################################
372							sub _clean_text {
373	30			30		123	my ($self, $text) = @_;
374	30	50				121	return unless $self->_valid_text($text);
375
376							# Strip out any markup and convert entities to their proper form
377	30					96	my $html_parser;
378	30					201	utf8::decode($text);
379	30					332	$html_parser = HTML::TokeParser->new(\$text);
380
381	30					7338	my $cleaned_text = $html_parser->get_text;
382	30					3641	while($html_parser->get_token){
383	0					0	$cleaned_text .= ($html_parser->get_text)." ";
384							}
385
386							# Tokenize the text (splitting on punctuation as you go)
387	30					1679	my @tokenized = map {$self->_split_punct($_)}
	1789					4849
388							split /\s+/, $cleaned_text;
389	30					605	my @words = $self->_split_sentences(\@tokenized);
390	30					1190	return @words;
391							}
392
393
394							#####################################################################
395							# _split_sentences ARRAY_REF
396							#
397							# This handles all of the trailing periods, keeping those that
398							# belong on abbreviations and removing those that seem to be
399							# at the end of sentences. This method makes some assumptions
400							# about the use of capitalization in the incoming text
401							#####################################################################
402							sub _split_sentences {
403	36			36		5159	my ($self, $array_ref) = @_;
404	36					104	my @tokenized = @{$array_ref};
	36					530
405
406	36					330	my @PEOPLE = qw/jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys supt det mssrs rev/;
407	36					224	my @ARMY = qw/col gen lt cmdr adm capt sgt cpl maj brig/;
408	36					177	my @INST = qw/dept univ assn bros ph.d/;
409	36					440	my @PLACE = qw/arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy hwy hway la pde pd plz pl rd st tce/;
410	36					191	my @COMP = qw/mfg inc ltd co corp/;
411	36					649	my @STATE = qw/ala ariz ark cal calif colo col conn del fed fla ga ida id ill ind ia kans kan ken ky la me md is mass mich minn miss mo mont neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt va wash wis wisc wy wyo usafa alta man ont que sask yuk/;
412	36					252	my @MONTH = qw/jan feb mar apr may jun jul aug sep sept oct nov dec/;
413	36					163	my @MISC = qw/vs etc no esp/;
414	36					188	my %ABBR = map {$_, 0}
	5076					15313
415							(@PEOPLE, @ARMY, @INST, @PLACE, @COMP, @STATE, @MONTH, @MISC);
416
417	36					645	my @words;
418	36					225	for(0 .. $#tokenized){
419	2052	100	100			12709	if (defined $tokenized[$_ + 1]
			100
420							and $tokenized[$_ + 1] =~ /[\p{IsUpper}\W]/
421							and $tokenized[$_] =~ /^(.+)\.$/){
422
423							# Don't separate the period off words that
424							# meet any of the following conditions:
425							# 1. It is defined in one of the lists above
426							# 2. It is only one letter long: Alfred E. Sloan
427							# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
428	79	100	100			1107	unless(defined $ABBR{lc $1}
			100
429							or $1 =~ /^\p{IsLower}$/i
430							or $1 =~ /^\p{IsLower}(?:\.\p{IsLower})+$/i){
431	59					305	push @words, ($1, '.');
432	59					198	next;
433							}
434							}
435	1993					5873	push @words, $tokenized[$_];
436							}
437
438							# If the final word ends in a period...
439	36	100	66			380	if(defined $words[$#words] and $words[$#words] =~ /^(.*\p{IsWord})\.$/){
440	7					45	$words[$#words] = $1;
441	7					33	push @words, '.';
442							}
443
444	36					2841	return @words;
445							}
446
447
448							###########################################################################
449							# _split_punct TERM
450							#
451							# Separate punctuation from words, where appropriate. This leaves trailing
452							# periods in place to be dealt with later. Called by the _clean_text method.
453							###########################################################################
454							sub _split_punct {
455	1799			1799		4245	local $_ = $_[1];
456
457							# If there's no punctuation, return immediately
458	1799	100				8272	return $_ if /^\p{IsWord}+$/;
459
460							# Sanity checks
461	372					996	s/\W{10,}/ /og; # get rid of long trails of non-word characters
462
463							# Put quotes into a standard format
464	372					856	s/`(?!`)(?=.*\p{IsWord})/` /og; # Shift left quotes off text
465	372					1022	s/"(?=.*\p{IsWord})/ `` /og; # Convert left quotes to ``
466	372					938	s/(?
467	372					893	s/"/ '' /og; # Convert (remaining) quotes to ''
468	372					911	s/(?<=\p{IsWord})'(?!')(?=\P{IsWord}\|$)/ ' /go; # Separate right single quotes
469
470							# Handle all other punctuation
471	372					866	s/--+/ - /go; # Convert and separate dashes
472	372					1193	s/,(?!\p{IsDigit})/ , /go; # Shift commas off everything but numbers
473	372					884	s/:$/ :/go; # Shift semicolons off
474	372					893	s/(\.\.\.+)/ $1 /; # Shift ellipses off
475	372					1114	s/([$\[\{\}\]$])/ $1 /go; # Shift off brackets
476	372					1068	s/([\!\?#\$%;~\|])/ $1 /go; # Shift off other ``standard'' punctuation
477
478							# English-specific contractions
479	372					1066	s/(?<=\p{IsAlpha})'([dms])\b/ '$1/go; # Separate off 'd 'm 's
480	372					897	s/n't\b/ n't/go; # Separate off n't
481	372					886	s/'(ve\|ll\|re)\b/ '$1/go; # Separate off 've, 'll, 're
482
483	372					2007	return split;
484							}
485
486
487							#####################################################################
488							# _assign_tag TAG, WORD (memoized)
489							#
490							# Given a preceding tag TAG, assign a tag to WORD.
491							# Called by the choose_tag method.
492							# This subroutine is a modified version of the Viterbi algorithm
493							# for part of speech tagging
494							#####################################################################
495							sub _assign_tag {
496	1966			1966		5580	my ($self, $prev_tag, $word) = @_;
497
498	1966	50	33			10395	if ($self->{'unknown_word_tag'} and $word eq "-unknown-"){
		100
499							# If the 'unknown_word_tag' value is defined,
500							# classify unknown words accordingly
501	0					0	return $self->{'unknown_word_tag'};
502							} elsif ($word eq "-sym-"){
503							# If this is a symbol, tag it as a symbol
504	8					36	return "sym";
505							}
506
507	1958					4911	my $best_so_far = 0;
508
509	1958					5024	my $w = $_LEXICON{$word};
510	1958					4923	my $t = \%_HMM;
511
512							##############################################################
513							# TAG THE TEXT
514							# What follows is a modified version of the Viterbi algorithm
515							# which is used in most POS taggers
516							##############################################################
517	1958					4277	my $best_tag;
518
519	1958					4117	foreach my $tag (keys %{$t->{$prev_tag}}){
	1958					26932
520							# With the $self->{'relax'} var set, this method
521							# will also include any `open classes' of POS tags
522	79164					136866	my $pw;
523	79164	100	100			129831	if(defined ${$w->{$tag}}){
	79164	100				323536
524	4799					9055	$pw = ${$w->{$tag}};
	4799					12890
525							} elsif ($self->{'relax'} and $tag =~ /^(?:jj\|nn\|rb\|vb)/){
526	2119					3929	$pw = 0;
527							} else {
528	72246					166629	next;
529							}
530
531							# Bayesian logic:
532							# P = P($tag \| $prev_tag) * P($tag \| $word)
533	6918					25712	my $probability = $t->{$prev_tag}{$tag} * ($pw + 1);
534
535							# Set the tag with maximal probability
536	6918	100				21341	if($probability > $best_so_far) {
537	3144					6472	$best_so_far = $probability;
538	3144					7781	$best_tag = $tag;
539							}
540							}
541
542	1958					18059	return $best_tag;
543							}
544
545
546							############################################################################
547							# _reset
548							#
549							# this subroutine will reset the preceding tag to a sentence ender (PP).
550							# This prepares the first word of a new sentence to be tagged correctly.
551							############################################################################
552							sub _reset {
553	46			46		173	my ($self) = @_;
554	46					185	$self->{'current_tag'} = 'pp';
555							}
556
557
558							#####################################################################
559							# _clean_word WORD
560							#
561							# This subroutine determines whether a word should be considered in its
562							# lower or upper case form. This is useful in considering proper nouns
563							# and words that begin sentences. Called by L.
564							#####################################################################
565							sub _clean_word {
566	1966			1966		6037	my ($self, $word) = @_;
567
568	1966	100				9002	if (defined $_LEXICON{$word}) {
		50
569							# seen this word as it appears (lower or upper case)
570	1823					7365	return $word;
571
572							} elsif (defined $_LEXICON{lcfirst $word}) {
573							# seen this word only as lower case
574	0					0	return lcfirst $word;
575
576							} else {
577							# never seen this word. guess.
578	143					6557	return $self->_classify_unknown_word($word);
579							}
580							}
581
582
583							#####################################################################
584							# _classify_unknown_word WORD
585							#
586							# This changes any word not appearing in the lexicon to identifiable
587							# classes of words handled by a simple unknown word classification
588							# metric. Called by the _clean_word method.
589							#####################################################################
590							sub _classify_unknown_word {
591	143			143		397	my ($self, $word) = @_;
592
593	143					406	local $_ = $word;
594
595	143	100				2938	if(m/[\(\{\[]/){ # Left brackets
		100
		100
		50
		50
		50
		100
		100
		100
		100
		100
		50
		50
		50
596	14					68	$word = "LRB";
597
598							} elsif(m/[\)\]\}]/o){ # Right brackets
599	14					57	$word = "RRB";
600
601							} elsif (m/^-?(?:\p{IsDigit}+(?:\.\p{IsDigit}*)?\|\.\p{IsDigit}+)$/){ # Floating point number
602	6					24	$word = "NUM";
603
604							} elsif (m/^\p{IsDigit}+[\p{IsDigit}\/:-]+\p{IsDigit}$/){ # Other number constructs
605	0					0	$word = "NUM";
606
607							} elsif (m/^-?\p{IsDigit}+\p{IsWord}+$/o){ # Ordinal number
608	0					0	$word = "ORD";
609
610							} elsif (m/^\p{IsUpper}[\p{IsUpper}\.-]*$/o) { # Abbreviation (all caps)
611	0					0	$word = "-abr-";
612
613							} elsif (m/\p{IsWord}-\p{IsWord}/o){ # Hyphenated word
614	21					157	my ($h_suffix) = m/-([^-]+)$/;
615
616	21	100	66			111	if ($h_suffix and defined ${$_LEXICON{$h_suffix}{'jj'} }){
	21					161
617							# last part of this is defined as an adjective
618	10					31	$word = "-hyp-adj-";
619							} else {
620							# last part of this is not defined as an adjective
621	11					38	$word = "-hyp-";
622							}
623
624							} elsif (m/^\W+$/o){ # Symbol
625	8					24	$word = "-sym-";
626
627							} elsif ($_ eq ucfirst) { # Capitalized word
628	20					6686	$word = "-cap-";
629
630							} elsif (m/ing$/o) { # Ends in 'ing'
631	10					29	$word = "-ing-";
632
633							} elsif(m/s$/o) { # Ends in 's'
634	16					52	$word = "-s-";
635
636							} elsif (m/tion$/o){ # Ends in 'tion'
637	0					0	$word = "-tion-";
638
639							} elsif (m/ly$/o){ # Ends in 'ly'
640	0					0	$word = "-ly-";
641
642							} elsif (m/ed$/o){ # Ends in 'ed'
643	0					0	$word = "-ed-";
644
645							} else { # Completely unknown
646	34					118	$word = "-unknown-";
647							}
648
649	143					716	return $word;
650							}
651
652
653							#####################################################################
654							# stem WORD (memoized)
655							#
656							# Returns the word stem as given by L. This can be
657							# turned off with the class parameter 'stem' => 0.
658							#####################################################################
659							sub stem {
660	236			236	1	748	my ($self, $word) = @_;
661	236	100				868	return $word unless $self->{'stem'};
662
663	4					33	my $stemref = Lingua::Stem::En::stem(-words => [ $word ]);
664	4					1394	return $stemref->[0];
665							}
666
667
668							#####################################################################
669							# _get_max_noun_regex
670							#
671							# This returns a compiled regex for extracting maximal noun phrases
672							# from a POS-tagged text.
673							#####################################################################
674							sub _get_max_noun_regex {
675	10			10		1725	my $regex = qr/
676							(?:$NUM)?(?:$GER\|$ADJ\|$PART)* # optional number, gerund - adjective -participle
677							(?:$NN)+ # Followed by one or more nouns
678							(?:
679							(?:$PREP)*(?:$DET)?(?:$NUM)? # Optional preposition, determinant, cardinal
680							(?:$GER\|$ADJ\|$PART)* # Optional gerund-adjective -participle
681							(?:$NN)+ # one or more nouns
682							)*
683							/xo;
684	10					92	return $regex;
685							}
686
687
688							######################################################################
689
690							=item get_proper_nouns TAGGED_TEXT
691
692							Given a POS-tagged text, this method returns a hash of all proper nouns
693							and their occurrence frequencies. The method is greedy and will
694							return multi-word phrases, if possible, so it would find ``Linguistic
695							Data Consortium'' as a single unit, rather than as three individual
696							proper nouns. This method does not stem the found words.
697
698							=cut
699
700							######################################################################
701							sub get_proper_nouns {
702	1			1	1	6	my ($self, $text) = @_;
703
704	1	50				6	return unless $self->_valid_text($text);
705
706	1					58	my @trimmed = map {$self->_strip_tags($_)}
	4					18
707							($text =~ /($NNP+)/gs);
708	1					6	my %nnp;
709	1					4	foreach my $n (@trimmed) {
710	4	50				17	next unless length($n) < 100; # sanity check on word length
711	4	50				25	$nnp{$n}++ unless $n =~ /^\s*$/;
712							}
713
714
715							# Now for some fancy resolution stuff...
716	1					7	foreach (keys %nnp){
717	4					22	my @words = split /\s/;
718
719							# Let's say this is an organization's name --
720							# (and it's got at least three words)
721							# is there a corresponding acronym in this hash?
722	4	100				22	if (scalar @words > 2){
723							# Make a (naive) acronym out of this name
724	1					5	my $acronym = join '', map{/^(\p{IsWord})\p{IsWord}*$/} @words;
	3					18
725	1	50				10	if (defined $nnp{$acronym}){
726							# If that acronym has been seen,
727							# remove it and add the values to
728							# the full name
729	0					0	$nnp{$_} += $nnp{$acronym};
730	0					0	delete $nnp{$acronym};
731							}
732							}
733							}
734
735	1					15	return %nnp;
736							}
737
738
739							######################################################################
740
741							=item get_nouns TAGGED_TEXT
742
743							Given a POS-tagged text, this method returns all nouns and their
744							occurrence frequencies.
745
746							=cut
747
748							######################################################################
749							sub get_nouns {
750	2			2	1	11	my ($self, $text) = @_;
751
752	2	50				14	return unless $self->_valid_text($text);
753
754	2					262	my @trimmed = map {$self->_strip_tags($_)}
	80					266
755							($text =~ /($NN)/gs);
756
757	2					26	my %return;
758	2					10	foreach my $n (@trimmed) {
759	80					284	$n = $self->stem($n);
760	80	50				261	next unless length($n) < 100; # sanity check on word length
761	80	50				456	$return{$n}++ unless $n =~ /^\s*$/;
762							}
763
764	2					109	return %return;
765							}
766
767
768							######################################################################
769
770							=item get_max_noun_phrases TAGGED_TEXT
771
772							Given a POS-tagged text, this method returns only the maximal noun phrases.
773							May be called directly, but is also used by L
774
775							=cut
776
777							######################################################################
778							sub get_max_noun_phrases {
779	2			2	1	10	my ($self, $text) = @_;
780
781	2	50				12	return unless $self->_valid_text($text);
782
783	2					1346	my @mn_phrases = map {$self->_strip_tags($_)}
	42					122
784							($text =~ /($MNP)/gs);
785
786	2					25	my %return;
787	2					11	foreach my $p (@mn_phrases) {
788	42	100				148	$p = $self->stem($p)
789							unless $p =~ /\s/; # stem single words
790	42	50				177	$return{$p}++ unless $p =~ /^\s*$/;
791							}
792
793	2					59	return %return;
794							}
795
796
797							######################################################################
798
799							=item get_noun_phrases TAGGED_TEXT
800
801							Similar to get_words, but requires a POS-tagged text as an argument.
802
803							=cut
804
805							######################################################################
806							sub get_noun_phrases {
807	4			4	1	33	my ($self, $text) = @_;
808
809	4	50				21	return unless $self->_valid_text($text);
810
811	4					11	my $found;
812	4					163	my $phrase_ext = qr/(?:$PREP\|$DET\|$NUM)+/xo;
813
814							# Find MNPs in the text, one sentence at a time
815							# Record and split if the phrase is extended by a (?:$PREP\|$DET\|$NUM)
816	4	100				2277	my @mn_phrases = map {$found->{$_}++ if m/$phrase_ext/; split /$phrase_ext/}
	84					697
	84					867
817							($text =~ /($MNP)/gs);
818
819	4					36	foreach(@mn_phrases){
820							# Split the phrase into an array of words, and
821							# create a loop for each word, shortening the
822							# phrase by removing the word in the first position
823							# Record the phrase and any single nouns that are found
824	124					359	my @words = split;
825
826	124					321	for(0 .. $#words){
827	188	100				621	$found->{join(" ", @words)}++ if scalar @words > 1;
828	188					376	my $w = shift @words;
829	188	100				1323	$found->{$w}++ if $w =~ /$NN/;
830							}
831							}
832
833	4					12	my %return;
834	4					11	foreach(keys %{$found}){
	4					72
835	220					595	my $k = $self->_strip_tags($_);
836	220					509	my $v = $found->{$_};
837
838							# We weight by the word count to favor long noun phrases
839	220					940	my @space_count = $k =~ /\s+/go;
840	220					512	my $word_count = scalar @space_count + 1;
841
842							# Throttle MNPs if necessary
843	220	50				691	next if $word_count > $self->{'longest_noun_phrase'};
844
845	220	100				704	$k = $self->stem($k) unless $word_count > 1; # stem single words
846	220					465	my $multiplier = 1;
847	220	100				570	$multiplier = $word_count if $self->{'weight_noun_phrases'};
848	220					873	$return{$k} += ($multiplier * $v);
849							}
850
851	4					368	return %return;
852							}
853
854
855							######################################################################
856
857							=item install
858
859							Reads some included corpus data and saves it in a stored hash on the
860							local file system. This is called automatically if the tagger can't
861							find the stored lexicon.
862
863							=cut
864
865							######################################################################
866							sub install {
867	0			0	1	0	my ($self) = @_;
868
869	0	0				0	carp "Creating part-of-speech lexicon" if $self->{'debug'};
870	0					0	$self->_load_tags($self->{'tag_lex'});
871	0					0	$self->_load_words($self->{'word_lex'});
872	0					0	$self->_load_words($self->{'unknown_lex'});
873	0					0	store \%_LEXICON, $self->{'word_path'};
874	0					0	store \%_HMM, $self->{'tag_path'};
875							}
876
877
878							########################################################
879							# LOAD THE 2-GRAMS INTO A HASH FROM YAML DATA
880							#
881							# This is a naive (but fast) YAML data parser. It will
882							# load a YAML document with a collection of key: value
883							# entries ({pos tag}: {probability}) mapped onto
884							# single keys ({tag}). Each map is expected to be on a
885							# single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
886							#########################################################
887							sub _load_tags {
888	1			1		6	my ($self, $lexicon) = @_;
889
890	1					32	my $path = File::Spec->catfile($lexpath, $lexicon);
891	1	50				13	my $fh = new FileHandle $path or die "Could not open $path: $!";
892	1					235	while(<$fh>){
893	45	100				622	next unless my ($key, $data) = m/^"?([^\{"]+)"?: \{ (.*) \}/;
894	44					2200	my %tags = split /[:,]\s+/, $data;
895	44					504	foreach(keys %tags){
896	1397					4712	$_HMM{$key}{$_} = $tags{$_};
897							}
898							}
899	1					15	$fh->close;
900							}
901
902
903							#########################################################
904							# LOAD THE WORD LEXICON INTO A HASH FROM YAML DATA
905							#
906							# This is a naive (but fast) YAML data parser. It will
907							# load a YAML document with a collection of key: value
908							# entries ({pos tag}: {count}) mapped onto single
909							# keys ({word}). Each map is expected to be on a
910							# single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
911							#########################################################
912							sub _load_words {
913	2			2		1814	my ($self, $lexicon) = @_;
914
915	2					53	my $path = File::Spec->catfile($lexpath, $lexicon);
916
917	2	50				25	my $fh = new FileHandle $path or die "Could not open $path: $!";
918	2					370	while(<$fh>){
919	43830	100				276022	next unless my ($key, $data) = m/^"?([^\{"]+)"?: \{ (.*) \}/;
920	43828					185739	my %tags = split /[:,]\s+/, $data;
921	43828					126925	foreach(keys %tags){
922	51989					280024	$_LEXICON{$key}{$_} = \$tags{$_};
923							}
924							}
925	2					23	$fh->close;
926							}
927
928
929							############################
930							# RETURN TRUE
931							############################
932							1;
933
934
935							__END__