File Coverage

blib/lib/Lingua/EN/Tagger.pm

Criterion	Covered	Total	%
statement	300	327	91.7
branch	95	134	70.9
condition	22	29	75.8
subroutine	36	38	94.7
pod	12	14	85.7
total	465	542	85.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Tagger;
2
3							our $VERSION = '0.30';
4
5	10			10		695985	use warnings;
	10					104
	10					343
6	10			10		48	use strict;
	10					20
	10					221
7
8	10			10		279	use 5.008000;
	10					48
9
10	10			10		72	use Carp;
	10					36
	10					739
11	10			10		66	use File::Spec;
	10					19
	10					304
12	10			10		4835	use FileHandle;
	10					101999
	10					51
13	10			10		8901	use HTML::TokeParser;
	10					105579
	10					347
14	10			10		5356	use Lingua::Stem::En;
	10					23772
	10					488
15	10			10		6782	use Storable;
	10					32691
	10					577
16	10			10		6467	use Memoize;
	10					25341
	10					4114
17
18							# Class variables
19							our %_LEXICON; # this holds the word lexicon
20							our %_HMM; # this holds the hidden markov model for English grammar
21							our $MNP; # this holds the compiled maximal noun phrase regex
22							our ($lexpath, $word_path, $tag_path);
23							our ($NUM, $GER, $NNP, $ADJ, $PART, $NN, $PREP, $DET, $PAREN, $QUOT, $SEN, $WORD);
24
25							BEGIN { # REGEX SETUP
26							sub get_exp {
27	120			120	0	246	my ($tag) = @_;
28	120	50				275	return unless defined $tag;
29	10			10		6374	return qr\|<$tag>[^<]+\s*\|;
	10					199
	10					161
	120					2374
30							}
31
32	10			10		50	$NUM = get_exp('cd');
33	10					73	$GER = get_exp('vbg');
34	10					57	$ADJ = get_exp('jj[rs]*');
35	10					42	$PART = get_exp('vbn');
36	10					36	$NN = get_exp('nn[sp]*');
37	10					59	$NNP = get_exp('nnp');
38	10					35	$PREP = get_exp('in');
39	10					37	$DET = get_exp('det');
40	10					31	$PAREN= get_exp('[lr]rb');
41	10					38	$QUOT = get_exp('ppr');
42	10					29	$SEN = get_exp('pp');
43	10					44	$WORD = get_exp('\p{IsWord}+');
44
45	10					219899	($lexpath) = __FILE__ =~ /(.*)\.pm/;
46	10					258	$word_path = File::Spec->catfile($lexpath, 'pos_words.hash');
47	10					84	$tag_path = File::Spec->catfile($lexpath, 'pos_tags.hash');
48
49	10					105	memoize(\&Lingua::EN::Tagger::stem,
50							TIE => [ 'Memoize::ExpireLRU',
51							CACHESIZE => 1000,
52							]);
53
54	10					3244	memoize(\&Linua::EN::Tagger::_assign_tag,
55							TIE => ['Memoize::ExpireLRU',
56							CACHESIZE => 10000,
57							]);
58							}
59
60
61							######################################################################
62
63							=head1 NAME
64
65							Lingua::EN::Tagger - Part-of-speech tagger for English natural language processing.
66
67
68							=head1 SYNOPSIS
69
70							# Create a parser object
71							my $p = new Lingua::EN::Tagger;
72
73							# Add part of speech tags to a text
74							my $tagged_text = $p->add_tags($text);
75
76							...
77
78							# Get a list of all nouns and noun phrases with occurrence counts
79							my %word_list = $p->get_words($text);
80
81							...
82
83							# Get a readable version of the tagged text
84							my $readable_text = $p->get_readable($text);
85
86
87							=head1 DESCRIPTION
88
89							The module is a probability based, corpus-trained tagger that assigns POS tags to
90							English text based on a lookup dictionary and a set of probability values. The tagger
91							assigns appropriate tags based on conditional probabilities - it examines the
92							preceding tag to determine the appropriate tag for the current word.
93							Unknown words are classified according to word morphology or can be set to
94							be treated as nouns or other parts of speech.
95
96							The tagger also extracts as many nouns and noun phrases as it can, using a
97							set of regular expressions.
98
99							=head1 CONSTRUCTOR
100
101							=over
102
103
104							=item new %PARAMS
105
106							Class constructor. Takes a hash with the following parameters (shown with default
107							values):
108
109							=over
110
111							=item unknown_word_tag => ''
112
113							Tag to assign to unknown words
114
115							=item stem => 0
116
117							Stem single words using Lingua::Stem::EN
118
119							=item weight_noun_phrases => 0
120
121							When returning occurrence counts for a noun phrase, multiply the value
122							by the number of words in the NP.
123
124							=item longest_noun_phrase => 5
125
126							Will ignore noun phrases longer than this threshold. This affects
127							only the get_words() and get_nouns() methods.
128
129							=item relax => 0
130
131							Relax the Hidden Markov Model: this may improve accuracy for
132							uncommon words, particularly words used polysemously
133
134							=back
135
136							=cut
137
138							######################################################################
139
140							sub new {
141	10			10	1	6015	my ($class, %params) = @_;
142	10					142	my $self = {unknown_word_tag => '',
143							stem => 0,
144							weight_noun_phrases => 0,
145							longest_noun_phrase => 5,
146							lc => 1,
147							tag_lex => 'tags.yml',
148							word_lex => 'words.yml',
149							unknown_lex => 'unknown.yml',
150							word_path => $word_path,
151							tag_path => $tag_path,
152							relax => 0,
153							debug => 0,
154							%params};
155
156	10					29	bless $self, $class;
157
158	10	50	33			528	unless (-f $self->{'word_path'} and -f $self->{'tag_path'}){
159	0	0				0	carp "Couldn't locate POS lexicon, creating new one" if $self->{'debug'};
160	0					0	$self->install();
161							} else {
162	10					34	%_LEXICON = %{retrieve($self->{'word_path'})}; # A hash of words and corresponding parts of speech
	10					65
163	10					1695635	%_HMM = %{retrieve($self->{'tag_path'})}; # A hash of adjacent part of speech tags and the probability of each
	10					269
164							}
165
166	10					11635	$MNP = $self->_get_max_noun_regex();
167	10					71	$self->_reset();
168
169	10					118	return $self;
170							}
171
172							######################################################################
173
174							=back
175
176							=head1 METHODS
177
178							=over
179
180							=item add_tags TEXT
181
182							Examine the string provided and return it fully tagged (XML style)
183
184							=cut
185
186							######################################################################
187							sub add_tags {
188	36			36	1	108	my ($self, $text) = @_;
189
190	36					134	my $tags = $self->add_tags_incrementally($text);
191	36					186	$self->_reset;
192	36					167	return $tags;
193							}
194
195							######################################################################
196
197							=item add_tags_incrementally TEXT
198
199							Examine the string provided and return it fully tagged (XML style) but
200							do not reset the internal part-of-speech state between invocations.
201
202							=cut
203
204							######################################################################
205							sub add_tags_incrementally {
206	36			36	1	90	my ($self, $text) = @_;
207
208	36	100				112	return unless $self->_valid_text($text);
209
210	28					117	my @text = $self->_clean_text($text);
211	28					148	my $t = $self->{'current_tag'}; # shortcut
212							my (@tags) =
213							map {
214	28					78	$t = $self->_assign_tag($t, $self->_clean_word($_))
215	1966		50			4985	\|\| $self->{'unknown_word_tag'} \|\| 'nn';
216	1966					7050	"<$t>$_"
217							} @text;
218	28					134	$self->{'current_tag'} = $t;
219	28					1061	return join ' ', @tags;
220							}
221
222							######################################################################
223
224							=item get_words TEXT
225
226							Given a text string, return as many nouns and
227							noun phrases as possible. Applies L and involves three stages:
228
229							=over
230
231							* Tag the text
232							* Extract all the maximal noun phrases
233							* Recursively extract all noun phrases from the MNPs
234
235							=back
236
237							=cut
238
239							######################################################################
240							sub get_words {
241	2			2	1	28	my ($self, $text) = @_;
242
243	2	50				12	return unless $self->_valid_text($text);
244
245	2					10	my $tagged = $self->add_tags($text);
246
247	2	50				22	if($self->{'longest_noun_phrase'} <= 1){
248	0					0	return $self->get_nouns($tagged);
249							} else {
250	2					15	return $self->get_noun_phrases($tagged);
251							}
252							}
253
254
255							######################################################################
256
257							=item get_readable TEXT
258
259							Return an easy-on-the-eyes tagged version of a text string. Applies
260							L and reformats to be easier to read.
261
262							=cut
263
264							######################################################################
265							sub get_readable {
266	15			15	1	7131	my ($self, $text) = @_;
267
268	15	50				64	return unless $self->_valid_text($text);
269
270	15					67	my $tagged = $self->add_tags($text);
271	15					1352	$tagged =~ s/<\p{IsLower}+>([^<]+)<\/(\p{IsLower}+)>/$1\/\U$2/go;
272	15					176	return $tagged;
273							}
274
275
276							######################################################################
277
278							=item get_sentences TEXT
279
280							Returns an anonymous array of sentences (without POS tags) from a text.
281
282							=cut
283
284							######################################################################
285							sub get_sentences {
286	1			1	1	720	my ($self, $text) = @_;
287
288	1	50				6	return unless $self->_valid_text($text);
289	1					6	my $tagged = $self->add_tags($text);
290	1					3	my @sentences;
291							{
292	1					3	local $self->{'lc'};
	1					4
293	1					3	$self->{'lc'} = 0;
294	1					24	@sentences = map {$self->_strip_tags($_)}
	8					20
295							split /<\/pp>/, $tagged;
296							}
297
298	1					5	foreach (@sentences){
299	8					21	s/ ('s?) /$1 /g;
300	8					35	s/ ([\$\(\[\{]) / $1/g;
301	8					63	s/ (\P{IsWord}+) /$1 /g;
302	8					17	s/ (`+) / $1/g;
303	8					48	s/ (\P{IsWord}+)$/$1/;
304	8					15	s/^(`+) /$1/;
305	8					14	s/^([\$\(\[\{]) /$1/g;
306							}
307	1					10	return \@sentences;
308							}
309
310
311							###########################################
312							# _valid_text TEXT
313							#
314							# Check whether the text is a valid string
315							###########################################
316							sub _valid_text {
317	448			448		715	my ($self, $text) = @_;
318	448	100				1698	if(!defined $text){
		50
		100
319							# $text is undefined, nothing to parse
320	4	50				20	carp "method call on uninitialized variable" if $self->{'debug'};
321	4					17	return undef;
322							} elsif (ref $text){
323							# $text is a scalar reference, don't parse
324	0	0				0	carp "method call on a scalar reference" if $self->{'debug'};
325	0					0	return undef;
326							} elsif ($text =~ /^\s*$/){
327							# $text is defined as an empty string, nothing to parse
328	4					19	return undef;
329							} else {
330							# $text is valid
331	440					1127	return 1;
332							}
333							}
334
335
336							sub lower_case {
337	0			0	0	0	my ($self, $lc) = @_;
338	0	0				0	if($lc){
339	0					0	$self->{'lc'} = 1;
340							} else {
341	0					0	$self->{'lc'} = 0;
342							}
343							}
344
345							#####################################################################
346							# _strip_tags TEXT
347							#
348							# Return a text string with the XML-style part-of-speech tags removed.
349							#####################################################################
350							sub _strip_tags {
351	355			355		619	my ($self, $text) = @_;
352	355	50				597	return unless $self->_valid_text($text);
353
354	355					1979	$text =~ s/<[^>]+>//gs;
355	355					1048	$text =~ s/\s+/ /gs;
356	355					798	$text =~ s/^\s*//;
357	355					1388	$text =~ s/\s*$//;
358	355	100				694	if($self->{'lc'}){
359	347					802	return lc($text);
360							} else {
361	8					28	return $text;
362							}
363							}
364
365
366							#####################################################################
367							# _clean_text TEXT
368							#
369							# Strip the provided text of HTML-style tags and separate off
370							# any punctuation in preparation for tagging
371							#####################################################################
372							sub _clean_text {
373	30			30		100	my ($self, $text) = @_;
374	30	50				93	return unless $self->_valid_text($text);
375
376							# Strip out any markup and convert entities to their proper form
377	30					69	my $html_parser;
378	30					155	utf8::decode($text);
379	30					254	$html_parser = HTML::TokeParser->new(\$text);
380
381	30					5581	my $cleaned_text = $html_parser->get_text;
382	30					2689	while($html_parser->get_token){
383	0					0	$cleaned_text .= ($html_parser->get_text)." ";
384							}
385
386							# Tokenize the text (splitting on punctuation as you go)
387	30					1226	my @tokenized = map {$self->_split_punct($_)}
	1789					3042
388							split /\s+/, $cleaned_text;
389	30					337	my @words = $self->_split_sentences(\@tokenized);
390	30					926	return @words;
391							}
392
393
394							#####################################################################
395							# _split_sentences ARRAY_REF
396							#
397							# This handles all of the trailing periods, keeping those that
398							# belong on abbreviations and removing those that seem to be
399							# at the end of sentences. This method makes some assumptions
400							# about the use of capitalization in the incoming text
401							#####################################################################
402							sub _split_sentences {
403	36			36		3666	my ($self, $array_ref) = @_;
404	36					70	my @tokenized = @{$array_ref};
	36					322
405
406	36					234	my @PEOPLE = qw/jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys supt det mssrs rev/;
407	36					174	my @ARMY = qw/col gen lt cmdr adm capt sgt cpl maj brig/;
408	36					114	my @INST = qw/dept univ assn bros ph.d/;
409	36					306	my @PLACE = qw/arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy hwy hway la pde pd plz pl rd st tce/;
410	36					120	my @COMP = qw/mfg inc ltd co corp/;
411	36					533	my @STATE = qw/ala ariz ark cal calif colo col conn del fed fla ga ida id ill ind ia kans kan ken ky la me md is mass mich minn miss mo mont neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt va wash wis wisc wy wyo usafa alta man ont que sask yuk/;
412	36					204	my @MONTH = qw/jan feb mar apr may jun jul aug sep sept oct nov dec/;
413	36					208	my @MISC = qw/vs etc no esp/;
414	36					130	my %ABBR = map {$_, 0}
	5076					10740
415							(@PEOPLE, @ARMY, @INST, @PLACE, @COMP, @STATE, @MONTH, @MISC);
416
417	36					430	my @words;
418	36					169	for(0 .. $#tokenized){
419	2052	100	100			8225	if (defined $tokenized[$_ + 1]
			100
420							and $tokenized[$_ + 1] =~ /[\p{IsUpper}\W]/
421							and $tokenized[$_] =~ /^(.+)\.$/){
422
423							# Don't separate the period off words that
424							# meet any of the following conditions:
425							# 1. It is defined in one of the lists above
426							# 2. It is only one letter long: Alfred E. Sloan
427							# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
428	79	100	100			759	unless(defined $ABBR{lc $1}
			100
429							or $1 =~ /^\p{IsLower}$/i
430							or $1 =~ /^\p{IsLower}(?:\.\p{IsLower})+$/i){
431	59					182	push @words, ($1, '.');
432	59					124	next;
433							}
434							}
435	1993					3594	push @words, $tokenized[$_];
436							}
437
438							# If the final word ends in a period...
439	36	100	66			313	if(defined $words[$#words] and $words[$#words] =~ /^(.*\p{IsWord})\.$/){
440	7					33	$words[$#words] = $1;
441	7					22	push @words, '.';
442							}
443
444	36					2140	return @words;
445							}
446
447
448							###########################################################################
449							# _split_punct TERM
450							#
451							# Separate punctuation from words, where appropriate. This leaves trailing
452							# periods in place to be dealt with later. Called by the _clean_text method.
453							###########################################################################
454							sub _split_punct {
455	1799			1799		2620	local $_ = $_[1];
456
457							# If there's no punctuation, return immediately
458	1799	100				4802	return $_ if /^\p{IsWord}+$/;
459
460							# Sanity checks
461	372					615	s/\W{10,}/ /og; # get rid of long trails of non-word characters
462
463							# Put quotes into a standard format
464	372					568	s/`(?!`)(?=.*\p{IsWord})/` /og; # Shift left quotes off text
465	372					614	s/"(?=.*\p{IsWord})/ `` /og; # Convert left quotes to ``
466	372					569	s/(?
467	372					595	s/"/ '' /og; # Convert (remaining) quotes to ''
468	372					594	s/(?<=\p{IsWord})'(?!')(?=\P{IsWord}\|$)/ ' /go; # Separate right single quotes
469
470							# Handle all other punctuation
471	372					548	s/--+/ - /go; # Convert and separate dashes
472	372					732	s/,(?!\p{IsDigit})/ , /go; # Shift commas off everything but numbers
473	372					586	s/:$/ :/go; # Shift semicolons off
474	372					556	s/(\.\.\.+)/ $1 /; # Shift ellipses off
475	372					672	s/([$\[\{\}\]$])/ $1 /go; # Shift off brackets
476	372					667	s/([\!\?#\$%;~\|])/ $1 /go; # Shift off other ``standard'' punctuation
477
478							# English-specific contractions
479	372					692	s/(?<=\p{IsAlpha})'([dms])\b/ '$1/go; # Separate off 'd 'm 's
480	372					551	s/n't\b/ n't/go; # Separate off n't
481	372					599	s/'(ve\|ll\|re)\b/ '$1/go; # Separate off 've, 'll, 're
482
483	372					1203	return split;
484							}
485
486
487							#####################################################################
488							# _assign_tag TAG, WORD (memoized)
489							#
490							# Given a preceding tag TAG, assign a tag to WORD.
491							# Called by the choose_tag method.
492							# This subroutine is a modified version of the Viterbi algorithm
493							# for part of speech tagging
494							#####################################################################
495							sub _assign_tag {
496	1966			1966		3541	my ($self, $prev_tag, $word) = @_;
497
498	1966	50	33			6527	if ($self->{'unknown_word_tag'} and $word eq "-unknown-"){
		100
499							# If the 'unknown_word_tag' value is defined,
500							# classify unknown words accordingly
501	0					0	return $self->{'unknown_word_tag'};
502							} elsif ($word eq "-sym-"){
503							# If this is a symbol, tag it as a symbol
504	8					38	return "sym";
505							}
506
507	1958					2936	my $best_so_far = 0;
508
509	1958					3671	my $w = $_LEXICON{$word};
510	1958					3164	my $t = \%_HMM;
511
512							##############################################################
513							# TAG THE TEXT
514							# What follows is a modified version of the Viterbi algorithm
515							# which is used in most POS taggers
516							##############################################################
517	1958					2789	my $best_tag;
518
519	1958					2774	foreach my $tag (keys %{$t->{$prev_tag}}){
	1958					16969
520							# With the $self->{'relax'} var set, this method
521							# will also include any `open classes' of POS tags
522	79164					91947	my $pw;
523	79164	100	100			91318	if(defined ${$w->{$tag}}){
	79164	100				227224
524	4799					6054	$pw = ${$w->{$tag}};
	4799					8662
525							} elsif ($self->{'relax'} and $tag =~ /^(?:jj\|nn\|rb\|vb)/){
526	2119					2997	$pw = 0;
527							} else {
528	72246					105189	next;
529							}
530
531							# Bayesian logic:
532							# P = P($tag \| $prev_tag) * P($tag \| $word)
533	6918					16013	my $probability = $t->{$prev_tag}{$tag} * ($pw + 1);
534
535							# Set the tag with maximal probability
536	6918	100				13839	if($probability > $best_so_far) {
537	3196					4337	$best_so_far = $probability;
538	3196					5150	$best_tag = $tag;
539							}
540							}
541
542	1958					9208	return $best_tag;
543							}
544
545
546							############################################################################
547							# _reset
548							#
549							# this subroutine will reset the preceding tag to a sentence ender (PP).
550							# This prepares the first word of a new sentence to be tagged correctly.
551							############################################################################
552							sub _reset {
553	46			46		145	my ($self) = @_;
554	46					172	$self->{'current_tag'} = 'pp';
555							}
556
557
558							#####################################################################
559							# _clean_word WORD
560							#
561							# This subroutine determines whether a word should be considered in its
562							# lower or upper case form. This is useful in considering proper nouns
563							# and words that begin sentences. Called by L.
564							#####################################################################
565							sub _clean_word {
566	1966			1966		3794	my ($self, $word) = @_;
567
568	1966	100				6696	if (defined $_LEXICON{$word}) {
		50
569							# seen this word as it appears (lower or upper case)
570	1823					4208	return $word;
571
572							} elsif (defined $_LEXICON{lcfirst $word}) {
573							# seen this word only as lower case
574	0					0	return lcfirst $word;
575
576							} else {
577							# never seen this word. guess.
578	143					6636	return $self->_classify_unknown_word($word);
579							}
580							}
581
582
583							#####################################################################
584							# _classify_unknown_word WORD
585							#
586							# This changes any word not appearing in the lexicon to identifiable
587							# classes of words handled by a simple unknown word classification
588							# metric. Called by the _clean_word method.
589							#####################################################################
590							sub _classify_unknown_word {
591	143			143		308	my ($self, $word) = @_;
592
593	143					283	local $_ = $word;
594
595	143	100				2369	if(m/[\(\{\[]/){ # Left brackets
		100
		100
		50
		50
		50
		100
		100
		100
		100
		100
		50
		50
		50
596	14					63	$word = "LRB";
597
598							} elsif(m/[\)\]\}]/o){ # Right brackets
599	14					44	$word = "RRB";
600
601							} elsif (m/^-?(?:\p{IsDigit}+(?:\.\p{IsDigit}*)?\|\.\p{IsDigit}+)$/){ # Floating point number
602	6					19	$word = "NUM";
603
604							} elsif (m/^\p{IsDigit}+[\p{IsDigit}\/:-]+\p{IsDigit}$/){ # Other number constructs
605	0					0	$word = "NUM";
606
607							} elsif (m/^-?\p{IsDigit}+\p{IsWord}+$/o){ # Ordinal number
608	0					0	$word = "ORD";
609
610							} elsif (m/^\p{IsUpper}[\p{IsUpper}\.-]*$/o) { # Abbreviation (all caps)
611	0					0	$word = "-abr-";
612
613							} elsif (m/\p{IsWord}-\p{IsWord}/o){ # Hyphenated word
614	21					112	my ($h_suffix) = m/-([^-]+)$/;
615
616	21	100	66			90	if ($h_suffix and defined ${$_LEXICON{$h_suffix}{'jj'} }){
	21					149
617							# last part of this is defined as an adjective
618	10					27	$word = "-hyp-adj-";
619							} else {
620							# last part of this is not defined as an adjective
621	11					26	$word = "-hyp-";
622							}
623
624							} elsif (m/^\W+$/o){ # Symbol
625	8					21	$word = "-sym-";
626
627							} elsif ($_ eq ucfirst) { # Capitalized word
628	20					6507	$word = "-cap-";
629
630							} elsif (m/ing$/o) { # Ends in 'ing'
631	10					34	$word = "-ing-";
632
633							} elsif(m/s$/o) { # Ends in 's'
634	16					40	$word = "-s-";
635
636							} elsif (m/tion$/o){ # Ends in 'tion'
637	0					0	$word = "-tion-";
638
639							} elsif (m/ly$/o){ # Ends in 'ly'
640	0					0	$word = "-ly-";
641
642							} elsif (m/ed$/o){ # Ends in 'ed'
643	0					0	$word = "-ed-";
644
645							} else { # Completely unknown
646	34					91	$word = "-unknown-";
647							}
648
649	143					521	return $word;
650							}
651
652
653							#####################################################################
654							# stem WORD (memoized)
655							#
656							# Returns the word stem as given by L. This can be
657							# turned off with the class parameter 'stem' => 0.
658							#####################################################################
659							sub stem {
660	236			236	1	408	my ($self, $word) = @_;
661	236	100				561	return $word unless $self->{'stem'};
662
663	4					32	my $stemref = Lingua::Stem::En::stem(-words => [ $word ]);
664	4					1180	return $stemref->[0];
665							}
666
667
668							#####################################################################
669							# _get_max_noun_regex
670							#
671							# This returns a compiled regex for extracting maximal noun phrases
672							# from a POS-tagged text.
673							#####################################################################
674							sub _get_max_noun_regex {
675	10			10		1545	my $regex = qr/
676							(?:$NUM)?(?:$GER\|$ADJ\|$PART)* # optional number, gerund - adjective -participle
677							(?:$NN)+ # Followed by one or more nouns
678							(?:
679							(?:$PREP)*(?:$DET)?(?:$NUM)? # Optional preposition, determinant, cardinal
680							(?:$GER\|$ADJ\|$PART)* # Optional gerund-adjective -participle
681							(?:$NN)+ # one or more nouns
682							)*
683							/xo;
684	10					65	return $regex;
685							}
686
687
688							######################################################################
689
690							=item get_proper_nouns TAGGED_TEXT
691
692							Given a POS-tagged text, this method returns a hash of all proper nouns
693							and their occurrence frequencies. The method is greedy and will
694							return multi-word phrases, if possible, so it would find ``Linguistic
695							Data Consortium'' as a single unit, rather than as three individual
696							proper nouns. This method does not stem the found words.
697
698							=cut
699
700							######################################################################
701							sub get_proper_nouns {
702	1			1	1	4	my ($self, $text) = @_;
703
704	1	50				4	return unless $self->_valid_text($text);
705
706	1					42	my @trimmed = map {$self->_strip_tags($_)}
	4					10
707							($text =~ /($NNP+)/gs);
708	1					3	my %nnp;
709	1					4	foreach my $n (@trimmed) {
710	4	50				18	next unless length($n) < 100; # sanity check on word length
711	4	50				16	$nnp{$n}++ unless $n =~ /^\s*$/;
712							}
713
714
715							# Now for some fancy resolution stuff...
716	1					8	foreach (keys %nnp){
717	4					49	my @words = split /\s/;
718
719							# Let's say this is an organization's name --
720							# (and it's got at least three words)
721							# is there a corresponding acronym in this hash?
722	4	100				19	if (scalar @words > 2){
723							# Make a (naive) acronym out of this name
724	1					4	my $acronym = join '', map{/^(\p{IsWord})\p{IsWord}*$/} @words;
	3					14
725	1	50				11	if (defined $nnp{$acronym}){
726							# If that acronym has been seen,
727							# remove it and add the values to
728							# the full name
729	0					0	$nnp{$_} += $nnp{$acronym};
730	0					0	delete $nnp{$acronym};
731							}
732							}
733							}
734
735	1					12	return %nnp;
736							}
737
738
739							######################################################################
740
741							=item get_nouns TAGGED_TEXT
742
743							Given a POS-tagged text, this method returns all nouns and their
744							occurrence frequencies.
745
746							=cut
747
748							######################################################################
749							sub get_nouns {
750	2			2	1	8	my ($self, $text) = @_;
751
752	2	50				9	return unless $self->_valid_text($text);
753
754	2					171	my @trimmed = map {$self->_strip_tags($_)}
	80					144
755							($text =~ /($NN)/gs);
756
757	2					19	my %return;
758	2					9	foreach my $n (@trimmed) {
759	80					138	$n = $self->stem($n);
760	80	50				157	next unless length($n) < 100; # sanity check on word length
761	80	50				238	$return{$n}++ unless $n =~ /^\s*$/;
762							}
763
764	2					70	return %return;
765							}
766
767
768							######################################################################
769
770							=item get_max_noun_phrases TAGGED_TEXT
771
772							Given a POS-tagged text, this method returns only the maximal noun phrases.
773							May be called directly, but is also used by L
774
775							=cut
776
777							######################################################################
778							sub get_max_noun_phrases {
779	2			2	1	8	my ($self, $text) = @_;
780
781	2	50				9	return unless $self->_valid_text($text);
782
783	2					951	my @mn_phrases = map {$self->_strip_tags($_)}
	42					80
784							($text =~ /($MNP)/gs);
785
786	2					16	my %return;
787	2					7	foreach my $p (@mn_phrases) {
788	42	100				110	$p = $self->stem($p)
789							unless $p =~ /\s/; # stem single words
790	42	50				127	$return{$p}++ unless $p =~ /^\s*$/;
791							}
792
793	2					51	return %return;
794							}
795
796
797							######################################################################
798
799							=item get_noun_phrases TAGGED_TEXT
800
801							Similar to get_words, but requires a POS-tagged text as an argument.
802
803							=cut
804
805							######################################################################
806							sub get_noun_phrases {
807	4			4	1	14	my ($self, $text) = @_;
808
809	4	50				18	return unless $self->_valid_text($text);
810
811	4					10	my $found;
812	4					201	my $phrase_ext = qr/(?:$PREP\|$DET\|$NUM)+/xo;
813
814							# Find MNPs in the text, one sentence at a time
815							# Record and split if the phrase is extended by a (?:$PREP\|$DET\|$NUM)
816	4	100				1691	my @mn_phrases = map {$found->{$_}++ if m/$phrase_ext/; split /$phrase_ext/}
	84					602
	84					640
817							($text =~ /($MNP)/gs);
818
819	4					26	foreach(@mn_phrases){
820							# Split the phrase into an array of words, and
821							# create a loop for each word, shortening the
822							# phrase by removing the word in the first position
823							# Record the phrase and any single nouns that are found
824	124					285	my @words = split;
825
826	124					225	for(0 .. $#words){
827	188	100				472	$found->{join(" ", @words)}++ if scalar @words > 1;
828	188					280	my $w = shift @words;
829	188	100				1050	$found->{$w}++ if $w =~ /$NN/;
830							}
831							}
832
833	4					17	my %return;
834	4					10	foreach(keys %{$found}){
	4					71
835	220					408	my $k = $self->_strip_tags($_);
836	220					351	my $v = $found->{$_};
837
838							# We weight by the word count to favor long noun phrases
839	220					604	my @space_count = $k =~ /\s+/go;
840	220					348	my $word_count = scalar @space_count + 1;
841
842							# Throttle MNPs if necessary
843	220	50				457	next if $word_count > $self->{'longest_noun_phrase'};
844
845	220	100				472	$k = $self->stem($k) unless $word_count > 1; # stem single words
846	220					312	my $multiplier = 1;
847	220	100				404	$multiplier = $word_count if $self->{'weight_noun_phrases'};
848	220					581	$return{$k} += ($multiplier * $v);
849							}
850
851	4					286	return %return;
852							}
853
854
855							######################################################################
856
857							=item install
858
859							Reads some included corpus data and saves it in a stored hash on the
860							local file system. This is called automatically if the tagger can't
861							find the stored lexicon.
862
863							=cut
864
865							######################################################################
866							sub install {
867	0			0	1	0	my ($self) = @_;
868
869	0	0				0	carp "Creating part-of-speech lexicon" if $self->{'debug'};
870	0					0	$self->_load_tags($self->{'tag_lex'});
871	0					0	$self->_load_words($self->{'word_lex'});
872	0					0	$self->_load_words($self->{'unknown_lex'});
873	0					0	store \%_LEXICON, $self->{'word_path'};
874	0					0	store \%_HMM, $self->{'tag_path'};
875							}
876
877
878							########################################################
879							# LOAD THE 2-GRAMS INTO A HASH FROM YAML DATA
880							#
881							# This is a naive (but fast) YAML data parser. It will
882							# load a YAML document with a collection of key: value
883							# entries ({pos tag}: {probability}) mapped onto
884							# single keys ({tag}). Each map is expected to be on a
885							# single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
886							#########################################################
887							sub _load_tags {
888	1			1		4	my ($self, $lexicon) = @_;
889
890	1					27	my $path = File::Spec->catfile($lexpath, $lexicon);
891	1	50				10	my $fh = new FileHandle $path or die "Could not open $path: $!";
892	1					210	while(<$fh>){
893	45	100				311	next unless my ($key, $data) = m/^"?([^\{"]+)"?: \{ (.*) \}/;
894	44					1309	my %tags = split /[:,]\s+/, $data;
895	44					285	foreach(keys %tags){
896	1397					2735	$_HMM{$key}{$_} = $tags{$_};
897							}
898							}
899	1					9	$fh->close;
900							}
901
902
903							#########################################################
904							# LOAD THE WORD LEXICON INTO A HASH FROM YAML DATA
905							#
906							# This is a naive (but fast) YAML data parser. It will
907							# load a YAML document with a collection of key: value
908							# entries ({pos tag}: {count}) mapped onto single
909							# keys ({word}). Each map is expected to be on a
910							# single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
911							#########################################################
912							sub _load_words {
913	2			2		903	my ($self, $lexicon) = @_;
914
915	2					36	my $path = File::Spec->catfile($lexpath, $lexicon);
916
917	2	50				16	my $fh = new FileHandle $path or die "Could not open $path: $!";
918	2					273	while(<$fh>){
919	43830	100				201910	next unless my ($key, $data) = m/^"?([^\{"]+)"?: \{ (.*) \}/;
920	43828					135837	my %tags = split /[:,]\s+/, $data;
921	43828					95813	foreach(keys %tags){
922	51989					243264	$_LEXICON{$key}{$_} = \$tags{$_};
923							}
924							}
925	2					18	$fh->close;
926							}
927
928
929							############################
930							# RETURN TRUE
931							############################
932							1;
933
934
935							__END__