File Coverage

blib/lib/Treex/Tool/EnglishMorpho/Lemmatizer.pm
Criterion Covered Total %
statement 47 176 26.7
branch 12 236 5.0
condition 4 12 33.3
subroutine 11 18 61.1
pod 1 1 100.0
total 75 443 16.9


line stmt bran cond sub pod time code
1             package Treex::Tool::EnglishMorpho::Lemmatizer;
2             $Treex::Tool::EnglishMorpho::Lemmatizer::VERSION = '2.20151102';
3 1     1   834 use strict;
  1         2  
  1         40  
4 1     1   7 use warnings;
  1         2  
  1         37  
5 1     1   853 use Moose;
  1         164054  
  1         9  
6 1     1   8002 use Treex::Core::Common;
  1         570075  
  1         5  
7 1     1   6200 use Treex::Core::Resource qw(require_file_from_share);
  1         2  
  1         60  
8 1     1   5 use File::Slurp;
  1         3  
  1         74  
9 1     1   5 use utf8;
  1         1  
  1         5  
10              
11             has 'exceptions_filename' => (
12             is => 'ro',
13             init_arg => undef,
14             default => sub {
15             return require_file_from_share('data/models/lemmatizer/en/exceptions2014_11_21.tsv');
16             },
17             );
18              
19             has 'negation_filename' => (
20             is => 'ro',
21             init_arg => undef,
22             default => sub {
23             return require_file_from_share('data/models/lemmatizer/en/negation');
24             },
25             );
26              
27             has 'exceptions' => (
28             is => 'ro',
29             builder => '_build_exceptions',
30             init_arg => undef,
31             lazy => 1,
32             );
33              
34             has 'negation' => (
35             is => 'ro',
36             builder => '_build_negation',
37             init_arg => undef,
38             lazy => 1,
39             );
40              
41             has 'cut_negation' => (
42             isa => 'Bool',
43             default => 1,
44             reader => 'cut_negation',
45             );
46              
47             has 'lowercase_proper_names' => (
48             isa => 'Bool',
49             default => 0,
50             reader => 'lowercase_proper_names',
51             );
52              
53             my $V = qr/[aeiou]/;
54             my $VY = qr/[aeiouy]/;
55             my $C = qr/[bcdfghjklmnpqrstvwxyz]/;
56             my $CXY = qr/[bcdfghjklmnpqrstvwxz]/;
57             my $S = qr/([sxz]|[cs]h)/;
58             my $S2 = qr/(ss|zz)/;
59             my $PRE = qr/(be|ex|in|mis|pre|pro|re)/;
60              
61             #The most importat sub:
62             #Input: word form and POS tag (Penn style)
63             #Output: lemma and was_negative_prefix
64             sub lemmatize {
65 1     1 1 225132 my ( $self, $word, $tag ) = @_;
66 1         3 my $negative_prefix = 0;
67              
68 1 50 33     21 if ( ( $tag !~ /^NNP/ || $self->lowercase_proper_names ) && $word ne 'I' ) {
      33        
69 0         0 $word = lc $word;
70             }
71              
72 1         57 my $entry = $self->exceptions->{$tag}{$word};
73 1 50       6 if ($entry) {
74 0         0 return @$entry;
75              
76             }
77             else {
78 1 50       56 if ( $self->cut_negation ) {
79 1         10 ( $word, $negative_prefix ) = $self->_cut_negative_prefix( $word, $tag );
80             }
81 1         7 return ( $self->_lemmatize_by_rules( $word, $tag ), $negative_prefix );
82             }
83             }
84              
85             sub _cut_negative_prefix {
86 1     1   4 my ( $self, $word, $tag ) = @_;
87              
88             # We are interested only in adjectives, adverbs and nouns.
89             # English verbs are negated usually by "not" (don't,...).
90             # Proper nouns (NNP,NNPS) are also left unchanged (Disney, Intel, Irvin... Non-IBM).
91 1 50 33     11 if ( $tag =~ /^(J.*|R.*|NN|NNS)$/ and $word =~ $self->negation ) {
92 0         0 $word =~ s/^(un|in|im|non-?|dis-?|il|ir)//;
93 0         0 return ( $word, 1 );
94             }
95 1         4 return ( $word, 0 );
96             }
97              
98             sub _lemmatize_NNS_NNPS {
99 0     0   0 my ( $self, $word ) = @_;
100 0 0       0 return $word if $word =~ s/men$/man/; #over 600 words (in BNC)
101 0 0       0 return $word if $word =~ s/shoes$/shoe/;
102 0 0       0 return $word if $word =~ s/wives$/wife/;
103 0 0       0 return $word if $word =~ s/(${C}us)es$/$1/; #buses bonuses
104              
105 0 0       0 return $word if $word =~ s/(${V}se)s$/$1/;
106 0 0       0 return $word if $word =~ s/(.${CXY}z)es$/$1/;
107 0 0       0 return $word if $word =~ s/(${VY}ze)s$/$1/;
108 0 0       0 return $word if $word =~ s/($S2)es$/$1/;
109 0 0       0 return $word if $word =~ s/(.${V}rse)s$/$1/;
110 0 0       0 return $word if $word =~ s/onses$/onse/;
111 0 0       0 return $word if $word =~ s/($S)es$/$1/;
112              
113 0 0       0 return $word if $word =~ s/(.$C)ies$/$1y/; #ponies vs ties
114 0 0       0 return $word if $word =~ s/(${CXY}o)es$/$1/;
115 0 0       0 return $word if $word =~ s/s$//;
116 0         0 return $word;
117             }
118              
119             sub _lemmatize_VBG { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex
120 0     0   0 my ( $self, $word ) = @_;
121 0 0       0 return $word if $word =~ s/(${CXY}z)ing$/$1/;
122 0 0       0 return $word if $word =~ s/(${VY}z)ing$/$1e/;
123 0 0       0 return $word if $word =~ s/($S2)ing$/$1/;
124 0 0       0 return $word if $word =~ s/($C${V}ll)ing$/$1/;
125 0 0       0 return $word if $word =~ s/($C${V}($CXY)\2)ing$/$1/; #cancel-ling vs call-ing - exception is needed
126 0 0       0 return $word if $word =~ s/^($CXY)ing$/$1/;
127 0 0       0 return $word if $word =~ s/^($PRE*$C${V}ng)ing$/$1/;
128 0 0       0 return $word if $word =~ s/icking$/ick/;
129 0 0       0 return $word if $word =~ s/(${C}in)ing$/$1e/;
130 0 0       0 return $word if $word =~ s/($C$V[npwx])ing$/$1/;
131 0 0       0 return $word if $word =~ s/(qu$V${C})ing$/$1e/;
132 0 0       0 return $word if $word =~ s/(u${V}d)ing$/$1e/;
133 0 0       0 return $word if $word =~ s/(${C}let)ing$/$1e/;
134 0 0       0 return $word if $word =~ s/^($PRE*$C+[ei]t)ing$/$1e/;
135 0 0       0 return $word if $word =~ s/([ei]t)ing$/$1/;
136 0 0       0 return $word if $word =~ s/($PRE$CXY${CXY}eat)ing$/$1/;
137 0 0       0 return $word if $word =~ s/($V$CXY${CXY}eat)ing$/$1e/;
138 0 0       0 return $word if $word =~ s/(.[eo]at)ing$/$1/; #treating vs creating
139 0 0       0 return $word if $word =~ s/(.${V}at)ing$/$1e/;
140 0 0       0 return $word if $word =~ s/($V$V[cgsv])ing$/$1e/; #announcing increasing
141 0 0       0 return $word if $word =~ s/($V$V$C)ing$/$1/;
142 0 0       0 return $word if $word =~ s/(.[rw]l)ing$/$1/;
143 0 0       0 return $word if $word =~ s/(.th)ing$/$1e/;
144 0 0       0 return $word if $word =~ s/($CXY[cglsv])ing$/$1e/; #involving
145 0 0       0 return $word if $word =~ s/($CXY$CXY)ing$/$1/; #reporting
146 0 0       0 return $word if $word =~ s/uing$/ue/;
147 0 0       0 return $word if $word =~ s/($VY$VY)ing$/$1/;
148 0 0       0 return $word if $word =~ s/ying$/y/;
149 0 0       0 return $word if $word =~ s/(${CXY}o)ing$/$1/;
150 0 0       0 return $word if $word =~ s/^($PRE*$C+or)ing$/$1e/;
151 0 0       0 return $word if $word =~ s/($C[clt]or)ing$/$1e/;
152 0 0       0 return $word if $word =~ s/([eo]r)ing$/$1/; #offering
153 0 0       0 return $word if $word =~ s/ing$/e/;
154 0         0 return $word;
155             }
156              
157             sub _lemmatize_VBD_VBN { ## no critic (Subroutines::ProhibitExcessComplexity) this is complex
158 0     0   0 my ( $self, $word ) = @_;
159 0 0       0 return $word if $word =~ s/en$/e/;
160 0 0       0 return $word if $word =~ s/(${CXY}z)ed$/$1/;
161 0 0       0 return $word if $word =~ s/(${VY}z)ed$/$1e/;
162 0 0       0 return $word if $word =~ s/($S2)ed$/$1/;
163 0 0       0 return $word if $word =~ s/($C${V}ll)ed$/$1/;
164 0 0       0 return $word if $word =~ s/($C${V}($CXY)\2)ed$/$1/; #cancel-led vs call-ed - wordlist is needed
165 0 0       0 return $word if $word =~ s/^($CXY)ed$/$1/;
166 0 0       0 return $word if $word =~ s/^($PRE*$C${V}ng)ed$/$1/;
167 0 0       0 return $word if $word =~ s/icked$/ick/;
168 0 0       0 return $word if $word =~ s/(${C}(in|[clnt]or))ed$/$1e/;
169 0 0       0 return $word if $word =~ s/($C$V[npwx])ed$/$1/;
170 0 0       0 return $word if $word =~ s/^($PRE*$C+or)ed$/$1e/;
171 0 0       0 return $word if $word =~ s/([eo]r)ed$/$1/;
172 0 0       0 return $word if $word =~ s/(${C})ied$/$1y/;
173 0 0       0 return $word if $word =~ s/(qu$V${C})ed$/$1e/;
174 0 0       0 return $word if $word =~ s/(u${V}d)ed$/$1e/;
175 0 0       0 return $word if $word =~ s/(${C}let)ed$/$1e/;
176 0 0       0 return $word if $word =~ s/^($PRE*$C+[ei]t)ed$/$1e/;
177 0 0       0 return $word if $word =~ s/([ei]t)ed$/$1/;
178 0 0       0 return $word if $word =~ s/($PRE$CXY${CXY}eat)ed$/$1/;
179 0 0       0 return $word if $word =~ s/($V$CXY${CXY}eat)ed$/$1e/;
180 0 0       0 return $word if $word =~ s/(.[eo]at)ed$/$1/; #treated vs created
181 0 0       0 return $word if $word =~ s/(.${V}at)ed$/$1e/;
182 0 0       0 return $word if $word =~ s/($V$V[cgsv])ed$/$1e/; #announced
183 0 0       0 return $word if $word =~ s/($V$V$C)ed$/$1/;
184 0 0       0 return $word if $word =~ s/(.[rw]l)ed$/$1/;
185 0 0       0 return $word if $word =~ s/(.th)ed$/$1e/;
186 0 0       0 return $word if $word =~ s/ued$/ue/;
187 0 0       0 return $word if $word =~ s/($CXY[cglsv])ed$/$1e/; #involved
188 0 0       0 return $word if $word =~ s/($CXY$CXY)ed$/$1/; #reported
189 0 0       0 return $word if $word =~ s/($VY$VY)ed$/$1/;
190 0 0       0 return $word if $word =~ s/ed$/e/;
191 0         0 return $word;
192             }
193              
194             sub _lemmatize_VBZ {
195 0     0   0 my ( $self, $word ) = @_;
196 0 0       0 return $word if $word =~ s/(${V}se)s$/$1/;
197 0 0       0 return $word if $word =~ s/(.${CXY}z)es$/$1/;
198 0 0       0 return $word if $word =~ s/(${VY}ze)s$/$1/;
199 0 0       0 return $word if $word =~ s/($S2)es$/$1/;
200 0 0       0 return $word if $word =~ s/(.${V}rse)s$/$1/;
201 0 0       0 return $word if $word =~ s/onses$/onse/;
202 0 0       0 return $word if $word =~ s/($S)es$/$1/;
203              
204 0 0       0 return $word if $word =~ s/(.$C)ies$/$1y/; #tries, relies vs lies
205 0 0       0 return $word if $word =~ s/(${CXY}o)es$/$1/; #does, undergoes
206 0 0       0 return $word if $word =~ s/(.)s$/$1/;
207 0         0 return $word;
208             }
209              
210             sub _lemmatize_JJR_RBR {
211 0     0   0 my ( $self, $word ) = @_;
212 0 0       0 return $word if $word =~ s/([^e]ll)er$/$1/; #smaller
213 0 0       0 return $word if $word =~ s/($C)\1er$/$1/; #bigger
214 0 0       0 return $word if $word =~ s/ier$/y/; #earlier
215 0 0       0 return $word if $word =~ s/($V$V$C)er$/$1/; #weaker
216 0 0       0 return $word if $word =~ s/($C$V[npwx])er$/$1/; #lower
217 0 0       0 return $word if $word =~ s/($V$C)er$/$1e/; #nicer wider
218 0 0       0 return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])er$/$1e/; #larger,stranger vs stronger, younger
219 0 0       0 return $word if $word =~ s/([ue])er$/$1e/; #freer
220 0 0       0 return $word if $word =~ s/er$//; #harder
221 0         0 return $word;
222             }
223              
224             sub _lemmatize_JJS_RBS {
225 0     0   0 my ( $self, $word ) = @_;
226 0 0       0 return $word if $word =~ s/([^e]ll)est$/$1/; #smallest
227 0 0       0 return $word if $word =~ s/(.)\1est$/$1/; #biggest
228 0 0       0 return $word if $word =~ s/iest$/y/; #earliest
229 0 0       0 return $word if $word =~ s/($V$V$C)est$/$1/; #weakest
230 0 0       0 return $word if $word =~ s/($C$V[npwx])est$/$1/; #lowest
231 0 0       0 return $word if $word =~ s/($V$C)est$/$1e/; #nicest widest
232 0 0       0 return $word if $word =~ s/([bcdfghjklmpqrstvwxz][cglsv])est$/$1e/; #largest vs strongest
233 0 0       0 return $word if $word =~ s/(.{3,})est$/$1/; #hardest
234 0         0 return $word;
235             }
236              
237             sub _lemmatize_by_rules {
238 1     1   4 my ( $self, $word, $tag ) = @_;
239              
240 1 50       18 my $lemma = $tag =~ /NNP?S/
    50          
    50          
    50          
    50          
    50          
241             ? $self->_lemmatize_NNS_NNPS($word)
242             : $tag =~ /^VBG/ ? $self->_lemmatize_VBG($word)
243             : $tag =~ /VB[DN]/ ? $self->_lemmatize_VBD_VBN($word)
244             : $tag eq 'VBZ' ? $self->_lemmatize_VBZ($word)
245             : $tag =~ /JJR|RBR/ ? $self->_lemmatize_JJR_RBR($word)
246             : $tag =~ /JJS|RBS/ ? $self->_lemmatize_JJS_RBS($word)
247             : $word
248             ;
249 1 50       5 return $word if $lemma eq ''; # Otherwise e.g. "est"->""
250 1         8 return $lemma;
251             }
252              
253             sub _build_exceptions {
254 1     1   28 my $self = shift;
255 1         2 my %exceptions;
256 1         58 log_debug( $self->exceptions_filename );
257 1 50       79 open my $ex_file, "<:encoding(utf-8)", $self->exceptions_filename or log_fatal($!);
258 1         126 while (<$ex_file>) {
259 5610         11535 chomp;
260 5610         24033 my ( $word, $tag, $lemma, $negative_prefix ) = split /\t/;
261              
262 5610   33     15788 $negative_prefix = ( defined $negative_prefix and $negative_prefix eq '1' );
263 5610         35333 $exceptions{$tag}{$word} = [ $lemma, $negative_prefix ];
264             }
265 1         45 close $ex_file;
266 1         91 return \%exceptions;
267             }
268              
269             sub _build_negation {
270 0     0     my $self = shift;
271 0           my $pattern = '';
272 0           my @lines = read_file( $self->negation_filename, binmode => ':encoding(utf-8)', err_mode => 'log_fatal' );
273              
274             # or log_fatal('Cannot load lemmatization exceptions from ' . $self->negation_filename);
275 0           chomp(@lines);
276 0           $pattern = join '|', @lines;
277              
278             #$pattern =~ s/-/\-/g;
279 0           my $negation = qr/^($pattern)/;
280 0           return $negation;
281             }
282              
283             1;
284              
285             __END__
286              
287             Cutting off negative prefixes is quite discutable.
288             Even if we filter out cases when:
289             a) a word starts with (un|in|im|dis|il|ir) but it is not a prefix (Intel, disaster,...)
290             b) it is a prefix but not negative (indoor, impress,...)
291             Still there are other cases, when etymologicaly it is a negative prefix, but...
292             unease, uneasily, uneasiness,... is definitelly not a negation of ease, easily, easiness
293              
294             indiscriminately ??
295             indiscriminate ??
296              
297             =pod
298              
299             =head1 NAME
300              
301             Treex::Tool::EnglishMorpho::Lemmatizer - rule based lemmatizer for English
302              
303             =head1 VERSION
304              
305             version 2.20151102
306              
307             =head1 SYNOPSIS
308              
309             use Treex::Tool::EnglishMorpho::Lemmatizer;
310             my $lemmatizer = Treex::Tool::EnglishMorpho::Lemmatizer->new();
311             my ($word, $tag) = qw( goes VBZ );
312             my ($lemma, $neg) = $lemmatizer->lemmatize($word, $tag);
313             # $lemma = 'go', $neg = 0
314             ($lemma, $neg) = $lemmatizer->lemmatize('unhappy', 'JJ');
315             # $lemma = 'happy', $neg = 1
316              
317             =head1 METHODS
318              
319             =over 4
320              
321             =item lemmatize
322              
323             Accepts pair of word and tag.
324             Produces pair with its lemma and indication if word was negation
325              
326             =back
327              
328             =head1 DESCRIPTION
329              
330             Covers:
331              
332             =over
333              
334             =item * noun -s (dogs -> dog, ponies -> pony,..., mice -> mouse)
335              
336             =item * verb -s (does -> do,...)
337              
338             =item * verb -ing
339              
340             =item * verb -ed, -en
341              
342             =item * adjective/adverb -er
343              
344             =item * adjective/adverb -est
345              
346             =item * cut off negative prefixes (un|in|im|non|dis|il|ir)
347              
348             =back
349              
350             =head2 Input requirements
351              
352             =over
353              
354             =item Tokenization
355              
356             I<doesn't> should be tokenized as two words: I<does> and I<n't>
357             (It will be lemmatized as I<do> and I<not>).
358              
359             =item Tagging
360              
361             Correct tagging (Penn style) is quite crucial for Lemmatizer to work.
362             For example it doesn't change words with tags NN and NNP
363             (it changes only NNS and NNPS). So (I<pence>, NN) -> I<pence>,
364             but (I<pence>, NNS) -> I<penny>.
365              
366             =back
367              
368             =head2 Differences from the previous implementation
369              
370             Modul C<PEDT::MorphologyAnalysis> uses Morpha (written in Flex)
371             and in some cases gives different lemmatization.
372              
373             =over
374              
375             =item Adverbs and adjectives.
376              
377             Morpha leaves comparatives and superlatives unchanged.
378             C<PEDT::MorphologyAnalysis> does only basic analysis (I<later> -> I<lat>).
379              
380             =item Capitalization of proper names
381              
382             =item Changes of NN
383              
384             =item Latin words
385              
386             Declination of words with latin origin is not covered by any Lemmatizer
387             rules on purpose.
388             There are few widely known english words with latin origin which are
389             (or should be) covered by exception files (f.e. indices NNS -> index).
390             In my opinion, it is better, especially for translation purposes,
391             to leave the other latin words unchanged. Mostly they will have the same
392             form also in the target language (biological terms like Spheniscidae).
393             BTW: Errors made by Morpha latin fallbacks are sometimes funny:
394             sci-fi -> sci-fus, Mitsubishi -> mitsubishus, Shanghai -> shanghaus,...
395              
396             =back
397              
398             =head1 TODO
399              
400             =over
401              
402             =item * this POD documentation !!!
403              
404             =item * better list of exceptions
405              
406             =item * change exceptions format from tsv to stored perl hash
407              
408             =back
409              
410             =head1 AUTHOR
411              
412             Martin Popel <popel@ufal.mff.cuni.cz>
413              
414             =head1 COPYRIGHT
415              
416             Copyright © 2008 - 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
417              
418             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.