File Coverage

blib/lib/NLP/GATE/AnnotationSet.pm
Criterion Covered Total %
statement 62 133 46.6
branch 15 56 26.7
condition 14 81 17.2
subroutine 10 16 62.5
pod 10 10 100.0
total 111 296 37.5


line stmt bran cond sub pod time code
1             package NLP::GATE::AnnotationSet;
2              
3 4     4   26 use warnings;
  4         8  
  4         118  
4 4     4   78 use strict;
  4         8  
  4         113  
5 4     4   20 use Carp;
  4         6  
  4         7261  
6              
7             #use Tree::RB;
8              
9             =head1 NAME
10              
11             NLP::GATE::AnnotationSet - A class for representing GATE-like annotation sets
12              
13             =head1 VERSION
14              
15             Version 0.6
16              
17             =cut
18              
19             our $VERSION = '0.6';
20              
21             =head1 SYNOPSIS
22              
23             use NLP::GATE::AnnotationSet;
24             my $annset = NLP::GATE::AnnotationSet->new();
25             $annset->add($annotation);
26             $newannset = $annset->get($type[,$featuremap]);
27             $arrayref = $annset->getAsArrayRef();
28             $ann = $annset->getByIndex();
29             $ann = $annset->size();
30              
31             =head1 DESCRIPTION
32              
33             This is a simple class representing a annotation set for documents
34             in the format the GATE software (http://gate.ac.uk/) uses.
35              
36             An annotation set can contain any number of NLP::GATE::Annotation objects.
37             Currently, there is no code to make sure that annotations are only added
38             once.
39              
40             Annotation sets behave a bit like arrays in that each annotation can be
41             addressed by an index and each set always contains a known number of
42             annotations.
43              
44             TODO: use the offset indices in method getByOffset()
45              
46             =head1 METHODS
47              
48             =head2 new()
49              
50             Create a new annotation set. The name of the annotationset is not a property of the
51             set, instead, each set is associated with a name when stored with a NLP::GATE::Document
52             object using the setAnnotationSet() method.
53              
54             =cut
55              
56             sub new {
57 8     8 1 4744 my $class = shift;
58 8   33     79 my $self = bless {
59             anns => [],
60             index_offset_from => undef,
61             }, ref($class) || $class;
62 8         74 return $self;
63             }
64              
65              
66              
67             =head2 add($annotation)
68              
69             Add an annotation object to the annotation set.
70              
71             =cut
72              
73             sub add {
74 5     5 1 16 my $self = shift;
75 5         7 my $ann = shift;
76 5 50       17 croak "Expected a NLP::GATE::Annotation for add, got a ",(ref $ann) unless(ref $ann eq "NLP::GATE::Annotation");
77 5         9 push @{$self->{anns}},$ann;
  5         13  
78 5         16 return $self;
79             }
80              
81             =head2 getByIndex($n)
82              
83             Return the annotation for index $n or signal an error.
84              
85             =cut
86              
87             sub getByIndex {
88 3     3 1 1199 my $self = shift;
89 3         5 my $n = shift;
90 3         4 my $s = scalar @{$self->{anns}};
  3         6  
91 3 50       10 carp "Need an index for getByIndex!" unless defined($n);
92 3 50 33     18 carp "Index not within range (0-$s)!" if($n < 0 || $n >= $s);
93 3         10 return $self->{anns}->[$n];
94             }
95              
96             =head2 get($type[,$featureset[,$matchtype]])
97              
98             Return a new annotation set containing all the annotations from this set
99             that match the given type, and if specified, all the feature/value pairs given
100             in the $featureset hash map reference.
101             If no annotations match, an empty annotation set will be returned.
102              
103             The parameter $matchtype specifies how features are matched: "exact" will
104             do an exact string comparison, "nocase" will compare after converting both
105             strings to lower case using perl's lc function, and "regexp" will interpret
106             the string given in the parameter as a regular expression. Default is "exact".
107              
108             If some feature is specified in the featureset it MUST occur in the feature
109             set of the annotation AND satisfy the testing matchtype method of testing for
110             equality.
111              
112             The annotations in the new set will be the same as in the original set,
113             so changing the annotation objects will change them in both sets!
114              
115             =cut
116              
117             sub get {
118 6     6 1 12659 my $self = shift;
119 6         12 my $type = shift;
120 6         9 my $features = shift;
121 6   100     45 my $matchtype = lc(shift||"") || "exact";
122 6         21 my $newset = NLP::GATE::AnnotationSet->new();
123             # $type is undef, do not check type,
124             # if $features is undef, do not check features
125             # if both are undef, this will a new annotation set with all the
126             # annotations of the original set
127 6         12 foreach my $ann (@{$self->{anns}}) {
  6         17  
128 6         10 my $cond1 = 0;
129 6         9 my $cond2 = 0;
130 6 100       31 if(!defined($type)) {
    100          
131 1         2 $cond1 = 1;
132             } elsif($ann->getType() eq $type) {
133 4         10 $cond1 = 1;
134             }
135 6 100       13 if(!defined($features)) {
136 3         5 $cond2 = 1;
137             } else {
138             # if we have a feature map, all features in the feature map
139             # must have the same value as in the annotation
140             # In other words, if one feature has a different value, the condition fails
141 3         5 $cond2 = 1;
142 3         11 foreach my $k (keys %$features) {
143 3 50 66     31 if($matchtype eq "exact" &&
    100 100        
    50 33        
      33        
144             $ann->getFeature($k) ne $features->{$k}) {
145 0         0 $cond2 = 0;
146 0         0 last;
147             } elsif($matchtype eq "nocase" &&
148             lc($ann->getFeature($k)) ne lc($features->{$k})) {
149 1         2 $cond2 = 0;
150 1         2 last;
151             } elsif($matchtype eq "regexp" &&
152             defined($ann->getFeature($k)) &&
153             $ann->getFeature($k) !~ /$features->{$k}/) {
154 0         0 $cond2 = 0;
155 0         0 last;
156             }
157             }
158             }
159 6 100 100     33 if($cond1 && $cond2) {
160 4         11 $newset->add($ann);
161             }
162             }
163 6         20 return $newset;
164             }
165              
166             =head2 getByOffset(from,to,type,featureset,$featurematchtype,$rangematchtype)
167              
168             Return all the annotations that span the given offset range, optionally
169             filtering in addition by type and features.
170             This method requires an offset range and in addition filters annotation
171             as the get method does.
172              
173             If from one of the parameters is undef, any value is allowed for the match
174             to be successful.
175              
176             The parameter $featurematchtype specifies how features are matched: "exact" will
177             do an exact string comparison, "nocase" will compare after converting both
178             strings to lower case using perl's lc function, and "regexp" will interpret
179             the string given in the parameter as a regular expression. Default is "exact".
180              
181             The $rangematchtype argument specifies how offsets will be compared, if
182             they are specified (case does not matter):
183             "COVER" - any annotation with a from less than or equal than $from and a
184             to greater than or equal than $to: annotations that contain this range
185             "EXACT" - any annotation with from and to offsets exactly as specified.
186             This is the default: annotations that are co-extensive with this range
187             "WITHIN" - any annotation that lies fully within the range
188             "OVERLAP" - any annotation that overlaps with the given range
189              
190             For example to find an annotation that fully contains the text from offset
191             12 to offset 17, use getByOffset(12,17,undef,undef,"cover").
192              
193             =cut
194             sub getByOffset {
195 0     0 1 0 my $self = shift;
196 0         0 my $from = shift;
197 0         0 my $to = shift;
198 0         0 my $type = shift;
199 0         0 my $features = shift;
200 0   0     0 my $featurematchtype = shift || "exact";
201 0         0 $featurematchtype = lc($featurematchtype);
202 0   0     0 my $rangematchtype = shift || "exact";
203 0         0 $rangematchtype = lc($rangematchtype);
204 0         0 my $newset = NLP::GATE::AnnotationSet->new();
205             #print STDERR "Looking for annotation in range $from to $to\n";
206 0         0 foreach my $ann (@{$self->{anns}}) {
  0         0  
207 0         0 my $cond1 = 0;
208 0         0 my $cond2 = 0;
209 0         0 my $cond3 = 0;
210 0         0 my $cond4 = 0;
211             #print STDERR "Checking annotation ",$ann->getType(),"/",$ann->getFrom(),"/",$ann->getTo(),"\n";
212 0 0       0 if(!defined($type)) {
    0          
213 0         0 $cond1 = 1;
214             } elsif($ann->getType() eq $type) {
215 0         0 $cond1 = 1;
216             }
217 0 0       0 if(!defined($features)) {
218 0         0 $cond2 = 1;
219             } else {
220             # if we have a feature map, all features in the feature map
221             # must have the same value as in the annotation
222             # In other words, if one feature has a different value, the condition fails
223 0         0 $cond2 = 1;
224 0         0 foreach my $k (keys %$features) {
225 0 0 0     0 if($featurematchtype eq "exact" &&
    0 0        
    0 0        
226             $ann->getFeature($k) ne $features->{$k}) {
227 0         0 $cond2 = 0;
228 0         0 last;
229             } elsif($featurematchtype eq "nocase" &&
230             lc($ann->getFeature($k)) ne lc($features->{$k})) {
231 0         0 $cond2 = 0;
232 0         0 last;
233             } elsif($featurematchtype eq "regexp" &&
234             $ann->getFeature($k) =~ /$features->{$k}/) {
235 0         0 $cond2 = 0;
236 0         0 last;
237             }
238             }
239             }
240 0 0 0     0 if(!defined($from)) {
    0 0        
    0 0        
    0          
241 0         0 $cond3 = 1;
242             } elsif($rangematchtype eq "exact" && $ann->getFrom() == $from) {
243 0         0 $cond3 = 1;
244             } elsif($rangematchtype eq "cover" && $ann->getFrom() <= $from) {
245 0         0 $cond3 = 1;
246             } elsif($rangematchtype eq "within" && $ann->getFrom() >= $from) {
247             #print STDERR "From matches for ",$ann->getType(),"/",$ann->getFrom(),"/",$ann->getTo(),"\n";
248 0         0 $cond3 = 1;
249             }
250 0 0 0     0 if(!defined($to)) {
    0 0        
    0 0        
    0          
251 0         0 $cond4 = 1;
252             } elsif($rangematchtype eq "exact" && $ann->getTo() == $to) {
253 0         0 $cond4 = 1;
254             } elsif($rangematchtype eq "cover" && $ann->getTo() >= $to) {
255 0         0 $cond4 = 1;
256             } elsif($rangematchtype eq "within" && $ann->getTo() <= $to) {
257             #print STDERR "To matches for ",$ann->getType(),"/",$ann->getFrom(),"/",$ann->getTo(),"\n";
258 0         0 $cond4 = 1;
259             }
260             # overlap is successful if either with have both to and from and
261             # either to or from or both of the annotation are within the given
262             # range, or one of to or from is undefined
263 0 0 0     0 if($rangematchtype eq "overlap" & defined($from) && defined($to)) {
    0 0        
      0        
264 0 0 0     0 if(($ann->getTo() >= $from && $ann->getTo() <= $to) ||
      0        
      0        
265             ($ann->getFrom() >= $from && $ann->getFrom() <= $to)) {
266 0         0 $cond3 = 1;
267 0         0 $cond4 = 1;
268             }
269             } elsif($rangematchtype eq "overlap" && (!defined($from) || !defined($to))) {
270 0         0 $cond3 = 1;
271 0         0 $cond4 = 1;
272             }
273 0 0 0     0 if($cond1 && $cond2 && $cond3 && $cond4) {
      0        
      0        
274 0         0 $newset->add($ann);
275             }
276             }
277 0         0 return $newset;
278             }
279              
280              
281             =head2 getAsArrayRef()
282              
283             Return an array reference whose elements are the Annotation objects in this
284             set.
285              
286             =cut
287              
288             sub getAsArrayRef {
289 1     1 1 6 my $self = shift;
290 1         2 my @arr;
291 1         2 foreach my $a ( @{$self->{anns}}) {
  1         3  
292 1         5 push @arr,$a;
293             }
294 1         4 return \@arr;
295             }
296              
297              
298             =head2 getAsArray()
299              
300             Return an array whose elements are the Annotation objects in this
301             set.
302              
303             =cut
304              
305             sub getAsArray {
306 1     1 1 719 my $self = shift;
307 1         3 my @arr = ();
308 1         3 foreach my $a ( @{$self->{anns}}) {
  1         4  
309 1         4 push @arr,$a;
310             }
311 1         4 return @arr;
312             }
313              
314              
315             =head2 size()
316              
317             Return the number of annotations in the set
318              
319             =cut
320             sub size {
321 3     3 1 499 my $self = shift;
322 3         4 return scalar @{$self->{anns}};
  3         21  
323             }
324              
325              
326             =head2 getTypes()
327              
328             Return an array of all different types in the set.
329              
330             NOTE: this will currently go through all annotations in the set and collect the types.
331             No caching of type names is done in this function or during creation of the set.
332              
333             =cut
334             sub getTypes() {
335 0     0 1   my $self = shift;
336 0           my $types = {};
337 0           foreach my $ann ( $self->getAsArray() ) {
338 0           $types->{$ann->getType()} = 1;
339             }
340 0           return keys %$types;
341             }
342              
343             =head2 indexByOffsetFrom ()
344              
345             Creates an index for the set that will speed up the retrieval of annotations
346             by offset or offset interval.
347             Unlike in GATE, this is not called automatically but must be explicitly
348             requested before doing the retrieval.
349              
350             If an index already exist it is discarded and a new index is built.
351              
352             =cut
353             sub indexByOffsetFrom {
354 0     0 1   my $self = shift;
355 0     0     my $indexfrom = Tree::RB->new(sub {$_[0] <=> $_[1]});
  0            
356 0     0     my $indexto = Tree::RB->new(sub {$_[0] <=> $_[1]});
  0            
357 0           my $i = 0;
358 0           foreach my $ann ( $self->getAsArray() ) {
359 0           $indexfrom->put($ann->getFrom(),$i);
360 0           $indexto->put($ann->getTo(),$i++);
361             }
362 0           $self->{index_offset_from} = $indexfrom;
363 0           $self->{index_offset_to} = $indexto;
364             }
365              
366              
367             ### This is only for efficiency when direct access to the internal
368             ### representation is needed for read access only
369             sub _getArrayRef {
370 0     0     my $self = shift;
371 0           return $self->{anns};
372             }
373              
374              
375             =head1 AUTHOR
376              
377             Johann Petrak, C<< >>
378              
379             =head1 BUGS
380              
381             Please report any bugs or feature requests to
382             C, or through the web interface at
383             L.
384             I will be notified, and then you'll automatically be notified of progress on
385             your bug as I make changes.
386              
387              
388             =head1 SUPPORT
389              
390             You can find documentation for this module with the perldoc command.
391              
392             perldoc NLP::GATE
393              
394             You can also look for information at:
395              
396             =over 4
397              
398             =item * AnnoCPAN: Annotated CPAN documentation
399              
400             L
401              
402             =item * CPAN Ratings
403              
404             L
405              
406             =item * RT: CPAN's request tracker
407              
408             L
409              
410             =item * Search CPAN
411              
412             L
413              
414             =back
415              
416              
417             =cut
418             1; # End of NLP::GATE::AnnotationSet