File Coverage

blib/lib/NLP/GATE/Document.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package NLP::GATE::Document;
2              
3 4     4   22 use warnings;
  4         38  
  4         154  
4 4     4   21 use strict;
  4         7  
  4         117  
5 4     4   22 use Carp;
  4         42  
  4         296  
6              
7 4     4   10488 use XML::Writer;
  4         51104  
  4         112  
8 4     4   2097 use XML::LibXML;
  0            
  0            
9              
10             use NLP::GATE::Annotation;
11             use NLP::GATE::AnnotationSet;
12              
13             =head1 NAME
14              
15             NLP::GATE::Document - Class for manipulating GATE-like documents
16              
17             =head1 VERSION
18              
19             Version 0.6
20              
21             =cut
22              
23             our $VERSION = '0.6';
24              
25             =head1 SYNOPSIS
26              
27             use NLP::GATE::Document;
28             my $doc = NLP::GATE::Document->new();
29             $doc->setText($text);
30             $doc->setFeature($name,"featvalue");
31             $doc->setFeatureType($name,$type);
32             $annset = $doc->getAnnotationSet($setname);
33             $doc->setAnnotationSet($set,$setname);
34             $feature = $doc->getFeature($name);
35             $type = $doc->getFeatureType($name);
36             $xml = $doc->toXML();
37             $doc->fromXMLFile($filename);
38             $doc->fromXML($string);
39              
40             =head1 DESCRIPTION
41              
42             This is a simple class representing a document with annotations and
43             features similar to how documents are represented in GATE.
44             The class can produce a string representation of the document that
45             is in XML format and should be readable by GATE.
46              
47             All setter functions return the original Document object.
48              
49             =head1 METHODS
50              
51             =head2 new()
52              
53             Create a new document. Currently only can be used without parameters
54             and will always create a new empty document.
55              
56             =cut
57              
58             sub new {
59             my $class = shift;
60             my $self = bless {
61             text => "",
62             annotationsets => {},
63             features => {},
64             featuretypes => {},
65             }, ref($class) || $class;
66             $self->{annotationsets}->{""} = NLP::GATE::AnnotationSet->new();
67             return $self;
68             }
69              
70             =head2 setText($text)
71              
72             Set the text of the document. Note that annotations will remain unchanged
73             unless you explicitly remove them (see setAnnotation) and might point to
74             non-existing or incorrect text after the text is changed.
75              
76             =cut
77              
78             sub setText {
79             my $self = shift;
80             my $text = shift;
81             $self->{text} = $text;
82             return $self;
83             }
84              
85             =head2 appendText($theText)
86              
87             Append text to the current text content of the document.
88             In scalar context, returns the document object.
89             In array context, returns the from and to offsets of the
90             newly added text. This can be used to add annotations
91             for that text snipped more easily.
92              
93             =cut
94              
95             sub appendText {
96             my $self = shift;
97             my $text = shift;
98             my $from = length($self->{text});
99             my $to = $from + length($text);
100             $self->{text} .= $text;
101             if(wantarray) {
102             return ($from,$to);
103             } else {
104             return $self;
105             }
106             }
107              
108             =head2 getText()
109              
110             Get the plain text of the document.
111              
112             =cut
113              
114             sub getText {
115             my $self = shift;
116             return $self->{text} || "";
117             }
118              
119             =head2 getTextForAnnotation($annotation)
120              
121             Get the text spanned by the given annotation
122              
123             TODO: no sanity checks yet!
124              
125             =cut
126             sub getTextForAnnotation {
127             my $self = shift;
128             my $ann = shift;
129             return substr($self->{text},$ann->getFrom(),$ann->getTo()-$ann->getFrom());
130             }
131              
132             =head2 getAnnotationSet ($name)
133              
134             Return the annotation set with that name. Return undef
135             if no set with such a name is found.
136              
137             This is more straightforward than the original Java implementation in GATE:
138             passing an empty string or undef as $name will return the default
139             annotation set.
140              
141             =cut
142             sub getAnnotationSet {
143             my $self = shift;
144             my $setname = shift || "";
145             return $self->{annotationsets}->{$setname};
146             }
147              
148              
149             =head2 getAnnotationSetNames
150              
151             Return a list of known annotation set names. This will include an entry that is the empty string
152             that stands for the default annotation set.
153              
154             =cut
155             sub getAnnotationSetNames {
156             my $self = shift;
157             return keys %{$self->{annotationsets}};
158             }
159              
160             =head2 setAnnotationSet ($set[,$name])
161              
162             Store the annotation set object with the document under the given
163             annotation set name. If the name is the empty string or undef, the
164             default annotation set is stored or replaced.
165             Any existing annotation set with that name will be destroyed (unless the
166             object to replace it is the original set object).
167              
168             =cut
169             sub setAnnotationSet {
170             my $self = shift;
171             my $set = shift;
172             my $name = shift || "";
173             croak "Expected a NLP::GATE::AnnotationSet for setAnnotationSet, got ",(ref $set) unless(ref $set eq "NLP::GATE::AnnotationSet");
174             $self->{annotationsets}->{$name} = $set;
175             return $self;
176             }
177              
178             =head2 setFeature($name,$value)
179              
180             Add or replace the document feature of the given name with the new
181             value.
182             Make sure you at least add the usual GATE standard features to a document:
183              
184             setFeature('gate.SourceURL','created from String');
185              
186             =cut
187              
188             sub setFeature {
189             my $self = shift;
190             my $name = shift;
191             my $value = shift;
192             $self->{features}->{$name} = $value;
193             return $self;
194             }
195              
196             =head2 getFeature($name)
197              
198             Return the value of the document feature with that name.
199              
200             =cut
201              
202             sub getFeature {
203             my $self = shift;
204             my $name = shift;
205             return $self->{features}->{$name};
206             }
207              
208             =head2 setFeatureType($name,$type)
209              
210             Set the Java type for the feature.
211              
212             =cut
213              
214             sub setFeatureType {
215             my $self = shift;
216             my $name = shift;
217             my $type = shift;
218             $self->{featuretypes}->{$name} = $type;
219             return $self;
220             }
221              
222             =head2 getFeatureType($name)
223              
224             Return the Java type for a feature. If the type has never been set,
225             the default is java.lang.String.
226              
227             =cut
228              
229             sub getFeatureType {
230             my $self = shift;
231             my $name = shift;
232             return $self->{featuretypes}->{$name} || "java.lang.String";
233             }
234              
235              
236             =head2 fromXMLFile($filename)
237              
238             Read a GATE document from an XML file.
239             All content of the current object, including features, annotations and
240             text is discarded.
241              
242             =cut
243              
244             sub fromXMLFile {
245             my $self = shift;
246             my $filepath = shift;
247             my $parser = XML::LibXML->new();
248             _setParserOptions($parser);
249             my $doc = undef;
250             ## the parse_file method outputs a very strange error message when the file is
251             ## not found - therefore we catch all errors and add a little note just to make
252             ## sure the user checks this possible cause.
253             eval {
254             $doc = $parser->parse_file( $filepath );
255             };
256             if ($@) {
257             croak "Got the following error when trying to parse $filepath: $@\n(Make sure the file exists!)";
258             }
259             _parseXML($self,$doc);
260             return $self;
261             }
262              
263             =head2 fromXML($string)
264              
265             Read a GATE document from a string that contains a GATE document into the
266             current object. All previous content of the object is discarded.
267             The XML string has to be encoded in UTF8 for now.
268              
269             =cut
270              
271             sub fromXML {
272             my $self = shift;
273             my $xml = shift;
274             my $parser = XML::LibXML->new();
275             _setParserOptions($parser);
276             my $doc = $parser->parse_string($xml);
277             _parseXML($self,$doc);
278             return $self;
279             }
280              
281              
282             sub _setParserOptions {
283             my $parser = shift;
284             $parser->validation(0);
285             $parser->recover(0);
286             $parser->expand_entities(1);
287             $parser->keep_blanks(1);
288             $parser->pedantic_parser(1);
289             $parser->line_numbers(1);
290             $parser->load_ext_dtd(0);
291             $parser->complete_attributes(0);
292             $parser->expand_xinclude(0);
293             $parser->no_network(1);
294             }
295              
296             sub _parseXML {
297             my $self = shift;
298             my $doc = shift;
299             my $root = $doc->getDocumentElement();
300             ## process the document features
301             my $i = 0;
302             for my $feature ($doc->findnodes("/GateDocument/GateDocumentFeatures/Feature")) {
303             my @n = $feature->findnodes("Name");
304             my @v = $feature->findnodes("Value");
305             #my @t = $feature->findnodes('Value/@className');
306             if(@n && @v) {
307             unless(scalar @n == 1) {
308             croak "Strange document format: not exactly one Name element for document feature!";
309             }
310             unless(scalar @v == 1) {
311             croak "Strange document format: not exactly one Value element for document feature!";
312             }
313             my $fname = $n[0]->textContent();
314             $self->{features}->{$fname} = $v[0]->textContent();
315             $self->{featuretypes}->{$fname} = $v[0]->getAttribute("className");
316             }
317             }
318             ## process the document text and create a map of node ids to text offsets
319             my %nodemap = ();
320             my $offset = 0;
321             my $text = "";
322             for my $el ($doc->findnodes("/GateDocument/TextWithNodes")) {
323             foreach my $c ($el->childNodes()) {
324             if($c->nodeType() == 1) { # element node
325             ## get the attribute id
326             my $nodeid = _getAttr($c,"id");
327             $nodemap{$nodeid} = $offset;
328             } elsif($c->nodeType() == 3 || $c->nodeType() == 4) {
329             ## 3: text
330             ## 4: cdata
331             my $t = $c->textContent();
332             $offset += length($t);
333             $text .= $t;
334             } else {
335             croak "Invalid node type encountered: ",$c->nodeType(),"\n";
336             }
337             }
338             }
339             $self->{text} = $text;
340             ## process the annotation features, replacing node ids with offset information
341             for my $annset ($doc->findnodes("/GateDocument/AnnotationSet")) {
342             ## figure out the name, then create a new annotation set
343             my $name = _getAttr($annset,"Name","");
344             my $myannset = NLP::GATE::AnnotationSet->new();
345              
346             ## find all the annotations in that annotation set
347             for my $ann ($annset->findnodes("Annotation")) {
348             # get attributes Id, Type, StartNode, EndNode
349             my $annId = _getAttr($ann,"Id");
350             my $annType = _getAttr($ann,"Type");
351             my $annStartNode = _getAttr($ann,"StartNode");
352             my $annEndNode = _getAttr($ann,"EndNode");
353             my $from = $nodemap{$annStartNode};
354             my $to = $nodemap{$annEndNode};
355             my $myann = NLP::GATE::Annotation->new($annType,$from,$to);
356             for my $feature ($ann->findnodes("Feature")) {
357             my @n = $feature->findnodes("Name");
358             my @v = $feature->findnodes("Value");
359             my @t = $feature->findnodes('Value/@className');
360             if(@n && @v) {
361             unless(scalar @n == 1) {
362             croak "Strange document format: not exactly one Name element for document feature!";
363             }
364             unless(scalar @v == 1) {
365             croak "Strange document format: not exactly one Value element for document feature!";
366             }
367             my $fname = $n[0]->textContent();
368             $myann->setFeature($fname,$v[0]->textContent());
369             $myann->setFeatureType($fname,$v[0]->getAttribute("className"));
370             }
371             } # for feature
372             $myannset->add($myann);
373             } # for ann
374             $self->{annotationsets}->{$name} = $myannset;
375             } # for annset
376              
377             }
378              
379             sub _getAttr {
380             ## TODO: use node->getAttribute(name) instead!
381             my $el = shift;
382             my $attrname = shift;
383             my $default = shift;
384             my $val = $el->getAttribute($attrname);
385             if(defined($val)) {
386             return $val;
387             } elsif(defined($default)) {
388             return $default;
389             } else {
390             croak "Attribute $attrname not found in element ",$el->toString()," and no default";
391             }
392             }
393              
394              
395              
396             =head2 toXML()
397              
398             Create an actual XML representation that can be used by GATE from the internal
399             representation of the document.
400              
401             =cut
402              
403             sub toXML {
404             my $self = shift;
405             my $ret = "";
406             my $xml = new XML::Writer(OUTPUT => \$ret, ENCODING => "utf-8");
407             $xml->xmlDecl();
408             $xml->startTag("GateDocument");
409             $ret .= "\n";
410             $xml->comment("The document's features");
411             $ret .= "\n";
412             $xml->startTag("GateDocumentFeatures");
413             $ret .= "\n";
414             _outputFeatures($xml,$self->{features},$self->{featuretypes},\$ret);
415             $xml->endTag("GateDocumentFeatures");
416             $ret .= "\n";
417             $xml->comment("The document content area with serialized nodes");
418             $ret .= "\n";
419             $xml->startTag("TextWithNodes");
420             # $ret .= "\n";
421             my @offsets = $self->_getOffsets();
422             my $lastoffset = 0;
423             foreach my $offset ( @offsets ) {
424             # is there any text between the last node and this one?
425             if($lastoffset < $offset) {
426             $xml->characters($self->_getText($lastoffset,$offset));
427             $lastoffset = $offset;
428             }
429             $xml->emptyTag("Node",'id' => $offset);
430             }
431             ## if there is text after the last node, output it too
432             if($lastoffset < length($self->{text})) {
433             $xml->characters($self->_getText($lastoffset,length($self->{text})));
434             }
435             $xml->endTag("TextWithNodes");
436             $ret .= "\n";
437             # the default annotation set is always there
438              
439             # use a unique id for all annotations over all sets
440             my $annid = 0;
441             $xml->comment("The default annotation set");
442             $ret .= "\n";
443             $xml->startTag("AnnotationSet");
444             $ret .= "\n";
445             foreach my $ann ( @{$self->{annotationsets}->{""}->_getArrayRef()} ) {
446             $xml->startTag("Annotation",
447             'Id' => $annid,
448             'Type' => $ann->getType(),
449             'StartNode' => $ann->getFrom(),
450             'EndNode' => $ann->getTo()
451             );
452             $ret .= "\n";
453             _outputFeatures($xml,$ann->_getFeatures(),$ann->_getFeatureTypes(),\$ret);
454             $xml->endTag("Annotation");
455             $ret .= "\n";
456             $annid++;
457             }
458             $xml->endTag("AnnotationSet");
459             $ret .= "\n";
460             # optionally, there might be additional named annotation sets
461             # these use AnnotationSet with Name="name" attributes
462             foreach my $annsetname ( keys %{$self->{annotationsets}} ) {
463             next if $annsetname eq ""; # we already have processed the default set
464             my $annset = $self->{annotationsets}->{$annsetname};
465             $xml->startTag("AnnotationSet", 'Name' => $annsetname);
466             $ret .= "\n";
467             foreach my $ann ( @{$annset->_getArrayRef()} ) {
468             $xml->startTag("Annotation",
469             'Id' => $annid,
470             'Type' => $ann->getType(),
471             'StartNode' => $ann->getFrom(),
472             'EndNode' => $ann->getTo()
473             );
474             $ret .= "\n";
475             _outputFeatures($xml,$ann->_getFeatures(),$ann->_getFeatureTypes,\$ret);
476             $xml->endTag("Annotation");
477             $ret .= "\n";
478             $annid++;
479             }
480             $xml->endTag("AnnotationSet");
481             $ret .= "\n";
482             }
483             $xml->endTag("GateDocument");
484             $xml->end();
485             return $ret;
486             }
487              
488             sub _outputFeatures {
489             my $xml = shift;
490             my $featurehashref = shift;
491             my $featuretypes = shift;
492             my $ret = shift;
493             $ret = ${$ret};
494             foreach my $feature ( keys %{$featurehashref} ) {
495             $xml->startTag("Feature");
496             $ret .= "\n";
497             $xml->startTag("Name",'className' => 'java.lang.String');
498             $xml->characters($feature);
499             $xml->endTag("Name");
500             $xml->startTag("Value",'className' => $featuretypes->{$feature} || "java.lang.String");
501             $xml->characters($featurehashref->{$feature});
502             $xml->endTag("Value");
503             $xml->endTag("Feature");
504             $ret .= "\n";
505             }
506             }
507              
508             ## this will generate an array with offsets for all from and
509             ## to offsets needed for the annotations.
510             sub _getOffsets {
511             my $self = shift;
512             my %offsets = ();
513             # for each annotation set
514             foreach my $annset ( keys %{$self->{annotationsets}} ) {
515             # for each annotation
516             #print STDERR "Annset: $annset\n";
517             foreach my $ann ( @{$self->{annotationsets}->{$annset}->_getArrayRef()} ) {
518             # add the from and two offsets to the offset hash
519             $offsets{$ann->getFrom()} = 1;
520             $offsets{$ann->getTo()} = 1;
521             }
522             }
523             # convert the offset hash to an array of offsets
524             my @offsets = keys %offsets;
525             # sort by offset
526             @offsets = sort {$a <=> $b} @offsets;
527             # return the array of offsets
528             #print "OFFSETS: ",join(",",@offsets),"\n";
529             return @offsets;
530             }
531              
532             ## get the text starting at offset from and going to the character before
533             ## offset to
534             sub _getText {
535             my $self = shift;
536             my $from = shift;
537             my $to = shift;
538             return substr($self->{text},$from,$to-$from);
539             }
540             =head1 AUTHOR
541              
542             Johann Petrak, C<< >>
543              
544             =head1 BUGS
545              
546             Please report any bugs or feature requests to
547             C, or through the web interface at
548             L.
549             I will be notified, and then you'll automatically be notified of progress on
550             your bug as I make changes.
551              
552             =head1 SUPPORT
553              
554             You can find documentation for this module with the perldoc command.
555              
556             perldoc NLP::GATE
557              
558             You can also look for information at:
559              
560             =over 4
561              
562             =item * AnnoCPAN: Annotated CPAN documentation
563              
564             L
565              
566             =item * CPAN Ratings
567              
568             L
569              
570             =item * RT: CPAN's request tracker
571              
572             L
573              
574             =item * Search CPAN
575              
576             L
577              
578             =back
579              
580             =head1 ACKNOWLEDGEMENTS
581              
582             =head1 COPYRIGHT & LICENSE
583              
584             Copyright 2007 Johann Petrak, all rights reserved.
585              
586             This program is free software; you can redistribute it and/or modify it
587             under the same terms as Perl itself.
588              
589             =cut
590              
591             1; # End of NLP::GATE::Document