File Coverage

blib/lib/NLP/GATE/Document.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package NLP::GATE::Document;
2
3	4	4	22	use warnings;
	4		38
	4		154
4	4	4	21	use strict;
	4		7
	4		117
5	4	4	22	use Carp;
	4		42
	4		296
6
7	4	4	10488	use XML::Writer;
	4		51104
	4		112
8	4	4	2097	use XML::LibXML;
	0
	0
9
10				use NLP::GATE::Annotation;
11				use NLP::GATE::AnnotationSet;
12
13				=head1 NAME
14
15				NLP::GATE::Document - Class for manipulating GATE-like documents
16
17				=head1 VERSION
18
19				Version 0.6
20
21				=cut
22
23				our $VERSION = '0.6';
24
25				=head1 SYNOPSIS
26
27				use NLP::GATE::Document;
28				my $doc = NLP::GATE::Document->new();
29				$doc->setText($text);
30				$doc->setFeature($name,"featvalue");
31				$doc->setFeatureType($name,$type);
32				$annset = $doc->getAnnotationSet($setname);
33				$doc->setAnnotationSet($set,$setname);
34				$feature = $doc->getFeature($name);
35				$type = $doc->getFeatureType($name);
36				$xml = $doc->toXML();
37				$doc->fromXMLFile($filename);
38				$doc->fromXML($string);
39
40				=head1 DESCRIPTION
41
42				This is a simple class representing a document with annotations and
43				features similar to how documents are represented in GATE.
44				The class can produce a string representation of the document that
45				is in XML format and should be readable by GATE.
46
47				All setter functions return the original Document object.
48
49				=head1 METHODS
50
51				=head2 new()
52
53				Create a new document. Currently only can be used without parameters
54				and will always create a new empty document.
55
56				=cut
57
58				sub new {
59				my $class = shift;
60				my $self = bless {
61				text => "",
62				annotationsets => {},
63				features => {},
64				featuretypes => {},
65				}, ref($class) \|\| $class;
66				$self->{annotationsets}->{""} = NLP::GATE::AnnotationSet->new();
67				return $self;
68				}
69
70				=head2 setText($text)
71
72				Set the text of the document. Note that annotations will remain unchanged
73				unless you explicitly remove them (see setAnnotation) and might point to
74				non-existing or incorrect text after the text is changed.
75
76				=cut
77
78				sub setText {
79				my $self = shift;
80				my $text = shift;
81				$self->{text} = $text;
82				return $self;
83				}
84
85				=head2 appendText($theText)
86
87				Append text to the current text content of the document.
88				In scalar context, returns the document object.
89				In array context, returns the from and to offsets of the
90				newly added text. This can be used to add annotations
91				for that text snipped more easily.
92
93				=cut
94
95				sub appendText {
96				my $self = shift;
97				my $text = shift;
98				my $from = length($self->{text});
99				my $to = $from + length($text);
100				$self->{text} .= $text;
101				if(wantarray) {
102				return ($from,$to);
103				} else {
104				return $self;
105				}
106				}
107
108				=head2 getText()
109
110				Get the plain text of the document.
111
112				=cut
113
114				sub getText {
115				my $self = shift;
116				return $self->{text} \|\| "";
117				}
118
119				=head2 getTextForAnnotation($annotation)
120
121				Get the text spanned by the given annotation
122
123				TODO: no sanity checks yet!
124
125				=cut
126				sub getTextForAnnotation {
127				my $self = shift;
128				my $ann = shift;
129				return substr($self->{text},$ann->getFrom(),$ann->getTo()-$ann->getFrom());
130				}
131
132				=head2 getAnnotationSet ($name)
133
134				Return the annotation set with that name. Return undef
135				if no set with such a name is found.
136
137				This is more straightforward than the original Java implementation in GATE:
138				passing an empty string or undef as $name will return the default
139				annotation set.
140
141				=cut
142				sub getAnnotationSet {
143				my $self = shift;
144				my $setname = shift \|\| "";
145				return $self->{annotationsets}->{$setname};
146				}
147
148
149				=head2 getAnnotationSetNames
150
151				Return a list of known annotation set names. This will include an entry that is the empty string
152				that stands for the default annotation set.
153
154				=cut
155				sub getAnnotationSetNames {
156				my $self = shift;
157				return keys %{$self->{annotationsets}};
158				}
159
160				=head2 setAnnotationSet ($set[,$name])
161
162				Store the annotation set object with the document under the given
163				annotation set name. If the name is the empty string or undef, the
164				default annotation set is stored or replaced.
165				Any existing annotation set with that name will be destroyed (unless the
166				object to replace it is the original set object).
167
168				=cut
169				sub setAnnotationSet {
170				my $self = shift;
171				my $set = shift;
172				my $name = shift \|\| "";
173				croak "Expected a NLP::GATE::AnnotationSet for setAnnotationSet, got ",(ref $set) unless(ref $set eq "NLP::GATE::AnnotationSet");
174				$self->{annotationsets}->{$name} = $set;
175				return $self;
176				}
177
178				=head2 setFeature($name,$value)
179
180				Add or replace the document feature of the given name with the new
181				value.
182				Make sure you at least add the usual GATE standard features to a document:
183
184				setFeature('gate.SourceURL','created from String');
185
186				=cut
187
188				sub setFeature {
189				my $self = shift;
190				my $name = shift;
191				my $value = shift;
192				$self->{features}->{$name} = $value;
193				return $self;
194				}
195
196				=head2 getFeature($name)
197
198				Return the value of the document feature with that name.
199
200				=cut
201
202				sub getFeature {
203				my $self = shift;
204				my $name = shift;
205				return $self->{features}->{$name};
206				}
207
208				=head2 setFeatureType($name,$type)
209
210				Set the Java type for the feature.
211
212				=cut
213
214				sub setFeatureType {
215				my $self = shift;
216				my $name = shift;
217				my $type = shift;
218				$self->{featuretypes}->{$name} = $type;
219				return $self;
220				}
221
222				=head2 getFeatureType($name)
223
224				Return the Java type for a feature. If the type has never been set,
225				the default is java.lang.String.
226
227				=cut
228
229				sub getFeatureType {
230				my $self = shift;
231				my $name = shift;
232				return $self->{featuretypes}->{$name} \|\| "java.lang.String";
233				}
234
235
236				=head2 fromXMLFile($filename)
237
238				Read a GATE document from an XML file.
239				All content of the current object, including features, annotations and
240				text is discarded.
241
242				=cut
243
244				sub fromXMLFile {
245				my $self = shift;
246				my $filepath = shift;
247				my $parser = XML::LibXML->new();
248				_setParserOptions($parser);
249				my $doc = undef;
250				## the parse_file method outputs a very strange error message when the file is
251				## not found - therefore we catch all errors and add a little note just to make
252				## sure the user checks this possible cause.
253				eval {
254				$doc = $parser->parse_file( $filepath );
255				};
256				if ($@) {
257				croak "Got the following error when trying to parse $filepath: $@\n(Make sure the file exists!)";
258				}
259				_parseXML($self,$doc);
260				return $self;
261				}
262
263				=head2 fromXML($string)
264
265				Read a GATE document from a string that contains a GATE document into the
266				current object. All previous content of the object is discarded.
267				The XML string has to be encoded in UTF8 for now.
268
269				=cut
270
271				sub fromXML {
272				my $self = shift;
273				my $xml = shift;
274				my $parser = XML::LibXML->new();
275				_setParserOptions($parser);
276				my $doc = $parser->parse_string($xml);
277				_parseXML($self,$doc);
278				return $self;
279				}
280
281
282				sub _setParserOptions {
283				my $parser = shift;
284				$parser->validation(0);
285				$parser->recover(0);
286				$parser->expand_entities(1);
287				$parser->keep_blanks(1);
288				$parser->pedantic_parser(1);
289				$parser->line_numbers(1);
290				$parser->load_ext_dtd(0);
291				$parser->complete_attributes(0);
292				$parser->expand_xinclude(0);
293				$parser->no_network(1);
294				}
295
296				sub _parseXML {
297				my $self = shift;
298				my $doc = shift;
299				my $root = $doc->getDocumentElement();
300				## process the document features
301				my $i = 0;
302				for my $feature ($doc->findnodes("/GateDocument/GateDocumentFeatures/Feature")) {
303				my @n = $feature->findnodes("Name");
304				my @v = $feature->findnodes("Value");
305				#my @t = $feature->findnodes('Value/@className');
306				if(@n && @v) {
307				unless(scalar @n == 1) {
308				croak "Strange document format: not exactly one Name element for document feature!";
309				}
310				unless(scalar @v == 1) {
311				croak "Strange document format: not exactly one Value element for document feature!";
312				}
313				my $fname = $n[0]->textContent();
314				$self->{features}->{$fname} = $v[0]->textContent();
315				$self->{featuretypes}->{$fname} = $v[0]->getAttribute("className");
316				}
317				}
318				## process the document text and create a map of node ids to text offsets
319				my %nodemap = ();
320				my $offset = 0;
321				my $text = "";
322				for my $el ($doc->findnodes("/GateDocument/TextWithNodes")) {
323				foreach my $c ($el->childNodes()) {
324				if($c->nodeType() == 1) { # element node
325				## get the attribute id
326				my $nodeid = _getAttr($c,"id");
327				$nodemap{$nodeid} = $offset;
328				} elsif($c->nodeType() == 3 \|\| $c->nodeType() == 4) {
329				## 3: text
330				## 4: cdata
331				my $t = $c->textContent();
332				$offset += length($t);
333				$text .= $t;
334				} else {
335				croak "Invalid node type encountered: ",$c->nodeType(),"\n";
336				}
337				}
338				}
339				$self->{text} = $text;
340				## process the annotation features, replacing node ids with offset information
341				for my $annset ($doc->findnodes("/GateDocument/AnnotationSet")) {
342				## figure out the name, then create a new annotation set
343				my $name = _getAttr($annset,"Name","");
344				my $myannset = NLP::GATE::AnnotationSet->new();
345
346				## find all the annotations in that annotation set
347				for my $ann ($annset->findnodes("Annotation")) {
348				# get attributes Id, Type, StartNode, EndNode
349				my $annId = _getAttr($ann,"Id");
350				my $annType = _getAttr($ann,"Type");
351				my $annStartNode = _getAttr($ann,"StartNode");
352				my $annEndNode = _getAttr($ann,"EndNode");
353				my $from = $nodemap{$annStartNode};
354				my $to = $nodemap{$annEndNode};
355				my $myann = NLP::GATE::Annotation->new($annType,$from,$to);
356				for my $feature ($ann->findnodes("Feature")) {
357				my @n = $feature->findnodes("Name");
358				my @v = $feature->findnodes("Value");
359				my @t = $feature->findnodes('Value/@className');
360				if(@n && @v) {
361				unless(scalar @n == 1) {
362				croak "Strange document format: not exactly one Name element for document feature!";
363				}
364				unless(scalar @v == 1) {
365				croak "Strange document format: not exactly one Value element for document feature!";
366				}
367				my $fname = $n[0]->textContent();
368				$myann->setFeature($fname,$v[0]->textContent());
369				$myann->setFeatureType($fname,$v[0]->getAttribute("className"));
370				}
371				} # for feature
372				$myannset->add($myann);
373				} # for ann
374				$self->{annotationsets}->{$name} = $myannset;
375				} # for annset
376
377				}
378
379				sub _getAttr {
380				## TODO: use node->getAttribute(name) instead!
381				my $el = shift;
382				my $attrname = shift;
383				my $default = shift;
384				my $val = $el->getAttribute($attrname);
385				if(defined($val)) {
386				return $val;
387				} elsif(defined($default)) {
388				return $default;
389				} else {
390				croak "Attribute $attrname not found in element ",$el->toString()," and no default";
391				}
392				}
393
394
395
396				=head2 toXML()
397
398				Create an actual XML representation that can be used by GATE from the internal
399				representation of the document.
400
401				=cut
402
403				sub toXML {
404				my $self = shift;
405				my $ret = "";
406				my $xml = new XML::Writer(OUTPUT => \$ret, ENCODING => "utf-8");
407				$xml->xmlDecl();
408				$xml->startTag("GateDocument");
409				$ret .= "\n";
410				$xml->comment("The document's features");
411				$ret .= "\n";
412				$xml->startTag("GateDocumentFeatures");
413				$ret .= "\n";
414				_outputFeatures($xml,$self->{features},$self->{featuretypes},\$ret);
415				$xml->endTag("GateDocumentFeatures");
416				$ret .= "\n";
417				$xml->comment("The document content area with serialized nodes");
418				$ret .= "\n";
419				$xml->startTag("TextWithNodes");
420				# $ret .= "\n";
421				my @offsets = $self->_getOffsets();
422				my $lastoffset = 0;
423				foreach my $offset ( @offsets ) {
424				# is there any text between the last node and this one?
425				if($lastoffset < $offset) {
426				$xml->characters($self->_getText($lastoffset,$offset));
427				$lastoffset = $offset;
428				}
429				$xml->emptyTag("Node",'id' => $offset);
430				}
431				## if there is text after the last node, output it too
432				if($lastoffset < length($self->{text})) {
433				$xml->characters($self->_getText($lastoffset,length($self->{text})));
434				}
435				$xml->endTag("TextWithNodes");
436				$ret .= "\n";
437				# the default annotation set is always there
438
439				# use a unique id for all annotations over all sets
440				my $annid = 0;
441				$xml->comment("The default annotation set");
442				$ret .= "\n";
443				$xml->startTag("AnnotationSet");
444				$ret .= "\n";
445				foreach my $ann ( @{$self->{annotationsets}->{""}->_getArrayRef()} ) {
446				$xml->startTag("Annotation",
447				'Id' => $annid,
448				'Type' => $ann->getType(),
449				'StartNode' => $ann->getFrom(),
450				'EndNode' => $ann->getTo()
451				);
452				$ret .= "\n";
453				_outputFeatures($xml,$ann->_getFeatures(),$ann->_getFeatureTypes(),\$ret);
454				$xml->endTag("Annotation");
455				$ret .= "\n";
456				$annid++;
457				}
458				$xml->endTag("AnnotationSet");
459				$ret .= "\n";
460				# optionally, there might be additional named annotation sets
461				# these use AnnotationSet with Name="name" attributes
462				foreach my $annsetname ( keys %{$self->{annotationsets}} ) {
463				next if $annsetname eq ""; # we already have processed the default set
464				my $annset = $self->{annotationsets}->{$annsetname};
465				$xml->startTag("AnnotationSet", 'Name' => $annsetname);
466				$ret .= "\n";
467				foreach my $ann ( @{$annset->_getArrayRef()} ) {
468				$xml->startTag("Annotation",
469				'Id' => $annid,
470				'Type' => $ann->getType(),
471				'StartNode' => $ann->getFrom(),
472				'EndNode' => $ann->getTo()
473				);
474				$ret .= "\n";
475				_outputFeatures($xml,$ann->_getFeatures(),$ann->_getFeatureTypes,\$ret);
476				$xml->endTag("Annotation");
477				$ret .= "\n";
478				$annid++;
479				}
480				$xml->endTag("AnnotationSet");
481				$ret .= "\n";
482				}
483				$xml->endTag("GateDocument");
484				$xml->end();
485				return $ret;
486				}
487
488				sub _outputFeatures {
489				my $xml = shift;
490				my $featurehashref = shift;
491				my $featuretypes = shift;
492				my $ret = shift;
493				$ret = ${$ret};
494				foreach my $feature ( keys %{$featurehashref} ) {
495				$xml->startTag("Feature");
496				$ret .= "\n";
497				$xml->startTag("Name",'className' => 'java.lang.String');
498				$xml->characters($feature);
499				$xml->endTag("Name");
500				$xml->startTag("Value",'className' => $featuretypes->{$feature} \|\| "java.lang.String");
501				$xml->characters($featurehashref->{$feature});
502				$xml->endTag("Value");
503				$xml->endTag("Feature");
504				$ret .= "\n";
505				}
506				}
507
508				## this will generate an array with offsets for all from and
509				## to offsets needed for the annotations.
510				sub _getOffsets {
511				my $self = shift;
512				my %offsets = ();
513				# for each annotation set
514				foreach my $annset ( keys %{$self->{annotationsets}} ) {
515				# for each annotation
516				#print STDERR "Annset: $annset\n";
517				foreach my $ann ( @{$self->{annotationsets}->{$annset}->_getArrayRef()} ) {
518				# add the from and two offsets to the offset hash
519				$offsets{$ann->getFrom()} = 1;
520				$offsets{$ann->getTo()} = 1;
521				}
522				}
523				# convert the offset hash to an array of offsets
524				my @offsets = keys %offsets;
525				# sort by offset
526				@offsets = sort {$a <=> $b} @offsets;
527				# return the array of offsets
528				#print "OFFSETS: ",join(",",@offsets),"\n";
529				return @offsets;
530				}
531
532				## get the text starting at offset from and going to the character before
533				## offset to
534				sub _getText {
535				my $self = shift;
536				my $from = shift;
537				my $to = shift;
538				return substr($self->{text},$from,$to-$from);
539				}
540				=head1 AUTHOR
541
542				Johann Petrak, C<< >>
543
544				=head1 BUGS
545
546				Please report any bugs or feature requests to
547				C, or through the web interface at
548				L.
549				I will be notified, and then you'll automatically be notified of progress on
550				your bug as I make changes.
551
552				=head1 SUPPORT
553
554				You can find documentation for this module with the perldoc command.
555
556				perldoc NLP::GATE
557
558				You can also look for information at:
559
560				=over 4
561
562				=item * AnnoCPAN: Annotated CPAN documentation
563
564				L
565
566				=item * CPAN Ratings
567
568				L
569
570				=item * RT: CPAN's request tracker
571
572				L
573
574				=item * Search CPAN
575
576				L
577
578				=back
579
580				=head1 ACKNOWLEDGEMENTS
581
582				=head1 COPYRIGHT & LICENSE
583
584				Copyright 2007 Johann Petrak, all rights reserved.
585
586				This program is free software; you can redistribute it and/or modify it
587				under the same terms as Perl itself.
588
589				=cut
590
591				1; # End of NLP::GATE::Document