File Coverage

blib/lib/Treex/Core/Document.pm
Criterion Covered Total %
statement 245 279 87.8
branch 61 90 67.7
condition 13 20 65.0
subroutine 35 41 85.3
pod 15 23 65.2
total 369 453 81.4


line stmt bran cond sub pod time code
1             package Treex::Core::Document;
2             $Treex::Core::Document::VERSION = '2.20210102';
3 24     24   967342 use Moose;
  24         7923477  
  24         196  
4 24     24   188370 use Treex::Core::Common;
  24         93  
  24         154  
5 24     24   153259 use Treex::Core::Config;
  24         87  
  24         896  
6 24     24   13473 use Treex::Core::DocZone;
  24         95  
  24         1164  
7 24     24   14547 use Treex::Core::Bundle;
  24         107  
  24         1218  
8              
9 24     24   232 use Treex::PML;
  24         62  
  24         3824  
10             Treex::PML::UseBackends('PMLBackend');
11             Treex::PML::AddResourcePath( Treex::Core::Config->pml_schema_dir() );
12              
13             with 'Treex::Core::WildAttr';
14              
15 24     24   199 use Scalar::Util qw( weaken reftype );
  24         70  
  24         1583  
16              
17 24     24   183 use PerlIO::via::gzip;
  24         65  
  24         669  
18 24     24   159 use Storable;
  24         58  
  24         1938  
19 24     24   191 use Digest::MD5 qw(md5_hex);
  24         66  
  24         1196  
20 24     24   183 use Lingua::Interset::FeatureStructure;
  24         56  
  24         11791  
21              
22             has loaded_from => ( is => 'rw', isa => 'Str', default => '' );
23             has path => ( is => 'rw', isa => 'Str' );
24             has file_stem => ( is => 'rw', isa => 'Str', default => 'noname' );
25             has file_number => ( is => 'rw', isa => 'Str', builder => 'build_file_number' );
26             has compress => ( is => 'rw', isa => 'Bool', default => undef, documentation => 'compression to .gz' );
27             has storable => (
28             is => 'rw',
29             isa => 'Bool',
30             default => undef,
31             documentation => 'using Storable with gz compression instead of Treex::PML'
32             );
33              
34             has _hash => ( is => 'rw', isa => 'Str' );
35              
36             sub get_hash {
37 0     0 0 0 my $self = shift;
38 0 0       0 if ( ! defined($self->_hash) ) {
39 0         0 $Storable::canonical = 1;
40 0         0 $self->_set_hash(md5_hex(Storable::nfreeze($self)));
41 0         0 $Storable::canonical = 0;
42             }
43 0         0 return $self->_hash;
44             }
45              
46             sub set_hash {
47 0     0 0 0 my ($self, $hash) = @_;
48              
49 0         0 $self->_set_hash($hash);
50              
51 0         0 return;
52             }
53              
54             has _pmldoc => (
55             isa => 'Treex::PML::Document',
56             is => 'rw',
57             init_arg => 'pml_doc',
58             writer => '_set_pmldoc',
59             handles => {
60             set_filename => 'changeFilename',
61             map { $_ => $_ }
62             qw( clone writeFile writeTo filename URL
63             changeFilename changeURL fileFormat changeFileFormat
64             backend changeBackend encoding changeEncoding userData
65             changeUserData metaData changeMetaData listMetaData
66             appData changeAppData listAppData
67              
68             documentRootData
69              
70             FS changeFS
71              
72             hint changeHint pattern_count pattern patterns
73             changePatterns tail changeTail
74              
75             trees changeTrees treeList tree delete_tree lastTreeNo notSaved
76             currentTreeNo currentNode nodes value_line value_line_list
77             determine_node_type )
78             },
79             builder => '_create_empty_pml_doc',
80             );
81              
82             has _index => (
83             is => 'rw',
84             default => sub { return {} },
85             );
86              
87             has _backref => (
88             is => 'rw',
89             default => sub { return {} },
90             );
91              
92             has _latest_node_number => ( # for generating document-unique IDs
93             is => 'rw',
94             default => 0,
95             );
96              
97 24     24   230 use Treex::PML::Factory;
  24         56  
  24         99543  
98             my $factory = Treex::PML::Factory->new();
99              
100             my $highest_file_number = 1;
101              
102             # the description attribute is stored inside the meta structures of pml documents,
103             # that is why it is not realized as a regular Moose attribute
104              
105             sub set_description {
106 2     2 0 15 my ( $self, $attr_value ) = @_;
107              
108             return Treex::PML::Node::set_attr(
109             $self->metaData('pml_root')->{meta},
110 2         10 'description', $attr_value
111             );
112             }
113              
114             sub description {
115 4     4 1 178 my $self = shift;
116 4         20 return Treex::PML::Node::attr( $self->metaData('pml_root')->{meta}, 'description' );
117             }
118              
119             sub build_file_number {
120 31     31 0 1063 return sprintf "%03d", $highest_file_number++;
121             }
122              
123             # Full filename without the extension
124             sub full_filename {
125 2 50   2 1 18 log_fatal 'Incorrect number of arguments' if @_ != 1;
126 2         6 my $self = shift;
127 2         5 my $path = '';
128 2 50 33     68 if (defined $self->path && $self->path ne ''){
129 0         0 $path = $self->path;
130 0 0       0 $path .= '/' if $path !~ m{/$};
131             }
132 2         94 return $path . $self->file_stem . $self->file_number;
133             }
134              
135             sub BUILD {
136 32     32 0 92 my $self = shift;
137 32         115 my ($params_rf) = @_;
138 32         76 my $pmldoc;
139              
140 32 50       127 if ( defined $params_rf ) {
141              
142             # creating Treex::Core::Document from an already existing Treex::PML::Document instance
143 32 50       247 if ( $params_rf->{pmldoc} ) {
    100          
144 0         0 $pmldoc = $params_rf->{pmldoc};
145             }
146              
147             # loading Treex::Core::Document from a file
148             elsif ( $params_rf->{filename} ) {
149              
150 10 50       63 if ( $params_rf->{filename} =~ /.streex$/ ) {
151 0         0 log_fatal 'Storable (.streex) docs must be retrieved by Treex::Core::Document->retrieve_storable($filename)';
152             }
153              
154             else {
155              
156             # If the file contains invalid PML (e.g. unknown afun value)
157             # Treex::PML fails with die.
158             # TODO: we should rather catch the die message and report it via log_fatal
159 10         28 $pmldoc = eval {
160             # In r10421, ZŽ added here recover => 1:
161             # $factory->createDocumentFromFile( $params_rf->{filename}, { recover => 1 });
162             # However, if the file contains invalid PML (e.g. unknown afun value), the recover=>1 option
163             # results in returning a $pmldoc which seems to be OK, but it contains no bundles,
164             # so Treex crashes on subsequent blocks which is misleading for users.
165             # If we really want to be fault-tolerant, it seems we would need to set Treex::PML::Instance::Reader::STRICT=0,
166             # but I don't know enough about PML internals and I think it's better to make such errors fatal.
167             # Martin Popel
168             # 2018-03-05: Dan Zeman: adding recover=>1 again.
169             # Random strange errors happen with visually correct files on cluster in the PML backend ("extra content after document end").
170             # It is paralyzing my work.
171 10         97 $factory->createDocumentFromFile( $params_rf->{filename}, {'recover' => 1} );
172             };
173 10 50       8865133 if ($Treex::PML::FSError) {
174 0         0 log_warn "Treex::PML::FSError: $Treex::PML::FSError";
175 0         0 log_warn "Trying to process the document anyway.";
176             }
177 10 0       60 log_fatal "Error while loading " . $params_rf->{filename} . ( $@ ? "\n$@" : '' )
    50          
178             if !defined $pmldoc;
179             }
180             }
181             }
182              
183             # constructing treex document from an existing file
184 32 100       136 if ($pmldoc) {
185 10         645 $self->_set_pmldoc($pmldoc);
186              
187             # ensuring Treex::Core types (partially copied from the factory)
188             # $doczone hashref will be reused as the blessed instance variable
189 10         549 for my $doczone ($self->get_all_zones()){
190 3         820 Treex::Core::DocZone->new($doczone);
191             }
192              
193 10         308 $self->_rebless_and_index();
194             }
195              
196 32         222 $self->deserialize_wild;
197 32         173 foreach my $bundle ( $self->get_bundles ) {
198 12         246 $bundle->deserialize_wild;
199 12         68 foreach my $bundlezone ( $bundle->get_all_zones ) {
200 14         1269 foreach my $tree ( $bundlezone->get_all_trees ){
201 32 100       2684 my $ordered = $tree->type->get_structure_name =~ /[at]-(root|node)/ ? 1 : 0;
202 32         574 my $correct_ord = 0;
203 32 100       235 my @nodes = $tree->get_descendants( { add_self => 1, ($ordered ? (ordered => 1) : ()) } );
204 32         119 foreach my $node (@nodes){
205             # normalize ord, so there are no gaps
206 42 100       553 if ($ordered){
207 28         921 $node->_set_ord($correct_ord);
208 28         60 $correct_ord++;
209             }
210 42         248 $node->deserialize_wild;
211             # Interset, if present, must be deserialized after wild because it relies on wild to take care of the 'other' feature.
212 42 100       861 if ( $node->DOES('Treex::Core::Node::Interset') ) {
213 28         11880 $node->deserialize_iset;
214             }
215             }
216             }
217             }
218             }
219              
220 32         3263 return;
221             }
222              
223             sub _rebless_and_index {
224 10     10   34 my $self = shift;
225 10         54 foreach my $bundle ( $self->get_bundles ) {
226 12         298 bless $bundle, 'Treex::Core::Bundle';
227              
228 12         416 $bundle->_set_document($self);
229              
230 12 100       57 if ( defined $bundle->{zones} ) {
231 8         68 foreach my $zone ( map { $_->value() } $bundle->{zones}->elements ) {
  14         163  
232              
233             # $zone hashref will be reused as the blessed instance variable
234 14         166 Treex::Core::BundleZone->new($zone);
235 14         3423 $zone->_set_bundle($bundle);
236              
237 14         66 foreach my $tree ( $zone->get_all_trees ) {
238 32         66 my $layer;
239 32 50       144 if ( $tree->type->get_structure_name =~ /(\S)-(root|node|nonterminal|terminal)/ ) {
240 32         672 $layer = uc($1);
241             }
242             else {
243 0         0 log_fatal "Unexpected member in zone structure: " . $tree->type->get_structure_name;
244             }
245 32         132 foreach my $node ( $tree, $tree->descendants ) { # must still call Treex::PML::Node's API
246 42         15695 bless $node, "Treex::Core::Node::$layer";
247 42         243 $self->index_node_by_id( $node->get_id, $node );
248 42 100 100     193 if ($layer eq 'A' && $node->{iset}){
249 3         10 $node->{iset} = Lingua::Interset::FeatureStructure->new(%{$node->{iset}});
  3         38  
250             }
251             }
252 32         30116 $tree->_set_zone($zone);
253             }
254             }
255             }
256             }
257 10         63 return;
258             }
259              
260             sub _pml_attribute_hash {
261 0     0   0 my $self = shift;
262 0         0 return $self->metaData('pml_root')->{meta};
263             }
264              
265             #my $_treex_schema_file = Treex::PML::ResolvePath( '.', 'treex_schema.xml', 1 );
266             my $_treex_schema_file = Treex::Core::Config->pml_schema_dir . "/" . 'treex_schema.xml';
267             if ( not -f $_treex_schema_file ) {
268             log_fatal "Can't find PML schema $_treex_schema_file";
269             }
270              
271             my $_treex_schema = Treex::PML::Schema->new( { filename => $_treex_schema_file } );
272              
273             sub _create_empty_pml_doc { ## no critic (ProhibitUnusedPrivateSubroutines)
274 32     32   504 my $fsfile = Treex::PML::Document->create
275             (
276             name => "x", #$filename, ???
277             FS => Treex::PML::FSFormat->new(
278             {
279             'deepord' => ' N' # ???
280             }
281             ),
282             trees => [],
283             backend => 'PMLBackend',
284             encoding => "utf-8",
285             );
286              
287 32         14980 $fsfile->changeMetaData( 'schema-url', 'treex_schema.xml' );
288 32         427 $fsfile->changeMetaData( 'schema', $_treex_schema );
289 32         463 $fsfile->changeMetaData( 'pml_root', { meta => {}, bundles => undef, } );
290 32         1305 return $fsfile;
291             }
292              
293             # --- INDEXING
294              
295             sub index_node_by_id {
296 414     414 1 8711 my $self = shift;
297 414         1953 my ( $id, $node ) = pos_validated_list(
298             \@_,
299             { isa => 'Treex::Type::Id' },
300             { isa => 'Maybe[Treex::Core::Node]' }, #jde to takhle?
301             );
302 414         12337 my $index = $self->_index;
303 414 100       1034 if ( defined $node ) {
304 303         948 $index->{$id} = $node;
305 303         1313 weaken $index->{$id};
306              
307 303         1179 my $refs = $node->_get_referenced_ids;
308 303         527 foreach my $type ( keys %{$refs} ) {
  303         885  
309 4         19 $self->index_backref( $type, $id, $refs->{$type} );
310             }
311             }
312             else {
313 111         310 delete $index->{$id};
314             }
315 414         1247 return;
316             }
317              
318             # Add references to the reversed references list
319             sub index_backref {
320 24     24 0 78 my ( $self, $type, $source, $targets ) = @_;
321 24         671 my $backref = $self->_backref;
322              
323 24         51 foreach my $target ( @{$targets} ) {
  24         64  
324 30 100       80 next if ( !defined($target) );
325 25   100     125 my $target_backrefs = $backref->{$target} // {};
326 25         65 $backref->{$target} = $target_backrefs;
327              
328 25 100       86 $target_backrefs->{$type} = [] if ( !$target_backrefs->{$type} );
329 25         41 push @{ $target_backrefs->{$type} }, $source;
  25         78  
330             }
331 24         76 return;
332             }
333              
334             # Remove references from the reversed references list
335             sub remove_backref {
336 10     10 0 35 my ( $self, $type, $source, $targets ) = @_;
337 10         298 my $backref = $self->_backref;
338              
339 10         23 foreach my $target ( @{$targets} ) {
  10         25  
340 16 50       43 next if ( !defined($target) );
341 16         34 my $target_backrefs = $backref->{$target};
342 16 50       37 next if ( !$target_backrefs );
343              
344 16         22 $target_backrefs->{$type} = [ grep { $_ ne $source } @{ $target_backrefs->{$type} } ];
  14         50  
  16         42  
345             }
346 10         33 return;
347             }
348              
349             # Return a hash of references ( type->[nodes] ) leading to the node with the given id
350             sub get_references_to_id {
351 24     24 1 65 my ( $self, $id ) = @_;
352 24         645 my $backref = $self->_backref;
353              
354 24 100       92 return if ( !$backref->{$id} );
355 22         65 return $backref->{$id}; # TODO clone ?
356             }
357              
358             # Remove all references and backreferences leading to the $node (calls remove_reference() on the source nodes)
359             sub _remove_references_to_node {
360 69     69   134 my ( $self, $node ) = @_;
361 69         1633 my $id = $node->id;
362 69         1803 my $backref = $self->_backref;
363              
364             # First, delete backreferences to the $node
365 69         208 my $refs = $node->_get_referenced_ids();
366 69         114 foreach my $type ( keys %{$refs} ) {
  69         197  
367 1         6 $self->remove_backref( $type, $id, $refs->{$type} );
368             }
369              
370             # Second, delete references to the $node
371 69 100       248 return if ( !$backref->{$id} );
372 4         9 my $node_backref = $backref->{$id};
373              
374 4         8 foreach my $type ( keys %{$node_backref} ) {
  4         18  
375 6         11 foreach my $source ( @{ $node_backref->{$type} } ) {
  6         19  
376 6         19 $self->get_node_by_id($source)->remove_reference( $type, $id );
377             }
378             }
379              
380             # Third, delete backreferences from the $node
381 4         13 delete $backref->{$id};
382 4         18 return;
383             }
384              
385             sub id_is_indexed {
386 183     183 1 311 my $self = shift;
387 183         826 my ($id) = pos_validated_list(
388             \@_,
389             { isa => 'Treex::Type::Id' },
390             );
391 183         5705 return ( defined $self->_index->{$id} );
392             }
393              
394             sub get_node_by_id {
395 71     71 1 1100 my $self = shift;
396 71         298 my ($id) = pos_validated_list(
397             \@_,
398             { isa => 'Treex::Type::Id' },
399             );
400 71 50       2122 if ( defined $self->_index->{$id} ) {
401 71         1804 return $self->_index->{$id};
402             }
403             else {
404 0         0 log_fatal "ID not indexed: id=\"$id\"";
405              
406             # This is something very fatal. Treex assumes every node ID to
407             # be valid and pointing to a node *in the given document*.
408             # (It is fine to have a node with no a/lex.rf
409             # attribute, but if the attribute is there, the value
410             # has to be an ID within the document.)
411             #
412             # If your data violates the requirement and your IDs point to
413             # a different document, the only hack we suggest is to drop such
414             # references...
415             }
416 0         0 return;
417             }
418              
419             sub get_all_node_ids {
420 0 0   0 1 0 log_fatal('Incorrect number of arguments') if @_ != 1;
421 0         0 my $self = shift;
422 0         0 return ( keys %{ $self->_index } );
  0         0  
423             }
424              
425             # -------------------------------------- ACCESS TO BUNDLES -------------------
426              
427             sub get_bundles {
428 98 50   98 1 5269 log_fatal('Incorrect number of arguments') if @_ != 1;
429 98         207 my $self = shift;
430 98         524 return $self->trees;
431             }
432              
433             sub create_bundle {
434 28     28 1 2455 my ( $self, $arg_ref ) = @_;
435 28         782 my $fsfile = $self->_pmldoc();
436 28         79 my $new_bundle;
437             my $position_of_new;
438              
439 28 100 66     139 if ( $arg_ref and ( $arg_ref->{after} or $arg_ref->{before} ) ) {
      66        
440 2 100       7 my $reference_bundle = ( $arg_ref->{after} ) ? $arg_ref->{after} : $arg_ref->{before};
441 2         9 my $position_of_reference = $reference_bundle->get_position;
442 2 100       6 $position_of_new = $position_of_reference + ( $arg_ref->{after} ? 1 : 0 );
443             }
444              
445             else { # default: append at the end of the document
446 26         97 $position_of_new = scalar( $self->get_bundles() );
447             }
448              
449 28         596 $new_bundle = $fsfile->new_tree($position_of_new);
450 28         1981 $new_bundle->set_type_by_name( $fsfile->metaData('schema'), 'bundle.type' );
451 28         2375 bless $new_bundle, "Treex::Core::Bundle"; # is this correct/sufficient with Moose ????
452 28         1073 $new_bundle->_set_document($self);
453              
454 28         153 $new_bundle->set_id( "s" . ( $fsfile->lastTreeNo + 1 ) );
455              
456 28         114 return $new_bundle;
457             }
458              
459             # ----------------------- ACCESS TO ZONES ------------------------------------
460              
461             sub create_zone {
462 4     4 1 685 my $self = shift;
463 4         48 my ( $language, $selector ) = pos_validated_list(
464             \@_,
465             { isa => 'Treex::Type::LangCode' },
466             { isa => 'Treex::Type::Selector', default => '' },
467             );
468              
469 4         59 my $new_zone = Treex::Core::DocZone->new(
470             {
471             'language' => $language,
472             'selector' => $selector
473             }
474             );
475              
476 4         1199 my $new_element = Treex::PML::Seq::Element->new( 'zone', $new_zone );
477              
478 4         43 my $meta = $self->metaData('pml_root')->{meta};
479 4 100       75 if ( defined $meta->{zones} ) {
480 2         11 $meta->{zones}->unshift_element_obj($new_element);
481             }
482             else {
483 2         20 $meta->{zones} = Treex::PML::Seq->new( [$new_element] );
484             }
485              
486 4         81 return $new_zone;
487             }
488              
489             sub get_all_zones {
490 15     15 0 48 my $self = shift;
491 15         101 my $meta = $self->metaData('pml_root')->{meta};
492 15 100       384 return if !$meta->{zones};
493              
494             # Each element is a pair [$name, $value]. We need just the values.
495 6         34 return map {$_->[1]} $meta->{zones}->elements;
  16         118  
496             }
497              
498             sub get_zone {
499 4     4 1 16 my $self = shift;
500 4         35 my ( $language, $selector ) = pos_validated_list(
501             \@_,
502             { isa => 'Treex::Type::LangCode' },
503             { isa => 'Treex::Type::Selector', default => '' },
504             );
505              
506 4         18 foreach my $zone ($self->get_all_zones()) {
507 12 100 66     311 return $zone if $zone->language eq $language && $zone->selector eq $selector;
508             }
509 0         0 return;
510             }
511              
512             sub get_or_create_zone {
513 0     0 1 0 my $self = shift;
514 0         0 my ( $language, $selector ) = pos_validated_list(
515             \@_,
516             { isa => 'Treex::Type::LangCode' },
517             { isa => 'Treex::Type::Selector', default => '' },
518             );
519              
520 0         0 my $fs_zone = $self->get_zone( $language, $selector );
521 0 0       0 if ( not defined $fs_zone ) {
522 0         0 $fs_zone = $self->create_zone( $language, $selector );
523             }
524 0         0 return $fs_zone;
525             }
526              
527             # -------------------- LOADING AND SAVING ------------------------------------
528              
529             sub load {
530 0     0 1 0 my $self = shift;
531 0         0 return $self->_pmldoc->load(@_);
532              
533             # TODO: this is unfinished: should be somehow connected with the code in BUILD
534             }
535              
536             sub save {
537 11     11 1 1180 my $self = shift;
538 11         38 my ($filename) = @_;
539              
540 11 100       79 if ( $filename =~ /\.streex$/ ) {
541 1 50   1   55 open( my $F, ">:via(gzip)", $filename ) or log_fatal $!;
  1         9  
  1         2  
  1         8  
542 1         3957 print $F Storable::nfreeze($self);
543 1         17617 close $F;
544              
545             # using Storable::nstore_fd($self,*$F) emits 'Inappropriate ioctl for device'
546             }
547              
548             else {
549 10         56 $self->_serialize_all_wild();
550 10         295 return $self->_pmldoc->save(@_);
551             }
552              
553 1         1443 return;
554             }
555              
556             sub _serialize_all_wild {
557 10     10   31 my ($self) = @_;
558 10         67 $self->serialize_wild;
559 10         45 foreach my $bundle ( $self->get_bundles ) {
560 12         307 $bundle->serialize_wild;
561 12         63 foreach my $bundlezone ( $bundle->get_all_zones ) {
562 14         116 foreach my $node ( map { $_->get_descendants( { add_self => 1 } ) } $bundlezone->get_all_trees ) {
  32         201  
563             # Interset, if present, must be serialized before wild because it relies on wild to take care of the 'other' feature.
564 42 100       1079 if ( $node->DOES('Treex::Core::Node::Interset') ) {
565 28         13301 $node->serialize_iset;
566             }
567 42         5846 $node->serialize_wild;
568             }
569             }
570             }
571 10         68 return;
572             }
573              
574             sub retrieve_storable {
575 1     1 1 14 my ( $class, $file ) = @_; # $file stands for a file name, but it can be also file handle (needed by the TrEd backend for .streex)
576              
577 1         2 my $FILEHANDLE;
578 1         3 my $opened = 0;
579              
580 1 50 33     10 if ( ref($file) and reftype($file) eq 'GLOB' ) {
581 0         0 $FILEHANDLE = $file;
582             }
583             else {
584 1 50       9 log_fatal "filename=$file, but Treex::Core::Document->retrieve_storable(\$filename) can be used only for .streex files"
585             unless $file =~ /\.streex$/;
586 1 50       26 open $FILEHANDLE, "<:via(gzip)", $file or log_fatal($!);
587 1         2235 $opened = 1;
588             }
589              
590 1         2 my $serialized;
591              
592             # reading it this way is silly, but both slurping the file or
593             # using Storable::retrieve_fd lead to errors when used with via(gzip)
594 1         9 while (<$FILEHANDLE>) {
595 5641         407497 $serialized .= $_;
596             }
597              
598 1 50       32 if ( $opened ) {
599 1         9 close($FILEHANDLE);
600             }
601              
602             # my $retrieved_doc = Storable::retrieve_fd(*$FILEHANDLE) or log_fatal($!);
603 1 50       238 my $retrieved_doc = Storable::thaw($serialized) or log_fatal $!;
604              
605 1 50       6474 if ( not ref($file) ) {
606 1         74 $retrieved_doc->set_loaded_from($file);
607 1         32 my ( $volume, $dirs, $file_name ) = File::Spec->splitpath($file);
608 1         41 $retrieved_doc->set_path( $volume . $dirs );
609              
610             # $retrieved_doc->changeFilename($file); # why this doesn't affect the name displayed in TrEd?
611             }
612              
613             # *.streex files saved before r8789 (2012-05-29) have no PML types with nodes, let's fix it
614             # TODO: delete this hack as soon as no such old streex files are needed.
615 1         7 foreach my $bundle ( $retrieved_doc->get_bundles() ) {
616 2         51 foreach my $bundlezone ( $bundle->get_all_zones() ) {
617 1         13 foreach my $node ( map { $_->get_descendants() } $bundlezone->get_all_trees() ) {
  1         16  
618              
619             # skip this hack if we are dealing with a new streex file
620             #return $retrieved_doc if $node->type;
621             # This shortcut does not work since old files have only *some* nodes without types
622 1         10 $node->fix_pml_type();
623             }
624             }
625             }
626              
627 1         8 return $retrieved_doc;
628             }
629              
630             __PACKAGE__->meta->make_immutable;
631              
632             1;
633              
634             __END__
635              
636              
637              
638             =for Pod::Coverage BUILD build_file_number description set_description
639              
640             =encoding utf-8
641              
642             =head1 NAME
643              
644             Treex::Core::Document - representation of a text and its linguistic analyses in the Treex framework
645              
646             =head1 VERSION
647              
648             version 2.20210102
649              
650             =head1 DESCRIPTION
651              
652             A document consists of a sequence of bundles, mirroring a sequence
653             of natural language sentences (typically, but not necessarily,
654             originating from the same text). Attributes (attribute-value pairs)
655             can be attached to a document as a whole.
656              
657             Note that the references from the bundles to the containing document are weak,
658             so make sure you always keep a reference to the document in scope to prevent
659             the contents of the document from being garbage-collected.
660              
661             =head1 ATTRIBUTES
662              
663             C<Treex::Core::Document>'s instances have the following attributes:
664              
665             =over 4
666              
667             =item description
668              
669             Textual description of the file's content that is stored in the file.
670              
671             =item loaded_from
672              
673             =item path
674              
675             =item file_stem
676              
677             =item file_number
678              
679             =back
680              
681             The attributes can be accessed using semi-affordance accessors:
682             getters have the same names as attributes, while setters start with
683             C<set_>. For example, the attribute C<path> has a getter C<path()> and a setter C<set_path($path)>
684              
685              
686              
687             =head1 METHODS
688              
689             =head2 Constructor
690              
691             =over 4
692              
693             =item my $new_document = Treex::Core::Document->new;
694              
695             creates a new empty document object.
696              
697             =item my $new_document = Treex::Core::Document->new( { pmldoc => $pmldoc } );
698              
699             creates a C<Treex::Core::Document> instance from an already existing L<Treex::PML::Document> instance
700              
701             =item my $new_document = Treex::Core::Document->new( { filename => $filename } );
702              
703             loads a C<Treex::Core::Document> instance from a .treex file
704              
705             =back
706              
707              
708             =head2 Access to zones
709              
710             Document zones are instances of L<Treex::Core::DocZone>, parametrized
711             by language code and possibly also by another free label
712             called selector, whose purpose is to distinguish zones for the same language
713             but from a different source.
714              
715             =over 4
716              
717             =item my $zone = $doc->create_zone( $langcode, ?$selector );
718              
719             =item my $zone = $doc->get_zone( $langcode, ?$selector );
720              
721             =item my $zone = $doc->get_or_create_zone( $langcode, ?$selector );
722              
723             =back
724              
725              
726             =head2 Access to bundles
727              
728             =over 4
729              
730             =item my @bundles = $document->get_bundles();
731              
732             Returns the array of bundles contained in the document.
733              
734              
735             =item my $new_bundle = $document->create_bundle();
736              
737             Creates a new empty bundle and appends it
738             at the end of the document.
739              
740             =item my $new_bundle = $document->new_bundle_before( $existing_bundle );
741              
742             Creates a new empty bundle and inserts it
743             in front of the existing bundle.
744              
745             =item my $new_bundle = $document->new_bundle_after( $existing_bundle );
746              
747             Creates a new empty bundle and inserts it
748             after the existing bundle.
749              
750             =back
751              
752              
753             =head2 Node indexing
754              
755             =over 4
756              
757             =item $document->index_node_by_id( $id, $node );
758              
759             The node is added to the document's indexing table C<id2node> (it is done
760             automatically in L<Treex::Core::Node::set_attr()|Treex::Core::Node/set_attr>
761             if the attribute name is 'C<id>'). When using C<undef> in the place of the
762             second argument, the entry for the given id is deleted from the hash.
763              
764              
765             =item my $node = $document->get_node_by_id( $id );
766              
767             Return the node which has the value C<$id> in its 'C<id>' attribute,
768             no matter to which tree and to which bundle in the given document
769             the node belongs to.
770              
771             It is prohibited in Treex for IDs to point outside of the current document.
772             In rare cases where your data has such links, we recommend you to split the
773             documents differently or hack it by dropping the problematic links.
774              
775             =item $document->id_is_indexed( $id );
776              
777             Return C<true> if the given C<id> is already present in the indexing table.
778              
779             =item $document->get_all_node_ids();
780              
781             Return the array of all node identifiers indexed in the document.
782              
783             =item $document->get_references_to_id( $id );
784              
785             Return all references leading to the given node id in a hash (keys are reference types, e.g. 'alignment',
786             'a/lex.rf' etc., values are arrays of nodes referencing this node).
787              
788             =item $document->remove_refences_to_id( $id );
789              
790             Remove all references to the given node id (calls remove_reference() on each referencing node).
791              
792             =back
793              
794             =head2 Serializing
795              
796             =over 4
797              
798             =item my $document = load($filename, \%opts)
799              
800             Loads document from C<$filename> given C<%opts> using L<Treex::PML::Document::load()>
801              
802             =item $document->save($filename)
803              
804             Saves document to C<$filename> using L<Treex::PML::Document::save()>,
805             or by the Storable module if the file's extension is .streex.gz.
806              
807             =item Treex::Core::Document->retrieve_storable($filename)
808              
809             Loading a document from the .streex (Storable) format.
810              
811             =back
812              
813             =head2 Other
814              
815             =over 4
816              
817             =item my $filename = $doc->full_filename;
818              
819             full filename without the extension
820              
821             =back
822              
823              
824             =head1 AUTHOR
825              
826             Zdeněk Žabokrtský <zabokrtsky@ufal.mff.cuni.cz>
827              
828             Martin Popel <popel@ufal.mff.cuni.cz>
829              
830             Ondřej Dušek <odusek@ufal.mff.cuni.cz>
831              
832             =head1 COPYRIGHT AND LICENSE
833              
834             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
835              
836             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.