File Coverage

blib/lib/Treex/Core/Block.pm
Criterion Covered Total %
statement 67 216 31.0
branch 10 76 13.1
condition 0 18 0.0
subroutine 16 26 61.5
pod 8 13 61.5
total 101 349 28.9


line stmt bran cond sub pod time code
1             package Treex::Core::Block;
2             $Treex::Core::Block::VERSION = '2.20210102';
3 3     3   106210 use Moose;
  3         483274  
  3         25  
4 3     3   18152 use Treex::Core::Common;
  3         10  
  3         20  
5 3     3   18280 use Treex::Core::Resource;
  3         8  
  3         169  
6 3     3   27 use Digest::MD5 qw(md5_hex);
  3         9  
  3         178  
7 3     3   28 use Storable;
  3         8  
  3         190  
8 3     3   23 use Time::HiRes;
  3         8  
  3         34  
9 3     3   1999 use App::whichpm 'which_pm';
  3         1906  
  3         181  
10 3     3   23 use Readonly;
  3         7  
  3         149  
11 3     3   23 use List::MoreUtils qw(uniq);
  3         9  
  3         46  
12              
13             has selector => ( is => 'ro', isa => 'Str', default => '' );
14             has language => ( is => 'ro', isa => 'Str', default => 'all' );
15              
16             has scenario => (
17             is => 'ro',
18             isa => 'Treex::Core::Scenario',
19             writer => '_set_scenario',
20             weak_ref => 1,
21             );
22              
23             has select_bundles => (
24             is => 'ro',
25             default => 0,
26             documentation => 'apply process_bundle only on the specified bundles,'
27             . ' e.g. "1-4,6,8-12". The default is 0 which means all bundles. Useful for debugging.',
28             );
29              
30             has if_missing_zone => (
31             is => 'ro',
32             isa => enum( [qw(fatal warn ignore create)] ),
33             default => 'fatal',
34             documentation => 'What to do if process_zone is to be called on a zone'
35             . ' (specified by parameters language and selector) that is missing in a given bundle?',
36             );
37              
38             has if_missing_tree => (
39             is => 'ro',
40             isa => enum( [qw(fatal warn ignore create)] ),
41             default => 'fatal',
42             documentation => 'What to do if process_[atnp]tree is to be called on a tree'
43             . ' that is missing in a given zone?',
44             );
45              
46             has if_missing_bundles => (
47             is => 'ro',
48             isa => enum( [qw(fatal warn ignore)] ),
49             default => 'fatal',
50             documentation => 'What to do if process_document is to be called on a document'
51             . ' with no bundles?',
52             );
53              
54              
55              
56             has report_progress => (
57             is => 'ro',
58             isa => 'Str',
59             default => 0,
60             documentation => 'Report which bundle (TODO: zone,tree,node) is being processed via log_info. Useful for debugging.',
61             );
62              
63             has [qw(_is_bundle_selected _is_language_selected _is_selector_selected)] => ( is => 'rw' );
64              
65             has _hash => ( is => 'rw', isa => 'Str' );
66              
67             has is_started => ( is => 'ro', isa => 'Bool', writer => '_set_is_started', default => 0 );
68              
69             Readonly our $DOCUMENT_PROCESSED => 1;
70             Readonly our $DOCUMENT_FROM_CACHE => 2;
71              
72              
73             # For load_other_block() and get_or_load_other_block()
74             # TODO this could also be in Scenario instead of Block...
75              
76             # new other block of same name replaces old one here
77             has _loaded_other_blocks => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
78             # all loaded other blocks, no replacing
79             has _loaded_other_blocks_array => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
80              
81             sub zone_label {
82 0     0 1 0 my ($self) = @_;
83 0 0       0 my $label = $self->language or return;
84 0 0 0     0 if ( defined $self->selector && $self->selector ne '' ) {
85 0         0 $label .= '_' . $self->selector;
86             }
87 0         0 return $label;
88             }
89              
90             # TODO
91             # has robust => ( is=> 'ro', isa=>'Bool', default=>0,
92             # documentation=>'no fatal errors in robust mode');
93              
94             sub BUILD {
95 12     12 0 40566 my $self = shift;
96              
97 12 50       475 if ( $self->select_bundles ) {
98 0 0       0 log_fatal 'select_bundles=' . $self->select_bundles . ' does not match /^\d+(-\d+)?(,\d+(-\d+)?)*$/'
99             if $self->select_bundles !~ /^\d+(-\d+)?(,\d+(-\d+)?)*$/;
100 0         0 my %selected;
101 0         0 foreach my $span ( split /,/, $self->select_bundles ) {
102 0 0       0 if ( $span =~ /(\d+)-(\d+)/ ) {
103 0         0 @selected{ $1 .. $2 } = ( $1 .. $2 );
104             }
105             else {
106 0         0 $selected{$span} = 1;
107             }
108             }
109 0         0 $self->_set_is_bundle_selected( \%selected );
110             }
111              
112 12 100       383 if ( $self->language ne 'all' ) {
113 11         288 my @codes = split /,/, $self->language;
114 11         27 my %selected;
115 11         30 for my $code (@codes) {
116 11 50       73 log_fatal "'$code' is not a valid ISO 639 language code"
117             if !Treex::Core::Types::is_lang_code($code);
118 11         37 $selected{$code} = 1;
119             }
120 11         462 $self->_set_is_language_selected( \%selected );
121             }
122              
123 12 50       365 if ( $self->selector ne 'all' ) {
124 12 100       317 if ( $self->selector eq '' ) {
125 11         481 $self->_set_is_selector_selected( { q{} => 1 } );
126             }
127             else {
128 1         27 my @selectors = split /,/, $self->selector;
129 1         3 my %selected;
130 1         4 for my $selector (@selectors) {
131 1 50       7 log_fatal "'$selector' is not a valid selector name"
132             if $selector !~ /^[a-z\d]*$/i;
133 1         6 $selected{$selector} = 1;
134             }
135 1         37 $self->_set_is_selector_selected( \%selected );
136             }
137             }
138              
139 12         46 return;
140             }
141              
142             sub _compute_hash {
143 0     0   0 my $self = shift;
144              
145 0         0 my $md5 = Digest::MD5->new();
146              
147             # compute block parameters hash
148 0         0 my $params_str = "";
149             map {
150 0         0 $params_str .= $_ . "=" . $self->{$_};
151              
152             # log_warn("\t\t" . $_ . "=" . $self->{$_} . " - " . ref($self->{$_}));
153             }
154             sort # in canonical form
155 0         0 grep { !ref( $self->{$_} ) } # no references
156 0         0 grep { defined( $self->{$_} ) } # value has to be defined
157 0         0 grep { !/(scenario|block)/ }
158 0         0 keys %{$self};
  0         0  
159              
160             # Digest::MD5 cannot handle Unicode strings (it dies with "Wide character in subroutine entry")
161 3     3   5861 use Encode;
  3         8  
  3         8744  
162 0         0 $md5->add(Encode::encode_utf8($params_str));
163              
164             # compute block source code hash
165 0         0 my ( $block_filename, $block_version ) = which_pm( $self->get_block_name() );
166 0 0       0 open( my $block_fh, "<", $block_filename ) or log_fatal("Can't open '$block_filename': $!");
167 0         0 binmode($block_fh);
168 0         0 $md5->addfile($block_fh);
169 0         0 close($block_fh);
170              
171 0         0 $self->_set_hash( $md5->hexdigest );
172              
173 0         0 return;
174             }
175              
176             sub get_hash {
177 0     0 0 0 my $self = shift;
178 0 0       0 if (!$self->_hash){
179 0         0 $self->_compute_hash();
180             }
181 0         0 return $self->_hash;
182             }
183              
184             sub require_files_from_share {
185 2     2 1 5 my ( $self, @rel_paths ) = @_;
186 2         11 my $my_name = 'the block ' . $self->get_block_name();
187             return map {
188 2         6 log_info $self->get_block_name() . " requires file " . $_;
  0         0  
189 0         0 Treex::Core::Resource::require_file_from_share( $_, $my_name )
190             } @rel_paths;
191             }
192              
193             sub get_required_share_files {
194 2     2 1 5 my ($self) = @_;
195              
196             # By default there are no required share files.
197             # The purpose of this method is to be overriden if needed.
198 2         12 return ();
199             }
200              
201             sub process_document {
202 0     0 1 0 my $self = shift;
203 0         0 my ($document) = pos_validated_list(
204             \@_,
205             { isa => 'Treex::Core::Document' },
206             );
207              
208 0 0 0     0 if ( !$document->get_bundles() && $self->if_missing_bundles =~ /fatal|warn/){
209 0         0 my $message = "There are no bundles in the document and block " . $self->get_block_name() .
210             " doesn't override the method process_document. You can use prepend 'Util::SetGlobal if_missing_bundles=ignore' to allow processing empty documents. ";
211 0 0       0 log_fatal($message) if $self->if_missing_bundles eq 'fatal';
212 0         0 log_warn($message);
213             }
214              
215 0         0 my $bundleNo = 1;
216 0         0 foreach my $bundle ( $document->get_bundles() ) {
217 0 0 0     0 if ( !$self->select_bundles || $self->_is_bundle_selected->{$bundleNo} ) {
218 0         0 $self->process_bundle( $bundle, $bundleNo );
219             }
220 0         0 $bundleNo++;
221             }
222 0         0 return 1;
223             }
224              
225             sub _apply_function_on_each_zone {
226 0     0   0 my ($self, $doc, $function, @function_params) = @_;
227 0         0 my %zones;
228              
229             # When using "all", we must collect the zones used in the whole document.
230 0 0 0     0 if ($self->language eq 'all' || $self->selector eq 'all'){
231 0         0 foreach my $bundle ($doc->get_bundles){
232 0         0 foreach my $zone ($bundle->get_all_zones()){
233 0         0 $zones{$zone->get_label()} = 1;
234             }
235             }
236             }
237             # Otherwise, we can make a Cartesian product of lang(uage)s and sel(ector)s
238             else {
239 0         0 foreach my $lang (keys %{$self->_is_language_selected}){
  0         0  
240 0         0 foreach my $sel (keys %{$self->_is_selector_selected}){
  0         0  
241 0         0 $zones{$lang . '_' . $sel} = 1;
242             }
243             }
244             }
245              
246 0         0 my $orig_language = $self->language;
247 0         0 my $orig_selector = $self->selector;
248 0         0 foreach my $label (keys %zones){
249 0         0 my ($lang, $sel) = split /_/, $label;
250              
251             # pretend this block was called with only this one language and selector
252 0         0 $self->{language} = $lang;
253 0         0 $self->{selector} = $sel;
254 0         0 $function->(@function_params);
255             }
256 0         0 $self->{language} = $orig_language;
257 0         0 $self->{selector} = $orig_selector;
258 0         0 return;
259             }
260              
261             sub process_bundle {
262 0     0 1 0 my ( $self, $bundle, $bundleNo ) = @_;
263 0 0       0 if ($self->report_progress){
264 0         0 log_info "Processing bundle $bundleNo";
265             }
266              
267 0         0 my @zones = $bundle->get_all_zones();
268              
269 0 0       0 if ($self->if_missing_zone eq 'create') {
270 0         0 my (@langs, @sels);
271 0 0       0 if ($self->language eq 'all') {
272 0         0 @langs = uniq map{$_->language} @zones;
  0         0  
273             } else {
274 0         0 @langs = keys %{$self->_is_language_selected};
  0         0  
275             }
276 0 0       0 if ($self->selector eq 'all') {
277 0         0 @sels = uniq map{$_->selector} @zones;
  0         0  
278             } else {
279 0         0 @sels = keys %{$self->_is_selector_selected};
  0         0  
280             }
281              
282             # Cartesian product of lang(uage)s and sel(ector)s
283 0         0 @zones = map {my $l = $_; map{$bundle->get_or_create_zone($l, $_)} @sels} @langs;
  0         0  
  0         0  
  0         0  
284             } else {
285 0         0 @zones = $self->get_selected_zones(@zones);
286             }
287              
288 0 0 0     0 if (!@zones && $self->if_missing_zone =~ /fatal|warn/) {
289 0         0 my $message = "No zone (language="
290             . $self->language
291             . ", selector="
292             . $self->selector
293             . ") was found in a bundle and block " . $self->get_block_name()
294             . " doesn't override the method process_bundle";
295 0 0       0 log_fatal($message) if $self->if_missing_zone eq 'fatal';
296 0         0 log_warn($message);
297             }
298              
299 0         0 foreach my $zone (@zones) {
300 0         0 $self->process_zone( $zone, $bundleNo );
301             }
302 0         0 return;
303             }
304              
305             sub get_selected_zones {
306 1     1 0 5 my ( $self, @zones ) = @_;
307 1 50       33 if ( $self->language ne 'all') {
308 1         5 @zones = grep { $self->_is_language_selected->{ $_->language } } @zones;
  1         34  
309             }
310 1 50       30 if ( $self->selector ne 'all') {
311 1         4 @zones = grep { $self->_is_selector_selected->{ $_->selector } } @zones;
  1         34  
312             }
313              
314 1         4 return @zones;
315             }
316              
317             sub _try_process_layer {
318 0     0   0 my ( $self, $zone, $layer, $bundleNo ) = @_;
319 0         0 my $meta = $self->meta;
320              
321 0 0       0 if ( my $m = $meta->find_method_by_name("process_${layer}tree") ) {
322 0 0       0 if (!$zone->has_tree($layer)){
323 0 0       0 if ($self->if_missing_tree eq 'create'){
324 0         0 $zone->create_tree($layer);
325             } else {
326 0         0 return 0;
327             }
328             }
329              
330             #$self->process_atree($tree, $bundleNo);
331 0         0 $m->execute( $self, $zone->get_tree($layer), $bundleNo );
332 0         0 return 1;
333             }
334              
335 0 0       0 if ( my $m = $meta->find_method_by_name("process_${layer}node") ) {
336 0 0       0 if (!$zone->has_tree($layer)){
337 0 0       0 if ($self->if_missing_tree eq 'create'){
338 0         0 $zone->create_tree($layer);
339             } else {
340 0         0 return 0;
341             }
342             }
343 0         0 my $tree = $zone->get_tree($layer);
344              
345             # process_ptree should be executed also on the root node (usually the S phrase)
346 0 0       0 my @opts = $layer eq 'p' ? ( { add_self => 1 } ) : ();
347 0         0 foreach my $node ( $tree->get_descendants(@opts) ) {
348             # Skip nodes deleted by previous process_Xnode() call.
349 0 0       0 next if ref $node eq 'Treex::Core::Node::Deleted';
350              
351             #$self->process_anode($node, $bundleNo);
352 0         0 $m->execute( $self, $node, $bundleNo );
353             }
354 0         0 return 1;
355             }
356              
357 0         0 return 0;
358             }
359              
360             sub process_zone {
361 0     0 1 0 my ( $self, $zone, $bundleNo ) = @_;
362 0         0 my $overriden = 0;
363              
364 0         0 for my $layer (qw(a t n p)) {
365 0 0       0 if ($self->_try_process_layer( $zone, $layer, $bundleNo )){
366 0         0 $overriden++;
367             }
368             }
369              
370 0 0 0     0 if (!$overriden && $self->if_missing_tree =~ /fatal|warn/){
371             my $message = "At least one of the methods /process_(document|bundle|zone|[atnp](tree|node))/ "
372             . "must be overriden and the corresponding [atnp] trees must be present in bundles.\n"
373             . "The zone '" . $zone->get_label() . "' contains trees ( "
374 0         0 . ( join ',', map { $_->get_layer() } $zone->get_all_trees() ) . ").";
  0         0  
375 0 0       0 log_fatal($message) if $self->if_missing_tree eq 'fatal';
376 0         0 log_warn($message);
377             }
378              
379 0         0 return;
380             }
381              
382             sub process_start {
383 2     2 1 6 my ($self) = @_;
384              
385 2         15 $self->require_files_from_share( $self->get_required_share_files() );
386              
387 2         4 return;
388             }
389              
390             sub process_end {
391             my ($self) = @_;
392              
393             # default implementation is empty, but can be overriden
394             return;
395             }
396              
397             after 'process_end' => sub {
398             my ($self) = @_;
399             foreach my $other_block (@{$self->_loaded_other_blocks_array}) {
400             if ( $other_block->is_started ) {
401             $other_block->process_end();
402             }
403             }
404             $self->_set_is_started(0);
405             };
406              
407             sub get_block_name {
408 2     2 1 5 my $self = shift;
409 2         9 return ref($self);
410             }
411              
412             sub load_other_block {
413 0     0 0   my ($self, $other_block_name, $params_hash_ref) = @_;
414              
415 0           my $other_block_full_name = "Treex::Block::$other_block_name";
416              
417             # CONSTRUCT PARAMETERS HASH
418             # global params (TODO: do that?)
419 0           my %params = %{$self->scenario->_global_params};
  0            
420             # overridden by selected (TODO: all?) block params
421 0           $params{language} = $self->language;
422 0           $params{selector} = $self->selector;
423 0           $params{scenario} = $self->scenario;
424             # overridden by locally set params
425 0           @params{ keys %$params_hash_ref } = values %$params_hash_ref;
426              
427             # CREATE IT and start it
428 0 0         eval "use $other_block_full_name; 1;" or
429             log_fatal "Treex::Core::Block->get_other_block: " .
430             "Can't use block $other_block_name!\n$@\n";
431 0           my $other_block;
432 0 0         eval {
433 0           $other_block = $other_block_full_name->new( \%params );
434 0           1;
435             } or log_fatal "Treex::Core::Block->get_other_block: " .
436             "Can't initialize block $other_block_name!\n$@\n";
437 0           $other_block->process_start();
438              
439             # this may replace older block with same name
440 0           $self->_loaded_other_blocks->{$other_block_name} = $other_block;
441             # this not
442 0           push @{$self->_loaded_other_blocks_array}, $other_block;
  0            
443              
444 0           return $other_block;
445             }
446              
447             sub get_or_load_other_block {
448 0     0 0   my ($self, $other_block_name, $params_hash_ref) = @_;
449              
450             my $other_block =
451             exists ($self->_loaded_other_blocks->{$other_block_name})
452             ?
453 0 0         $self->_loaded_other_blocks->{$other_block_name}
454             :
455             $self->load_other_block($other_block_name, $params_hash_ref)
456             ;
457              
458 0           return $other_block;
459             }
460              
461             1;
462              
463             __END__
464              
465             =for Pod::Coverage BUILD build_language
466              
467             =encoding utf-8
468              
469             =head1 NAME
470              
471             Treex::Core::Block - the basic data-processing unit in the Treex framework
472              
473             =head1 VERSION
474              
475             version 2.20210102
476              
477             =head1 SYNOPSIS
478              
479             package Treex::Block::My::Block;
480             use Moose;
481             use Treex::Core::Common;
482             extends 'Treex::Core::Block';
483              
484             sub process_bundle {
485             my ( $self, $bundle) = @_;
486              
487             # bundle processing
488              
489             }
490              
491             =head1 DESCRIPTION
492              
493             C<Treex::Core::Block> is a base class serving as a common ancestor of
494             all Treex blocks.
495             C<Treex::Core::Block> can't be used directly in any scenario.
496             Use it's descendants which implement one of the methods
497             C<process_document()>, C<process_bundle()>, C<process_zone()>,
498             C<process_[atnp]tree()> or C<process_[atnp]node()>.
499              
500              
501             =head1 CONSTRUCTOR
502              
503             =over 4
504              
505             =item my $block = Treex::Block::My::Block->new();
506              
507             Instance of a block derived from C<Treex::Core::Block> can be created
508             by the constructor (optionally, a reference to a hash of block parameters
509             can be specified as the constructor's argument, see L</BLOCK PARAMETRIZATION>).
510             However, it is not likely to appear in your code since block initialization
511             is usually invoked automatically when initializing a scenario.
512              
513             =back
514              
515             =head1 METHODS FOR BLOCK EXECUTION
516              
517             You must override one of the following methods:
518              
519             =over 4
520              
521             =item $block->process_document($document);
522              
523             Applies the block instance on the given instance of
524             L<Treex::Core::Document>. The default implementation
525             iterates over all bundles in a document and calls C<process_bundle()>. So in
526             most cases you don't need to override this method.
527              
528             =item $block->process_bundle($bundle);
529              
530             Applies the block instance on the given bundle
531             (L<Treex::Core::Bundle>).
532              
533             =item $block->process_zone($zone);
534              
535             Applies the block instance on the given bundle zone
536             (L<Treex::Core::BundleZone>). Unlike
537             C<process_document> and C<process_bundle>, C<process_zone> requires block
538             attribute C<language> (and possibly also C<selector>) to be specified.
539              
540             =item $block->process_I<X>tree($tree);
541              
542             Here I<X> stands for a,t,n or p.
543             This method is executed on the root node of a tree on a given layer (a,t,n,p).
544              
545             =item $block->process_I<X>node($node);
546              
547             Here I<X> stands for a,t,n or p.
548             This method is executed on the every node of a tree on a given layer (a,t,n,p).
549             Note that for layers a, t, and n, this method is not executed on the root node
550             (because the root node is just a "technical" root without the attributes of regular nodes).
551             However, C<process_pnode> is executed also on the root node
552             (because its a regular non-terminal node with a phrase attribute, usually C<S>).
553              
554             =back
555              
556             =head2 $block->process_start();
557              
558             This method is called before all documents are processed.
559             This method is responsible for loading required models.
560              
561             =head2 $block->process_end();
562              
563             This method is called after all documents are processed.
564             The default implementation is empty, but derived classes can override it
565             to e.g. print some final summaries, statistics etc.
566             Overriding this method is preferable to both
567             standard Perl END blocks (where you cannot access C<$self> and instance attributes),
568             and DEMOLISH (which is not called in some cases, e.g. C<treex --watch>).
569              
570              
571              
572             =head1 BLOCK PARAMETRIZATION
573              
574             =over 4
575              
576             =item my $block = BlockGroup::My_Block->new({$name1=>$value1,$name2=>$value2...});
577              
578             Block instances can be parametrized by a hash containing parameter name/value
579             pairs.
580              
581             =item my $param_value = $block->get_parameter($param_name);
582              
583             Parameter values used in block construction can
584             be revealed by C<get_parameter> method (but cannot be changed).
585              
586             =back
587              
588             =head1 MISCEL
589              
590             =over 4
591              
592             =item my $langcode_selector = $block->zone_label();
593              
594             =item my $block_name = $block->get_block_name();
595              
596             It returns the name of the block module.
597              
598             =item my @needed_files = $block->get_required_share_files();
599              
600             If a block requires some files to be present in the shared part of Treex,
601             their list (with relative paths starting in
602             L<Treex::Core::Config-E<gt>share_dir|Treex::Core::Config/share_dir>) can be
603             specified by redefining by this method. By default, an empty list is returned.
604             Presence of the files is automatically checked in the block constructor. If
605             some of the required file is missing, the constructor tries to download it
606             from L<http://ufallab.ms.mff.cuni.cz>.
607              
608             This method should be used especially for downloading statistical models,
609             but not for installed tools or libraries.
610              
611             sub get_required_share_files {
612             my $self = shift;
613             return (
614             'data/models/mytool/'.$self->language.'/features.gz',
615             'data/models/mytool/'.$self->language.'/weights.tsv',
616             );
617             }
618              
619             =item require_files_from_share()
620              
621             This method checks existence of files given as parameters, it tries to download them if they are not present
622              
623             =back
624              
625             =head1 SEE ALSO
626              
627             L<Treex::Core::Node>,
628             L<Treex::Core::Bundle>,
629             L<Treex::Core::Document>,
630             L<Treex::Core::Scenario>,
631              
632             =head1 AUTHOR
633              
634             Zdeněk Žabokrtský <zabokrtsky@ufal.mff.cuni.cz>
635              
636             Martin Popel <popel@ufal.mff.cuni.cz>
637              
638             =head1 COPYRIGHT AND LICENSE
639              
640             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
641              
642             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.