File Coverage

blib/lib/HPCI.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             package HPCI;
2             ### HPCI.pm ###################################################################
3              
4             ### INCLUDES ##################################################################
5              
6             # safe Perl
7 18     18   4221041 use warnings;
  18         59  
  18         635  
8 18     18   110 use strict;
  18         42  
  18         366  
9 18     18   93 use Carp;
  18         46  
  18         1039  
10 18     18   8949 use Module::Load;
  18         18161  
  18         116  
11 18     18   10542 use Module::Load::Conditional qw(can_load);
  18         398776  
  18         1320  
12 18     18   21897 use List::MoreUtils qw(uniq);
  0            
  0            
13              
14             our @extra_roles;
15              
16             sub add_extra_role {
17             # next line is documentation
18             # my ($cluster, $level, $role) = @_;
19             shift; # get rid of HPCI class name
20             push @extra_roles, [ @_ ];
21             }
22              
23             sub get_extra_roles {
24             my ($target_cluster, $target_level) = @_;
25             my @roles;
26             for my $role_bunch (@extra_roles) {
27             my ($cluster, $level, $roles) = @$role_bunch;
28             next unless $cluster eq 'ALL' || $cluster eq $target_cluster;
29             next unless $level eq $target_level;
30             push @roles, ref $roles ? @$roles : $roles;
31             }
32             return @roles;
33             }
34              
35             my $default_attrs = {};
36              
37             sub add_default_attrs {
38             shift; # get rid of HPCI class name
39             my $newhash = ref($_[0]) eq 'HASH' ? shift : { @_ };
40             _merge_hash( $default_attrs, $newhash );
41             }
42              
43             sub _merge_hash {
44             my( $target, $new, $path ) = @_;
45             $path ||= [];
46             croak "not a hash when merging attribute hash{".join('}{',@$path)."}"
47             unless ref($target) eq 'HASH' && ref($new) eq 'HASH';
48             while (my($k,$v) = each %$new) {
49             if (ref($v) eq 'HASH' || (exists $target->{$k} && ref($target->{$k}) eq 'HASH')) {
50             $target->{$k} //= {};
51             _merge_hash( $target->{$k}, $v, [ @$path, $k ] );
52             }
53             else {
54             $target->{$k} = $v;
55             }
56             }
57             }
58              
59             sub explist {
60             return (
61             map {
62             ref($_) eq 'ARRAY' ? @$_
63             : defined($_) ? ( $_ )
64             : ( )
65             } @_
66             );
67             }
68              
69             # get the env_keys, in original order, but use *LAST* instance
70             # that retains the order specified in either default or args
71             # but lets the relative order in args take precedence for keys
72             # that are in both
73             #
74             # So, the order is:
75             # [ keys that are only in default in the order they were specified in default ]
76             # [ then keys that are in args in the order they were specified in args ]
77             # No complaint is made if the same key is specified twice in either default
78             # or args, the earlier one(s) are simply ignored.
79             sub keylist {
80             my @keys;
81             for my $arg (@_) {
82             my $keys = (delete $arg->{env_keys}) // [];
83             push @keys, @$keys;
84             }
85             return (reverse uniq reverse @keys);
86             }
87              
88             sub group {
89             my $pkg = shift;
90             my $args =
91             scalar(@_) == 1 && ref($_[0]) eq 'HASH' ? shift
92             : scalar(@_) % 2 == 0 ? { @_ }
93             : croak("HPCI->group() requires a hashref or a hash in list form");
94             # copy the default attributes as a start
95             my $use_args = {};
96             _merge_hash( $use_args, $default_attrs );
97              
98             # pull out the env_keys (if any)
99             my @keys = keylist( $use_args, $args );
100             my @key_specific = map { $_->{env_key_specific} // () } $use_args, $args;
101              
102             # merge any specified env_key list that has a value available
103             for my $key (@keys) {
104             for my $key_spec (@key_specific) {
105             if (my $spec_args = $key_spec->{$key}) {
106             _merge_hash( $use_args, $spec_args );
107             }
108             }
109             }
110              
111             my $cluster = $args->{cluster} // $use_args->{cluster}
112             // croak("HPCI->group() requires a cluster key in the argument hash");
113              
114             for my $arg_set ($use_args, $args) {
115             if (my $spec_args = delete $arg_set->{cluster_specific}) {
116             _merge_hash( $use_args, $spec_args->{$cluster} // {} );
117             }
118             }
119             _merge_hash( $use_args, $args );
120             my $clmod = "HPCD::${cluster}::Group";
121             load $clmod;
122             return $clmod->new($use_args);
123             }
124              
125              
126             sub _trigger_mkdir {
127             my $self = shift; # an object with a log
128             my $dir = shift; # a Path::Class::Dir object
129             $self->info( "Created directory: $_" ) for $dir->mkpath;
130             }
131              
132             =head1 NAME
133              
134             HPCI
135              
136             =head1 VERSION
137              
138             Version 0.51
139              
140             =cut
141              
142             our $VERSION = '0.51';
143              
144             our $LocalConfigFound;
145              
146             $LocalConfigFound = can_load( modules => { 'HPCI::LocalConfig' => undef });
147              
148             if (!$LocalConfigFound) {
149             my $err = $Module::Load::Conditional::ERROR;
150             if (defined $err && $err !~ /^Could not find or check module /) {
151             print STDERR "Conditional load of HPCI::LocalConfig failed. Error is:\n";
152             print STDERR "$err\n";
153             }
154             }
155              
156             =head1 SYNOPSIS
157              
158             use HPCI;
159              
160             my $group = HPCI->group(
161             cluster => ($ENV{HPCI_CLUSTER} // 'uni'),
162             ...
163             );
164             $group->stage(
165             name => 'analysis_A',
166             command => '...'
167             );
168             $group->stage(
169             name => 'analysis_B',
170             command => '...'
171             );
172             $group->stage(
173             name => 'analysis_C',
174             command => '...'
175             );
176             $group->stage(
177             name => 'report',
178             command => '...'
179             );
180             $group->add_deps(
181             pre_reqs => [ qw(analysis_A analysis_B analysis_C) ],
182             dep => 'report'
183             );
184              
185             my $status_info = $group->execute;
186              
187             my $exit_status = 0;
188             for my $stage ( qw(analysis_A analysis_B analysis_C report) ) {
189             if (my $stat = $status_info->{$stage}[-1]{exit_status}) {
190             $exit_status ||= $stat;
191             print stderr "Stage $stage failed, status $stat!\n";
192             }
193             }
194              
195             exit(0); # all stages completed without error
196              
197             =head1 OVERVIEW
198              
199             HPCI (High Performance Computing Interface) provides an interface to
200             a range of types of computer aggregations (clusters, clouds, ...).
201             (The rest of this document will use I<cluster> henceforth to refer
202             to any type of aggregation that is supported by HPCI.)
203              
204             A cluster is defined as a software interface that allows running
205             multiple programs on separate compute elements (nodes).
206              
207             HPCI uses an HPCD (High Performance Computing Driver) module
208             to translate its standard interface into the appropriate access
209             mechanisms for the type of cluster that is selected. (If you have
210             used the DBI/DBD modules for accessing databases, this will seem
211             very familiar.)
212              
213             The goal of this HPCI/HPCD split is to allow users to write
214             programs that make use of cluster facilities in a portable manner.
215             If there is a reason to run the same program using a different
216             type of cluster, it should only require change the cluster
217             definition attributes provided to one parent object creation; the
218             rest of code need not know or care about the changed cluster type.
219             Programs which are likely to be run on different cluster types will
220             usually be written to get the cluster attribute information from
221             a configuration file, or command line arguments - so the program
222             itself need not change at all.
223              
224             Running a program on different types of clusters can happen for a
225             number of reasons. An organization might have access to multiple
226             types of cluster, such as an in-house cluster plus an external cloud.
227             Scholarly research often shares programs both to allow similar
228             research, or to validate existing research results.
229              
230             HPCD modules can provide cluster-specific extensions. That can
231             either be a different kind of functionality, or it can be as simple
232             as allowing the teminology familiar to users of that cluster type
233             to be used in place of the generic terminology provided by HPCI.
234             However, using such extensions makes it harder to move to a
235             different cluster type. So, actually making use of such extensions
236             must be considered carefully.
237              
238             =head1 The life cycle of a B<group>
239              
240             A B<group> is the main mechanism for using HPCI. It is an object that
241             manages a group of computation steps (called B<stage>s), distributing them
242             across the cluster and keeping track of various housekeeping details like
243             when each stage can be run, checking for the result of each completed stage
244             run, deciding whether a failure should cause a stage to be retried to to
245             prevent other stages from being executed, and collecting the status for each
246             stage.
247              
248             The life cycle of running a group of commands on a cluster is:
249              
250             =over 4
251              
252             =item create group
253              
254             A B<group> object is created using the HPCI "class method" B<group>.
255             HPCI isn't really a class, it just appears to be one. Its B<group>
256             "class method" actually delegates creation of a group object to
257             the HPCD module that is indicated by the I<cluster> attribute
258             and it returns an cluster-specific group object that supports the
259             HPCI interface.
260              
261             =item create stages
262              
263             A B<stage> is created for each command that is to be executed on a
264             separate node of the cluster. This is created using the B<group>
265             object's method B<stage>.
266              
267             =item define dependency ordering between the stages
268              
269             An important reason for running a group of jobs on a cluster is the
270             ability to use multiple computers to run portions of the computation
271             at the same time, rather than having them compete for the rsources
272             of a single computer. However, often some stages will depend
273             upon the output of other stages. Such a dependent stage cannot
274             start executing until all pre-requisite stages have completed.
275             Specifying such dependency requirements is done with the B<group>
276             method B<add_deps>.
277              
278             =item execution
279              
280             Finally, the B<group> method B<execute> will run the entire set
281             of stages. It does not return until all stages have completed (or
282             have been skipped). Each stage will normally be run once, however
283             it is possible for some stages to be retried under some
284             failure conditions.
285             A failure of one stage (after retry possibilities have been exhausted)
286             can be a trigger for
287             completely skipping the execution of other stages. Each separate
288             execution of a stage (original or retry) is managed with an internal object
289             called a job - but a user program won't see job objects directly.
290              
291             As many stages as possible are run simultaneously. This is limited by
292             the specified dependencies, by cluster-specific driver limits, and by
293             user-specified limits on concurrent execution.
294              
295             =back
296              
297             The objects that calling code deals with directly are a group object to
298             manage a group of stages, and a stage object for each separately run job.
299             Internally, there are also job objects for each retry of a stage, and a
300             log object for logging the execution process (alternately, the user can
301             provide their own Log4Perl compatible log object for HPCI to use - this may be
302             of use if you wish to merge logging of multiple groups and/or of other
303             processing within your program together in a single log).
304              
305             There are also some facilities to provide local customization of the standard
306             usage of HPCI (see "Local Customization" below).
307              
308             =head1 Output Tree Layout
309              
310             There are a number of output files and directories created during a group execution.
311              
312             The default layout of these is:
313              
314             <base_dir> "."
315             <group_dir> <base_dir>/<name>-<YYYYMMDD-hhmmss>
316             <log> <group_dir>/<name>.log
317             <stage_dir> <group_dir>/<stage_name>
318             <script_file> <stage_dir>/script.sh
319             <job_dir> <stage_dir>/<retry_number>
320             stdout
321             stderr
322             final_retry symlink to final <job_dir>
323              
324             Many of these files/directories can be re-assigned to different
325             location using group or stage attributes - shown above is the
326             default layout. Commonly, you will specifically use the I<base_dir>
327             attribute to choose a location other than the current directory for
328             placing the tree; or else use the I<group_dir> attribute if you want
329             to choose a location that does not create a sub-directory for you.
330             (If this is an already existing directory that is being re-used you
331             may end up with a mixture of old and new contents that are hard to
332             figure out.)
333              
334             =over 4
335              
336             =item base_dir
337              
338             The top level of all the generated output. It defaults to ".",
339             but can be specified explicitly when the group is created with
340             the attribute B<base_dir>.
341              
342             =item group_dir
343              
344             By default, a new directory is created under B<base_dir>. Its name
345             is I<name>-I<YYYYMMDD>-I<hhmmss> - the name of the group along with
346             a timestamp of when the execution started. This can be over-ridden
347             when the group is created by providing the group attribute B<group_dir>.
348              
349             =item log
350              
351             The automatically provided log is written to the file I<"group.log">
352             directly under I<group_dir>. This logs information about the
353             execution of the entire group of stages. See B<Logging Attributes
354             of group object> below for ways of changing the default setting.
355              
356             =item stage_dir
357              
358             Each stage creates a sub-directory beneath I<group_dir> with the
359             same name as the stage. An alternate name can be used by providing
360             the B<dir> attribute when the stage object is created.
361              
362             =item script_file
363              
364             The script created to be executed on the cluster node. This wraps
365             the specified command with additional logic to pass on environment
366             and config info, and to set output redirection. It is called
367             "script.sh" and placed in I<stage_dir>.
368              
369             =item job_dir
370              
371             A sub-directory is created under I<stage_dir> for each attempt to
372             run the command. Usually, there will only be a single attempt.
373             However, if the cluster driver provides mechanisms for detecting
374             recoverable issues and then retries a command there can be more
375             than one attempt; or alternately, if a pre-requisite stage
376             fails there might be no attempt made (in that case, though,
377             the entire I<stage_dir> directory would not even get created).
378             These directories are simply named with the retry number ("0",
379             "1", ...).
380              
381             =item stdout/stderr
382              
383             Within each I<job_dir>, the files "stdout" and "stderr" collect
384             the standard output and standard error output from that (re)try
385             attempt to run the command.
386              
387             =item final_retry
388              
389             A symlink named "final_retry" is created within I<stage_dir> that
390             points to the I<job_dir> of the final (re)try. Since you often
391             don't care as much about the initial run tries as you do about the
392             last one, this symlink provides a consistant access path to that
393             final retry.
394              
395             =back
396              
397             =head1 HPCI "Class" Methods
398              
399             You can pretend that B<HPCI> is a class with one primary class
400             method named B<group>.
401              
402             There a few other class methods used for localization purposes, they
403             are decribed below in "Local Customization".
404              
405             =head2 B<group> method
406              
407             The B<group> method creates and returns a group object, which
408             you can treat like a B<HPCI::Group> object. (In fact, it really
409             returns an object of class B<HPCD::I<cluster>::Group>, but if you
410             ignore that fact then you can trivially have your program run on
411             some other cluster type.)
412              
413             =head2 B<group> object
414              
415             The description of attributes and methods for the B<group> object given here describe
416             the generic attributes and how they are treated for all cluster types.
417             Individual cluster drivers can modify this behaviour and can provide
418             additional attributes and methods for cluster-specific purposes.
419              
420             =head3 Cluster-Related Attributes of B<group> object
421              
422             The one necessary attribute is B<cluster>. For some specific
423             cluster types there may be additional attributes required for
424             connecting to the cluster software (authentification, usage
425             class info, etc.).
426              
427             =over 4
428              
429             =item cluster
430              
431             The B<cluster> attribute specifies which type of cluster is to be used.
432             This is the only required attribute. (Some cluster types may have
433             additional attributes that are required for specifying connection
434             and authentification info.)
435              
436             =item cluster_specific
437              
438             The attribute B<cluster_specific> is optional. If provided, it should
439             contain a hashref of hashrefs. If the value specified for the I<cluster>
440             attribute is present as a key in the B<cluster_specific>
441             hash, the corresponding value will be used as a set of attribute values
442             when the group is created. Its elements will replace or augment any values
443             for the same attribute name provided to the group method. This will normally be
444             used if the program can be dynamically configured for different cluster
445             types, and there are different arg settings required for the different
446             types of cluster.
447              
448             =back
449              
450             =head3 Basic Attributes of B<group> object
451              
452             =over 4
453              
454             =item name
455              
456             The B<name> you give to a group is used for creating the directory
457             where output is stored, and also in log messages. A default name
458             "default_group_name" is provided if you do not specific an explicit
459             name. Using the default name is adequate in simple programs which
460             only create one group, but for more complicated programs giving
461             separate names to each group is necessary to easily identify the
462             output of each group. The value of B<name> may also be used by
463             the cluster-specific driver to provide an identifier name (or the
464             basis of one) to the underlying cluster, if it needs one.
465              
466             =item stage_defaults
467              
468             The attribute B<stage_defaults> is optional. If provided, it should
469             contain a hashref. This hash will be used as default values for
470             every stage created by this group.
471              
472             =back
473              
474             =head3 Directory Layout Attributes of B<group> object
475              
476             =over 4
477              
478             =item base_dir
479              
480             If none of the other directory layout attributes are used to
481             over-ride this, this attribute specifies the directory in which
482             all output directories and files will be created. This is
483             usually an existing directory; it defaults to the current
484             directory ".".
485              
486             =item group_dir
487              
488             This directory is usually created to contain the outputs of the
489             group execution. By default, it is directly under B<base_dir> with
490             a name that consists of the group name attribute and a timestamp
491             (e.g. "T_Definition-20150521-153256").
492              
493             If you provide an explicit value for this parameter, then it
494             should not be an existing directory containing previous results.
495             (If it is, the log file will be appended to the previous one, but
496             the stage directories will over-write equivalently named directories
497             and files that are created in this run, while leaving unchanged any
498             that did not recur, so you'll have a mix of old and new contents.)
499             The names of files and directories created under B<group_dir> are
500             chosen to be consistent and easy to find automatically.
501              
502             =back
503              
504             =head3 Logging Attributes of B<group> object
505              
506             An HPCI group logs its activities using a Log::Log4perl logger.
507             The logger can either be provided by the caller, or else HPCI will
508             create its own.
509              
510             =over 4
511              
512             =item log
513              
514             This a Log::Log4perl::Logger object. If it is provided as an
515             attribute to the B<group> creation call, it will be used as it is,
516             and the other logging attributes will be ignored.
517              
518             If it is not provided by the user, a new Log::Log4perl::Logger
519             object will be created using the attributes below to define where
520             it is logged to. This created logger will send all log entries to
521             a file, as well as sending all info and higher log entries to stderr.
522              
523             =item log_path
524              
525             If this attribute is provided (and the B<log> attribute is not
526             provided) it will be used as the full pathname of a file where the
527             log will be written. If it is not provided, it will use the path
528             B<log_dir>/B<log_file> by default.
529              
530             =item log_dir
531              
532             If neither B<log> or B<log_path> is provided, this attribute can
533             be used to specify the directory where the log file is to be written.
534             By default, it uses B<group_dir>.
535              
536             =item log_file
537              
538             If neither B<log> or B<log_path> is provided, this attribute can
539             be used to specify the file name to be written in the log directory.
540             By default, it uses the constant name "group.log".
541              
542             =item log_level
543              
544             You can provide this attribute to change the default log level setting from "info" to any of I<debug info warn error fatal>.
545              
546             =item log_no_stderr, log_no_file
547              
548             Normally, the default log is written to both stderr and to the log file.
549             Either of those can be suppressed by setting the corresponding attribute to a true value.
550             These attributes have no effect if the user proviedes their own logger instead of using the default one.
551              
552             =back
553              
554             =head3 Operational Attributes of B<group> object
555              
556             =over 4
557              
558             =item max_concurrent
559              
560             This attribute specifies the maximum number of stages that will
561             be executing at one time. The default setting of 0 allows as
562             many stages as possible (all those that are not waiting for a
563             pre-requisite stage to complete) to run at the same time.
564              
565             =item status
566              
567             This attribute is set internally while stages are executed.
568             It contains the final result status from each stage run that
569             has completed. The B<execute> method returns this value when
570             execution completes, so you will usually not need to access it
571             explicitly yourself.
572              
573             This value is a hashref (indexed by stage name). The values are
574             arrayrefs (indexed by run number 0..n). For each run, there is
575             a hash. The key B<exit_status> contains the exit status of the run.
576             If the stage was never run, B<exit_status> instead contains a text
577             message listing the reason that it was skipped.
578              
579             =back
580              
581             =head3 Environment Passing Attributes of B<group> object
582              
583             You can set up a set of enviroment variables that will be provided to
584             all stages. (You can also set variables that are only for individual
585             stages - if so, they will modify any set you provide in the group.)
586              
587             See B<HPCI::Env> for a description of these.
588              
589             =head3 Method B<stage> of B<group> object
590              
591             The method B<stage> is used to create a new stage object.
592             Its characteristics are described below.
593              
594             The B<group> object keeps track of all B<stage> objects created
595             within that group so that they can all be managed properly when the
596             B<execute> method is invoked.
597              
598             =head3 Method B<add_deps> of B<group> object
599              
600             The method B<add_deps> is used to specify pre-requisite/dependent
601             relationships. It takes either a hashref or a list containing
602             pairs. One of the keys must be either B<pre_req> or B<pre_reqs>,
603             another must be either B<dep> or B<deps>.
604              
605             The value for each of these keys can be either a scalar, or an arrayref
606             of scalar values. A scalar value can be either a B<stage> object (a reference),
607             the exact name of a stage object (a string), or a pattern that matches
608             the name of zero or more stages (a regexp).
609              
610             HPCI will ensure that the stage or all of the stages specified for pre_req
611             or pre_reqs have completed execution before any of the dep (or deps) stages
612             is allowed to start executing.
613              
614             The plural forms are provided for convenience - often the output
615             file from one preparation stage is required by many others, or the
616             output from many processing stages is needed by a stage that merges
617             results into a summary report. Rather than having to loop over the
618             pre_reqs and deps and calling B<add_deps> individually for every
619             individual dependency, a single call will handle the entire combination.
620              
621             Allowing a regexp to match no stages at all makes it possible to write
622             an add_deps call for stages that are optional - no dependency will be
623             added if the optional stage was not created this run.
624              
625             While it is recommended for code readability that you use the singular
626             form (B<dep> or B<pre_req>) is you are providing a single stage, and the
627             plural form (B<deps> or B<pre_reqs>) if you are providing a list of
628             stages, either can be used.
629              
630             The B<add_deps> method can be called multiple times. HPCI will
631             accumlate the dependencies appropriately.
632              
633             It is an error to provide a sequence of dependencies that form
634             a cycle in which a stage directly or indirectly has itself as a
635             pre-requisite. (Such a stage could never run. HPCI will detect
636             when all remaining stages are blocked by pre-requisites and abort,
637             but that might be after numerous stages have already been executed.)
638              
639             =head3 METHOD execute of B<group> object
640              
641             The B<execute> method is the final goal of building the group.
642             It schedules the execution of individual stages. It waits for
643             pre-requisites before running a stage. It provides for re-running
644             a stage if a soft failure has occurred that allows a retry. If a
645             failure that cannot be retried occurs, it can skip scheduling dependent
646             stages, or even stop scheduling all new stages.
647              
648             =head2 Stage Object
649              
650             =head3 Attributes
651              
652             =over 4
653              
654             =item name
655              
656             A unique B<name> attribute must be provided for stages. It is a string.
657             There is no default value provided.
658              
659             =item command
660              
661             The B<command> attribute must be provided before the group is
662             executed. It can either be provided as a string attribute when the
663             stage is created, or by using the one of
664             the command-setting methods provided by the stage class.
665              
666             See B<HPCI::Stage> for more details about the command setting
667             methods.
668              
669             =item dir
670              
671             The B<dir> attribute is optional. It specifies the direcory
672             in which files related to the stage are placed. By default,
673             it is I<group_dir>/I<stage_name>. You will usually not need to
674             change this.
675              
676             =item cluster
677              
678             The B<cluster> attribute is automatically passed on fro mthe B<group>
679             to each B<stage>. You are not likely to need this.
680              
681             =item group
682              
683             The B<group> that created a stage is automatically passed on (as a weak
684             reference) to the stage. You are not likely to need to use this attribute
685             in user code.
686              
687             =item resources_required
688              
689             =item retry_resources_required
690              
691             The B<resources_required> and B<retry_resources_required> are used to
692             define resources that will be required by the stage when it executes.
693             These attributes are somewhat cluster specific - each cluster has
694             its own set of requirements for how a job submission must specify
695             the sort of resources that it will require.
696              
697             The B<resources_required> attribute is a hash, specifying the
698             value for each resource that is to be considered.
699              
700             The B<retry_resources_required> attribute is also a hash. For
701             each resource, you can specify an array of values. If the cluster
702             driver is able to detect that a run failed because the resource
703             was inadequate, it will retry the run with the next larger value
704             from this list.
705              
706             See B<HPCI::Stage> for more details about resources.
707              
708             =item force_retries
709              
710             This attribute specifies an integer number of time to retry the
711             stage before comcluding that it has actually failed. You might use
712             this if your cluster has some nodes that work differently from
713             others and a stage might fail on one type of node but succeed on
714             another.
715              
716             These retries are done after any cluster-specific retry mechanisms
717             have been used.
718              
719             The default value for this attribute is 0 (zero), giving no forced
720             retries unless you specifically ask for them.
721              
722             =item failure_action ('abort_group', 'abort_deps'*, or 'ignore')
723              
724             Specifies the action to take if this stage fails (terminates with
725             a non-zero status).
726              
727             There are three string values that it can have:
728              
729             =over 4
730              
731             =item - abort_deps (default)
732              
733             If the stage fails, then any stages which depend upon it
734             (recursively) are not run. The group continues executing until
735             all stages which are not dependent upon this stage (including those
736             that have not yet been initiated) complete execution.
737              
738             =item - abort_group
739              
740             If the stage fails, then no other stages are started. The group
741             simply waits until stages that have already been started complete
742             and then returns.
743              
744             =item - ignore
745              
746             Execution continues unchanged, any dependent stages will be run when they are
747             no longer blocked.
748              
749             =back
750              
751             =item abort_group_on_failure abort_deps_on_failure ignore_failure
752              
753             As an alternative to providing a value to the failute_action attribute
754             when you create a stage, you can instead provide one of the pseudo-attributes
755             'abort_group_on_failure', 'abort_deps_on_failure', or 'ignore_failure' with
756             a true value to specify 'abort_group', 'abort_deps', or 'ignore' respectively.
757              
758             =item state
759              
760             The B<state> is mostly an internal attribute but after the group has
761             finished execution you can use this to check whether the stage was
762             run successfully. After execution, B<state> will either be 'pass" or
763             'fail'.
764              
765             =item Environment passing attributes
766              
767             You can set up a set of environment variables that will be provided to
768             this stage. It will use set defined for the group as a basis (if such a set was
769             defined for the group), but that set can be changed for individual stages
770             or you can have no group default and only provide a set to specific stages
771             as needed. See B<HPCI::Env> for further details.
772              
773             =back
774              
775             =head3 Methods
776              
777             =head4 command creation
778              
779             There are a number of helper methods to assist in building different
780             types of commands to be provided for the B<command> attribute.
781             See B<HPCI::Stage> for details.
782              
783             =head1 Local Configuration
784              
785             TODO: write this section
786             - describe the HPCI::LocalConfig module
787             - describe the mechanism for adding extra roles to group, stage, etc.
788              
789             =head1 Additional
790              
791             This is an early public release of HPCI, and at present, there are
792             only two drivers available.
793              
794             Only one cluster type is directly included within the HPCI package.
795             The cluster type B<HPCD::uni> runs on a "cluster" of only one
796             machine. It simply uses fork to submit individual stages and has
797             facility for retries and timeouts. This is the default cluster
798             type used for testing, as it will work natively on all types of
799             Unix systems. It is also possible to use this driver as a fallback,
800             in cases where the only available "real" cluster is not accessable
801             for some reason.
802              
803             Additionally, there is the B<HPCD::SGE> driver available on CPAN.
804             It has seen heavy use within Boutros Lab.
805              
806             Now that these packages have been released, it is likely new
807             cluster drivers will be written. People interested in developing
808             drivers for additional cluster types should contact the authors
809             of this package to co-ordinate releases, features needed, etc. at
810             B<mailto:BoutrosLabSoftware@oicr.on.ca>.
811              
812             Additionally, you may wish to subscribe to the email list mentioned
813             at B<https:://lists.oicr.on.ca/mailman/listinfo/hpci-discuss>.
814             This is expected to be a low volume discussion group, although the
815             future will tell what the actual volume will be.
816              
817             As additional capabilities of new cluster types are addressed, and as
818             different control needs used at other organizations are identified;
819             this interface will surely change. As far as possible, such changes
820             will be done in an upwardly compatible manner, but until a few more
821             drivers have been integrated there is the possibility of changes
822             that are not fully backward compatible. Watch the release notes
823             for warnings of such issues. At some point there will be a 1.0.0
824             release, at which point this expectation of (limited) incompatible
825             future change will be dropped. After that point, incompatible
826             changes will only be made for critical reasons.
827              
828             The reason for separate distribution of cluster-specific HPCD
829             packages are fairly obvious:
830              
831             =over 4
832              
833             =item -
834              
835             The maintainers of the HPCI package do not have access to every
836             possible cluster type, and it unlikely that anyone will have access
837             to all supported cluster types from one location, so the driver
838             modules will need to be tested separately anyhow.
839              
840             =item -
841              
842             A user of HPCI is equally not going to have need to access every
843             type of cluster that exists, so they will probably prefer to only
844             download the driver modules that they actually need.
845              
846             =back
847              
848             =head1 SEE ALSO
849              
850             =over 4
851              
852             =item HPCI::Group
853              
854             Describes the interface common to all B<HPCI Group>
855             objects, regardless of the particular type of cluster that
856             is actually being used to run the stages. In the future, the
857             common interface may change somewhat as supprt for additional
858             cluster types is added and a better understanding of the common
859             features is achieved.
860              
861             =item HPCI::Stage
862              
863             Describes the interface common to stage object returned
864             by all B<HPCI Stage> objects, regardless of the
865             particular type of cluster that is actually being used to
866             run the stages. The common interface may change somewhat
867             as supprt for additional cluster types is added and a better
868             understanding of the common features is achieved.
869              
870             =item HPCI::Logger
871              
872             Describes the logger parameters in more detail.
873              
874             =item HPCI::Env
875              
876             Describes the environment passing parameters in more detail.
877              
878             =item HPCD::I<$cluster>::Group
879              
880             Describes the group interface unique to a specific type of cluster,
881             including any limitations or extensions to the generic interface.
882              
883             =item HPCD::I<$cluster>::Stage
884              
885             Describes the stage interface unique to a specific type of cluster,
886             including any limitations or extensions to the generic interface.
887              
888             =back
889              
890             =head1 AUTHOR
891              
892             Christopher Lalansingh - Boutros Lab
893              
894             John Macdonald - Boutros Lab
895              
896             =head1 ACKNOWLEDGEMENTS
897              
898             Paul Boutros, Phd, PI - Boutros Lab
899              
900             The Ontario Institute for Cancer Research
901              
902             =cut
903              
904             1;
905