File Coverage

blib/lib/Cluster/Init.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             package Cluster::Init;
2             #
3              
4             #
5             # The Design
6             # ==========
7             #
8             # A collection of event-driven DFA or finite state machines; each machine
9             # is its own object.
10             #
11             # Daemon machine started first, daemon starts group machines, group machines
12             # start process machines, process machines start and stop processes.
13             #
14             # Client talks to daemon via UNIX domain socket.
15             #
16 17     17   154147 use strict;
  17         53  
  17         879  
17 17     17   101 use warnings;
  17         37  
  17         620  
18 17     17   18407 use Data::Dump qw(dump);
  17         186018  
  17         1519  
19 17     17   19502 use Carp::Assert;
  17         25162  
  17         128  
20 17     17   31400 use IO::Socket;
  17         571683  
  17         95  
21 17     17   29036 use POSIX qw(:signal_h :errno_h :sys_wait_h);
  17         167268  
  17         133  
22 0           use IPC::LDT qw(
23             LDT_OK
24             LDT_CLOSED
25             LDT_READ_INCOMPLETE
26             LDT_WRITE_INCOMPLETE
27 17     17   65083 );
  0            
28             use Cluster::Init::DB;
29             use Cluster::Init::Conf;
30             use Cluster::Init::Util qw(debug);
31             use Cluster::Init::Daemon;
32             use base qw(Cluster::Init::Util);
33              
34             our $VERSION = "0.215";
35              
36             my $debug=$ENV{DEBUG} || 0;
37              
38             my $cltab="/etc/cltab";
39              
40              
41             =head1 NAME
42              
43             Cluster::Init - Clusterwide "init", spawn cluster applications
44              
45             =head1 SYNOPSIS
46              
47             use Cluster::Init;
48              
49             unless (fork())
50             {
51             Cluster::Init->daemon;
52             exit 0;
53             }
54              
55             my $client = Cluster::Init->client;
56              
57             # spawn all apps for resource group "foo", runlevel "run"
58             $client->tell("foo","run");
59              
60             # spawn all apps for resource group "foo", runlevel "runmore"
61             # (this stops everything started by runlevel "run")
62             $client->tell("foo","runmore");
63              
64             # spawn all apps for resource group "bar", runlevel "3"
65             # (this does *not* stop or otherwise affect anything in "foo")
66             $client->tell("bar",3);
67              
68             =head1 DESCRIPTION
69              
70             This module provides basic B functionality, giving you a single
71             inittab-like file to manage initialization and daemon startup across a
72             cluster or collection of machines.
73              
74             This module is used by B, for instance, to provide high
75             availability with failure detection, automatic migration, and restart
76             of applications running in a cluster. B provides you
77             with the ability to build 24x7 mission-critical, high-performance
78             server farms using only commodity hardware. See L.
79              
80             I wrote the original version of this module to provide a more flexible
81             interface between IBM's AIX HACMP cluster manager and managed
82             applications. This provided a cleaner configuration, much faster
83             configuration changes, and respawn ability for individual daemons.
84              
85             Other uses are possible, including non-cluster environments -- use
86             your imagination. Generically, what you get in this package is an
87             application-level "init" written in Perl, with added ability to
88             configure resource groups, status file output, and a 'test' runmode
89             (see below).
90              
91             Commercial support for this module is available: see L.
92              
93             =head1 QUICK START
94              
95             See L for cluster management
96             techniques, including clean ways to install, replicate, and update
97             nodes.
98              
99             See L for an explanation of terms.
100              
101             Much of the following work is done for you if you're running
102             B on an openMosix cluster -- see L.
103              
104             To use B (without B) to manage your
105             cluster-hosted processes, on either a high-throughput computing
106             cluster or a high-availability cluster:
107              
108             =over 4
109              
110             =item *
111              
112             Install B on each node.
113              
114             =item *
115              
116             Create L<"/etc/cltab">.
117              
118             =item *
119              
120             Replicate L<"/etc/cltab"> to all nodes.
121              
122             =item *
123              
124             Run 'C' on each node. Putting this in F as a
125             "respawn" process would be a good idea, or you could have it started
126             as a managed process under HACMP, VCS, Linux-HA etc.
127              
128             =item *
129              
130             Run 'C' on each node where you want resource
131             group I to be running at runlevel I.
132              
133             =item *
134              
135             Check current status in L<"/var/run/clinit/clstat"> on each node. (Or
136             use B, which collates this for you across all nodes.)
137              
138             =back
139              
140             =head1 INSTALLATION
141              
142             Use Perl's normal sequence:
143              
144             perl Makefile.PL
145             make
146             make test
147             make install
148              
149             You'll need to install this module on each node in the cluster.
150              
151             This module includes a script, L, which will be installed when
152             you run 'make install'. See the output of C to
153             find out which directory the script is installed in.
154              
155             =head1 CONCEPTS
156              
157             =over 4
158              
159             =item Cluster
160              
161             A group of machines administered as a single unit and offering a
162             common set of services. See I,
163             I, and I.
164              
165             =item Computing Cluster
166              
167             See I.
168              
169             =item Enterprise Cluster
170              
171             A well-administered B (see
172             L), in which each machine, whether
173             desktop or server, provides scalable commodity services. Any machine
174             or group of machines can be easily and quickly replaced, with
175             minimal user impact, without restoring from backups, with no advance
176             notice or unique preparation. May include elements of both I
177             availability> and I clusters.
178              
179             =item High-Availability Cluster
180              
181             (Also B.) A cluster of machines optimized for providing
182             high uptime and minimal user impact in case of hardware failure, in
183             return for increased per-node expense and complexity. Normally
184             includes shared disk, unattended failover of filesystem mounts and IP
185             and MAC addresses, and automatic daemon restart on the surviving
186             node(s). Suitable for applications such as NFS and database servers,
187             and other services which normally cannot be replicated easily.
188              
189             Examples of HA cluster platforms include OpenMosix::HA, Linux-HA, AIX
190             HACMP, and Veritas VCS.
191              
192             Due to the expense of providing the per-node redundancy required for
193             high availability, HA clusters are normally not scalable to the
194             hundreds of nodes typically needed for high-throughput applications.
195             OpenMosix::HA is the exception to this rule; it provides an HA layer
196             on top of a high-throughput openMosix cluster.
197              
198             =item High-Throughput Cluster
199              
200             A cluster of machines optimized for cheaply delivering large
201             quantities of work in a short time, in return for reduced per-process
202             reliability. May include features such as process checkpointing and
203             migration, high-speed interconnects, or distributed shared memory.
204             Some high-throughput clusters are optimized for scavenging unused
205             cycles on desktop machines. Most high-throughput clusters are
206             suitable for supercomputing-class applications which can be
207             parallellized across dozens, hundreds, or even thousands of nodes.
208              
209             Examples of high-throughput cluster platforms include OpenMosix::HA,
210             openMosix itself, Linux Beowulf, and Condor.
211              
212             Due to the internode dependencies inherent in distributed shared
213             memory or migration of interactive processes, high-throughput clusters
214             normally do not meet the needs of high availability -- they are
215             intended for brute-force problem solving where the death of a single
216             process out of thousands is not significant. High-throughput clusters
217             are not typically designed to provide mission-critical interactive
218             services to the public.
219              
220             The one (known) exception is OpenMosix::HA -- it provides high
221             availability for both interactive and batch processes running on a
222             high-throughput openMosix cluster.
223              
224             =item Resource Group
225              
226             A collection of applications and physical resources (like filesystem
227             mounts) which need to execute together on the same cluster node.
228             Resource groups allow easy migration of applications between nodes.
229              
230             Cluster::Init supports resource groups explicitly. Resource groups
231             are configured in L<"/etc/cltab">.
232              
233             For example, B, F, and the
234             F directory might make up a resource group -- they
235             all need to be present on the same node. From L<"/etc/cltab">, you
236             could spawn the scripts which update F, mount F,
237             and then start B itself.
238              
239             Another example; Apache, a virtual IP address, and the filesystem
240             containing the HTML document tree might together constitute a resource
241             group. To start this resource group, you might need to mount the
242             filesystem, ifconfig the virtual IP, and start httpd. This sequence
243             can easily be specified in F.
244              
245             =back
246              
247             =head1 UTILITIES
248              
249             =head2 clinit
250              
251             Cluster::Init includes B, a script which is intended to be a
252             bolt-in cluster init tool. The script is called like C or
253             C, with the addition of a new "resource group" argument. See
254             the output of C.
255              
256             The first time you execute B you will need to use the C<-d>
257             flag only, to start the B daemon. This flag does not
258             automatically background the daemon though -- this is so it will work
259             as a "respawn" entry in F. If you're testing from the
260             command line or running from a shell script, use 'C'.
261              
262             Once you have the daemon running, use B I the C<-d>
263             flag. This will cause it to run as a client only, talking to the
264             daemon via a UNIX domain socket. At this point you will use B
265             in roughly the same way you would use the UNIX B, in this
266             case commanding resource groups to switch to different runlevels.
267             That's it!
268              
269             Use the C<-k> flag to tell the daemon and all child processes to shut
270             down gracefully.
271              
272             =head1 PUBLIC METHODS
273              
274             =head2 daemon()
275              
276             # start a Cluster::Init server daemon
277             Cluster::Init->daemon (
278             'cltab' => '/etc/cltab',
279             'socket' => '/var/run/clinit/clinit.s'
280             'clstat' => '/var/run/clinit/clstat'
281             );
282              
283             The server-side constructor. You'll likely want to fork before
284             calling this method -- it does not return until you issue a
285             L from a L process. See the L source code
286             for an example.
287              
288             Accepts an optional hash containing the paths to the configuration
289             file, socket, and status output file. You can also specify 'socket'
290             and 'clstat' locations in L.
291              
292             The daemon opens and listens on a UNIX domain socket,
293             L by default. The L will
294             communicate with the daemon via this socket.
295              
296             =cut
297              
298             sub daemon
299             {
300             my $class = shift;
301             my $self = {@_};
302             bless $self, $class;
303             my $conf = $self->getconf(context=>'server',@_);
304             Cluster::Init::Daemon->new(conf=>$conf);
305             $self->loop();
306             return 1;
307             }
308              
309             =head2 client()
310              
311             # create a Cluster::Init client object
312             my $client = Cluster::Init->client (
313             'cltab' => '/etc/cltab',
314             'socket' => '/var/run/clinit/clinit.s'
315             'clstat' => '/var/run/clinit/clstat'
316             );
317              
318             The client-side constructor.
319              
320             Accepts an optional hash containing the paths to the configuration
321             file, socket, and status output file. You can also specify 'socket'
322             and 'clstat' locations in L.
323              
324             Returns a B object. You'll normally call the resulting
325             object's L method one or more times after this. See the
326             L source code for example usage.
327              
328             The client looks for the L on a UNIX domain socket,
329             L by default.
330              
331             =cut
332              
333             sub client
334             {
335             my $class = shift;
336             my $self = {@_};
337             bless $self, $class;
338             my $conf = $self->getconf(context=>'client',@_);
339             $self->{'socket'} = $conf->get('socket');
340             return $self;
341             }
342              
343             =head2 tell()
344              
345             # tell resource group "mygroup" to change to runlevel "newlevel"
346             $client->tell("mygroup", "newlevel");
347              
348             # cause Cluster::Init daemon to re-read cltab
349             $client->tell(":::ALL:::", ":::REREAD:::");
350              
351             Tells a running L to change a resource group to a new runlevel.
352             Called as a method on an object returned by L. See the
353             L source code for example usage.
354              
355             At this time, this method returns a string containing a success or
356             failure message. I don't use this string in B, so it
357             isn't very refined -- it doesn't give you much you can use to detect
358             failure programmatically, for example. For a better solution, see
359             L.
360              
361             The C usage is only a convention;
362             in fact, any call to C with true values for group and level
363             will cause a re-read, regardless of whether the values provided match
364             any actual group or runlevel.
365              
366             =cut
367              
368              
369             sub tell
370             {
371             my $self=shift;
372             my $group = shift;
373             my $level = shift;
374             my $socket = $self->{'socket'};
375             affirm { $socket };
376             affirm { -S $socket };
377             my $client = new IO::Socket::UNIX
378             (
379             Peer => $socket,
380             Type => SOCK_STREAM
381             ) || die $!;
382             my $ldt=new IPC::LDT(handle=>$client, objectMode=>1);
383             # send command
384             debug "sending command $group $level";
385             $ldt->send({group=>$group,level=>$level}) || warn $ldt->{'msg'};
386             debug "command sent";
387             # get response
388             my $res;
389             until (($res)=$ldt->receive)
390             {
391             die $ldt->{msg} if $ldt->{rc} == LDT_CLOSED;
392             }
393             return $res->{msg};
394             }
395              
396              
397             =head2 status()
398              
399             # return status of all running groups
400             my $text=$client->status();
401              
402             # filter by group and level
403             my $text=$client->status(group=>'foo',level=>'bar');
404              
405             # provide nonstandard path to clstat
406             my $text=$client->status(group=>'foo',level=>'bar',clstat=>'/tmp/clstat');
407              
408             This method will read L<"/var/run/clinit/clstat"> for you, dumping it
409             to stdout. All arguments are optional. If you provide 'group' or
410             'level', then output will be filtered accordingly. If you specify
411             'clstat', then the status file at the given pathname will be read
412             (this is handy if you need to query multiple Cluster::Init status
413             files in a shared cluster filesystem, and is what B
414             does).
415              
416             In addition to the usual $obj->status() syntax, the status() method
417             can also be called as a class function, as in
418             Cluster::Init::status(clstat=>'/tmp/clstat'). The 'clstat' argument
419             is required in this case. Again, this is handy if you want to query a
420             running Cluster::Init on another machine via a shared filesystem, without
421             creating an Cluster::Init object or daemon here.
422              
423             =cut
424              
425             sub status
426             {
427             my $self=shift;
428             my %parm = @_;
429             # allow this to be called as Cluster::Init->status(...)
430             $self=bless({},$self) unless ref($self);
431             my $group = $parm{'group'} if $parm{'group'};
432             my $level = $parm{'level'} if defined($parm{'level'});
433             my $clstat = $parm{'clstat'} || $self->conf('clstat');
434             die "need to specify clstat" unless $clstat;
435             return "" unless -f $clstat;
436             my $out ="";
437             open(CLSTAT,"<$clstat") || die $!;
438             while()
439             {
440             chomp;
441             my ($obj,$name,$stlevel,$state)=split;
442             next unless $obj eq "Cluster::Init::Group";
443             if ($group)
444             {
445             next unless $group eq $name;
446             }
447             if (defined($level))
448             {
449             next unless $level eq $stlevel;
450             }
451             $out.="$name " unless $group;
452             $out.="$stlevel " unless $level;
453             $out.=$state;
454             $out.="\n" unless $group && $level;
455             }
456             return $out;
457             }
458              
459             =head2 shutdown()
460              
461             # tell daemon to gracefully stop all child processes and exit
462             $client->shutdown();
463              
464             Causes daemon to stop all child processes and exit. Processes will be
465             sent SIGINT, SIGTERM, then SIGKILL at intervals of several seconds;
466             the daemon will not exit until the last process has stopped -- this
467             method will always return sooner.
468              
469             =cut
470              
471             sub shutdown
472             {
473             my $self=shift;
474             return $self->tell(":::ALL:::",":::SHUTDOWN:::");
475             }
476              
477             sub getconf
478             {
479             my $self=shift;
480             $cltab=$self->{cltab} if $self->{cltab};
481             $self->{conf} = Cluster::Init::Conf->new(cltab=>$cltab,@_);
482             my $conf = $self->{conf};
483             return $conf;
484             }
485              
486             sub conf
487             {
488             my $self=shift;
489             my $var=shift;
490             die "can't set conf here" if @_;
491             my $conf = $self->{conf};
492             return $conf->get($var);
493             }
494              
495             sub loop
496             {
497             my $rc=Event::loop();
498             debug $rc if $rc;
499             }
500              
501             =head1 FILES
502              
503             =head2 /etc/cltab
504              
505             The main B configuration file. Identical in format to
506             F, with a new "resource group" column added. See
507             F in the B distribution for an example.
508              
509             The path and name of this file can be changed: see L and
510             L.
511              
512             This file must be replicated across all hosts in the cluster by some
513             means of your own. On openMosix clusters, B will
514             replicate this file for you. See L
515             for ways to do this in other environments.
516              
517             You can specify tests to be performed during startup of a resource
518             group: In addition to the init-style runmodes of 'once', 'wait',
519             'respawn', and 'off', B supports a 'test' runmode. If
520             the return code of a 'test' command is anything other than zero, then
521             the resource group as a whole is marked as 'FAILED' in
522             L. For example, the 'test' runmode is used by
523             B to test a node for eligibility before attempting to
524             start a resource group there.
525              
526             You can specify different locations for L
527             and L in L, like this:
528            
529             # location of socket
530             :::socket:/tmp/clinit.s
531             # location of status file
532             :::clstat:/tmp/clstat
533              
534             Settings found in L override those found in the
535             L or L constructor arguments.
536              
537             =head2 /var/run/clinit/clstat
538              
539             Plain-text file showing the status of all running resource groups.
540             Any time B changes the runlevel of a resource group, it
541             will update this file. This file can be read directly or via the
542             L method.
543              
544             The path and name of this file can be changed: see L,
545             L, and L.
546              
547             =head2 /var/run/clinit/clinit.s
548              
549             A UNIX domain socket used by L to communicate with
550             L.
551              
552             The path and name of this file can be changed: see L,
553             L, and L.
554              
555             =head1 BUGS
556              
557             See TODO file for a more comprehensive and current list. The most
558             significant outstanding bugs right now are:
559              
560             =over 4
561              
562             =item *
563              
564             Perl 5.8 incompatibility -- blows chunks with a scalar dereference
565             error. This module won't work at all on 5.8 until I get a chance to
566             fix this.
567              
568             =item *
569              
570             Runlevel of '0' (zero) is broken right now; groups named '0' will
571             probably never be supported either. If you pass a '0' as an argument
572             to tell() (either group or level), then whatever you intended to
573             happen is not going to happen.
574              
575             If you're just trying to force a re-read of cltab, then use some
576             nonexistent group and level; I use C or
577             somesuch, as mentioned in L.
578              
579             If you're just trying to shut a single group off, use something like
580             C. This will stop all of that group's
581             processes gracefully, assuming that there is no real runlevel '999'
582             configured for that group.
583              
584             =item *
585              
586             Deleting a group from cltab without stopping it first will cause the
587             group's processes to be sent SIGKILL -- they will not be stopped
588             gracefully with SIGINT or SIGTERM. Better to send
589             C to stop it gracefully first, as mentioned above.
590              
591             =item *
592              
593             Duplicate tags in cltab are detected but not enough useful
594             exceptions are generated.
595              
596             =item *
597              
598             Intermittent failure line 35 t/0232stop.t -- indicator error as far as
599             I can tell; just re-run C for now.
600              
601             =back
602              
603             =head1 SUPPORT
604              
605             Commercial support for this module is available at
606             L. On that web site, you'll also find
607             pointers to the latest version, a community mailing list, other
608             cluster management software, etc. You can also find help for general
609             infrastructure (and cluster) administration at
610             L.
611              
612             =head1 AUTHOR
613              
614             Steve Traugott
615             CPAN ID: STEVEGT
616             stevegt@TerraLuna.Org
617             http://www.stevegt.com
618              
619             =head1 COPYRIGHT
620              
621             Copyright (c) 2003 Steve Traugott. All rights reserved.
622             This program is free software; you can redistribute
623             it and/or modify it under the same terms as Perl itself.
624              
625             The full text of the license can be found in the
626             LICENSE file included with this module.
627              
628             =head1 SEE ALSO
629              
630             L,
631             L,
632             L,
633             B,
634             B,
635             B.
636              
637             =cut
638              
639             1;
640              
641             __END__