source: fedkit/splitter.pl @ 8034579

axis_examplecompt_changesinfo-opsversion-1.30version-2.00version-3.01version-3.02
Last change on this file since 8034579 was 8034579, checked in by Ted Faber <faber@…>, 16 years ago

event repeater starts remotely

  • Property mode set to 100644
File size: 37.9 KB
Line 
1#!/usr/bin/perl
2
3use strict;
4
5use Getopt::Std;
6use IO::File;
7use IO::Dir;
8use IO::Pipe;
9use File::Copy;
10
11my @scripts = ("fed_bootstrap", "federate.sh", "smbmount.pl", 
12    "make_hosts", "fed-tun.pl", "fed_evrepeater" );
13my $local_script_dir = ".";
14my($pid, $gid);                 # Process and group IDs for calling parse.tcl
15my $splitter_config;            # Configuration file
16my $debug;                      # True if thecalled in debug mode
17my $verbose;                    # True for extra progress reports
18my $startem;                    # If true, start the sub-experiments
19my $eid;                        # Experiment ID
20my $tcl;                        # The experiment description (topology file)
21my $master;                     # Master testbed
22my $tmpdir;                     # tmp files
23my $tb_config;                  # testbed configurations
24my $smb_share;                  # Share to mount from the master
25my $project_user;               # User to mount project dirs as
26my($gw_pubkey, $gw_pubkey_base);# Connector pubkey (full path & basename)
27my($gw_secretkey, $gw_secretkey_base);# Connector secret key (full path &
28                                # basename)
29my($keytype);                   # Type (DSA or RSA) of generated gateway keys
30my $tcl_splitter;               # tcl program to split experiments
31                                # (changed during devel)
32my $tclsh;                      # tclsh to call directly (changed during devel)
33my $muxmax;                     # Maximum number of links/lans over 1 gw pair
34my @tarfiles;                   # Tarfiles in use by this experiment
35my @rpms;                       # Rpms in use by this experiment
36my $timeout;                    # The timeout to use for experiment swap ins
37my %opts;                       # Parsed options
38
39my $tbparams = {};              # Map of the per-testbed parameters from the
40                                # testbeds file.  It is a reference to a hash
41                                # of hashes (because it's passed around a bunch
42                                # and it's nicer to have one access pattern
43                                # throughout the script, in the main loop and
44                                # the subroutines).  That access is exemplified
45                                # by  $tbparams->{'deter'}->{'domain'} which is
46                                # the domain parameter of the DETER testbed. 
47my $fail_soft;                  # Do not swap failed sub-experiments out
48my $max_children=1;             # Maximum number of simultaneous swap-ins
49
50# Parse the config file.  The format is a colon-separated parameter name
51# followed by the value of that parameter to the end of the line.  This parses
52# that format and puts the parameters into the referenced hash.  Parameter
53# names are mapped to lower case, parameter values are unchanged.  Returns 0 on
54# failure (e.g. file open) and 1 on success.
55sub parse_config {
56    my($file, $href) = @_;
57    my $fh = new IO::File($file);
58       
59    unless ($fh) {
60        warn "Can't open $file: $!\n";
61        return 0;
62    }
63
64    while (<$fh>) {
65        next if /^\s*#/ || /^\s*$/;     # Skip comments & blanks
66        chomp;
67        /^([^:]+):\s*(.*)/ && do {
68            my $key = $1; 
69
70            $key =~ tr/A-Z/a-z/;
71            $href->{$key} = $2;
72            next;
73        };
74        warn "Unparasble line in $file: $_\n";
75    }
76    $fh->close();   # It will close when it goes out of scope, but...
77    return 1;
78}
79
80# Parse an easier-to-read testbeds file (the original was comma-separated
81# unadorned strings).  The format is a testbed scope as [testbed] followed by
82# the colon-separated attribute-value pairs for the testbed.  Right now these
83# go into a set of global hashes indexed by testbed, but that should probably
84# change.
85
86sub parse_testbeds {
87    my($file, $tbparams) = @_;      # Testbeds file and parameter hash
88    my $fh = new IO::File($file);  # Testbeds filehandle
89    my $tb;                         # Current testbed
90    # Convert attribute in the file to tbparams hash key
91    my %attr_to_hash = (
92        "opsnode" => "host",
93        "user" => "user",
94        "domain" => "domain",
95        "project" => "project",
96        "connectortype" => "gwtype",
97        "slavenodestartcmd" => "expstart",
98        "slaveconnectorstartcmd" => "gwstart",
99        "masternodestartcmd" => "mexpstart",
100        "masterconnectorstartcmd" => "mgwstart",
101        "connectorimage" => "gwimage",
102        "fileserver" => "fs",
103        "boss" => "boss",
104        "eventserver" => "eventserver",
105        "tunnelcfg" => "tun"
106    );
107
108
109    unless ($fh) {
110        warn "Can't open $file: $!\n";
111        return 0;
112    }
113
114    while (<$fh>) {
115        next if /^\s*#/ || /^\s*$/;     # Skip comments & blanks
116        chomp;
117        /^\s*\[(.*)\]/ && do {
118            $tb = $1;
119            $tbparams->{$tb} = {} unless $tbparams->{$tb};
120            next;
121        };
122
123        /^([^:]+):\s*(.*)/ && do {
124            unless ($tb) {
125                warn "Ignored attribute definition before testbed " .
126                    "defined in $file: $_\n";
127                next;
128            }
129            my $key = $1; 
130            $key =~ tr/A-Z/a-z/;
131            my $var = $attr_to_hash{$key};
132
133            if ($var) { $tbparams->{$tb}->{$var} = $2; }
134            else { warn "Unknown keyword $key in $file\n"; }
135
136            next;
137        };
138        warn "Unparasble line in $file: $_\n";
139    }
140    $fh->close();   # It will close when it goes out of scope, but...
141    return 1;
142}
143
144# Generate SSH keys for use by the gateways.  The parameters are the type and
145# the filename for the private key.  The pubkey will be stored in a filename
146# with the same name as the private key but with .pub appended.  Type can be
147# dsa or rsa.
148
149sub generate_ssh_keys {
150    my($type, $dest) = @_;
151
152    $type =~ tr/A-Z/a-z/;
153    return 0 if $type !~ /(rsa|dsa)/;
154    system("/usr/bin/ssh-keygen -t $type -N \"\" -f $dest");
155    return $@ ? 0 : 1;
156}
157
158# use scp to transfer a file, reporting true if successful and false otherwise.
159# Parameters are the local file name, the ssh host destination (either hostname
160# oe user@host), and an optional destination file name or directory.  If no
161# destination is given, the file is transferred to the given user's home
162# directory.  If only a machine is given in the ssh host destination, the
163# current user is used.
164sub scp_file {
165    my($file, $user, $host, $dest) = @_;
166
167    # XXX system with a relative pathname is sort of gross
168    system("scp $file $user\@$host:$dest");
169    if ($?) {
170        warn "scp failed $?\n";
171        return 0;
172    }
173    else { return 1; }
174}
175
176# use ssh to execute the given command on the machine (and as the user) in
177# $where.  Parameters are the ssh destination directive ($where) and the
178# command to execute, and a prefix to be placed on a message generated if the
179# command fails.   On failure print a warning if a warning prefix was given and
180# return false.  If timeout is given fork a process and set an alarm of that
181# many seconds.  Timeouts also return 0;
182sub ssh_cmd {
183    my($user, $host, $cmd, $wname, $timeout) = @_;
184    my $pid;                # Child pid
185
186    $timeout = 0 unless $timeout;   # Force default timeout
187
188    if ( $pid = fork () ) {
189        # Parent process
190        # The eval acts as a signal catcher.  If the alarm goes off inside
191        # the eval, the die will put "alarm\n" into $@, otherwise the
192        # return value of the execution in the child process will be used.
193        my $rv = eval {
194            local $SIG{'ALRM'} = sub{ die "alarm\n"; };
195            my $rv;
196
197            alarm $timeout;
198            $rv = waitpid($pid, 0);
199            alarm 0;
200            $rv;
201        };
202
203        # If the eval succeeded, $@ will be null and we can use $rv, which
204        # is the return code from the subprocess.  If the eval timed out,
205        # print a warning and assume the best.
206        if ($@ eq "alarm\n" ) {
207            warn "$wname timed out - pid $pid still live\n";
208            return 1;
209        }
210        else {
211            return $rv;
212        }
213    }
214    else {
215        # Child process
216        exec("ssh $user\@$host $cmd");
217        exit 0;
218    }
219}
220
221# Ship local copies of the federation scripts out to the given host.  If any of
222# the script transfers fails, return 0.  The scripts to transfer are from the
223# global @scripts and are found locally in $local_script_dir (another global).
224sub ship_scripts {
225    my($host, $user, $dest_dir) = @_;       # Where, who, where remotely
226    my $s;
227
228    &ssh_cmd($user, $host, "mkdir -p $dest_dir");
229    for $s (@scripts) {
230        &scp_file("$local_script_dir/$s", $user, $host, $dest_dir) || 
231            return 0;
232    }
233    return 1;
234}
235
236# Ship per-testbed configuration generated by this script to the remote /proj
237# directories on the remote testbeds
238sub ship_configs {
239    my($host, $user, $src_dir, $dest_dir) = @_;     # Where, who, where remotely
240    my($d, $f);
241
242    $d = IO::Dir->new($src_dir) || return 0;
243
244    # All directories under $tmpdir are 770 so we can delete them later.
245    &ssh_cmd($user, $host, "mkdir -p $dest_dir") || return 0;
246    &ssh_cmd($user, $host, "chmod 770 $dest_dir") || return 0;
247    while ( $f = $d->read()) {
248        next if $f =~ /^\./;
249        if ( -d "$src_dir/$f" ) {
250            &ship_configs($host, $user, "$src_dir/$f", "$dest_dir/$f") || 
251                return 0;
252        }
253        else {
254            &scp_file("$src_dir/$f", $user, $host, $dest_dir) || return 0;
255        }
256    }
257    return 1;
258}
259
260# Start a sub section of the experiment on a given testbed.  The testbed and
261# the user to start the experiment as are pulled from the global per-testbed
262# hash, passed in as $tbparams, as is the project name on the remote testbed.
263# Parameters are the testbed and the experiment id.  Configuration files are
264# scp-ed over to the target testbed from the global $tmpdir/$tb directory.
265# Then the current state of the experiment determined using expinfo.  From that
266# state, the experiment is either created, modified or spapped in.  If
267# everything succeeds, true is returned.  If the global verbose is set progress
268# messages are printed.
269sub start_segment {
270    my($tb, $eid, $tbparams, $timeout) = @_;# testbed, experiment ID,
271                                            # per-testbed parameters and remote
272                                            # swap-in timeout
273    my $host =                              # Host name of remote ops (FQDN)
274        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
275    my $user = $tbparams->{$tb}->{'user'};  # user to pass to ssh
276    my $pid = $tbparams->{$tb}->{'project'};# remote project to start the
277                                            # experiment under
278    my $tclfile = "./$eid.$tb.tcl";         # Local tcl file with the
279                                            # sub-experiment
280    my $proj_dir = "/proj/$pid/exp/$eid/tmp";  # Where to stash federation stuff
281    my $tarfiles_dir = "/proj/$pid/tarfiles/$eid";  # Where to stash tarfiles
282    my $rpms_dir = "/proj/$pid/rpms/$eid";  # Where to stash rpms
283    my $to_hostname = "$proj_dir/hosts";    # remote hostnames file
284    my $state;                              # State of remote experiment
285    my $status = new IO::Pipe;              # The pipe to get status
286
287    # Determine the status of the remote experiment
288    $status->reader("ssh $user\@$host /usr/testbed/bin/expinfo $pid $eid") || 
289        die "Can't ssh to $user\@$host:$!\n";
290
291    # XXX: this is simple now.  Parsing may become more complex
292    while (<$status>) {
293        /State: (\w+)/ && ($state = $1);
294        /No\s+such\s+experiment/ && ($state = "none");
295    }
296    $status->close();
297    print "$tb: $state\n";
298
299    # Copy the experiment definition data over
300    print "transferring subexperiment to $tb\n" if $verbose;
301    &scp_file("$tmpdir/$tb/$tclfile", $user, $host) || return 0;
302    # Clear out any old experiment data; if not deleted, copies over it by
303    # different users will fail.
304    # (O /bin/csh, how evil thou art.  The -c and the escaped single quotes
305    # force the /bin/sh interpretation of the trailing * (which we need to keep
306    # tmp around))  Again, this needs to be done more properly once we have a
307    # non-ssh interface here.)
308    print "clearing experiment subdirs on $tb\n" if $verbose;
309    &ssh_cmd($user, $host, "/bin/sh -c \\'/bin/rm -rf $proj_dir/*\\'") || 
310        return 0;
311    print "clearing experiment tarfiles subdirs on $tb\n" if $verbose;
312    &ssh_cmd($user, $host, "/bin/rm -rf $tarfiles_dir/") || 
313        return 0;
314    print "creating tarfiles subdir $tarfiles_dir on $tb\n" if $verbose;
315    &ssh_cmd($user, $host, "mkdir -p $tarfiles_dir", "create tarfiles") || 
316        return 0;
317    print "clearing experiment rpms subdirs on $tb\n" if $verbose;
318    &ssh_cmd($user, $host, "/bin/rm -rf $rpms_dir/") || 
319        return 0;
320    print "creating rpms subdir $rpms_dir on $tb\n" if $verbose;
321    &ssh_cmd($user, $host, "mkdir -p $rpms_dir", "create rpms") || 
322        return 0;
323    # Remote experiment is active.  Modify it.
324    if ($state eq "active") {
325        print "Transferring federation support files to $tb\n" if $verbose;
326        # First copy new scripts and hostinfo into the remote /proj
327        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
328            return 0;
329        &ship_scripts($host, $user, $proj_dir) || return 0;
330        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
331
332        if ( -d "$tmpdir/tarfiles") {
333            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
334                return 0;
335        }
336
337        if ( -d "$tmpdir/rpms") {
338            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
339                return 0;
340        }
341
342        print "Modifying $eid in place on $tb\n" if $verbose;
343        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -r -s -w $pid " . 
344            "$eid $tclfile", "modexp", $timeout) || return 0;
345        return 1;
346    }
347
348    # Remote experiment is swapped out, modify it and swap it in.
349    if ($state eq "swapped") {
350        print "Transferring federation support files to $tb\n" if $verbose;
351        # First copy new scripts and hostinfo into the remote /proj (because
352        # the experiment exists, the directory tree should be there.
353        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
354            return 0;
355        &ship_scripts($host, $user, $proj_dir) || return 0;
356        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
357        if ( -d "$tmpdir/tarfiles") {
358            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
359                return 0;
360        }
361
362        if ( -d "$tmpdir/rpms") {
363            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
364                return 0;
365        }
366
367        print "Modifying $eid on $tb\n" if $verbose;
368        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -w $pid $eid $tclfile", 
369            "modexp") || return 0;
370        print "Swapping $eid in on $tb\n" if $verbose;
371        # Now start up
372        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in", 
373            "swapexp", $timeout) || return 0;
374        return 1;
375    }
376
377    # No remote experiment.  Create one.  We do this in 2 steps so we can put
378    # the configuration files and scripts into the new experiment directories.
379    if ($state eq "none") {
380
381        if ( -d "$tmpdir/tarfiles") {
382            # Tarfiles have to exist for the creation to work
383            print "copying tarfiles to $tb\n";
384            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
385                return 0;
386        }
387
388        if ( -d "$tmpdir/rpms") {
389            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
390                return 0;
391        }
392
393        print "Creating $eid on $tb\n" if $verbose;
394        &ssh_cmd($user, $host, "/usr/testbed/bin/startexp -i -f -w -p " . 
395            "$pid -e $eid $tclfile", "startexp") || return 0;
396        # After startexp succeeds, the per-experiment directories exist on the
397        # remote testbed.
398        print "Transferring federation support files to $tb\n" if $verbose;
399        # First copy new scripts and hostinfo into the remote /proj
400        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
401            return 0;
402        &ship_scripts($host, $user, $proj_dir) || return 0;
403        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
404        # Now start up
405        print "Swapping $eid in on $tb\n" if $verbose;
406        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in", 
407            "swapexp", $timeout) || return 0;
408        return 1;
409    }
410
411    # Every branch for a known state returns.  If execution gets here, the
412    # state is unknown.
413    warn "unknown state: $state\n";
414    return 0;
415}
416
417# Swap out a sub-experiment - probably because another has failed.  Arguments
418# are testbed and experiment.  Most of the control flow is similar to
419# start_segment, though much simpler.
420sub stop_segment {
421    my($tb, $eid, $tbparams) = @_;          # testbed, experiment ID and
422                                            # per-testbed parameters
423    my $user = $tbparams->{$tb}->{'user'};  # testbed user
424    my $host =                              # Ops node
425        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
426    my $pid = $tbparams->{$tb}->{'project'};# testbed project
427
428    print "Stopping $eid on $tb\n" if $verbose;
429    &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid out", 
430        "swapexp (out)") || return 0;
431    return 1;
432}
433
434
435$pid = $gid = "dummy";              # Default project and group to pass to
436                                    # $tcl_splitter above.  These are total
437                                    # dummy arguments;  the splitter doesn't
438                                    # use them at all, but we supply them to
439                                    # keep our changes to the parser minimal.
440# Argument processing.
441getopts('Ft:c:p:f:ndvN', \%opts);
442$splitter_config = $opts{'c'} || "./splitter.conf";
443$debug = $opts{'d'};
444$verbose = $opts{'v'} || $opts{'d'};
445
446&parse_config("$splitter_config", \%opts) || 
447    die "Cannot read config file $splitter_config: $!\n";
448
449warn "-N does nothing now.  Only one testbeds format supported.\n"
450    if $opts{'N'};
451$fail_soft = $opts{'F'} || $opts{'failsoft'};
452$startem = $opts{'n'} ? 0 : 1;          # If true, start the sub-experiments
453$timeout = $opts{'t'} || $opts{'timeout'};
454$eid = $opts{'experiment'};             # Experiment ID
455$tcl = $opts{'f'} || shift;             # The experiment description
456$master = $opts{'master'};              # Master testbed
457$tmpdir = $opts{'tmpdir'} || $opts{'tempdir'}|| "/tmp"; # tmp files
458$tb_config = $opts{'testbeds'} || "./testbeds"; # testbed configurations
459$local_script_dir = $opts{'scriptdir'}; # Local scripts
460$muxmax  = $opts{'muxlimit'} || 3;      # Number of connections muxed on one
461                                        # gateway
462
463$max_children = $opts{'p'} || $opts{'maxchildren'} 
464    if $opts{'p'} || $opts{'maxchildren'};
465
466$smb_share = $opts{'smbshare'} ||       # Share to mount from the master
467    die "Must give an SMB share\n";
468$project_user = $opts{'smbuser'} ||     # User to mount project dirs as
469    die "Must give an SMB user\n";
470
471# tcl program to split experiments (changed during devel)
472$tcl_splitter = $opts{'tclparse'} || "/usr/testbed/lib/ns2ir/parse.tcl";
473# tclsh to call directly (changed during devel)
474$tclsh = $opts{'tclsh'} || "/usr/local/bin/otclsh";
475
476# Prefix to avoid collisions
477$tmpdir .= "/split$$";
478
479print "Temp files are in $tmpdir\n" if $verbose;
480# Create a workspace
481unless (-d "$tmpdir") {
482    mkdir("$tmpdir") || die "Can't create $tmpdir: $!";
483}
484
485# If the keys are given, use them.  Otherwise create a set under $tmpdir
486
487if ( $opts{'gatewatpubkey'} && $opts{'gatewaysecretkey'}) {
488    $gw_pubkey = $opts{'gatewaypubkey'};
489    $gw_secretkey = $opts{'gatewaysecretkey'};
490}
491else {
492    $keytype = $opts{'gatewaykeytype'} || "rsa";
493    mkdir("$tmpdir/keys") || die "Can't create temoprary key dir: $!\n";
494    $gw_pubkey = "$tmpdir/keys/fed.$keytype.pub";
495    $gw_secretkey = "$tmpdir/keys/fed.$keytype";
496    print "Generating $keytype keys\n" if $verbose;
497    generate_ssh_keys($keytype, $gw_secretkey) || 
498        die "Cannot generate kets:$@\n";
499}
500# Generate the basenames
501($gw_pubkey_base = $gw_pubkey) =~ s#.*/##;
502($gw_secretkey_base = $gw_secretkey) =~ s#.*/##;
503
504
505
506# Validate scripts directory
507for my $s (@scripts) {
508    die "$local_script_dir/$s not in local script directory. Try -d\n"
509        unless -r "$local_script_dir/$s";
510}
511
512die "Must supply file, master and experiment" unless $master && $tcl && $eid;
513
514&parse_testbeds($tb_config, $tbparams) ||
515    die "Cannot testbed congfigurations from $tb_config: $!\n";
516
517# Open a pipe to the splitter program and start it parsing the experiments
518my $pipe = new IO::Pipe;
519# NB no more -p call on parse call.
520$pipe->reader("$tclsh $tcl_splitter -s -x $muxmax -m $master  $pid $gid $eid $tcl") || 
521    die "Cannot execute $tclsh $tcl_splitter -s -x $muxmax -m $master $pid $gid $eid $tcl:$!\n";
522
523# Parsing variables
524my $ctb;                        # Current testbed
525my %allocated;                  # If allocated{$tb} > 0, $tb is in use
526my $destfile;                   # File that the sub-experiment tcl file is
527                                # being written to, or "" if none.  Also used
528                                # for hostnames file.
529my $desthandle;                 # File handle for distfile
530my $gateways;                   # when gateway lists are being processed this
531                                # is the testbed whose gateways are being
532                                # gathered.
533my $control_gateway;            # Control net gateway for the current testbed
534my %active_end;                 # If active_end{"a-b"} > 0 then a is the active
535                                # end of the a <-> b connector pair.
536
537# Parse the splitter output.  This loop creates the sub experiments, gateway
538# configurations and hostnames file
539while (<$pipe>) {
540    # Start of a sub-experiment
541    /^#\s+Begin\s+Testbed\s+\((\w+)\)/ && do {
542        $ctb = $1;
543
544        # If we know the testbed, start collecting its sub experiment tcl
545        # description.  If not, warn the caller and ignore the configuration of
546        # this testbed.
547        if ($tbparams->{$ctb}) {
548            $allocated{$ctb}++; # Keep track of the testbeds allocated
549
550            unless (-d "$tmpdir/$ctb") {
551                mkdir("$tmpdir/$ctb") || die "Can't create $tmpdir/$ctb: $!";
552            }
553            $destfile = "$tmpdir/$ctb/$eid.$ctb.tcl";
554
555            $desthandle = new IO::File(">$destfile") || 
556                die "Cannot open $destfile:$!\n";
557        }
558        else { 
559            warn "No such testbed $ctb\n";
560            $destfile = "";
561        }
562        next;
563    };
564
565    # End of that experiment
566    /^#\s+End\s+Testbed\s+\((\w+)\)/ && do {
567        # Simple syntax check and close out this experiment's tcl description
568        die "Mismatched testbed markers ($1, $ctb)\n" unless ($1 eq $ctb);
569        $desthandle->close();
570        $destfile = $ctb = "";
571        next;
572    };
573
574    # Beginning of a gateway set
575    /^#\s+Begin\s+gateways\s+\((\w+)\)/ && do {
576        $gateways = $1;
577        # If we've heard of this tb, create the config lines for it one at a
578        # time.
579        if ($allocated{$gateways}) {
580            # Just in case.  This directory should already have been created
581            # above.
582            unless (-d "$tmpdir/$gateways") {
583                mkdir("$tmpdir/$gateways") || 
584                    die "Can't create $tmpdir/$gateways: $!";
585            }
586        }
587        else {
588            warn "Gateways given (and ignored) for testbed not in use: " .
589                "$gateways\n";
590            $gateways = 0;
591        }
592        next;
593    };
594    # End of the gateways section.  Output the client config for this testbed
595    /^#\s+End\s+gateways\s+\((\w+)\)/ && do {
596        die "Mismatched gateway markers ($1, $gateways)\n" 
597            unless !$gateways || $gateways == $1;
598
599        if ($control_gateway ) {
600            # Client config
601            my $cc = new IO::File(">$tmpdir/$gateways/client.conf");
602            my $master_project = $tbparams->{$master}->{'project'};
603            die "Can't open $tmpdir/$gateways/client.conf: $!\n" unless $cc;
604            print $cc "ControlGateway: $control_gateway\n";
605            print $cc "SMBShare: $smb_share\n";
606            print $cc "ProjectUser: $project_user\n";
607            print $cc "ProjectName: $master_project\n";
608            $cc->close();
609        }
610        else { warn "No control gateway for $gateways?\n"; }
611           
612        $gateways = 0;
613        next;
614    };
615    # Beginning of the hostnames list.  Collection is always in the hostnames
616    # file.
617    /^#\s+Begin\s+hostnames/ && do {
618        $destfile = "$tmpdir/hostnames";
619        $desthandle = new IO::File(">$destfile") || 
620            die "Can't open $destfile:$!\n";
621        next;
622    };
623    # end of the hostnames list.
624    /^#\s+End\s+hostnames/ && do {
625        $desthandle->close();
626        $destfile = "";
627        next;
628    };
629
630    # Generate gateway configuration info, one file per line
631    $gateways && do {
632        chomp;
633        my($dtb, $myname, $desthost, $type) = split(" ", $_);
634
635        # Many of these are to simplify print statements
636        my $sdomain =                           # domain for the source
637            $tbparams->{$gateways}->{'domain'};
638        my $ddomain =                           # domain for the destination
639            $tbparams->{$dtb}->{'domain'};
640        my $sproject =                          # Project of the source
641            $tbparams->{$gateways}->{'project'};
642        my $dproject =                          # Project of the destination
643            $tbparams->{$dtb}->{'project'};
644        my $fs =                                # Master fs node (FQDN)
645            $tbparams->{$master}->{'fs'} .  $tbparams->{$master}->{'domain'};
646        my $boss =                              # Master boss node (FQDN)
647            $tbparams->{$master}->{'boss'} .  $tbparams->{$master}->{'domain'};
648        my $event_server =                      # Master event-server (FQDN)
649            $tbparams->{$master}->{'eventserver'} . 
650            $tbparams->{$master}->{'domain'};
651        my $remote_script_dir =                 # Remote fed script location
652            "/proj/" . $dproject . "/exp/$eid/tmp";
653        my $local_script_dir =                  # Local fed script location
654            "/proj/" . $sproject . "/exp/$eid/tmp";
655        my $active;                             # Is this the active side of
656                                                # the connector?
657
658        $sdomain = ".$eid." . $tbparams->{$gateways}->{'project'} . "$sdomain";
659        $ddomain = ".$eid." . $tbparams->{$dtb}->{'project'} . "$ddomain";
660
661        my $conf_file = "$myname$sdomain.gw.conf";
662        my $remote_conf_file = "$desthost$ddomain.gw.conf";
663        # translate to lower case so the `hostname` hack for specifying
664        # configuration files works.
665        $conf_file =~ tr/A-Z/a-z/;
666        $remote_conf_file =~ tr/A-Z/a-z/;
667
668        # If either end of this link is in the master side of the testbed, that
669        # side is the active end. Otherwise the first testbed encountered in
670        # the file will be the active end.  The $active_end variable keeps
671        # track of those decisions
672        if ( $dtb eq $master ) { $active = "false"; }
673        elsif ($gateways eq $master ) { $active = "true"; }
674        elsif ( $active_end{"$dtb-$gateways"} ) { $active="false"; }
675        else { $active_end{"$gateways-$dtb"}++; $active = "true"; }
676
677        # This is used to create the client configuration.
678        $control_gateway = "$myname$sdomain"
679            if $type =~ /(control|both)/;
680
681        # Write out the file
682        my $gwconfig = new IO::File(">$tmpdir/$gateways/$conf_file")|| 
683            die "can't open $tmpdir/$gateways/$conf_file: $!\n";
684
685        print $gwconfig "Active: $active\n";
686        print $gwconfig "TunnelCfg: " . $tbparams->{$gateways}->{'tun'} . "\n";
687        print $gwconfig "BossName: $boss\n";
688        print $gwconfig "FsName: $fs\n";
689        print $gwconfig "EventServerName: $event_server\n";
690        print $gwconfig "Type: $type\n";
691        print $gwconfig "RemoteScriptDir: $remote_script_dir\n";
692        print $gwconfig "EventRepeater: $local_script_dir/fed_evrepeater\n";
693        print $gwconfig "RemoteExperiment: $dproject/$eid\n";
694        print $gwconfig "LocalExperiment: $sproject/$eid\n";
695        print $gwconfig "RemoteConfigFile: " . 
696            "$remote_script_dir/$remote_conf_file\n";
697        print $gwconfig "Peer: $desthost$ddomain\n";
698        print $gwconfig "Pubkeys: " . 
699            "/proj/$sproject/exp/$eid/tmp/$gw_pubkey_base\n";
700        print $gwconfig "Privkeys: " .
701            "/proj/$sproject/exp/$eid/tmp/$gw_secretkey_base\n";
702        $gwconfig->close();
703
704        # This testbed has a gateway (most will) so make a copy of the keys it
705        # needs in this testbed's subdirectory.  start_segment will transfer
706        # them.
707        unless (-r "$tmpdir/$gateways/$gw_pubkey_base" ) {
708            copy($gw_pubkey, "$tmpdir/$gateways/$gw_pubkey_base") ||
709                die "Can't copy pubkeys ($gw_pubkey to " . 
710                    "$tmpdir/$gateways/$gw_pubkey_base): $!\n";
711        }
712        if ($active eq "true" ) {
713            unless (-r "$tmpdir/$gateways/$gw_secretkey_base" ) {
714                copy($gw_secretkey, "$tmpdir/$gateways/$gw_secretkey_base") ||
715                    die "Can't copy secret keys ($gw_secretkey to " . 
716                        "$tmpdir/$gateways/$gw_secretkey_base): $!\n";
717            }
718        }
719
720        #done processing gateway entry, ready for next line
721        next; 
722    };
723    (/^#\s+Begin\s+tarfiles/../^#\s+End\s+tarfiles/) && do {
724        next if /^#/;
725        chomp;
726        push(@tarfiles, $_);
727        next;
728    };
729    (/^#\s+Begin\s+rpms/../^#\s+End\s+rpms/) && do {
730        next if /^#/;
731        chomp;
732        push(@rpms, $_);
733        next;
734    };
735
736    next unless $destfile;  # Unidentified testbed, ignore config
737    # local copies that can be used in the substitutions below
738    my $gwtype = $tbparams->{$ctb}->{'gwtype'};
739    my $gwimage = $tbparams->{$ctb}->{'gwimage'};
740    my $mgwstart = $tbparams->{$ctb}->{'mgwstart'};
741    my $mexpstart = $tbparams->{$ctb}->{'mexpstart'};
742    my $gwstart = $tbparams->{$ctb}->{'gwstart'};
743    my $expstart = $tbparams->{$ctb}->{'expstart'};
744    my $project = $tbparams->{$ctb}->{'project'};
745
746    # Substitute variables
747    s/GWTYPE/$gwtype/g;
748    s/GWIMAGE/$gwimage/g;
749    if ($ctb eq $master ) {
750        s/GWSTART/$mgwstart/g;
751        s/EXPSTART/$mexpstart/g;
752    }
753    else {
754        s/GWSTART/$gwstart/g;
755        s/EXPSTART/$expstart/g;
756    }
757    # XXX: oh is this bad
758    s#GWCONF#FEDDIR\`hostname\`.gw.conf#g;
759    s#PROJDIR#/proj/$project/#g;
760    s#EID#$eid#g;
761    s#FEDDIR#/proj/$project/exp/$eid/tmp/#g;
762    print $desthandle $_;
763}
764$pipe->close();
765die "No nodes in master testbed ($master)\n" unless $allocated{$master};
766
767# Copy tarfiles and rpms needed at remote sites to the staging directories.
768# Start_segment will distribute them
769for my $t  (@tarfiles) {
770    die "tarfile '$t' unreadable: $!\n" unless -r $t;
771    unless (-d "$tmpdir/tarfiles") {
772        mkdir("$tmpdir/tarfiles") || 
773            die "Can't create $tmpdir/tarfiles:$!\n";
774    }
775    copy($t, "$tmpdir/tarfiles") || 
776        die "Can't copy $t to  $tmpdir/tarfiles:$!\n";
777}
778
779for my $r  (@rpms) {
780    die "rpm '$r' unreadable: $!\n" unless -r $r;
781    unless (-d "$tmpdir/rpms") {
782        mkdir("$tmpdir/rpms") || 
783            die "Can't create $tmpdir/rpms:$!\n";
784    }
785    copy($r, "$tmpdir/rpms") || 
786        die "Can't copy $r to  $tmpdir/rpms:$!\n";
787}
788
789exit(0) unless $startem;
790
791my %started;                # If $started{$tb} then $tb successfully started
792my %child;                  # If $child{$pid} then a process with that pid is
793                            # working on a starting a segment
794my $nworking = 0;           # Number of children working on swapin
795my $pid;                    # Scratch variable for pids
796
797# Start up the slave sub-experiments first
798TESTBED:
799for my $tb  (keys %allocated) {
800    if ( $tb ne $master ) {
801        while ( $nworking == $max_children ) {
802            print "Waiting for a child process to complete\n" if $verbose;
803            if (($pid = wait()) != -1 ) {
804                # The $? >> 8 is the exit code of the subprocess, which is
805                # non-zero if the &start_segment routine failed.
806                my $exit_code = ($? >> 8);
807
808                print "Child $pid completed exit code ($exit_code)\n"
809                    if $verbose;
810                $nworking--;
811                $started{$child{$pid}}++ unless $exit_code;
812                if ($child{$pid} ) { delete $child{$pid}; }
813                else { warn "Reaped a pid we did not start?? ($pid)\n"; }
814                last TESTBED if $exit_code;
815            }
816            else { warn "wait returned without reaping: $!\n"; }
817        }
818        if ( $pid = fork() ) {
819            # Parent process
820            $nworking ++;
821            $child{$pid} = $tb;
822            print "Started process $pid to start testbed $tb\n"
823                if $verbose;
824        }
825        else {
826            # Child.  Note that we reverse the sense of the return code when it
827            # becomes an exit value.  Zero exit values indicate success.
828            exit(!&start_segment($tb, $eid, $tbparams, $timeout));
829        }
830    }
831}
832
833# Now wait for any still running processes.
834while ( $nworking ) {
835    print "Waiting for a child process to complete ($nworking running)\n" 
836        if $verbose;
837    if (($pid = wait()) != -1 ) {
838        # The $? >> 8 is the exit code of the subprocess, which is
839        # non-zero if the &start_segment routine failed.
840        my $exit_code = ($? >> 8);
841
842        print "Child $pid completed exit code ($exit_code)\n"
843            if $verbose;
844        $nworking--;
845        $started{$child{$pid}}++ unless $exit_code;
846        if ($child{$pid} ) { delete $child{$pid}; }
847        else { warn "Reaped a pid we did not start?? ($pid)\n"; }
848    }
849    else { warn "wait returned without reaping: $!\n"; }
850}
851
852# Now the master
853if (&start_segment($master, $eid, $tbparams, $timeout)) { 
854    $started{$master}++;
855}
856
857# If any testbed failed, swap the rest out.
858if ( !$fail_soft && scalar(keys %started) != scalar(keys %allocated)) {
859    for my $tb (keys %started) { &stop_segment($tb, $eid, $tbparams); }
860    print "Error starting experiment\n";
861    exit(1);
862}
863print "Experiment started\n";
864print "Deleting $tmpdir (-d to leave them in place)\n" if $verbose && !$debug;
865system("rm -rf $tmpdir") unless $debug;
866exit(0);    # set the exit value
867
868=pod
869
870=head1 NAME
871
872B<splitter.pl>
873
874=head1 SYNOPSIS
875
876B<splitter.pl> [B<-ndF>] [B<-t> I<secs>] [B<-c> F<config_file>]
877    [B<-f> F<experiment_tcl>] [B<-p> I<max_procs>] [F<experiment_tcl>]
878
879=head1 DESCRIPTION
880
881B<splitter.pl> invokes the DETER experiment parser to split an annotated
882experiment into multiple sub-experments and instantiates the sub-experiments on
883their intended testbeds.  Annotation is accomplished using the
884tb-set-node-testbed command, added to the parser.
885
886Much of the script's behavior depends on the configuration file, specified with
887the B<-c> flag and defaulting to F<./splitter.conf>.
888
889The testbed labels supplied in the B<tb-set-node-testbed> command are
890meaningful based on their presence in the testbeds file.  that file can be
891specified in the configuration file using the B<Testbeds> directive, and
892defaults to F<./testbeds>.  The syntax is described below.
893
894Most of the intermediate files are staged in a sub-directory of a temporary
895files directory and deleted at the end of the script.  Specifying the B<-d>
896flag on the command line avoids the deletion for debbugging.  By default the
897temporary files directory is directory is F</tmp> and can be reset in the
898configuration file using the B<Tmpdir> directive.  Intermediate files are
899stored under a subdirectory formed by adding the process ID of the splitter
900process.  For example, if the temporary files directory is F</tmp> and the
901B<splitter.pl> process ID is 2323, the temporary files will be stored in
902F</tmp/split2323/>.
903
904The expreriment is split out into one experiment description per testbed in the
905temporary directory named as F<experiment.testbed.tcl> where the experiment is
906the experiment ID given in the configuration file, and the testbed is the
907tb-set-node-testbed parameter for the nodes in the file.
908
909If the B<-n> option is absent the sub-experiments are then instantiated on
910their testbeds.  (Here B<-n> is analogous to its use in L<make(1)>).
911Per-testbed parameters are set in the testbeds file.  Sub-experiments on
912slave testbeds are instantiated in a random order, but the master testbed is
913currently instantiated last.
914
915Slave testbeds can be swapped in in parallel by specifying the B<-p> parameter
916and the maximum number of simultaneous processes to start.
917
918Scripts to start federation (the federation kit) are copied into the local
919experiment's tmp file - e.g., F</proj/DETER/exp/simple-split/tmp>.  These are
920taken from the directory given by the B<ScriptDir> directive in the
921configuration file.
922
923If B<-t> is given the parameter is treated as a parameter to B<Timeout> in
924F<splitter.conf>.
925
926If any sub-experiment fails to instantiate, the other sub-exeriments are
927swapped out.  B<-F> avoids this swap out, which can also be specified as
928B<SoftFail: true> in F<splitter.conf>
929
930=head2 Configuration File
931
932The configuration file is a simple set of colon-separated parameters and
933values.  A configuration file must be present, either specified in the B<-c>
934flag or the default F<./splitter.conf>.  All the parameter names are case
935insensitive, but should not include any whitespace.  Parameter values may
936include whitespace, but no newlines.
937
938Possible parameters are:
939
940=over 5
941
942=item Experiment
943
944The name of the experiment on the various testbeds
945
946=item Master
947
948The master testbed label from the testbeds file, described below.
949
950=item Testbeds
951
952The testbeds file described below, giving per-testbed parameters.  If this
953directive is absent the testbeds file defaults to F<./testbeds>
954
955=item ScriptDir
956
957Location of the default federation scripts, i.e. the federation kit.
958
959=item GatewayPubkey
960
961=item GatewaySecretKey
962
963The names of the files containing secret and public keys to use in setting up
964tunnels between testbeds.  If given they are used, otherwise keys are generated.
965
966=item GatewayKeyType
967
968This controls the kind of SSH keys generated to configure the geatways.  If
969given this must be B<dsa> or B<rsa>, and it defaults to B<rsa>.  The parameter
970is csase insensitive.
971
972=item TmpDir
973
974=item TempDir
975
976The directory where temporary files are created.  These are synonyms, but
977should both be specified, B<TmpDir> has priority.  If neither is specified,
978F</tmp> is used.
979
980=item SMBShare
981
982The SMB share on the master testbed that will be exported to remote clients.
983
984=item SMBUser
985
986The experiment user to mount project directories as.  This user needs to be a
987member of the exported experiment - that is one of the users in the project
988containing this experiment on the master testbed.
989
990=item Timeout
991
992Value in seconds after which a swap-in operatioin will be considered a success.
993Often long swap-ins will hang when there are partial failures.  This works
994around this issue.  (This behavior can be requested on the command line by
995specifying B<-t> I<secs>.)
996
997=item FailSoft
998
999If not set, failure of any sub experiment swaps the rest out.  Setting this to
1000any value avoids this swap out.  (This behavior can be requested on the command
1001line by specifying B<-F>.)
1002
1003=item MuxLimit
1004
1005The maximum bumber of links/lans carried by one gateway pair
1006
1007=item Tclparse
1008
1009The pathname to the experiment parsing program.  Only developers should set
1010this.
1011
1012=item Tclsh
1013
1014The pathname to the local oTcl shell.  Only developers should set
1015this.
1016
1017=back
1018
1019=head2 Testbeds file
1020
1021The configuration file (F<./testbeds> unless overridden by B<-c>) is a file of
1022scoped attribute-value pairs where each attribute is specified on a separate
1023line of the configuration file.  Each testbed's parameters are preceeded by the
1024testbed label in brackets ([]) on a line by itself.  After that the parameters
1025are specified as parameter: value.  This is essentially the same format as the
1026configuration file.  Parameters are:
1027
1028=over 4
1029
1030=item User
1031
1032The user under which to make requests to this testbed.  The user running
1033B<splitter.pl> must be able to authenicate as this user under L<ssh(1)> to this
1034testbed.
1035
1036=item OpsNode
1037
1038The host name of the testbed's ops node.  The user calling B<splitter.pl> must
1039be able to execute commands on this host via L<ssh(1)>.
1040
1041=item Domain
1042
1043The domain of nodes in this testbed (including the ops host).  This parameter
1044should always start with a period.
1045
1046=item Project
1047
1048The project under which to instantiate sub-experiments on this testbed.
1049
1050=item ConnectorType
1051
1052The node type for inter-testbed connector nodes on this testbed.
1053
1054=item SlaveNodeStartCmd
1055
1056The start command to run on experimental nodes when this testbed is used as a
1057slave.  In all the start commands the following string substitutions are made:
1058
1059=over 10
1060
1061=item FEDDIR
1062
1063The local experiment's federation scripts directory.  Each local experiment
1064will have this replaced by the scripts directory on its local boss.
1065
1066=item GWCONF
1067
1068The full pathname of the gateway configuration file.  As with FEDDIR, this is
1069on the local boss.
1070
1071=item PROJDIR
1072
1073The project directory on the local boss.
1074
1075=item EID
1076
1077The local experiment name.
1078
1079=back
1080
1081All startcmds specified in F<testbeds> undergo these expansions.
1082
1083=item SlaveConnectorStartCmd
1084
1085The start command to run on gateway nodes when this testbed is used as a slave.
1086The same string substitutions are made in this command as in SlaveNodeStartCmd.
1087
1088=item MasterNodeStartCmd
1089
1090The start command to run on experimental nodes when this testbed is used as a
1091master.  The same string substitutions are made in this command as in
1092SlaveNodeStartCmd.
1093
1094=item MasterConnectorStartCmd
1095
1096The start command to run on gateway nodes when this testbed is used as a
1097master.  The same string substitutions are made in this command as in
1098SlaveNodeStartCmd.
1099
1100=item ConnectorImage
1101
1102The disk image to be loaded on a gateway node on this testbed.
1103
1104=item FileServer
1105
1106The node in the master testbed from which filesystems are mounted.
1107
1108=item Boss
1109
1110The node in the master testbed that controls the testbed.
1111
1112=item TunnelCfg
1113
1114True if the connector needs to do DETER federation.  This parameter will
1115probably be removed.
1116
1117
1118=back
1119
1120=head1 ENVIRONMENT
1121
1122B<splitter.pl> does not directly make use of environment variables, but calls
1123out to L<ssh(1)> and (indirectly) to L<sh(1)>, which may be influenced by the
1124environment.
1125
1126=head1 BUGS
1127
1128A deprecated B<-N> flag was used to select testbeds file format.  Only one
1129format is supported now, and B<-N> generates a warning, but otherwise does not
1130affect B<splitter.pl>.
1131
1132=head1 SEE ALSO
1133
1134L<sh(1)>, L<ssh(1)>
1135
1136=cut
Note: See TracBrowser for help on using the repository browser.