source: fedkit/splitter.pl @ 4abace9

axis_examplecompt_changesinfo-opsversion-1.30version-2.00version-3.01version-3.02
Last change on this file since 4abace9 was 4abace9, checked in by Ted Faber <faber@…>, 16 years ago

Changes to imporve reliability of routes coming up. Restructuring of
federate leaves the stsyem in a state where we can safely restart the router
(specifically, talking to the local boss again) and we restart it in the
final topology. This introduces a lag in routing actually being established.
It may be useful to add a lag to the startcmd to take this into account.

  • Property mode set to 100644
File size: 37.2 KB
Line 
1#!/usr/bin/perl
2
3use strict;
4
5use Getopt::Std;
6use IO::File;
7use IO::Dir;
8use IO::Pipe;
9use File::Copy;
10
11my @scripts = ("fed_bootstrap", "federate.sh", "smbmount.pl", 
12    "make_hosts", "fed-tun.pl");
13my $local_script_dir = ".";
14my($pid, $gid);                 # Process and group IDs for calling parse.tcl
15my $splitter_config;            # Configuration file
16my $debug;                      # True if thecalled in debug mode
17my $verbose;                    # True for extra progress reports
18my $startem;                    # If true, start the sub-experiments
19my $eid;                        # Experiment ID
20my $tcl;                        # The experiment description (topology file)
21my $master;                     # Master testbed
22my $tmpdir;                     # tmp files
23my $tb_config;                  # testbed configurations
24my $smb_share;                  # Share to mount from the master
25my $project_user;               # User to mount project dirs as
26my($gw_pubkey, $gw_pubkey_base);# Connector pubkey (full path & basename)
27my($gw_secretkey, $gw_secretkey_base);# Connector secret key (full path &
28                                # basename)
29my($keytype);                   # Type (DSA or RSA) of generated gateway keys
30my $tcl_splitter;               # tcl program to split experiments
31                                # (changed during devel)
32my $tclsh;                      # tclsh to call directly (changed during devel)
33my $muxmax;                     # Maximum number of links/lans over 1 gw pair
34my @tarfiles;                   # Tarfiles in use by this experiment
35my @rpms;                       # Rpms in use by this experiment
36my $timeout;                    # The timeout to use for experiment swap ins
37my %opts;                       # Parsed options
38
39my $tbparams = {};              # Map of the per-testbed parameters from the
40                                # testbeds file.  It is a reference to a hash
41                                # of hashes (because it's passed around a bunch
42                                # and it's nicer to have one access pattern
43                                # throughout the script, in the main loop and
44                                # the subroutines).  That access is exemplified
45                                # by  $tbparams->{'deter'}->{'domain'} which is
46                                # the domain parameter of the DETER testbed. 
47my $fail_soft;                  # Do not swap failed sub-experiments out
48my $max_children=1;             # Maximum number of simultaneous swap-ins
49
50# Parse the config file.  The format is a colon-separated parameter name
51# followed by the value of that parameter to the end of the line.  This parses
52# that format and puts the parameters into the referenced hash.  Parameter
53# names are mapped to lower case, parameter values are unchanged.  Returns 0 on
54# failure (e.g. file open) and 1 on success.
55sub parse_config {
56    my($file, $href) = @_;
57    my $fh = new IO::File($file);
58       
59    unless ($fh) {
60        warn "Can't open $file: $!\n";
61        return 0;
62    }
63
64    while (<$fh>) {
65        next if /^\s*#/ || /^\s*$/;     # Skip comments & blanks
66        chomp;
67        /^([^:]+):\s*(.*)/ && do {
68            my $key = $1; 
69
70            $key =~ tr/A-Z/a-z/;
71            $href->{$key} = $2;
72            next;
73        };
74        warn "Unparasble line in $file: $_\n";
75    }
76    $fh->close();   # It will close when it goes out of scope, but...
77    return 1;
78}
79
80# Parse an easier-to-read testbeds file (the original was comma-separated
81# unadorned strings).  The format is a testbed scope as [testbed] followed by
82# the colon-separated attribute-value pairs for the testbed.  Right now these
83# go into a set of global hashes indexed by testbed, but that should probably
84# change.
85
86sub parse_testbeds {
87    my($file, $tbparams) = @_;      # Testbeds file and parameter hash
88    my $fh = new IO::File($file);  # Testbeds filehandle
89    my $tb;                         # Current testbed
90    # Convert attribute in the file to tbparams hash key
91    my %attr_to_hash = (
92        "opsnode" => "host",
93        "user" => "user",
94        "domain" => "domain",
95        "project" => "project",
96        "connectortype" => "gwtype",
97        "slavenodestartcmd" => "expstart",
98        "slaveconnectorstartcmd" => "gwstart",
99        "masternodestartcmd" => "mexpstart",
100        "masterconnectorstartcmd" => "mgwstart",
101        "connectorimage" => "gwimage",
102        "fileserver" => "fs",
103        "boss" => "boss",
104        "tunnelcfg" => "tun"
105    );
106
107
108    unless ($fh) {
109        warn "Can't open $file: $!\n";
110        return 0;
111    }
112
113    while (<$fh>) {
114        next if /^\s*#/ || /^\s*$/;     # Skip comments & blanks
115        chomp;
116        /^\s*\[(.*)\]/ && do {
117            $tb = $1;
118            $tbparams->{$tb} = {} unless $tbparams->{$tb};
119            next;
120        };
121
122        /^([^:]+):\s*(.*)/ && do {
123            unless ($tb) {
124                warn "Ignored attribute definition before testbed " .
125                    "defined in $file: $_\n";
126                next;
127            }
128            my $key = $1; 
129            $key =~ tr/A-Z/a-z/;
130            my $var = $attr_to_hash{$key};
131
132            if ($var) { $tbparams->{$tb}->{$var} = $2; }
133            else { warn "Unknown keyword $key in $file\n"; }
134
135            next;
136        };
137        warn "Unparasble line in $file: $_\n";
138    }
139    $fh->close();   # It will close when it goes out of scope, but...
140    return 1;
141}
142
143# Generate SSH keys for use by the gateways.  The parameters are the type and
144# the filename for the private key.  The pubkey will be stored in a filename
145# with the same name as the private key but with .pub appended.  Type can be
146# dsa or rsa.
147
148sub generate_ssh_keys {
149    my($type, $dest) = @_;
150
151    $type =~ tr/A-Z/a-z/;
152    return 0 if $type !~ /(rsa|dsa)/;
153    system("/usr/bin/ssh-keygen -t $type -N \"\" -f $dest");
154    return $@ ? 0 : 1;
155}
156
157# use scp to transfer a file, reporting true if successful and false otherwise.
158# Parameters are the local file name, the ssh host destination (either hostname
159# oe user@host), and an optional destination file name or directory.  If no
160# destination is given, the file is transferred to the given user's home
161# directory.  If only a machine is given in the ssh host destination, the
162# current user is used.
163sub scp_file {
164    my($file, $user, $host, $dest) = @_;
165
166    # XXX system with a relative pathname is sort of gross
167    system("scp $file $user\@$host:$dest");
168    if ($?) {
169        warn "scp failed $?\n";
170        return 0;
171    }
172    else { return 1; }
173}
174
175# use ssh to execute the given command on the machine (and as the user) in
176# $where.  Parameters are the ssh destination directive ($where) and the
177# command to execute, and a prefix to be placed on a message generated if the
178# command fails.   On failure print a warning if a warning prefix was given and
179# return false.  If timeout is given fork a process and set an alarm of that
180# many seconds.  Timeouts also return 0;
181sub ssh_cmd {
182    my($user, $host, $cmd, $wname, $timeout) = @_;
183    my $pid;                # Child pid
184
185    $timeout = 0 unless $timeout;   # Force default timeout
186
187    if ( $pid = fork () ) {
188        # Parent process
189        # The eval acts as a signal catcher.  If the alarm goes off inside
190        # the eval, the die will put "alarm\n" into $@, otherwise the
191        # return value of the execution in the child process will be used.
192        my $rv = eval {
193            local $SIG{'ALRM'} = sub{ die "alarm\n"; };
194            my $rv;
195
196            alarm $timeout;
197            $rv = waitpid($pid, 0);
198            alarm 0;
199            $rv;
200        };
201
202        # If the eval succeeded, $@ will be null and we can use $rv, which
203        # is the return code from the subprocess.  If the eval timed out,
204        # print a warning and assume the best.
205        if ($@ eq "alarm\n" ) {
206            warn "$wname timed out - pid $pid still live\n";
207            return 1;
208        }
209        else {
210            return $rv;
211        }
212    }
213    else {
214        # Child process
215        exec("ssh $user\@$host $cmd");
216        exit 0;
217    }
218}
219
220# Ship local copies of the federation scripts out to the given host.  If any of
221# the script transfers fails, return 0.  The scripts to transfer are from the
222# global @scripts and are found locally in $local_script_dir (another global).
223sub ship_scripts {
224    my($host, $user, $dest_dir) = @_;       # Where, who, where remotely
225    my $s;
226
227    &ssh_cmd($user, $host, "mkdir -p $dest_dir");
228    for $s (@scripts) {
229        &scp_file("$local_script_dir/$s", $user, $host, $dest_dir) || 
230            return 0;
231    }
232    return 1;
233}
234
235# Ship per-testbed configuration generated by this script to the remote /proj
236# directories on the remote testbeds
237sub ship_configs {
238    my($host, $user, $src_dir, $dest_dir) = @_;     # Where, who, where remotely
239    my($d, $f);
240
241    $d = IO::Dir->new($src_dir) || return 0;
242
243    # All directories under $tmpdir are 770 so we can delete them later.
244    &ssh_cmd($user, $host, "mkdir -p $dest_dir") || return 0;
245    &ssh_cmd($user, $host, "chmod 770 $dest_dir") || return 0;
246    while ( $f = $d->read()) {
247        next if $f =~ /^\./;
248        if ( -d "$src_dir/$f" ) {
249            &ship_configs($host, $user, "$src_dir/$f", "$dest_dir/$f") || 
250                return 0;
251        }
252        else {
253            &scp_file("$src_dir/$f", $user, $host, $dest_dir) || return 0;
254        }
255    }
256    return 1;
257}
258
259# Start a sub section of the experiment on a given testbed.  The testbed and
260# the user to start the experiment as are pulled from the global per-testbed
261# hash, passed in as $tbparams, as is the project name on the remote testbed.
262# Parameters are the testbed and the experiment id.  Configuration files are
263# scp-ed over to the target testbed from the global $tmpdir/$tb directory.
264# Then the current state of the experiment determined using expinfo.  From that
265# state, the experiment is either created, modified or spapped in.  If
266# everything succeeds, true is returned.  If the global verbose is set progress
267# messages are printed.
268sub start_segment {
269    my($tb, $eid, $tbparams, $timeout) = @_;# testbed, experiment ID,
270                                            # per-testbed parameters and remote
271                                            # swap-in timeout
272    my $host =                              # Host name of remote ops (FQDN)
273        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
274    my $user = $tbparams->{$tb}->{'user'};  # user to pass to ssh
275    my $pid = $tbparams->{$tb}->{'project'};# remote project to start the
276                                            # experiment under
277    my $tclfile = "./$eid.$tb.tcl";         # Local tcl file with the
278                                            # sub-experiment
279    my $proj_dir = "/proj/$pid/exp/$eid/tmp";  # Where to stash federation stuff
280    my $tarfiles_dir = "/proj/$pid/tarfiles/$eid";  # Where to stash tarfiles
281    my $rpms_dir = "/proj/$pid/rpms/$eid";  # Where to stash rpms
282    my $to_hostname = "$proj_dir/hosts";    # remote hostnames file
283    my $state;                              # State of remote experiment
284    my $status = new IO::Pipe;              # The pipe to get status
285
286    # Determine the status of the remote experiment
287    $status->reader("ssh $user\@$host /usr/testbed/bin/expinfo $pid $eid") || 
288        die "Can't ssh to $user\@$host:$!\n";
289
290    # XXX: this is simple now.  Parsing may become more complex
291    while (<$status>) {
292        /State: (\w+)/ && ($state = $1);
293        /No\s+such\s+experiment/ && ($state = "none");
294    }
295    $status->close();
296    print "$tb: $state\n";
297
298    # Copy the experiment definition data over
299    print "transferring subexperiment to $tb\n" if $verbose;
300    &scp_file("$tmpdir/$tb/$tclfile", $user, $host) || return 0;
301    # Clear out any old experiment data; if not deleted, copies over it by
302    # different users will fail.
303    # (O /bin/csh, how evil thou art.  The -c and the escaped single quotes
304    # force the /bin/sh interpretation of the trailing * (which we need to keep
305    # tmp around))  Again, this needs to be done more properly once we have a
306    # non-ssh interface here.)
307    print "clearing experiment subdirs on $tb\n" if $verbose;
308    &ssh_cmd($user, $host, "/bin/sh -c \\'/bin/rm -rf $proj_dir/*\\'") || 
309        return 0;
310    print "clearing experiment tarfiles subdirs on $tb\n" if $verbose;
311    &ssh_cmd($user, $host, "/bin/rm -rf $tarfiles_dir/") || 
312        return 0;
313    print "creating tarfiles subdir $tarfiles_dir on $tb\n" if $verbose;
314    &ssh_cmd($user, $host, "mkdir -p $tarfiles_dir", "create tarfiles") || 
315        return 0;
316    print "clearing experiment rpms subdirs on $tb\n" if $verbose;
317    &ssh_cmd($user, $host, "/bin/rm -rf $rpms_dir/") || 
318        return 0;
319    print "creating rpms subdir $rpms_dir on $tb\n" if $verbose;
320    &ssh_cmd($user, $host, "mkdir -p $rpms_dir", "create rpms") || 
321        return 0;
322    # Remote experiment is active.  Modify it.
323    if ($state eq "active") {
324        print "Transferring federation support files to $tb\n" if $verbose;
325        # First copy new scripts and hostinfo into the remote /proj
326        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
327            return 0;
328        &ship_scripts($host, $user, $proj_dir) || return 0;
329        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
330
331        if ( -d "$tmpdir/tarfiles") {
332            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
333                return 0;
334        }
335
336        if ( -d "$tmpdir/rpms") {
337            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
338                return 0;
339        }
340
341        print "Modifying $eid in place on $tb\n" if $verbose;
342        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -r -s -w $pid " . 
343            "$eid $tclfile", "modexp", $timeout) || return 0;
344        return 1;
345    }
346
347    # Remote experiment is swapped out, modify it and swap it in.
348    if ($state eq "swapped") {
349        print "Transferring federation support files to $tb\n" if $verbose;
350        # First copy new scripts and hostinfo into the remote /proj (because
351        # the experiment exists, the directory tree should be there.
352        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
353            return 0;
354        &ship_scripts($host, $user, $proj_dir) || return 0;
355        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
356        if ( -d "$tmpdir/tarfiles") {
357            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
358                return 0;
359        }
360
361        if ( -d "$tmpdir/rpms") {
362            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
363                return 0;
364        }
365
366        print "Modifying $eid on $tb\n" if $verbose;
367        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -w $pid $eid $tclfile", 
368            "modexp") || return 0;
369        print "Swapping $eid in on $tb\n" if $verbose;
370        # Now start up
371        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in", 
372            "swapexp", $timeout) || return 0;
373        return 1;
374    }
375
376    # No remote experiment.  Create one.  We do this in 2 steps so we can put
377    # the configuration files and scripts into the new experiment directories.
378    if ($state eq "none") {
379
380        if ( -d "$tmpdir/tarfiles") {
381            # Tarfiles have to exist for the creation to work
382            print "copying tarfiles to $tb\n";
383            &ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) || 
384                return 0;
385        }
386
387        if ( -d "$tmpdir/rpms") {
388            &ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) || 
389                return 0;
390        }
391
392        print "Creating $eid on $tb\n" if $verbose;
393        &ssh_cmd($user, $host, "/usr/testbed/bin/startexp -i -f -w -p " . 
394            "$pid -e $eid $tclfile", "startexp") || return 0;
395        # After startexp succeeds, the per-experiment directories exist on the
396        # remote testbed.
397        print "Transferring federation support files to $tb\n" if $verbose;
398        # First copy new scripts and hostinfo into the remote /proj
399        &scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) ||
400            return 0;
401        &ship_scripts($host, $user, $proj_dir) || return 0;
402        &ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) || return 0;
403        # Now start up
404        print "Swapping $eid in on $tb\n" if $verbose;
405        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in", 
406            "swapexp", $timeout) || return 0;
407        return 1;
408    }
409
410    # Every branch for a known state returns.  If execution gets here, the
411    # state is unknown.
412    warn "unknown state: $state\n";
413    return 0;
414}
415
416# Swap out a sub-experiment - probably because another has failed.  Arguments
417# are testbed and experiment.  Most of the control flow is similar to
418# start_segment, though much simpler.
419sub stop_segment {
420    my($tb, $eid, $tbparams) = @_;          # testbed, experiment ID and
421                                            # per-testbed parameters
422    my $user = $tbparams->{$tb}->{'user'};  # testbed user
423    my $host =                              # Ops node
424        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
425    my $pid = $tbparams->{$tb}->{'project'};# testbed project
426
427    print "Stopping $eid on $tb\n" if $verbose;
428    &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid out", 
429        "swapexp (out)") || return 0;
430    return 1;
431}
432
433
434$pid = $gid = "dummy";              # Default project and group to pass to
435                                    # $tcl_splitter above.  These are total
436                                    # dummy arguments;  the splitter doesn't
437                                    # use them at all, but we supply them to
438                                    # keep our changes to the parser minimal.
439# Argument processing.
440getopts('Ft:c:p:f:ndvN', \%opts);
441$splitter_config = $opts{'c'} || "./splitter.conf";
442$debug = $opts{'d'};
443$verbose = $opts{'v'} || $opts{'d'};
444
445&parse_config("$splitter_config", \%opts) || 
446    die "Cannot read config file $splitter_config: $!\n";
447
448warn "-N does nothing now.  Only one testbeds format supported.\n"
449    if $opts{'N'};
450$fail_soft = $opts{'F'} || $opts{'failsoft'};
451$startem = $opts{'n'} ? 0 : 1;          # If true, start the sub-experiments
452$timeout = $opts{'t'} || $opts{'timeout'};
453$eid = $opts{'experiment'};             # Experiment ID
454$tcl = $opts{'f'} || shift;             # The experiment description
455$master = $opts{'master'};              # Master testbed
456$tmpdir = $opts{'tmpdir'} || $opts{'tempdir'}|| "/tmp"; # tmp files
457$tb_config = $opts{'testbeds'} || "./testbeds"; # testbed configurations
458$local_script_dir = $opts{'scriptdir'}; # Local scripts
459$muxmax  = $opts{'muxlimit'} || 3;      # Number of connections muxed on one
460                                        # gateway
461
462$max_children = $opts{'p'} || $opts{'maxchildren'} 
463    if $opts{'p'} || $opts{'maxchildren'};
464
465$smb_share = $opts{'smbshare'} ||       # Share to mount from the master
466    die "Must give an SMB share\n";
467$project_user = $opts{'smbuser'} ||     # User to mount project dirs as
468    die "Must give an SMB user\n";
469
470# tcl program to split experiments (changed during devel)
471$tcl_splitter = $opts{'tclparse'} || "/usr/testbed/lib/ns2ir/parse.tcl";
472# tclsh to call directly (changed during devel)
473$tclsh = $opts{'tclsh'} || "/usr/local/bin/otclsh";
474
475# Prefix to avoid collisions
476$tmpdir .= "/split$$";
477
478print "Temp files are in $tmpdir\n" if $verbose;
479# Create a workspace
480unless (-d "$tmpdir") {
481    mkdir("$tmpdir") || die "Can't create $tmpdir: $!";
482}
483
484# If the keys are given, use them.  Otherwise create a set under $tmpdir
485
486if ( $opts{'gatewatpubkey'} && $opts{'gatewaysecretkey'}) {
487    $gw_pubkey = $opts{'gatewaypubkey'};
488    $gw_secretkey = $opts{'gatewaysecretkey'};
489}
490else {
491    $keytype = $opts{'gatewaykeytype'} || "rsa";
492    mkdir("$tmpdir/keys") || die "Can't create temoprary key dir: $!\n";
493    $gw_pubkey = "$tmpdir/keys/fed.$keytype.pub";
494    $gw_secretkey = "$tmpdir/keys/fed.$keytype";
495    print "Generating $keytype keys\n" if $verbose;
496    generate_ssh_keys($keytype, $gw_secretkey) || 
497        die "Cannot generate kets:$@\n";
498}
499# Generate the basenames
500($gw_pubkey_base = $gw_pubkey) =~ s#.*/##;
501($gw_secretkey_base = $gw_secretkey) =~ s#.*/##;
502
503
504
505# Validate scripts directory
506for my $s (@scripts) {
507    die "$local_script_dir/$s not in local script directory. Try -d\n"
508        unless -r "$local_script_dir/$s";
509}
510
511die "Must supply file, master and experiment" unless $master && $tcl && $eid;
512
513&parse_testbeds($tb_config, $tbparams) ||
514    die "Cannot testbed congfigurations from $tb_config: $!\n";
515
516# Open a pipe to the splitter program and start it parsing the experiments
517my $pipe = new IO::Pipe;
518# NB no more -p call on parse call.
519$pipe->reader("$tclsh $tcl_splitter -s -x $muxmax -m $master  $pid $gid $eid $tcl") || 
520    die "Cannot execute $tclsh $tcl_splitter -s -x $muxmax -m $master $pid $gid $eid $tcl:$!\n";
521
522# Parsing variables
523my $ctb;                        # Current testbed
524my %allocated;                  # If allocated{$tb} > 0, $tb is in use
525my $destfile;                   # File that the sub-experiment tcl file is
526                                # being written to, or "" if none.  Also used
527                                # for hostnames file.
528my $desthandle;                 # File handle for distfile
529my $gateways;                   # when gateway lists are being processed this
530                                # is the testbed whose gateways are being
531                                # gathered.
532my $control_gateway;            # Control net gateway for the current testbed
533my %active_end;                 # If active_end{"a-b"} > 0 then a is the active
534                                # end of the a <-> b connector pair.
535
536# Parse the splitter output.  This loop creates the sub experiments, gateway
537# configurations and hostnames file
538while (<$pipe>) {
539    # Start of a sub-experiment
540    /^#\s+Begin\s+Testbed\s+\((\w+)\)/ && do {
541        $ctb = $1;
542
543        # If we know the testbed, start collecting its sub experiment tcl
544        # description.  If not, warn the caller and ignore the configuration of
545        # this testbed.
546        if ($tbparams->{$ctb}) {
547            $allocated{$ctb}++; # Keep track of the testbeds allocated
548
549            unless (-d "$tmpdir/$ctb") {
550                mkdir("$tmpdir/$ctb") || die "Can't create $tmpdir/$ctb: $!";
551            }
552            $destfile = "$tmpdir/$ctb/$eid.$ctb.tcl";
553
554            $desthandle = new IO::File(">$destfile") || 
555                die "Cannot open $destfile:$!\n";
556        }
557        else { 
558            warn "No such testbed $ctb\n";
559            $destfile = "";
560        }
561        next;
562    };
563
564    # End of that experiment
565    /^#\s+End\s+Testbed\s+\((\w+)\)/ && do {
566        # Simple syntax check and close out this experiment's tcl description
567        die "Mismatched testbed markers ($1, $ctb)\n" unless ($1 eq $ctb);
568        $desthandle->close();
569        $destfile = $ctb = "";
570        next;
571    };
572
573    # Beginning of a gateway set
574    /^#\s+Begin\s+gateways\s+\((\w+)\)/ && do {
575        $gateways = $1;
576        # If we've heard of this tb, create the config lines for it one at a
577        # time.
578        if ($allocated{$gateways}) {
579            # Just in case.  This directory should already have been created
580            # above.
581            unless (-d "$tmpdir/$gateways") {
582                mkdir("$tmpdir/$gateways") || 
583                    die "Can't create $tmpdir/$gateways: $!";
584            }
585        }
586        else {
587            warn "Gateways given (and ignored) for testbed not in use: " .
588                "$gateways\n";
589            $gateways = 0;
590        }
591        next;
592    };
593    # End of the gateways section.  Output the client config for this testbed
594    /^#\s+End\s+gateways\s+\((\w+)\)/ && do {
595        die "Mismatched gateway markers ($1, $gateways)\n" 
596            unless !$gateways || $gateways == $1;
597
598        if ($control_gateway ) {
599            # Client config
600            my $cc = new IO::File(">$tmpdir/$gateways/client.conf");
601            my $master_project = $tbparams->{$master}->{'project'};
602            die "Can't open $tmpdir/$gateways/client.conf: $!\n" unless $cc;
603            print $cc "ControlGateway: $control_gateway\n";
604            print $cc "SMBShare: $smb_share\n";
605            print $cc "ProjectUser: $project_user\n";
606            print $cc "ProjectName: $master_project\n";
607            $cc->close();
608        }
609        else { warn "No control gateway for $gateways?\n"; }
610           
611        $gateways = 0;
612        next;
613    };
614    # Beginning of the hostnames list.  Collection is always in the hostnames
615    # file.
616    /^#\s+Begin\s+hostnames/ && do {
617        $destfile = "$tmpdir/hostnames";
618        $desthandle = new IO::File(">$destfile") || 
619            die "Can't open $destfile:$!\n";
620        next;
621    };
622    # end of the hostnames list.
623    /^#\s+End\s+hostnames/ && do {
624        $desthandle->close();
625        $destfile = "";
626        next;
627    };
628
629    # Generate gateway configuration info, one file per line
630    $gateways && do {
631        chomp;
632        my($dtb, $myname, $desthost, $type) = split(" ", $_);
633
634        # Many of these are to simplify print statements
635        my $sdomain =                           # domain for the source
636            $tbparams->{$gateways}->{'domain'};
637        my $ddomain =                           # domain for the destination
638            $tbparams->{$dtb}->{'domain'};
639        my $sproject =                          # Project of the destination
640            $tbparams->{$gateways}->{'project'};
641        my $fs =                                # Master fs node (FQDN)
642            $tbparams->{$master}->{'fs'} .  $tbparams->{$master}->{'domain'};
643        my $boss =                              # Master boss node (FQDN)
644            $tbparams->{$master}->{'boss'} .  $tbparams->{$master}->{'domain'};
645        my $remote_script_dir =                 # Remote fed script location
646            "/proj/" . $tbparams->{$dtb}->{'project'} . "/exp/$eid/tmp";
647        my $active;                             # Is this the active side of
648                                                # the connector?
649
650        $sdomain = ".$eid." . $tbparams->{$gateways}->{'project'} . "$sdomain";
651        $ddomain = ".$eid." . $tbparams->{$dtb}->{'project'} . "$ddomain";
652
653        my $conf_file = "$myname$sdomain.gw.conf";
654        # translate to lower case so the `hostname` hack for specifying
655        # configuration files works.
656        $conf_file =~ tr/A-Z/a-z/;
657
658        # If either end of this link is in the master side of the testbed, that
659        # side is the active end. Otherwise the first testbed encountered in
660        # the file will be the active end.  The $active_end variable keeps
661        # track of those decisions
662        if ( $dtb eq $master ) { $active = "false"; }
663        elsif ($gateways eq $master ) { $active = "true"; }
664        elsif ( $active_end{"$dtb-$gateways"} ) { $active="false"; }
665        else { $active_end{"$gateways-$dtb"}++; $active = "true"; }
666
667        # This is used to create the client configuration.
668        $control_gateway = "$myname$sdomain"
669            if $type =~ /(control|both)/;
670
671        # Write out the file
672        my $gwconfig = new IO::File(">$tmpdir/$gateways/$conf_file")|| 
673            die "can't open $tmpdir/$gateways/$conf_file: $!\n";
674
675        print $gwconfig "Active: $active\n";
676        print $gwconfig "TunnelCfg: " . $tbparams->{$gateways}->{'tun'} . "\n";
677        print $gwconfig "BossName: $boss\n";
678        print $gwconfig "FsName: $fs\n";
679        print $gwconfig "Type: $type\n";
680        print $gwconfig "RemoteScriptDir: $remote_script_dir\n";
681        print $gwconfig "Peer: $desthost$ddomain\n";
682        print $gwconfig "Pubkeys: " . 
683            "/proj/$sproject/exp/$eid/tmp/$gw_pubkey_base\n";
684        print $gwconfig "Privkeys: " .
685            "/proj/$sproject/exp/$eid/tmp/$gw_secretkey_base\n";
686        $gwconfig->close();
687
688        # This testbed has a gateway (most will) so make a copy of the keys it
689        # needs in this testbed's subdirectory.  start_segment will transfer
690        # them.
691        unless (-r "$tmpdir/$gateways/$gw_pubkey_base" ) {
692            copy($gw_pubkey, "$tmpdir/$gateways/$gw_pubkey_base") ||
693                die "Can't copy pubkeys ($gw_pubkey to " . 
694                    "$tmpdir/$gateways/$gw_pubkey_base): $!\n";
695        }
696        if ($active eq "true" ) {
697            unless (-r "$tmpdir/$gateways/$gw_secretkey_base" ) {
698                copy($gw_secretkey, "$tmpdir/$gateways/$gw_secretkey_base") ||
699                    die "Can't copy secret keys ($gw_secretkey to " . 
700                        "$tmpdir/$gateways/$gw_secretkey_base): $!\n";
701            }
702        }
703
704        #done processing gateway entry, ready for next line
705        next; 
706    };
707    (/^#\s+Begin\s+tarfiles/../^#\s+End\s+tarfiles/) && do {
708        next if /^#/;
709        chomp;
710        push(@tarfiles, $_);
711        next;
712    };
713    (/^#\s+Begin\s+rpms/../^#\s+End\s+rpms/) && do {
714        next if /^#/;
715        chomp;
716        push(@rpms, $_);
717        next;
718    };
719
720    next unless $destfile;  # Unidentified testbed, ignore config
721    # local copies that can be used in the substitutions below
722    my $gwtype = $tbparams->{$ctb}->{'gwtype'};
723    my $gwimage = $tbparams->{$ctb}->{'gwimage'};
724    my $mgwstart = $tbparams->{$ctb}->{'mgwstart'};
725    my $mexpstart = $tbparams->{$ctb}->{'mexpstart'};
726    my $gwstart = $tbparams->{$ctb}->{'gwstart'};
727    my $expstart = $tbparams->{$ctb}->{'expstart'};
728    my $project = $tbparams->{$ctb}->{'project'};
729
730    # Substitute variables
731    s/GWTYPE/$gwtype/g;
732    s/GWIMAGE/$gwimage/g;
733    if ($ctb eq $master ) {
734        s/GWSTART/$mgwstart/g;
735        s/EXPSTART/$mexpstart/g;
736    }
737    else {
738        s/GWSTART/$gwstart/g;
739        s/EXPSTART/$expstart/g;
740    }
741    # XXX: oh is this bad
742    s#GWCONF#FEDDIR\`hostname\`.gw.conf#g;
743    s#PROJDIR#/proj/$project/#g;
744    s#EID#$eid#g;
745    s#FEDDIR#/proj/$project/exp/$eid/tmp/#g;
746    print $desthandle $_;
747}
748$pipe->close();
749die "No nodes in master testbed ($master)\n" unless $allocated{$master};
750
751# Copy tarfiles and rpms needed at remote sites to the staging directories.
752# Start_segment will distribute them
753for my $t  (@tarfiles) {
754    die "tarfile '$t' unreadable: $!\n" unless -r $t;
755    unless (-d "$tmpdir/tarfiles") {
756        mkdir("$tmpdir/tarfiles") || 
757            die "Can't create $tmpdir/tarfiles:$!\n";
758    }
759    copy($t, "$tmpdir/tarfiles") || 
760        die "Can't copy $t to  $tmpdir/tarfiles:$!\n";
761}
762
763for my $r  (@rpms) {
764    die "rpm '$r' unreadable: $!\n" unless -r $r;
765    unless (-d "$tmpdir/rpms") {
766        mkdir("$tmpdir/rpms") || 
767            die "Can't create $tmpdir/rpms:$!\n";
768    }
769    copy($r, "$tmpdir/rpms") || 
770        die "Can't copy $r to  $tmpdir/rpms:$!\n";
771}
772
773exit(0) unless $startem;
774
775my %started;                # If $started{$tb} then $tb successfully started
776my %child;                  # If $child{$pid} then a process with that pid is
777                            # working on a starting a segment
778my $nworking = 0;           # Number of children working on swapin
779my $pid;                    # Scratch variable for pids
780
781# Start up the slave sub-experiments first
782TESTBED:
783for my $tb  (keys %allocated) {
784    if ( $tb ne $master ) {
785        while ( $nworking == $max_children ) {
786            print "Waiting for a child process to complete\n" if $verbose;
787            if (($pid = wait()) != -1 ) {
788                # The $? >> 8 is the exit code of the subprocess, which is
789                # non-zero if the &start_segment routine failed.
790                my $exit_code = ($? >> 8);
791
792                print "Child $pid completed exit code ($exit_code)\n"
793                    if $verbose;
794                $nworking--;
795                $started{$child{$pid}}++ unless $exit_code;
796                if ($child{$pid} ) { delete $child{$pid}; }
797                else { warn "Reaped a pid we did not start?? ($pid)\n"; }
798                last TESTBED if $exit_code;
799            }
800            else { warn "wait returned without reaping: $!\n"; }
801        }
802        if ( $pid = fork() ) {
803            # Parent process
804            $nworking ++;
805            $child{$pid} = $tb;
806            print "Started process $pid to start testbed $tb\n"
807                if $verbose;
808        }
809        else {
810            # Child.  Note that we reverse the sense of the return code when it
811            # becomes an exit value.  Zero exit values indicate success.
812            exit(!&start_segment($tb, $eid, $tbparams, $timeout));
813        }
814    }
815}
816
817# Now wait for any still running processes.
818while ( $nworking ) {
819    print "Waiting for a child process to complete ($nworking running)\n" 
820        if $verbose;
821    if (($pid = wait()) != -1 ) {
822        # The $? >> 8 is the exit code of the subprocess, which is
823        # non-zero if the &start_segment routine failed.
824        my $exit_code = ($? >> 8);
825
826        print "Child $pid completed exit code ($exit_code)\n"
827            if $verbose;
828        $nworking--;
829        $started{$child{$pid}}++ unless $exit_code;
830        if ($child{$pid} ) { delete $child{$pid}; }
831        else { warn "Reaped a pid we did not start?? ($pid)\n"; }
832    }
833    else { warn "wait returned without reaping: $!\n"; }
834}
835
836# Now the master
837if (&start_segment($master, $eid, $tbparams, $timeout)) { 
838    $started{$master}++;
839}
840
841# If any testbed failed, swap the rest out.
842if ( !$fail_soft && scalar(keys %started) != scalar(keys %allocated)) {
843    for my $tb (keys %started) { &stop_segment($tb, $eid, $tbparams); }
844    print "Error starting experiment\n";
845    exit(1);
846}
847print "Experiment started\n";
848print "Deleting $tmpdir (-d to leave them in place)\n" if $verbose && !$debug;
849system("rm -rf $tmpdir") unless $debug;
850exit(0);    # set the exit value
851
852=pod
853
854=head1 NAME
855
856B<splitter.pl>
857
858=head1 SYNOPSIS
859
860B<splitter.pl> [B<-ndF>] [B<-t> I<secs>] [B<-c> F<config_file>]
861    [B<-f> F<experiment_tcl>] [B<-p> I<max_procs>] [F<experiment_tcl>]
862
863=head1 DESCRIPTION
864
865B<splitter.pl> invokes the DETER experiment parser to split an annotated
866experiment into multiple sub-experments and instantiates the sub-experiments on
867their intended testbeds.  Annotation is accomplished using the
868tb-set-node-testbed command, added to the parser.
869
870Much of the script's behavior depends on the configuration file, specified with
871the B<-c> flag and defaulting to F<./splitter.conf>.
872
873The testbed labels supplied in the B<tb-set-node-testbed> command are
874meaningful based on their presence in the testbeds file.  that file can be
875specified in the configuration file using the B<Testbeds> directive, and
876defaults to F<./testbeds>.  The syntax is described below.
877
878Most of the intermediate files are staged in a sub-directory of a temporary
879files directory and deleted at the end of the script.  Specifying the B<-d>
880flag on the command line avoids the deletion for debbugging.  By default the
881temporary files directory is directory is F</tmp> and can be reset in the
882configuration file using the B<Tmpdir> directive.  Intermediate files are
883stored under a subdirectory formed by adding the process ID of the splitter
884process.  For example, if the temporary files directory is F</tmp> and the
885B<splitter.pl> process ID is 2323, the temporary files will be stored in
886F</tmp/split2323/>.
887
888The expreriment is split out into one experiment description per testbed in the
889temporary directory named as F<experiment.testbed.tcl> where the experiment is
890the experiment ID given in the configuration file, and the testbed is the
891tb-set-node-testbed parameter for the nodes in the file.
892
893If the B<-n> option is absent the sub-experiments are then instantiated on
894their testbeds.  (Here B<-n> is analogous to its use in L<make(1)>).
895Per-testbed parameters are set in the testbeds file.  Sub-experiments on
896slave testbeds are instantiated in a random order, but the master testbed is
897currently instantiated last.
898
899Slave testbeds can be swapped in in parallel by specifying the B<-p> parameter
900and the maximum number of simultaneous processes to start.
901
902Scripts to start federation (the federation kit) are copied into the local
903experiment's tmp file - e.g., F</proj/DETER/exp/simple-split/tmp>.  These are
904taken from the directory given by the B<ScriptDir> directive in the
905configuration file.
906
907If B<-t> is given the parameter is treated as a parameter to B<Timeout> in
908F<splitter.conf>.
909
910If any sub-experiment fails to instantiate, the other sub-exeriments are
911swapped out.  B<-F> avoids this swap out, which can also be specified as
912B<SoftFail: true> in F<splitter.conf>
913
914=head2 Configuration File
915
916The configuration file is a simple set of colon-separated parameters and
917values.  A configuration file must be present, either specified in the B<-c>
918flag or the default F<./splitter.conf>.  All the parameter names are case
919insensitive, but should not include any whitespace.  Parameter values may
920include whitespace, but no newlines.
921
922Possible parameters are:
923
924=over 5
925
926=item Experiment
927
928The name of the experiment on the various testbeds
929
930=item Master
931
932The master testbed label from the testbeds file, described below.
933
934=item Testbeds
935
936The testbeds file described below, giving per-testbed parameters.  If this
937directive is absent the testbeds file defaults to F<./testbeds>
938
939=item ScriptDir
940
941Location of the default federation scripts, i.e. the federation kit.
942
943=item GatewayPubkey
944
945=item GatewaySecretKey
946
947The names of the files containing secret and public keys to use in setting up
948tunnels between testbeds.  If given they are used, otherwise keys are generated.
949
950=item GatewayKeyType
951
952This controls the kind of SSH keys generated to configure the geatways.  If
953given this must be B<dsa> or B<rsa>, and it defaults to B<rsa>.  The parameter
954is csase insensitive.
955
956=item TmpDir
957
958=item TempDir
959
960The directory where temporary files are created.  These are synonyms, but
961should both be specified, B<TmpDir> has priority.  If neither is specified,
962F</tmp> is used.
963
964=item SMBShare
965
966The SMB share on the master testbed that will be exported to remote clients.
967
968=item SMBUser
969
970The experiment user to mount project directories as.  This user needs to be a
971member of the exported experiment - that is one of the users in the project
972containing this experiment on the master testbed.
973
974=item Timeout
975
976Value in seconds after which a swap-in operatioin will be considered a success.
977Often long swap-ins will hang when there are partial failures.  This works
978around this issue.  (This behavior can be requested on the command line by
979specifying B<-t> I<secs>.)
980
981=item FailSoft
982
983If not set, failure of any sub experiment swaps the rest out.  Setting this to
984any value avoids this swap out.  (This behavior can be requested on the command
985line by specifying B<-F>.)
986
987=item MuxLimit
988
989The maximum bumber of links/lans carried by one gateway pair
990
991=item Tclparse
992
993The pathname to the experiment parsing program.  Only developers should set
994this.
995
996=item Tclsh
997
998The pathname to the local oTcl shell.  Only developers should set
999this.
1000
1001=back
1002
1003=head2 Testbeds file
1004
1005The configuration file (F<./testbeds> unless overridden by B<-c>) is a file of
1006scoped attribute-value pairs where each attribute is specified on a separate
1007line of the configuration file.  Each testbed's parameters are preceeded by the
1008testbed label in brackets ([]) on a line by itself.  After that the parameters
1009are specified as parameter: value.  This is essentially the same format as the
1010configuration file.  Parameters are:
1011
1012=over 4
1013
1014=item User
1015
1016The user under which to make requests to this testbed.  The user running
1017B<splitter.pl> must be able to authenicate as this user under L<ssh(1)> to this
1018testbed.
1019
1020=item OpsNode
1021
1022The host name of the testbed's ops node.  The user calling B<splitter.pl> must
1023be able to execute commands on this host via L<ssh(1)>.
1024
1025=item Domain
1026
1027The domain of nodes in this testbed (including the ops host).  This parameter
1028should always start with a period.
1029
1030=item Project
1031
1032The project under which to instantiate sub-experiments on this testbed.
1033
1034=item ConnectorType
1035
1036The node type for inter-testbed connector nodes on this testbed.
1037
1038=item SlaveNodeStartCmd
1039
1040The start command to run on experimental nodes when this testbed is used as a
1041slave.  In all the start commands the following string substitutions are made:
1042
1043=over 10
1044
1045=item FEDDIR
1046
1047The local experiment's federation scripts directory.  Each local experiment
1048will have this replaced by the scripts directory on its local boss.
1049
1050=item GWCONF
1051
1052The full pathname of the gateway configuration file.  As with FEDDIR, this is
1053on the local boss.
1054
1055=item PROJDIR
1056
1057The project directory on the local boss.
1058
1059=item EID
1060
1061The local experiment name.
1062
1063=back
1064
1065All startcmds specified in F<testbeds> undergo these expansions.
1066
1067=item SlaveConnectorStartCmd
1068
1069The start command to run on gateway nodes when this testbed is used as a slave.
1070The same string substitutions are made in this command as in SlaveNodeStartCmd.
1071
1072=item MasterNodeStartCmd
1073
1074The start command to run on experimental nodes when this testbed is used as a
1075master.  The same string substitutions are made in this command as in
1076SlaveNodeStartCmd.
1077
1078=item MasterConnectorStartCmd
1079
1080The start command to run on gateway nodes when this testbed is used as a
1081master.  The same string substitutions are made in this command as in
1082SlaveNodeStartCmd.
1083
1084=item ConnectorImage
1085
1086The disk image to be loaded on a gateway node on this testbed.
1087
1088=item FileServer
1089
1090The node in the master testbed from which filesystems are mounted.
1091
1092=item Boss
1093
1094The node in the master testbed that controls the testbed.
1095
1096=item TunnelCfg
1097
1098True if the connector needs to do DETER federation.  This parameter will
1099probably be removed.
1100
1101
1102=back
1103
1104=head1 ENVIRONMENT
1105
1106B<splitter.pl> does not directly make use of environment variables, but calls
1107out to L<ssh(1)> and (indirectly) to L<sh(1)>, which may be influenced by the
1108environment.
1109
1110=head1 BUGS
1111
1112A deprecated B<-N> flag was used to select testbeds file format.  Only one
1113format is supported now, and B<-N> generates a warning, but otherwise does not
1114affect B<splitter.pl>.
1115
1116=head1 SEE ALSO
1117
1118L<sh(1)>, L<ssh(1)>
1119
1120=cut
Note: See TracBrowser for help on using the repository browser.