Changeset 5479c80


Ignore:
Timestamp:
Sep 29, 2007 5:45:48 PM (17 years ago)
Author:
Ted Faber <faber@…>
Branches:
axis_example, compt_changes, info-ops, master, version-1.30, version-2.00, version-3.01, version-3.02
Children:
e06b224
Parents:
d53dda5
Message:

Timeout critical sshes in setting up experiments. The commands don't seem to
terminate when partial swap-ins are accomplished, or perhaps after long
swap-ins. This provides a way to move on.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • fedkit/splitter.pl

    rd53dda5 r5479c80  
    3030my $tclsh;                      # tclsh to call directly (changed during devel)
    3131my @tarfiles;                   # Tarfiles in use by this experiment
     32my $timeout;                    # The timeout to use for experiment swap ins
    3233my %opts;                       # Parsed options
    3334
     
    158159# command to execute, and a prefix to be placed on a message generated if the
    159160# command fails.   On failure print a warning if a warning prefix was given and
    160 # return false.
     161# return false.  If timeout is given fork a process and set an alarm of that
     162# many seconds.  Timeouts also return 0;
    161163sub ssh_cmd {
    162     my($user, $host, $cmd, $wname) = @_;
    163 
    164     # XXX system with a relative pathname is sort of gross
    165     system ("ssh $user\@$host $cmd");
    166     if ($?) {
    167         warn "$wname failed $?\n" if $wname;
    168         return 0;
    169     }
    170     else { return 1; }
     164    my($user, $host, $cmd, $wname, $timeout) = @_;
     165    my $pid;                # Child pid
     166
     167    if ( $timeout ) {
     168        if ( $pid = fork () ) {
     169            # Parent process
     170            # The eval acts as a signal catcher.  If the alarm goes off inside
     171            # the eval, the die will put "alarm\n" into $@, otherwise the
     172            # return value of the execution in the child process will be used.
     173            my $rv = eval {
     174                local $SIG{'ALRM'} = sub{ die "alarm\n"; };
     175                my $rv;
     176
     177                alarm $timeout;
     178                $rv = waitpid($pid, 0);
     179                alarm 0;
     180                $rv;
     181            };
     182
     183            # If the eval succeeded, $@ will be null and we can use $rv, which
     184            # is the return code from the subprocess.  If the eval timed out,
     185            # print a warning and assume the best.
     186            if ($@ eq "alarm\n" ) {
     187                warn "$wname timed out - pid $pid still live\n";
     188                return 1;
     189            }
     190            else {
     191                return $rv;
     192            }
     193        }
     194        else {
     195            # Child process
     196            # XXX system with a relative pathname is sort of gross
     197            exec("ssh $user\@$host $cmd");
     198            exit 0;
     199        }
     200    }
     201    else {
     202        # No timeout, do all the work here.
     203        # XXX system with a relative pathname is sort of gross
     204        system ("ssh $user\@$host $cmd");
     205        if ($?) {
     206            warn "$wname failed $?\n" if $wname;
     207            return 0;
     208        }
     209        else { return 1; }
     210    }
    171211}
    172212
     
    220260# messages are printed.
    221261sub start_segment {
    222     my($tb, $eid, $tbparams) = @_;          # testbed, experiment ID, and
    223                                             # per-testbed parameters
     262    my($tb, $eid, $tbparams, $timeout) = @_;# testbed, experiment ID,
     263                                            # per-testbed parameters and remote
     264                                            # swap-in timeout
    224265    my $host =                              # Host name of remote ops (FQDN)
    225266        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
     
    280321        print "Modifying $eid in place on $tb\n" if $verbose;
    281322        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -r -s -w $pid " .
    282             "$eid $tclfile", "modexp") || return 0;
     323            "$eid $tclfile", "modexp", $timeout) || return 0;
    283324        return 1;
    284325    }
     
    304345        # Now start up
    305346        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",
    306             "swapexp") || return 0;
     347            "swapexp", $timeout) || return 0;
    307348        return 1;
    308349    }
     
    330371        print "Swapping $eid in on $tb\n" if $verbose;
    331372        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",
    332             "swapexp") || return 0;
     373            "swapexp", $timeout) || return 0;
    333374        return 1;
    334375    }
     
    364405                                    # keep our changes to the parser minimal.
    365406# Argument processing.
    366 getopts('c:f:ndvN', \%opts);
     407getopts('t:c:f:ndvN', \%opts);
    367408$splitter_config = $opts{'c'} || "./splitter.conf";
    368409$debug = $opts{'d'};
     
    374415
    375416$startem = $opts{'n'} ? 0 : 1;          # If true, start the sub-experiments
     417$timeout = $opts{'t'} || $opts{'timeout'};
    376418$eid = $opts{'experiment'};             # Experiment ID
    377419$tcl = $opts{'f'} || shift;             # The experiment description
     
    541583            $cc->close();
    542584        }
    543         else { warn "No control gateway for $gateways?"; }
     585        else { warn "No control gateway for $gateways?\n"; }
    544586           
    545587        $gateways = 0;
     
    695737for my $tb  (keys %allocated) {
    696738    if ($tb ne $master) {
    697         if (&start_segment($tb, $eid, $tbparams)) { $started{$tb}++; }
     739        if (&start_segment($tb, $eid, $tbparams, $timeout)) { $started{$tb}++; }
    698740        else { last TESTBED; }
    699741    }
     
    701743
    702744# Now the master
    703 if (&start_segment($master, $eid, $tbparams)) {
     745if (&start_segment($master, $eid, $tbparams, $timeout)) {
    704746    $started{$master}++;
    705747}
Note: See TracChangeset for help on using the changeset viewer.