Show
Ignore:
Timestamp:
09/29/07 17:45:48 (5 years ago)
Author:
Ted Faber <faber@…>
Children:
e06b224874a3a79af314dde1704f9ec9842e8743
Parents:
d53dda59ec0075b44350bcc347aa87f075a73452
git-committer:
Ted Faber <faber@isi.edu> / 2007-09-30T00:45:48Z+0000
Message:

Timeout critical sshes in setting up experiments. The commands don't seem to
terminate when partial swap-ins are accomplished, or perhaps after long
swap-ins. This provides a way to move on.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • fedkit/splitter.pl

    rbc01820 r5479c80  
    3030my $tclsh;                      # tclsh to call directly (changed during devel) 
    3131my @tarfiles;                   # Tarfiles in use by this experiment 
     32my $timeout;                    # The timeout to use for experiment swap ins 
    3233my %opts;                       # Parsed options 
    3334 
     
    158159# command to execute, and a prefix to be placed on a message generated if the 
    159160# command fails.   On failure print a warning if a warning prefix was given and 
    160 # return false. 
     161# return false.  If timeout is given fork a process and set an alarm of that 
     162# many seconds.  Timeouts also return 0; 
    161163sub ssh_cmd { 
    162     my($user, $host, $cmd, $wname) = @_; 
    163  
    164     # XXX system with a relative pathname is sort of gross 
    165     system ("ssh $user\@$host $cmd"); 
    166     if ($?) { 
    167         warn "$wname failed $?\n" if $wname; 
    168         return 0; 
    169     } 
    170     else { return 1; } 
     164    my($user, $host, $cmd, $wname, $timeout) = @_; 
     165    my $pid;                # Child pid 
     166 
     167    if ( $timeout ) { 
     168        if ( $pid = fork () ) { 
     169            # Parent process 
     170            # The eval acts as a signal catcher.  If the alarm goes off inside 
     171            # the eval, the die will put "alarm\n" into $@, otherwise the 
     172            # return value of the execution in the child process will be used. 
     173            my $rv = eval { 
     174                local $SIG{'ALRM'} = sub{ die "alarm\n"; }; 
     175                my $rv; 
     176 
     177                alarm $timeout; 
     178                $rv = waitpid($pid, 0); 
     179                alarm 0; 
     180                $rv; 
     181            }; 
     182 
     183            # If the eval succeeded, $@ will be null and we can use $rv, which 
     184            # is the return code from the subprocess.  If the eval timed out, 
     185            # print a warning and assume the best. 
     186            if ($@ eq "alarm\n" ) { 
     187                warn "$wname timed out - pid $pid still live\n"; 
     188                return 1; 
     189            } 
     190            else { 
     191                return $rv; 
     192            } 
     193        } 
     194        else { 
     195            # Child process 
     196            # XXX system with a relative pathname is sort of gross 
     197            exec("ssh $user\@$host $cmd"); 
     198            exit 0; 
     199        } 
     200    } 
     201    else { 
     202        # No timeout, do all the work here. 
     203        # XXX system with a relative pathname is sort of gross 
     204        system ("ssh $user\@$host $cmd"); 
     205        if ($?) { 
     206            warn "$wname failed $?\n" if $wname; 
     207            return 0; 
     208        } 
     209        else { return 1; } 
     210    } 
    171211} 
    172212 
     
    220260# messages are printed. 
    221261sub start_segment { 
    222     my($tb, $eid, $tbparams) = @_;          # testbed, experiment ID, and  
    223                                             # per-testbed parameters 
     262    my($tb, $eid, $tbparams, $timeout) = @_;# testbed, experiment ID,  
     263                                            # per-testbed parameters and remote 
     264                                            # swap-in timeout 
    224265    my $host =                              # Host name of remote ops (FQDN) 
    225266        $tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'}; 
     
    280321        print "Modifying $eid in place on $tb\n" if $verbose; 
    281322        &ssh_cmd($user, $host, "/usr/testbed/bin/modexp -r -s -w $pid " .  
    282             "$eid $tclfile", "modexp") || return 0; 
     323            "$eid $tclfile", "modexp", $timeout) || return 0; 
    283324        return 1; 
    284325    } 
     
    304345        # Now start up 
    305346        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",  
    306             "swapexp") || return 0; 
     347            "swapexp", $timeout) || return 0; 
    307348        return 1; 
    308349    } 
     
    330371        print "Swapping $eid in on $tb\n" if $verbose; 
    331372        &ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",  
    332             "swapexp") || return 0; 
     373            "swapexp", $timeout) || return 0; 
    333374        return 1; 
    334375    } 
     
    364405                                    # keep our changes to the parser minimal. 
    365406# Argument processing. 
    366 getopts('c:f:ndvN', \%opts); 
     407getopts('t:c:f:ndvN', \%opts); 
    367408$splitter_config = $opts{'c'} || "./splitter.conf"; 
    368409$debug = $opts{'d'}; 
     
    374415 
    375416$startem = $opts{'n'} ? 0 : 1;          # If true, start the sub-experiments 
     417$timeout = $opts{'t'} || $opts{'timeout'}; 
    376418$eid = $opts{'experiment'};             # Experiment ID 
    377419$tcl = $opts{'f'} || shift;             # The experiment description 
     
    541583            $cc->close(); 
    542584        } 
    543         else { warn "No control gateway for $gateways?"; } 
     585        else { warn "No control gateway for $gateways?\n"; } 
    544586             
    545587        $gateways = 0; 
     
    695737for my $tb  (keys %allocated) { 
    696738    if ($tb ne $master) { 
    697         if (&start_segment($tb, $eid, $tbparams)) { $started{$tb}++; } 
     739        if (&start_segment($tb, $eid, $tbparams, $timeout)) { $started{$tb}++; } 
    698740        else { last TESTBED; } 
    699741    } 
     
    701743 
    702744# Now the master 
    703 if (&start_segment($master, $eid, $tbparams)) {  
     745if (&start_segment($master, $eid, $tbparams, $timeout)) {  
    704746    $started{$master}++; 
    705747}