Context Navigation

source: fedkit/splitter.pl @ e863014

axis_examplecompt_changesinfo-opsversion-1.30version-2.00version-3.01version-3.02

Last change on this file since e863014 was e863014, checked in by Ted Faber <faber@…>, 17 years ago

Allow slave testbeds to be swapped in concurrently.

-p <max_procs> sets the maximum number of subnets we can do at once.

Property mode set to 100644

File size: 35.8 KB

Line
1	#!/usr/bin/perl
2
3	use strict;
4
5	use Getopt::Std;
6	use IO::File;
7	use IO::Dir;
8	use IO::Pipe;
9	use File::Copy;
10
11	my @scripts = ("federate.sh", "smbmount.pl", "make_hosts", "fed-tun.pl");
12	my $local_script_dir = ".";
13	my($pid, $gid); # Process and group IDs for calling parse.tcl
14	my $splitter_config; # Configuration file
15	my $debug; # True if thecalled in debug mode
16	my $verbose; # True for extra progress reports
17	my $startem; # If true, start the sub-experiments
18	my $eid; # Experiment ID
19	my $tcl; # The experiment description (topology file)
20	my $master; # Master testbed
21	my $tmpdir; # tmp files
22	my $tb_config; # testbed configurations
23	my $smb_share; # Share to mount from the master
24	my $project_user; # User to mount project dirs as
25	my($gw_pubkey, $gw_pubkey_base);# Connector pubkey (full path & basename)
26	my($gw_secretkey, $gw_secretkey_base);# Connector secret key (full path &
27	# basename)
28	my $tcl_splitter; # tcl program to split experiments
29	# (changed during devel)
30	my $tclsh; # tclsh to call directly (changed during devel)
31	my @tarfiles; # Tarfiles in use by this experiment
32	my @rpms; # Rpms in use by this experiment
33	my $timeout; # The timeout to use for experiment swap ins
34	my %opts; # Parsed options
35
36	my $tbparams = {}; # Map of the per-testbed parameters from the
37	# testbeds file. It is a reference to a hash
38	# of hashes (because it's passed around a bunch
39	# and it's nicer to have one access pattern
40	# throughout the script, in the main loop and
41	# the subroutines). That access is exemplified
42	# by $tbparams->{'deter'}->{'domain'} which is
43	# the domain parameter of the DETER testbed.
44	my $fail_soft; # Do not swap failed sub-experiments out
45	my $max_children=1; # Maximum number of simultaneous swap-ins
46
47	# Parse the config file. The format is a colon-separated parameter name
48	# followed by the value of that parameter to the end of the line. This parses
49	# that format and puts the parameters into the referenced hash. Parameter
50	# names are mapped to lower case, parameter values are unchanged. Returns 0 on
51	# failure (e.g. file open) and 1 on success.
52	sub parse_config {
53	my($file, $href) = @_;
54	my $fh = new IO::File($file);
55
56	unless ($fh) {
57	warn "Can't open $file: $!\n";
58	return 0;
59	}
60
61	while (<$fh>) {
62	next if /^\s#/ \|\| /^\s$/; # Skip comments & blanks
63	chomp;
64	/^([^:]+):\s(.)/ && do {
65	my $key = $1;
66
67	$key =~ tr/A-Z/a-z/;
68	$href->{$key} = $2;
69	next;
70	};
71	warn "Unparasble line in $file: $_\n";
72	}
73	$fh->close(); # It will close when it goes out of scope, but...
74	return 1;
75	}
76
77	# Parse an easier-to-read testbeds file (the original was comma-separated
78	# unadorned strings). The format is a testbed scope as [testbed] followed by
79	# the colon-separated attribute-value pairs for the testbed. Right now these
80	# go into a set of global hashes indexed by testbed, but that should probably
81	# change.
82
83	sub parse_testbeds {
84	my($file, $tbparams) = @_; # Testbeds file and parameter hash
85	my $fh = new IO::File($file); # Testbeds filehandle
86	my $tb; # Current testbed
87	# Convert attribute in the file to tbparams hash key
88	my %attr_to_hash = (
89	"opsnode" => "host",
90	"user" => "user",
91	"domain" => "domain",
92	"project" => "project",
93	"connectortype" => "gwtype",
94	"slavenodestartcmd" => "expstart",
95	"slaveconnectorstartcmd" => "gwstart",
96	"masternodestartcmd" => "mexpstart",
97	"masterconnectorstartcmd" => "mgwstart",
98	"connectorimage" => "gwimage",
99	"fileserver" => "fs",
100	"boss" => "boss",
101	"tunnelcfg" => "tun"
102	);
103
104
105	unless ($fh) {
106	warn "Can't open $file: $!\n";
107	return 0;
108	}
109
110	while (<$fh>) {
111	next if /^\s#/ \|\| /^\s$/; # Skip comments & blanks
112	chomp;
113	/^\s\[(.)\]/ && do {
114	$tb = $1;
115	$tbparams->{$tb} = {} unless $tbparams->{$tb};
116	next;
117	};
118
119	/^([^:]+):\s(.)/ && do {
120	unless ($tb) {
121	warn "Ignored attribute definition before testbed " .
122	"defined in $file: $_\n";
123	next;
124	}
125	my $key = $1;
126	$key =~ tr/A-Z/a-z/;
127	my $var = $attr_to_hash{$key};
128
129	if ($var) { $tbparams->{$tb}->{$var} = $2; }
130	else { warn "Unknown keyword $key in $file\n"; }
131
132	next;
133	};
134	warn "Unparasble line in $file: $_\n";
135	}
136	$fh->close(); # It will close when it goes out of scope, but...
137	return 1;
138	}
139
140
141	# use scp to transfer a file, reporting true if successful and false otherwise.
142	# Parameters are the local file name, the ssh host destination (either hostname
143	# oe user@host), and an optional destination file name or directory. If no
144	# destination is given, the file is transferred to the given user's home
145	# directory. If only a machine is given in the ssh host destination, the
146	# current user is used.
147	sub scp_file {
148	my($file, $user, $host, $dest) = @_;
149
150	# XXX system with a relative pathname is sort of gross
151	system("scp $file $user\@$host:$dest");
152	if ($?) {
153	warn "scp failed $?\n";
154	return 0;
155	}
156	else { return 1; }
157	}
158
159	# use ssh to execute the given command on the machine (and as the user) in
160	# $where. Parameters are the ssh destination directive ($where) and the
161	# command to execute, and a prefix to be placed on a message generated if the
162	# command fails. On failure print a warning if a warning prefix was given and
163	# return false. If timeout is given fork a process and set an alarm of that
164	# many seconds. Timeouts also return 0;
165	sub ssh_cmd {
166	my($user, $host, $cmd, $wname, $timeout) = @_;
167	my $pid; # Child pid
168
169	$timeout = 0 unless $timeout; # Force default timeout
170
171	if ( $pid = fork () ) {
172	# Parent process
173	# The eval acts as a signal catcher. If the alarm goes off inside
174	# the eval, the die will put "alarm\n" into $@, otherwise the
175	# return value of the execution in the child process will be used.
176	my $rv = eval {
177	local $SIG{'ALRM'} = sub{ die "alarm\n"; };
178	my $rv;
179
180	alarm $timeout;
181	$rv = waitpid($pid, 0);
182	alarm 0;
183	$rv;
184	};
185
186	# If the eval succeeded, $@ will be null and we can use $rv, which
187	# is the return code from the subprocess. If the eval timed out,
188	# print a warning and assume the best.
189	if ($@ eq "alarm\n" ) {
190	warn "$wname timed out - pid $pid still live\n";
191	return 1;
192	}
193	else {
194	return $rv;
195	}
196	}
197	else {
198	# Child process
199	exec("ssh $user\@$host $cmd");
200	exit 0;
201	}
202	}
203
204	# Ship local copies of the federation scripts out to the given host. If any of
205	# the script transfers fails, return 0. The scripts to transfer are from the
206	# global @scripts and are found locally in $local_script_dir (another global).
207	sub ship_scripts {
208	my($host, $user, $dest_dir) = @_; # Where, who, where remotely
209	my $s;
210
211	&ssh_cmd($user, $host, "mkdir -p $dest_dir");
212	for $s (@scripts) {
213	&scp_file("$local_script_dir/$s", $user, $host, $dest_dir) \|\|
214	return 0;
215	}
216	return 1;
217	}
218
219	# Ship per-testbed configuration generated by this script to the remote /proj
220	# directories on the remote testbeds
221	sub ship_configs {
222	my($host, $user, $src_dir, $dest_dir) = @_; # Where, who, where remotely
223	my($d, $f);
224
225	$d = IO::Dir->new($src_dir) \|\| return 0;
226
227	# All directories under $tmpdir are 770 so we can delete them later.
228	&ssh_cmd($user, $host, "mkdir -p $dest_dir") \|\| return 0;
229	&ssh_cmd($user, $host, "chmod 770 $dest_dir") \|\| return 0;
230	while ( $f = $d->read()) {
231	next if $f =~ /^\./;
232	if ( -d "$src_dir/$f" ) {
233	&ship_configs($host, $user, "$src_dir/$f", "$dest_dir/$f") \|\|
234	return 0;
235	}
236	else {
237	&scp_file("$src_dir/$f", $user, $host, $dest_dir) \|\| return 0;
238	}
239	}
240	return 1;
241	}
242
243	# Start a sub section of the experiment on a given testbed. The testbed and
244	# the user to start the experiment as are pulled from the global per-testbed
245	# hash, passed in as $tbparams, as is the project name on the remote testbed.
246	# Parameters are the testbed and the experiment id. Configuration files are
247	# scp-ed over to the target testbed from the global $tmpdir/$tb directory.
248	# Then the current state of the experiment determined using expinfo. From that
249	# state, the experiment is either created, modified or spapped in. If
250	# everything succeeds, true is returned. If the global verbose is set progress
251	# messages are printed.
252	sub start_segment {
253	my($tb, $eid, $tbparams, $timeout) = @_;# testbed, experiment ID,
254	# per-testbed parameters and remote
255	# swap-in timeout
256	my $host = # Host name of remote ops (FQDN)
257	$tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
258	my $user = $tbparams->{$tb}->{'user'}; # user to pass to ssh
259	my $pid = $tbparams->{$tb}->{'project'};# remote project to start the
260	# experiment under
261	my $tclfile = "./$eid.$tb.tcl"; # Local tcl file with the
262	# sub-experiment
263	my $proj_dir = "/proj/$pid/exp/$eid/tmp"; # Where to stash federation stuff
264	my $tarfiles_dir = "/proj/$pid/tarfiles/$eid"; # Where to stash tarfiles
265	my $rpms_dir = "/proj/$pid/rpms/$eid"; # Where to stash rpms
266	my $to_hostname = "$proj_dir/hosts"; # remote hostnames file
267	my $state; # State of remote experiment
268	my $status = new IO::Pipe; # The pipe to get status
269
270	# Determine the status of the remote experiment
271	$status->reader("ssh $user\@$host /usr/testbed/bin/expinfo $pid $eid") \|\|
272	die "Can't ssh to $user\@$host:$!\n";
273
274	# XXX: this is simple now. Parsing may become more complex
275	while (<$status>) {
276	/State: (\w+)/ && ($state = $1);
277	/No\s+such\s+experiment/ && ($state = "none");
278	}
279	$status->close();
280	print "$tb: $state\n";
281
282	# Copy the experiment definition data over
283	print "transferring subexperiment to $tb\n" if $verbose;
284	&scp_file("$tmpdir/$tb/$tclfile", $user, $host) \|\| return 0;
285	# Clear out any old experiment data; if not deleted, copies over it by
286	# different users will fail.
287	# (O /bin/csh, how evil thou art. The -c and the escaped single quotes
288	# force the /bin/sh interpretation of the trailing * (which we need to keep
289	# tmp around)) Again, this needs to be done more properly once we have a
290	# non-ssh interface here.)
291	print "clearing experiment subdirs on $tb\n" if $verbose;
292	&ssh_cmd($user, $host, "/bin/sh -c \\'/bin/rm -rf $proj_dir/*\\'") \|\|
293	return 0;
294	print "clearing experiment tarfiles subdirs on $tb\n" if $verbose;
295	&ssh_cmd($user, $host, "/bin/rm -rf $tarfiles_dir/") \|\|
296	return 0;
297	print "creating tarfiles subdir $tarfiles_dir on $tb\n" if $verbose;
298	&ssh_cmd($user, $host, "mkdir -p $tarfiles_dir", "create tarfiles") \|\|
299	return 0;
300	print "clearing experiment rpms subdirs on $tb\n" if $verbose;
301	&ssh_cmd($user, $host, "/bin/rm -rf $rpms_dir/") \|\|
302	return 0;
303	print "creating rpms subdir $rpms_dir on $tb\n" if $verbose;
304	&ssh_cmd($user, $host, "mkdir -p $rpms_dir", "create rpms") \|\|
305	return 0;
306	# Remote experiment is active. Modify it.
307	if ($state eq "active") {
308	print "Transferring federation support files to $tb\n" if $verbose;
309	# First copy new scripts and hostinfo into the remote /proj
310	&scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) \|\|
311	return 0;
312	&ship_scripts($host, $user, $proj_dir) \|\| return 0;
313	&ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) \|\| return 0;
314
315	if ( -d "$tmpdir/tarfiles") {
316	&ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) \|\|
317	return 0;
318	}
319
320	if ( -d "$tmpdir/rpms") {
321	&ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) \|\|
322	return 0;
323	}
324
325	print "Modifying $eid in place on $tb\n" if $verbose;
326	&ssh_cmd($user, $host, "/usr/testbed/bin/modexp -r -s -w $pid " .
327	"$eid $tclfile", "modexp", $timeout) \|\| return 0;
328	return 1;
329	}
330
331	# Remote experiment is swapped out, modify it and swap it in.
332	if ($state eq "swapped") {
333	print "Transferring federation support files to $tb\n" if $verbose;
334	# First copy new scripts and hostinfo into the remote /proj (because
335	# the experiment exists, the directory tree should be there.
336	&scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) \|\|
337	return 0;
338	&ship_scripts($host, $user, $proj_dir) \|\| return 0;
339	&ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) \|\| return 0;
340	if ( -d "$tmpdir/tarfiles") {
341	&ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) \|\|
342	return 0;
343	}
344
345	if ( -d "$tmpdir/rpms") {
346	&ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) \|\|
347	return 0;
348	}
349
350	print "Modifying $eid on $tb\n" if $verbose;
351	&ssh_cmd($user, $host, "/usr/testbed/bin/modexp -w $pid $eid $tclfile",
352	"modexp") \|\| return 0;
353	print "Swapping $eid in on $tb\n" if $verbose;
354	# Now start up
355	&ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",
356	"swapexp", $timeout) \|\| return 0;
357	return 1;
358	}
359
360	# No remote experiment. Create one. We do this in 2 steps so we can put
361	# the configuration files and scripts into the new experiment directories.
362	if ($state eq "none") {
363
364	if ( -d "$tmpdir/tarfiles") {
365	# Tarfiles have to exist for the creation to work
366	print "copying tarfiles to $tb\n";
367	&ship_configs($host, $user, "$tmpdir/tarfiles", $tarfiles_dir) \|\|
368	return 0;
369	}
370
371	if ( -d "$tmpdir/rpms") {
372	&ship_configs($host, $user, "$tmpdir/rpms", $rpms_dir) \|\|
373	return 0;
374	}
375
376	print "Creating $eid on $tb\n" if $verbose;
377	&ssh_cmd($user, $host, "/usr/testbed/bin/startexp -i -f -w -p " .
378	"$pid -e $eid $tclfile", "startexp") \|\| return 0;
379	# After startexp succeeds, the per-experiment directories exist on the
380	# remote testbed.
381	print "Transferring federation support files to $tb\n" if $verbose;
382	# First copy new scripts and hostinfo into the remote /proj
383	&scp_file("$tmpdir/hostnames", $user, $host, $to_hostname) \|\|
384	return 0;
385	&ship_scripts($host, $user, $proj_dir) \|\| return 0;
386	&ship_configs($host, $user, "$tmpdir/$tb", $proj_dir) \|\| return 0;
387	# Now start up
388	print "Swapping $eid in on $tb\n" if $verbose;
389	&ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid in",
390	"swapexp", $timeout) \|\| return 0;
391	return 1;
392	}
393
394	# Every branch for a known state returns. If execution gets here, the
395	# state is unknown.
396	warn "unknown state: $state\n";
397	return 0;
398	}
399
400	# Swap out a sub-experiment - probably because another has failed. Arguments
401	# are testbed and experiment. Most of the control flow is similar to
402	# start_segment, though much simpler.
403	sub stop_segment {
404	my($tb, $eid, $tbparams) = @_; # testbed, experiment ID and
405	# per-testbed parameters
406	my $user = $tbparams->{$tb}->{'user'}; # testbed user
407	my $host = # Ops node
408	$tbparams->{$tb}->{'host'} . $tbparams->{$tb}->{'domain'};
409	my $pid = $tbparams->{$tb}->{'project'};# testbed project
410
411	print "Stopping $eid on $tb\n" if $verbose;
412	&ssh_cmd($user, $host, "/usr/testbed/bin/swapexp -w $pid $eid out",
413	"swapexp (out)") \|\| return 0;
414	return 1;
415	}
416
417
418	$pid = $gid = "dummy"; # Default project and group to pass to
419	# $tcl_splitter above. These are total
420	# dummy arguments; the splitter doesn't
421	# use them at all, but we supply them to
422	# keep our changes to the parser minimal.
423	# Argument processing.
424	getopts('Ft:c:p:f:ndvN', \%opts);
425	$splitter_config = $opts{'c'} \|\| "./splitter.conf";
426	$debug = $opts{'d'};
427	$verbose = $opts{'v'} \|\| $opts{'d'};
428
429	&parse_config("$splitter_config", \%opts) \|\|
430	die "Cannot read config file $splitter_config: $!\n";
431
432	warn "-N does nothing now. Only one testbeds format supported.\n"
433	if $opts{'N'};
434	$fail_soft = $opts{'F'} \|\| $opts{'failsoft'};
435	$startem = $opts{'n'} ? 0 : 1; # If true, start the sub-experiments
436	$timeout = $opts{'t'} \|\| $opts{'timeout'};
437	$eid = $opts{'experiment'}; # Experiment ID
438	$tcl = $opts{'f'} \|\| shift; # The experiment description
439	$master = $opts{'master'}; # Master testbed
440	$tmpdir = $opts{'tmpdir'} \|\| $opts{'tempdir'}\|\| "/tmp"; # tmp files
441	$tb_config = $opts{'testbeds'} \|\| "./testbeds"; # testbed configurations
442	$local_script_dir = $opts{'scriptdir'}; # Local scripts
443
444	$max_children = $opts{'p'} \|\| $opts{'maxchildren'}
445	if $opts{'p'} \|\| $opts{'maxchildren'};
446
447	$smb_share = $opts{'smbshare'} \|\| # Share to mount from the master
448	die "Must give an SMB share\n";
449	$project_user = $opts{'smbuser'} \|\| # User to mount project dirs as
450	die "Must give an SMB user\n";
451
452	# For now specify these. We may want to generate them later.
453	$gw_pubkey = $opts{'gatewaypubkey'};
454	($gw_pubkey_base = $gw_pubkey) =~ s#.*/##;
455	$gw_secretkey = $opts{'gatewaysecretkey'};
456	($gw_secretkey_base = $gw_secretkey) =~ s#.*/##;
457
458	# tcl program to split experiments (changed during devel)
459	$tcl_splitter = $opts{'tclparse'} \|\| "/usr/testbed/lib/ns2ir/parse.tcl";
460	# tclsh to call directly (changed during devel)
461	$tclsh = $opts{'tclsh'} \|\| "/usr/local/bin/otclsh";
462
463	# Prefix to avoid collisions
464	$tmpdir .= "/split$$";
465
466	print "Temp files are in $tmpdir\n" if $verbose;
467	# Create a workspace
468	unless (-d "$tmpdir") {
469	mkdir("$tmpdir") \|\| die "Can't create $tmpdir: $!";
470	}
471
472	# Validate scripts directory
473	for my $s (@scripts) {
474	die "$local_script_dir/$s not in local script directory. Try -d\n"
475	unless -r "$local_script_dir/$s";
476	}
477
478	die "Must supply file, master and experiment" unless $master && $tcl && $eid;
479
480	&parse_testbeds($tb_config, $tbparams) \|\|
481	die "Cannot testbed congfigurations from $tb_config: $!\n";
482
483	# Open a pipe to the splitter program and start it parsing the experiments
484	my $pipe = new IO::Pipe;
485	# NB no more -p call on parse call.
486	$pipe->reader("$tclsh $tcl_splitter -s -m $master $pid $gid $eid $tcl") \|\|
487	die "Cannot execute $tclsh $tcl_splitter -s -m $master $pid $gid $eid $tcl:$!\n";
488
489	# Parsing variables
490	my $ctb; # Current testbed
491	my %allocated; # If allocated{$tb} > 0, $tb is in use
492	my $destfile; # File that the sub-experiment tcl file is
493	# being written to, or "" if none. Also used
494	# for hostnames file.
495	my $desthandle; # File handle for distfile
496	my $gateways; # when gateway lists are being processed this
497	# is the testbed whose gateways are being
498	# gathered.
499	my $control_gateway; # Control net gateway for the current testbed
500	my %active_end; # If active_end{"a-b"} > 0 then a is the active
501	# end of the a <-> b connector pair.
502
503	# Parse the splitter output. This loop creates the sub experiments, gateway
504	# configurations and hostnames file
505	while (<$pipe>) {
506	# Start of a sub-experiment
507	/^#\s+Begin\s+Testbed\s+$(\w+)$/ && do {
508	$ctb = $1;
509
510	# If we know the testbed, start collecting its sub experiment tcl
511	# description. If not, warn the caller and ignore the configuration of
512	# this testbed.
513	if ($tbparams->{$ctb}) {
514	$allocated{$ctb}++; # Keep track of the testbeds allocated
515
516	unless (-d "$tmpdir/$ctb") {
517	mkdir("$tmpdir/$ctb") \|\| die "Can't create $tmpdir/$ctb: $!";
518	}
519	$destfile = "$tmpdir/$ctb/$eid.$ctb.tcl";
520
521	$desthandle = new IO::File(">$destfile") \|\|
522	die "Cannot open $destfile:$!\n";
523	}
524	else {
525	warn "No such testbed $ctb\n";
526	$destfile = "";
527	}
528	next;
529	};
530
531	# End of that experiment
532	/^#\s+End\s+Testbed\s+$(\w+)$/ && do {
533	# Simple syntax check and close out this experiment's tcl description
534	die "Mismatched testbed markers ($1, $ctb)\n" unless ($1 eq $ctb);
535	$desthandle->close();
536	$destfile = $ctb = "";
537	next;
538	};
539
540	# Beginning of a gateway set
541	/^#\s+Begin\s+gateways\s+$(\w+)$/ && do {
542	$gateways = $1;
543	# If we've heard of this tb, create the config lines for it one at a
544	# time.
545	if ($allocated{$gateways}) {
546	# Just in case. This directory should already have been created
547	# above.
548	unless (-d "$tmpdir/$gateways") {
549	mkdir("$tmpdir/$gateways") \|\|
550	die "Can't create $tmpdir/$gateways: $!";
551	}
552	}
553	else {
554	warn "Gateways given (and ignored) for testbed not in use: " .
555	"$gateways\n";
556	$gateways = 0;
557	}
558	next;
559	};
560	# End of the gateways section. Output the client config for this testbed
561	/^#\s+End\s+gateways\s+$(\w+)$/ && do {
562	die "Mismatched gateway markers ($1, $gateways)\n"
563	unless !$gateways \|\| $gateways == $1;
564
565	if ($control_gateway ) {
566	# Client config
567	my $cc = new IO::File(">$tmpdir/$gateways/client.conf");
568	my $master_project = $tbparams->{$master}->{'project'};
569	die "Can't open $tmpdir/$gateways/client.conf: $!\n" unless $cc;
570	print $cc "ControlGateway: $control_gateway\n";
571	print $cc "SMBShare: $smb_share\n";
572	print $cc "ProjectUser: $project_user\n";
573	print $cc "ProjectName: $master_project\n";
574	$cc->close();
575	}
576	else { warn "No control gateway for $gateways?\n"; }
577
578	$gateways = 0;
579	next;
580	};
581	# Beginning of the hostnames list. Collection is always in the hostnames
582	# file.
583	/^#\s+Begin\s+hostnames/ && do {
584	$destfile = "$tmpdir/hostnames";
585	$desthandle = new IO::File(">$destfile") \|\|
586	die "Can't open $destfile:$!\n";
587	next;
588	};
589	# end of the hostnames list.
590	/^#\s+End\s+hostnames/ && do {
591	$desthandle->close();
592	$destfile = "";
593	next;
594	};
595
596	# Generate gateway configuration info, one file per line
597	$gateways && do {
598	chomp;
599	my($dtb, $myname, $desthost, $type) = split(" ", $_);
600
601	# Many of these are to simplify print statements
602	my $sdomain = # domain for the source
603	$tbparams->{$gateways}->{'domain'};
604	my $ddomain = # domain for the destination
605	$tbparams->{$dtb}->{'domain'};
606	my $sproject = # Project of the destination
607	$tbparams->{$gateways}->{'project'};
608	my $fs = # Master fs node (FQDN)
609	$tbparams->{$master}->{'fs'} . $tbparams->{$master}->{'domain'};
610	my $boss = # Master boss node (FQDN)
611	$tbparams->{$master}->{'boss'} . $tbparams->{$master}->{'domain'};
612	my $remote_script_dir = # Remote fed script location
613	"/proj/" . $tbparams->{$dtb}->{'project'} . "/exp/$eid/tmp";
614	my $active; # Is this the active side of
615	# the connector?
616
617	$sdomain = ".$eid." . $tbparams->{$gateways}->{'project'} . "$sdomain";
618	$ddomain = ".$eid." . $tbparams->{$dtb}->{'project'} . "$ddomain";
619
620	my $conf_file = "$myname$sdomain.gw.conf";
621	# translate to lower case so the `hostname` hack for specifying
622	# configuration files works.
623	$conf_file =~ tr/A-Z/a-z/;
624
625	# If either end of this link is in the master side of the testbed, that
626	# side is the active end. Otherwise the first testbed encountered in
627	# the file will be the active end. The $active_end variable keeps
628	# track of those decisions
629	if ( $dtb eq $master ) { $active = "false"; }
630	elsif ($gateways eq $master ) { $active = "true"; }
631	elsif ( $active_end{"$dtb-$gateways"} ) { $active="false"; }
632	else { $active_end{"$gateways-$dtb"}++; $active = "true"; }
633
634	# This is used to create the client configuration.
635	$control_gateway = "$myname$sdomain"
636	if $type =~ /(control\|both)/;
637
638	# Write out the file
639	my $gwconfig = new IO::File(">$tmpdir/$gateways/$conf_file")\|\|
640	die "can't open $tmpdir/$gateways/$conf_file: $!\n";
641
642	print $gwconfig "Active: $active\n";
643	print $gwconfig "TunnelCfg: " . $tbparams->{$gateways}->{'tun'} . "\n";
644	print $gwconfig "BossName: $boss\n";
645	print $gwconfig "FsName: $fs\n";
646	print $gwconfig "Type: $type\n";
647	print $gwconfig "RemoteScriptDir: $remote_script_dir\n";
648	print $gwconfig "Peer: $desthost$ddomain\n";
649	print $gwconfig "Pubkeys: " .
650	"/proj/$sproject/exp/$eid/tmp/$gw_pubkey_base\n";
651	print $gwconfig "Privkeys: " .
652	"/proj/$sproject/exp/$eid/tmp/$gw_secretkey_base\n";
653	$gwconfig->close();
654
655	# This testbed has a gateway (most will) so make a copy of the keys it
656	# needs in this testbed's subdirectory. start_segment will transfer
657	# them.
658	unless (-r "$tmpdir/$gateways/$gw_pubkey_base" ) {
659	copy($gw_pubkey, "$tmpdir/$gateways/$gw_pubkey_base") \|\|
660	die "Can't copy pubkeys ($gw_pubkey to " .
661	"$tmpdir/$gateways/$gw_pubkey_base): $!\n";
662	}
663	if ($active eq "true" ) {
664	unless (-r "$tmpdir/$gateways/$gw_secretkey_base" ) {
665	copy($gw_secretkey, "$tmpdir/$gateways/$gw_secretkey_base") \|\|
666	die "Can't copy secret keys ($gw_secretkey to " .
667	"$tmpdir/$gateways/$gw_secretkey_base): $!\n";
668	}
669	}
670
671	#done processing gateway entry, ready for next line
672	next;
673	};
674	(/^#\s+Begin\s+tarfiles/../^#\s+End\s+tarfiles/) && do {
675	next if /^#/;
676	chomp;
677	push(@tarfiles, $_);
678	next;
679	};
680	(/^#\s+Begin\s+rpms/../^#\s+End\s+rpms/) && do {
681	next if /^#/;
682	chomp;
683	push(@rpms, $_);
684	next;
685	};
686
687	next unless $destfile; # Unidentified testbed, ignore config
688	# local copies that can be used in the substitutions below
689	my $gwtype = $tbparams->{$ctb}->{'gwtype'};
690	my $gwimage = $tbparams->{$ctb}->{'gwimage'};
691	my $mgwstart = $tbparams->{$ctb}->{'mgwstart'};
692	my $mexpstart = $tbparams->{$ctb}->{'mexpstart'};
693	my $gwstart = $tbparams->{$ctb}->{'gwstart'};
694	my $expstart = $tbparams->{$ctb}->{'expstart'};
695	my $project = $tbparams->{$ctb}->{'project'};
696
697	# Substitute variables
698	s/GWTYPE/$gwtype/g;
699	s/GWIMAGE/$gwimage/g;
700	if ($ctb eq $master ) {
701	s/GWSTART/$mgwstart/g;
702	s/EXPSTART/$mexpstart/g;
703	}
704	else {
705	s/GWSTART/$gwstart/g;
706	s/EXPSTART/$expstart/g;
707	}
708	# XXX: oh is this bad
709	s#GWCONF#FEDDIR\`hostname\`.gw.conf#g;
710	s#PROJDIR#/proj/$project/#g;
711	s#EID#$eid#g;
712	s#FEDDIR#/proj/$project/exp/$eid/tmp/#g;
713	print $desthandle $_;
714	}
715	$pipe->close();
716	die "No nodes in master testbed ($master)\n" unless $allocated{$master};
717
718	# Copy tarfiles and rpms needed at remote sites to the staging directories.
719	# Start_segment will distribute them
720	for my $t (@tarfiles) {
721	die "tarfile '$t' unreadable: $!\n" unless -r $t;
722	unless (-d "$tmpdir/tarfiles") {
723	mkdir("$tmpdir/tarfiles") \|\|
724	die "Can't create $tmpdir/tarfiles:$!\n";
725	}
726	copy($t, "$tmpdir/tarfiles") \|\|
727	die "Can't copy $t to $tmpdir/tarfiles:$!\n";
728	}
729
730	for my $r (@rpms) {
731	die "rpm '$r' unreadable: $!\n" unless -r $r;
732	unless (-d "$tmpdir/rpms") {
733	mkdir("$tmpdir/rpms") \|\|
734	die "Can't create $tmpdir/rpms:$!\n";
735	}
736	copy($r, "$tmpdir/rpms") \|\|
737	die "Can't copy $r to $tmpdir/rpms:$!\n";
738	}
739
740	exit(0) unless $startem;
741
742	my %started; # If $started{$tb} then $tb successfully started
743	my %child; # If $child{$pid} then a process with that pid is
744	# working on a starting a segment
745	my $nworking = 0; # Number of children working on swapin
746	my $pid; # Scratch variable for pids
747
748	# Start up the slave sub-experiments first
749	TESTBED:
750	for my $tb (keys %allocated) {
751	if ( $tb ne $master ) {
752	while ( $nworking == $max_children ) {
753	print "Waiting for a child process to complete\n" if $verbose;
754	if (($pid = wait()) != -1 ) {
755	# The $? >> 8 is the exit code of the subprocess, which is
756	# non-zero if the &start_segment routine failed.
757	my $exit_code = ($? >> 8);
758
759	print "Child $pid completed exit code ($exit_code)\n"
760	if $verbose;
761	$nworking--;
762	$started{$child{$pid}}++ unless $exit_code;
763	if ($child{$pid} ) { delete $child{$pid}; }
764	else { warn "Reaped a pid we did not start?? ($pid)\n"; }
765	last TESTBED if $exit_code;
766	}
767	else { warn "wait returned without reaping: $!\n"; }
768	}
769	if ( $pid = fork() ) {
770	# Parent process
771	$nworking ++;
772	$child{$pid} = $tb;
773	print "Started process $pid to start testbed $tb\n"
774	if $verbose;
775	}
776	else {
777	# Child. Note that we reverse the sense of the return code when it
778	# becomes an exit value. Zero exit values indicate success.
779	exit(!&start_segment($tb, $eid, $tbparams, $timeout));
780	}
781	}
782	}
783
784	# Now wait for any still running processes.
785	while ( $nworking ) {
786	print "Waiting for a child process to complete ($nworking running)\n"
787	if $verbose;
788	if (($pid = wait()) != -1 ) {
789	# The $? >> 8 is the exit code of the subprocess, which is
790	# non-zero if the &start_segment routine failed.
791	my $exit_code = ($? >> 8);
792
793	print "Child $pid completed exit code ($exit_code)\n"
794	if $verbose;
795	$nworking--;
796	$started{$child{$pid}}++ unless $exit_code;
797	if ($child{$pid} ) { delete $child{$pid}; }
798	else { warn "Reaped a pid we did not start?? ($pid)\n"; }
799	}
800	else { warn "wait returned without reaping: $!\n"; }
801	}
802
803	# Now the master
804	if (&start_segment($master, $eid, $tbparams, $timeout)) {
805	$started{$master}++;
806	}
807
808	# If any testbed failed, swap the rest out.
809	if ( !$fail_soft && scalar(keys %started) != scalar(keys %allocated)) {
810	for my $tb (keys %started) { &stop_segment($tb, $eid, $tbparams); }
811	print "Error starting experiment\n";
812	exit(1);
813	}
814	print "Experiment started\n";
815	print "Deleting $tmpdir (-d to leave them in place)\n" if $verbose && !$debug;
816	system("rm -rf $tmpdir") unless $debug;
817	exit(0); # set the exit value
818
819	=pod
820
821	=head1 NAME
822
823	B<splitter.pl>
824
825	=head1 SYNOPSIS
826
827	B<splitter.pl> [B<-ndF>] [B<-t> I<secs>] [B<-c> F<config_file>]
828	[B<-f> F<experiment_tcl>] [B<-p> I<max_procs>] [F<experiment_tcl>]
829
830	=head1 DESCRIPTION
831
832	B<splitter.pl> invokes the DETER experiment parser to split an annotated
833	experiment into multiple sub-experments and instantiates the sub-experiments on
834	their intended testbeds. Annotation is accomplished using the
835	tb-set-node-testbed command, added to the parser.
836
837	Much of the script's behavior depends on the configuration file, specified with
838	the B<-c> flag and defaulting to F<./splitter.conf>.
839
840	The testbed labels supplied in the B<tb-set-node-testbed> command are
841	meaningful based on their presence in the testbeds file. that file can be
842	specified in the configuration file using the B<Testbeds> directive, and
843	defaults to F<./testbeds>. The syntax is described below.
844
845	Most of the intermediate files are staged in a sub-directory of a temporary
846	files directory and deleted at the end of the script. Specifying the B<-d>
847	flag on the command line avoids the deletion for debbugging. By default the
848	temporary files directory is directory is F</tmp> and can be reset in the
849	configuration file using the B<Tmpdir> directive. Intermediate files are
850	stored under a subdirectory formed by adding the process ID of the splitter
851	process. For example, if the temporary files directory is F</tmp> and the
852	B<splitter.pl> process ID is 2323, the temporary files will be stored in
853	F</tmp/split2323/>.
854
855	The expreriment is split out into one experiment description per testbed in the
856	temporary directory named as F<experiment.testbed.tcl> where the experiment is
857	the experiment ID given in the configuration file, and the testbed is the
858	tb-set-node-testbed parameter for the nodes in the file.
859
860	If the B<-n> option is absent the sub-experiments are then instantiated on
861	their testbeds. (Here B<-n> is analogous to its use in L<make(1)>).
862	Per-testbed parameters are set in the testbeds file. Sub-experiments on
863	slave testbeds are instantiated in a random order, but the master testbed is
864	currently instantiated last.
865
866	Slave testbeds can be swapped in in parallel by specifying the B<-p> parameter
867	and the maximum number of simultaneous processes to start.
868
869	Scripts to start federation (the federation kit) are copied into the local
870	experiment's tmp file - e.g., F</proj/DETER/exp/simple-split/tmp>. These are
871	taken from the directory given by the B<ScriptDir> directive in the
872	configuration file.
873
874	If B<-t> is given the parameter is treated as a parameter to B<Timeout> in
875	F<splitter.conf>.
876
877	If any sub-experiment fails to instantiate, the other sub-exeriments are
878	swapped out. B<-F> avoids this swap out, which can also be specified as
879	B<SoftFail: true> in F<splitter.conf>
880
881	=head2 Configuration File
882
883	The configuration file is a simple set of colon-separated parameters and
884	values. A configuration file must be present, either specified in the B<-c>
885	flag or the default F<./splitter.conf>. All the parameter names are case
886	insensitive, but should not include any whitespace. Parameter values may
887	include whitespace, but no newlines.
888
889	Possible parameters are:
890
891	=over 5
892
893	=item Experiment
894
895	The name of the experiment on the various testbeds
896
897	=item Master
898
899	The master testbed label from the testbeds file, described below.
900
901	=item Testbeds
902
903	The testbeds file described below, giving per-testbed parameters. If this
904	directive is absent the testbeds file defaults to F<./testbeds>
905
906	=item ScriptDir
907
908	Location of the default federation scripts, i.e. the federation kit.
909
910	=item GatewayPubkey
911
912	=item GatewaySecretKey
913
914	The names of the files containing secret and public keys to use in setting up
915	tunnels between testbeds. These will eventually be automatically generated.
916
917	=item TmpDir
918
919	=item TempDir
920
921	The directory where temporary files are created. These are synonyms, but
922	should both be specified, B<TmpDir> has priority. If neither is specified,
923	F</tmp> is used.
924
925	=item SMBShare
926
927	The SMB share on the master testbed that will be exported to remote clients.
928
929	=item SMBUser
930
931	The experiment user to mount project directories as. This user needs to be a
932	member of the exported experiment - that is one of the users in the project
933	containing this experiment on the master testbed.
934
935	=item Tclparse
936
937	The pathname to the experiment parsing program. Only developers should set
938	this.
939
940	=item Tclsh
941
942	The pathname to the local oTcl shell. Only developers should set
943	this.
944
945	=item Timeout
946
947	Value in seconds after which a swap-in operatioin will be considered a success.
948	Often long swap-ins will hang when there are partial failures. This works
949	around this issue. (This behavior can be requested on the command line by
950	specifying B<-t> I<secs>.)
951
952	=item FailSoft
953
954	If not set, failure of any sub experiment swaps the rest out. Setting this to
955	any value avoids this swap out. (This behavior can be requested on the command
956	line by specifying B<-F>.)
957
958	=back
959
960	=head2 Testbeds file
961
962	The configuration file (F<./testbeds> unless overridden by B<-c>) is a file of
963	scoped attribute-value pairs where each attribute is specified on a separate
964	line of the configuration file. Each testbed's parameters are preceeded by the
965	testbed label in brackets ([]) on a line by itself. After that the parameters
966	are specified as parameter: value. This is essentially the same format as the
967	configuration file. Parameters are:
968
969	=over 4
970
971	=item User
972
973	The user under which to make requests to this testbed. The user running
974	B<splitter.pl> must be able to authenicate as this user under L<ssh(1)> to this
975	testbed.
976
977	=item OpsNode
978
979	The host name of the testbed's ops node. The user calling B<splitter.pl> must
980	be able to execute commands on this host via L<ssh(1)>.
981
982	=item Domain
983
984	The domain of nodes in this testbed (including the ops host). This parameter
985	should always start with a period.
986
987	=item Project
988
989	The project under which to instantiate sub-experiments on this testbed.
990
991	=item ConnectorType
992
993	The node type for inter-testbed connector nodes on this testbed.
994
995	=item SlaveNodeStartCmd
996
997	The start command to run on experimental nodes when this testbed is used as a
998	slave. In all the start commands the following string substitutions are made:
999
1000	=over 10
1001
1002	=item FEDDIR
1003
1004	The local experiment's federation scripts directory. Each local experiment
1005	will have this replaced by the scripts directory on its local boss.
1006
1007	=item GWCONF
1008
1009	The full pathname of the gateway configuration file. As with FEDDIR, this is
1010	on the local boss.
1011
1012	=item PROJDIR
1013
1014	The project directory on the local boss.
1015
1016	=item EID
1017
1018	The local experiment name.
1019
1020	=back
1021
1022	All startcmds specified in F<testbeds> undergo these expansions.
1023
1024	=item SlaveConnectorStartCmd
1025
1026	The start command to run on gateway nodes when this testbed is used as a slave.
1027	The same string substitutions are made in this command as in SlaveNodeStartCmd.
1028
1029	=item MasterNodeStartCmd
1030
1031	The start command to run on experimental nodes when this testbed is used as a
1032	master. The same string substitutions are made in this command as in
1033	SlaveNodeStartCmd.
1034
1035	=item MasterConnectorStartCmd
1036
1037	The start command to run on gateway nodes when this testbed is used as a
1038	master. The same string substitutions are made in this command as in
1039	SlaveNodeStartCmd.
1040
1041	=item ConnectorImage
1042
1043	The disk image to be loaded on a gateway node on this testbed.
1044
1045	=item FileServer
1046
1047	The node in the master testbed from which filesystems are mounted.
1048
1049	=item Boss
1050
1051	The node in the master testbed that controls the testbed.
1052
1053	=item TunnelCfg
1054
1055	True if the connector needs to do DETER federation. This parameter will
1056	probably be removed.
1057
1058
1059	=back
1060
1061	=head1 ENVIRONMENT
1062
1063	B<splitter.pl> does not directly make use of environment variables, but calls
1064	out to L<ssh(1)> and (indirectly) to L<sh(1)>, which may be influenced by the
1065	environment.
1066
1067	=head1 BUGS
1068
1069	A deprecated B<-N> flag was used to select testbeds file format. Only one
1070	format is supported now, and B<-N> generates a warning, but otherwise does not
1071	affect B<splitter.pl>.
1072
1073	=head1 SEE ALSO
1074
1075	L<sh(1)>, L<ssh(1)>
1076
1077	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: fedkit/splitter.pl @ e863014

Download in other formats: