Context Navigation

← Previous Changeset
Next Changeset →

Changeset bd3e314

Timestamp:

Jul 24, 2009 1:22:34 PM (16 years ago)

Author:

Ted Faber <faber@…>

Branches:

axis_example, compt_changes, info-ops, master, version-1.30, version-2.00, version-3.01, version-3.02

Children:

Parents:

Message:

Asynchronous creation and logging. These are the fedd changes. Fedd_client next.

Location:

Files:

: 2 edited

federation/experiment_control.py (modified) (18 diffs)
wsdl/fedd_types.xsd (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

fedd/federation/experiment_control.py

-                      r728001e
+                      rbd3e314
         self.thread_with_rv = experiment_control_local.pooled_thread
         self.thread_pool = experiment_control_local.thread_pool
+        self.list_log = experiment_control_local.list_log
         self.cert_file = config.get("experiment_control", "cert_file")
 …
                 return True
+    def allocate_resources(self, allocated, master, eid, expid, expcert,
+            tbparams, tmpdir, alloc_log=None):
+        started = { }           # Testbeds where a sub-experiment started
+                                # successfully
+        # XXX
+        fail_soft = False
+        log = alloc_log or self.log
+        thread_pool = self.thread_pool(self.nthreads)
+        threads = [ ]
+        for tb in [ k for k in allocated.keys() if k != master]:
+            # Create and start a thread to start the segment, and save it to
+            # get the return value later
+            thread_pool.wait_for_slot()
+            t  = self.pooled_thread(\
+                    target=self.start_segment(log=log,
+                        keyfile=self.ssh_privkey_file, debug=self.debug),
+                    args=(tb, eid, tbparams, tmpdir, 0), name=tb,
+                    pdata=thread_pool, trace_file=self.trace_file)
+            threads.append(t)
+            t.start()
+        # Wait until all finish
+        thread_pool.wait_for_all_done()
+        # If none failed, start the master
+        failed = [ t.getName() for t in threads if not t.rv ]
+        if len(failed) == 0:
+            starter = self.start_segment(log=log,
+                    keyfile=self.ssh_privkey_file, debug=self.debug)
+            if not starter(master, eid, tbparams, tmpdir):
+                failed.append(master)
+        succeeded = [tb for tb in allocated.keys() if tb not in failed]
+        # If one failed clean up, unless fail_soft is set
+        if failed:
+            if not fail_soft:
+                thread_pool.clear()
+                for tb in succeeded:
+                    # Create and start a thread to stop the segment
+                    thread_pool.wait_for_slot()
+                    t  = self.pooled_thread(\
+                            target=self.stop_segment(log=log,
+                                keyfile=self.ssh_privkey_file,
+                                debug=self.debug),
+                            args=(tb, eid, tbparams), name=tb,
+                            pdata=thread_pool, trace_file=self.trace_file)
+                    t.start()
+                # Wait until all finish
+                thread_pool.wait_for_all_done()
+                # release the allocations
+                for tb in tbparams.keys():
+                    self.release_access(tb, tbparams[tb]['allocID'])
+                # Remove the placeholder
+                self.state_lock.acquire()
+                self.state[eid]['experimentStatus'] = 'failed'
+                if self.state_filename: self.write_state()
+                self.state_lock.release()
+                #raise service_error(service_error.federant,
+                #    "Swap in failed on %s" % ",".join(failed))
+                log.error("Swap in failed on %s" % ",".join(failed))
+                return
+        else:
+            log.info("[start_segment]: Experiment %s active" % eid)
+        log.debug("[start_experiment]: removing %s" % tmpdir)
+        # Walk up tmpdir, deleting as we go
+        for path, dirs, files in os.walk(tmpdir, topdown=False):
+            for f in files:
+                os.remove(os.path.join(path, f))
+            for d in dirs:
+                os.rmdir(os.path.join(path, d))
+        os.rmdir(tmpdir)
+        # Insert the experiment into our state and update the disk copy
+        self.state_lock.acquire()
+        self.state[expid]['experimentStatus'] = 'active'
+        self.state[eid] = self.state[expid]
+        if self.state_filename: self.write_state()
+        self.state_lock.release()
+        return
     def create_experiment(self, req, fid):
         """
 …
         pid = "dummy"
         gid = "dummy"
-        # XXX
-        fail_soft = False
         try:
             os.mkdir(tmpdir+"/keys")
 …
             raise service_error(service_error.req, "No experiment description")
+        # Generate an ID for the experiment (slice) and a certificate that the
+        # allocator can use to prove they own it.  We'll ship it back through
+        # the encrypted connection.
+        (expid, expcert) = generate_fedid("test", dir=tmpdir, log=self.log)
         if req.has_key('experimentID') and \
                 req['experimentID'].has_key('localname'):
 …
             while (self.state.has_key(eid)):
                 eid += random.choice(string.ascii_letters)
+            # To avoid another thread picking this localname
+            self.state[eid] = "placeholder"
+            # Initial state
+            self.state[eid] = {
+                    'experimentID' : \
+                            [ { 'localname' : eid }, {'fedid': expid } ],
+                    'experimentStatus': 'starting',
+                    'experimentAccess': { 'X509' : expcert },
+                    'owner': fid,
+                    'log' : [],
+                }
+            self.state[expid] = self.state[eid]
+            if self.state_filename: self.write_state()
             self.state_lock.release()
         else:
 …
                 for i in range(0,5):
                     eid += random.choice(string.ascii_letters)
+            # To avoid another thread picking this localname
+            self.state[eid] = "placeholder"
+            # Initial state
+            self.state[eid] = {
+                    'experimentID' : \
+                            [ { 'localname' : eid }, {'fedid': expid } ],
+                    'experimentStatus': 'starting',
+                    'experimentAccess': { 'X509' : expcert },
+                    'owner': fid,
+                    'log' : [],
+                }
+            self.state[expid] = self.state[eid]
+            if self.state_filename: self.write_state()
             self.state_lock.release()
 …
             allocated = { }         # Testbeds we can access
-            started = { }           # Testbeds where a sub-experiment started
-                                    # successfully
             # Objects to parse the splitter output (defined above)
             parse_current_testbed = self.current_testbed(eid, tmpdir,
 …
                 raise service_error(service_error.internal,
                         "Failed to generate visualization")
             # save federant information
 …
+                    }
+            self.state_lock.acquire()
+            self.state[eid]['vtopo'] = vtopo
+            self.state[eid]['vis'] = vis
+            self.state[expid]['federant'] = \
+                    [ tbparams[tb]['federant'] for tb in tbparams.keys() \
+                        if tbparams[tb].has_key('federant') ]
+            if self.state_filename: self.write_state()
+            self.state_lock.release()
             # Copy tarfiles and rpms needed at remote sites into a staging area
 …
             # If something goes wrong in the parse (usually an access error)
             # clear the placeholder state.  From here on out the code delays
+            # exceptions.
+            # exceptions.  Failing at this point returns a fault to the remote
+            # caller.
             self.state_lock.acquire()
             del self.state[eid]
+            del self.state[expid]
+            if self.state_filename: self.write_state()
             self.state_lock.release()
             raise e
+        thread_pool = self.thread_pool(self.nthreads)
+        threads = [ ]
+        for tb in [ k for k in allocated.keys() if k != master]:
+            # Create and start a thread to start the segment, and save it to
+            # get the return value later
+            thread_pool.wait_for_slot()
+            t  = self.pooled_thread(\
+                    target=self.start_segment(log=self.log,
+                        keyfile=self.ssh_privkey_file, debug=self.debug),
+                    args=(tb, eid, tbparams, tmpdir, 0), name=tb,
+                    pdata=thread_pool, trace_file=self.trace_file)
+            threads.append(t)
+            t.start()
+        # Wait until all finish
+        thread_pool.wait_for_all_done()
+        # If none failed, start the master
+        failed = [ t.getName() for t in threads if not t.rv ]
+        if len(failed) == 0:
+            starter = self.start_segment(log=self.log,
+                    keyfile=self.ssh_privkey_file, debug=self.debug)
+            if not starter(master, eid, tbparams, tmpdir):
+                failed.append(master)
+        succeeded = [tb for tb in allocated.keys() if tb not in failed]
+        # If one failed clean up, unless fail_soft is set
+        if failed:
+            if not fail_soft:
+                thread_pool.clear()
+                for tb in succeeded:
+                    # Create and start a thread to stop the segment
+                    thread_pool.wait_for_slot()
+                    t  = self.pooled_thread(\
+                            target=self.stop_segment(log=self.log,
+                                keyfile=self.ssh_privkey_file,
+                                debug=self.debug),
+                            args=(tb, eid, tbparams), name=tb,
+                            pdata=thread_pool, trace_file=self.trace_file)
+                    t.start()
+                # Wait until all finish
+                thread_pool.wait_for_all_done()
+                # release the allocations
+                for tb in tbparams.keys():
+                    self.release_access(tb, tbparams[tb]['allocID'])
+                # Remove the placeholder
+                self.state_lock.acquire()
+                del self.state[eid]
+                self.state_lock.release()
+                raise service_error(service_error.federant,
+                    "Swap in failed on %s" % ",".join(failed))
+        else:
+            self.log.info("[start_segment]: Experiment %s started" % eid)
+        # Generate an ID for the experiment (slice) and a certificate that the
+        # allocator can use to prove they own it.  We'll ship it back through
+        # the encrypted connection.
+        (expid, expcert) = generate_fedid("test", dir=tmpdir, log=self.log)
+        self.log.debug("[start_experiment]: removing %s" % tmpdir)
+        # Walk up tmpdir, deleting as we go
+        for path, dirs, files in os.walk(tmpdir, topdown=False):
+            for f in files:
+                os.remove(os.path.join(path, f))
+            for d in dirs:
+                os.rmdir(os.path.join(path, d))
+        os.rmdir(tmpdir)
+        # The deepcopy prevents the allocation ID and other binaries from being
+        # translated into other formats
+        resp = { 'federant' : [ copy.deepcopy(tbparams[tb]['federant']) \
+                for tb in tbparams.keys() \
+                    if tbparams[tb].has_key('federant') ],\
+                    'vtopo': vtopo,\
+                    'vis' : vis,
+                    'experimentID' : [\
+                            { 'fedid': copy.copy(expid) }, \
+                            { 'localname': eid },\
+                        ],\
+                    'experimentAccess': { 'X509' : expcert },\
+                }
+        # remove the allocationID info from each federant
+        for f in resp['federant']:
+            if f.has_key('allocID'): del f['allocID']
+        # Insert the experiment into our state and update the disk copy
+        self.state_lock.acquire()
+        self.state[expid] = { 'federant' : [ tbparams[tb]['federant'] \
+                for tb in tbparams.keys() \
+                    if tbparams[tb].has_key('federant') ],\
+                    'vtopo': vtopo,\
+                    'vis' : vis,
+                    'owner': fid,
+                    'experimentID' : [\
+                            { 'fedid': expid }, { 'localname': eid },\
+                        ],\
+                }
+        self.state[eid] = self.state[expid]
+        if self.state_filename: self.write_state()
+        self.state_lock.release()
+        # Start the background swapper and return the starting state.  From
+        # here on out, the state will stick around a while.
+        # Let users touch the state
         self.auth.set_attribute(fid, expid)
         self.auth.set_attribute(expid, expid)
+        if not failed:
+            return resp
+        else:
+            raise service_error(service_error.partial, \
+                    "Partial swap in on %s" % ",".join(succeeded))
+        # Create a logger that logs to the experiment's state object as well as
+        # to the main log file.
+        alloc_log = logging.getLogger('fedd.experiment_control.%s' % eid)
+        h = logging.StreamHandler(self.list_log(self.state[eid]['log']))
+        # XXX: there should be a global one of these rather than repeating the
+        # code.
+        h.setFormatter(logging.Formatter("%(asctime)s %(name)s %(message)s",
+                    '%d %b %y %H:%M:%S'))
+        alloc_log.addHandler(h)
+        # Start a thread to do the resource allocation
+        t  = Thread(target=self.allocate_resources,
+                args=(allocated, master, eid, expid, expcert, tbparams,
+                    tmpdir, alloc_log),
+                name=eid)
+        t.start()
+        rv = {
+                'experimentID': [
+                    {'localname' : eid }, { 'fedid': copy.copy(expid) }
+                ],
+                'experimentStatus': 'started',
+                'experimentAccess': { 'X509' : expcert }
+            }
+        return rv
     def check_experiment_access(self, fid, key):
 …
         """
         rv = None
+        state = None
         req = req.get('VtopoRequestBody', None)
 …
         self.state_lock.acquire()
         if self.state.has_key(key):
+            rv = { 'experiment' : {keytype: key },\
+                    'vtopo': self.state[key]['vtopo'],\
+                }
+            if self.state[key].has_key('vtopo'):
+                rv = { 'experiment' : {keytype: key },\
+                        'vtopo': self.state[key]['vtopo'],\
+                    }
+            else:
+                state = self.state[key]['experimentStatus']
         self.state_lock.release()
         if rv: return rv
+        else: raise service_error(service_error.req, "No such experiment")
+        else:
+            if state:
+                raise service_error(service_error.partial,
+                        "Not ready: %s" % state)
+            else:
+                raise service_error(service_error.req, "No such experiment")
     def get_vis(self, req, fid):
 …
         """
         rv = None
+        state = None
         req = req.get('VisRequestBody', None)
 …
         self.state_lock.acquire()
         if self.state.has_key(key):
+            rv =  { 'experiment' : {keytype: key },\
+                    'vis': self.state[key]['vis'],\
+                    }
+            if self.state[key].has_key('vis'):
+                rv =  { 'experiment' : {keytype: key },\
+                        'vis': self.state[key]['vis'],\
+                        }
+            else:
+                state = self.state[key]['experimentStatus']
         self.state_lock.release()
         if rv: return rv
+        else: raise service_error(service_error.req, "No such experiment")
+        else:
+            if state:
+                raise service_error(service_error.partial,
+                        "Not ready: %s" % state)
+            else:
+                raise service_error(service_error.req, "No such experiment")
     def get_info(self, req, fid):
 …
             rv = copy.deepcopy(self.state[key])
         self.state_lock.release()
+        # Remove the owner info
+        del rv['owner']
+        # remove the allocationID info from each federant
+        for f in rv['federant']:
+            if f.has_key('allocID'): del f['allocID']
+        if rv: return rv
+        else: raise service_error(service_error.req, "No such experiment")
+        if rv:
+            # Remove the owner info (should always be there, but...)
+            if rv.has_key('owner'): del rv['owner']
+            # Convert the log into the allocationLog parameter and remove the
+            # log entry (with defensive programming)
+            if rv.has_key('log'):
+                rv['allocationLog'] = "".join(rv['log'])
+                del rv['log']
+            if rv['experimentStatus'] != 'active':
+                if rv.has_key('federant'): del rv['federant']
+            else:
+                # remove the allocationID info from each federant
+                for f in rv.get('federant', []):
+                    if f.has_key('allocID'): del f['allocID']
+            return rv
+        else:
+            raise service_error(service_error.req, "No such experiment")
 …
             # It releases the lock to do the deallocations and reacquires it to
             # remove the experiment state when the termination is complete.
+            # First make sure that the experiment creation is complete.
+            if fed_exp.has_key('experimentStatus'):
+                if fed_exp['experimentStatus'] == 'started':
+                    self.state_lock.release()
+                    raise service_error(service_error.partial,
+                            'Experiment still being created')
+            else:
+                # No status??? trouble
+                self.state_lock.release()
+                raise service_error(service_error.internal,
+                        "Experiment has no status!?")
             ids = []
             #  experimentID is a list of dicts that are self-describing
 …
             # Construct enough of the tbparams to make the stop_segment calls
             # work
             for fed in fed_exp['federant']:
+            for fed in fed_exp.get('federant', []):
                 try:
                     for e in fed['name']:
 …
                 # Create and start a thread to stop the segment
                 thread_pool.wait_for_slot()
+                t  = self.pooled_thread(target=self.stop_segment,
+                t  = self.pooled_thread(\
+                        target=self.stop_segment(log=self.log,
+                            keyfile=self.ssh_privkey_file, debug=self.debug),
                         args=(tb, tbparams[tb]['eid'], tbparams), name=tb,
                         pdata=thread_pool, trace_file=self.trace_file)

fedd/wsdl/fedd_types.xsd

-                      r728001e
+                      rbd3e314
       <xsd:enumeration value="max"/>
       <xsd:enumeration value="average"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="statusType">
+    <xsd:annotation>
+      <xsd:documentation>
+        The current state of the experiment.
+      </xsd:documentation>
+    </xsd:annotation>
+    <xsd:restriction base="xsd:string">
+      <xsd:enumeration value="active"/>
+      <xsd:enumeration value="starting"/>
+      <xsd:enumeration value="terminating"/>
+      <xsd:enumeration value="failed"/>
     </xsd:restriction>
   </xsd:simpleType>
 …
     <xsd:annotation>
       <xsd:documentation>
+        The reply to a successful creation request.  Includes the
+        information about federants hosting sub-experiments for service
+        access as well as virtual topology and visualization
+        information.  All that information is relative to the requester.
+        ExperimentAccess includes credentials with which one can access
+        the experiment.  These may include a public key necessary to
+        prove possession of the credential and should be treated with
+        care.
+      </xsd:documentation>
+    </xsd:annotation>
+    <xsd:sequence>
+      <xsd:element name="federant" type="tns:federatedExperimentType"
+        minOccurs="1" maxOccurs="unbounded"/>
+      <xsd:element name="vtopo" type="tns:vtopoType" minOccurs="0"
+        maxOccurs="1"/>
+      <xsd:element name="vis" type="tns:visType" minOccurs="0"
+        maxOccurs="1"/>
+        Returned to let the caller know that the request is underway and what
+        credentials will eventauly be able to be used to access them.
+      </xsd:documentation>
+    </xsd:annotation>
+    <xsd:sequence>
       <xsd:element name="experimentID" type="tns:IDType" minOccurs="1"
         maxOccurs="unbounded"/>
+      <xsd:element name="experimentStatus" type="tns:statusType"/>
       <xsd:element name="experimentAccess" type="tns:accessType" minOccurs="0"
         maxOccurs="1"/>
 …
         A combined topology, visualalization, and federant request.
         Different information may be returned based on the user's rights
+        to see the topology.  </xsd:documentation>
+        to see the topology.
+      </xsd:documentation>
     </xsd:annotation>
     <xsd:sequence>
 …
     <xsd:annotation>
       <xsd:documentation>
+        Information on an instantiated experiment.  A createResponse
+        without the secret information.  Different information may be
+        returned based on the user's rights to see the topology.
+        Information on an instantiated experiment.  Different information may
+        be returned based on the user's rights to see the topology.  Includes
+        the information about federants hosting sub-experiments for service
+        access as well as virtual topology and visualization information.  All
+        that information is relative to the requester.  ExperimentAccess
+        includes credentials with which one can access the experiment.  These
+        may include a public key necessary to prove possession of the
+        credential and should be treated with care.
       </xsd:documentation>
     </xsd:annotation>
     <xsd:sequence>
       <xsd:element name="federant" type="tns:federatedExperimentType"
         minOccurs="1" maxOccurs="unbounded"/>
+        minOccurs="0" maxOccurs="unbounded"/>
       <xsd:element name="vtopo" type="tns:vtopoType" minOccurs="0"
         maxOccurs="1"/>
 …
       <xsd:element name="experimentID" type="tns:IDType" minOccurs="1"
         maxOccurs="unbounded"/>
+      <xsd:element name="allocationLog" type="xsd:string" minOccurs="0"
+        maxOccurs="1"/>
+      <xsd:element name="experimentStatus" type="tns:statusType"/>
+      <xsd:element name="experimentAccess" type="tns:accessType" minOccurs="0"
+        maxOccurs="1"/>
     </xsd:sequence>
   </xsd:complexType>

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: