source: fedd/federation/desktop_access.py @ 5dbcc93

Last change on this file since 5dbcc93 was 5dbcc93, checked in by Ted Faber <faber@…>, 12 years ago

Last few tricks. Manage hosts, make whole script sudo

  • Property mode set to 100644
File size: 20.2 KB
Line 
1#!/usr/local/bin/python
2
3import os,sys
4import re
5import string
6import copy
7import pickle
8import logging
9import random
10import subprocess
11
12from util import *
13from deter import fedid, generate_fedid
14from authorizer import authorizer, abac_authorizer
15from service_error import service_error
16from remote_service import xmlrpc_handler, soap_handler, service_caller
17
18from deter import topdl
19
20from access import access_base
21
22# Make log messages disappear if noone configures a fedd logger.  This is
23# something of an incantation, but basically it creates a logger object
24# registered to fedd.access if no other module above us has.  It's an extra
25# belt for the suspenders.
26class nullHandler(logging.Handler):
27    def emit(self, record): pass
28
29fl = logging.getLogger("fedd.access")
30fl.addHandler(nullHandler())
31
32
33# The plug-in itself.
34class access(access_base):
35    """
36    This is a demonstration plug-in for fedd.  It responds to all the
37    experiment_control requests and keeps internal state.  The allocations it
38    makes are simple integers associated with each valid request.  It makes use
39    of the general routines in access.access_base.
40
41    Detailed comments in the code and info at
42    """
43    def __init__(self, config=None, auth=None):
44        """
45        Initializer.  Pulls parameters out of the ConfigParser's access
46        section, and initializes simple internal state.  This version reads a
47        maximum integer to assign from the configuration file, while most other
48        configuration entries  are read by the base class. 
49
50        An access database in the cannonical format is also read as well as a
51        state database that is a hash of internal state.  Routines to
52        manipulate these are in the base class, but specializations appear
53        here.
54
55        The access database maps users to a simple string.
56        """
57
58        # Calling the base initializer, which reads canonical configuration
59        # information and initializes canonical members.
60        access_base.__init__(self, config, auth)
61        # Reading the maximum integer parameter from the configuration file
62
63        self.src_addr = config.get('access', 'interface_address')
64        self.router = config.get('access', 'gateway')
65        self.hostname = config.get('access', 'hostname')
66        # Storage for ephemeral ssh keys and host files
67        self.localdir = config.get('access', 'localdir')
68        self.ssh_identity = None
69
70        # hostname is the name of the ssh endpoint for the other side.  That
71        # side needs it to set up routing tables.  If hostname is not
72        # available, but an IP address is, use that.
73        if self.hostname is None:
74            if  self.src_addr is None:
75                raise service_error(service_error.server_config,
76                        'Hostname or interface_address must be set in config')
77            self.hostname = self.src_addr
78       
79        self.ssh_port = config.get('access', 'ssh_port', '22')
80
81        # authorization information
82        self.auth_type = config.get('access', 'auth_type') \
83                or 'abac'
84        self.auth_dir = config.get('access', 'auth_dir')
85        accessdb = config.get("access", "accessdb")
86        # initialize the authorization system.  We make a call to
87        # read the access database that maps from authorization information
88        # into local information.  The local information is parsed by the
89        # translator above.
90        if self.auth_type == 'abac':
91            #  Load the current authorization state
92            self.auth = abac_authorizer(load=self.auth_dir)
93            self.access = [ ]
94            if accessdb:
95                try:
96                    self.read_access(accessdb)
97                except EnvironmentError, e:
98                    self.log.error("Cannot read %s: %s" % \
99                            (config.get("access", "accessdb"), e))
100                    raise e
101        else:
102            raise service_error(service_error.internal, 
103                    "Unknown auth_type: %s" % self.auth_type)
104
105        # The superclass has read the state, but if this is the first run ever,
106        # we must initialise the running flag.  This plugin only supports one
107        # connection, so StartSegment will fail when self.state['running'] is
108        # true.
109        self.state_lock.acquire()
110        if 'running' not in self.state:
111            self.state['running'] = False
112        self.state_lock.release()
113
114        # These dictionaries register the plug-in's local routines for handline
115        # these four messages with the server code above.  There's a version
116        # for SOAP and XMLRPC, depending on which interfaces the plugin
117        # supports.  There's rarely a technical reason not to support one or
118        # the other - the plugin code almost never deals with the transport -
119        # but if a plug-in writer wanted to disable XMLRPC, they could leave
120        # the self.xmlrpc_services dictionary empty.
121        self.soap_services = {\
122            'RequestAccess': soap_handler("RequestAccess", self.RequestAccess),
123            'ReleaseAccess': soap_handler("ReleaseAccess", self.ReleaseAccess),
124            'StartSegment': soap_handler("StartSegment", self.StartSegment),
125            'TerminateSegment': soap_handler("TerminateSegment", 
126                self.TerminateSegment),
127            }
128        self.xmlrpc_services =  {\
129            'RequestAccess': xmlrpc_handler('RequestAccess',
130                self.RequestAccess),
131            'ReleaseAccess': xmlrpc_handler('ReleaseAccess',
132                self.ReleaseAccess),
133            'StartSegment': xmlrpc_handler("StartSegment", self.StartSegment),
134            'TerminateSegment': xmlrpc_handler('TerminateSegment',
135                self.TerminateSegment),
136            }
137        self.call_SetValue = service_caller('SetValue', log=self.log)
138        self.call_GetValue = service_caller('GetValue', log=self.log)
139
140    # ReleaseAccess come from the base class, this is a slightly modified
141    # RequestAccess from the base that includes a fedAttr to force this side to
142    # be active.
143    def RequestAccess(self, req, fid):
144        """
145        Handle an access request.  Success here maps the requester into the
146        local access control space and establishes state about that user keyed
147        to a fedid.  We also save a copy of the certificate underlying that
148        fedid so this allocation can access configuration information and
149        shared parameters on the experiment controller.
150        """
151
152        self.log.info("RequestAccess called by %s" % fid)
153        # The dance to get into the request body
154        if req.has_key('RequestAccessRequestBody'):
155            req = req['RequestAccessRequestBody']
156        else:
157            raise service_error(service_error.req, "No request!?")
158
159        # Base class lookup routine.  If this fails, it throws a service
160        # exception denying access that triggers a fault response back to the
161        # caller.
162        found,  owners, proof = self.lookup_access(req, fid)
163        self.log.info(
164                "[RequestAccess] Access granted local creds %s" % found)
165        # Make a fedid for this allocation
166        allocID, alloc_cert = generate_fedid(subj="alloc", log=self.log)
167        aid = unicode(allocID)
168
169        # Store the data about this allocation:
170        self.state_lock.acquire()
171        self.state[aid] = { }
172        self.state[aid]['user'] = found
173        self.state[aid]['owners'] = owners
174        self.state[aid]['auth'] = set()
175        # Authorize the creating fedid and the principal representing the
176        # allocation to manipulate it.
177        self.append_allocation_authorization(aid, 
178                ((fid, allocID), (allocID, allocID)))
179        self.write_state()
180        self.state_lock.release()
181
182        # Create a directory to stash the certificate in, ans stash it.
183        try:
184            f = open("%s/%s.pem" % (self.certdir, aid), "w")
185            print >>f, alloc_cert
186            f.close()
187        except EnvironmentError, e:
188            raise service_error(service_error.internal, 
189                    "Can't open %s/%s : %s" % (self.certdir, aid, e))
190        self.log.debug('[RequestAccess] Returning allocation ID: %s' % allocID)
191        msg = { 
192                'allocID': { 'fedid': allocID }, 
193                'fedAttr': [{ 'attribute': 'nat_portals', 'value': 'True' }],
194                'proof': proof.to_dict()
195                }
196        return msg
197
198    def validate_topology(self, top):
199        '''
200        Validate the topology.  Desktops can only be single connections.
201        Though the topology will include a portal and a node, the access
202        controller will implement both on one node.
203
204        As more capabilities are added to the contoller the constraints here
205        will relax.
206        '''
207
208        comps = []
209        for e in top.elements:
210            if isinstance(e, topdl.Computer): comps.append(e)
211        if len(comps) > 2: 
212            raise service_error(service_error.req,
213                    "Desktop only supports 1-node subexperiments")
214
215        portals = 0
216        for c in comps:
217            if c.get_attribute('portal') is not None: 
218                portals += 1
219                continue
220            if len(c.interface) > 1:
221                raise service_error(service_error.req,
222                        "Desktop Node has more than one interface")
223            i  = c.interface[0]
224            if len(i.subs) > 1: 
225                raise service_error(service_error.req,
226                        "Desktop Node has more than one substate on interface")
227            sub = i.subs[0]
228            for i in sub.interfaces:
229                if i.element not in comps:
230                    raise service_error(service_error.req,
231                            "Desktop Node connected to non-portal")
232
233        if portals > 1:
234            raise service_error(service_error.req,
235                    "Desktop segment has more than one portal")
236        return True
237
238    def validate_connInfo(self, connInfo):
239        if len(connInfo) != 1: 
240            raise service_error(service_error.req,
241                    "Desktop segment requests multiple connections")
242        if connInfo[0]['type'] != 'ssh':
243            raise service_error(service_error.req,
244                    "Desktop segment requires ssh connecton")
245        return True
246
247    def export_store_info(self, certfile, connInfo):
248        '''
249        Tell the other portal node where to reach this desktop.  The other side
250        uses this information to set up routing, though the ssh_port is unused
251        as the Desktop always initiates ssh connections.
252        '''
253        values = { 'peer': self.hostname, 'ssh_port': self.ssh_port }
254        for c in connInfo:
255            for p in c.get('parameter', []):
256                if p.get('type','') == 'input': continue
257                pname = p.get('name', '')
258                key = p.get('key', '')
259                surl = p.get('store', '')
260                if pname not in values:
261                    self.log('Unknown export parameter: %s'  % pname)
262                    continue
263                val = values[pname]
264                req = { 'name': key, 'value': val }
265                self.log.debug('Setting %s (%s) to %s on %s' % \
266                        (pname, key,  val, surl))
267                self.call_SetValue(surl, req, certfile)
268
269    def set_route(self, dest, script, gw=None, src=None):
270        if sys.platform.startswith('freebsd'):
271            if src is not None and gw is not None:
272                raise service_error(service_error.internal, 
273                        'FreeBSD will not route based on src address')
274            elif src is not None:
275                raise service_error(service_error.internal, 
276                        'FreeBSD will not route based on src address')
277            elif gw is not None:
278                print >>script, 'route add %s %s' % (dest, gw)
279        elif sys.platform.startswith('linux'):
280            if src is not None and gw is not None:
281                print >>script, 'ip route add %s via %s src %s' % \
282                        (dest, gw, src)
283            elif src is not None:
284                print >>script, 'ip route add %s src %s' % \
285                        (dest, src)
286            elif gw is not None:
287                print >>script, 'ip route add %s via %s' % (dest, gw)
288        else:
289            raise service_error(service_error.internal, 
290                    'Unknown platform %s' % sys.platform)
291
292    def unset_route(self, dest, script):
293        rv = 0
294        if sys.platform.startswith('freebsd'):
295            print >>script, 'route delete %s' % dest
296        elif sys.platform.startswith('linux'):
297            print >>script, 'ip route delete %s' % dest
298
299    def find_a_peer(self, addr): 
300        '''
301        Find another node in the experiment that's on our subnet.  This is a
302        hack to handle the problem that we really cannot require the desktop to
303        dynamically route.  Will be improved by distributing static routes.
304        '''
305
306        peer = None
307        hosts = os.path.join(self.localdir, 'hosts')
308        p = addr.rfind('.')
309        if p == -1:
310            raise service_error(service_error.req, 'bad address in topology')
311        prefix = addr[0:p]
312        addr_re = re.compile('(%s.\\d+)' % prefix)
313        try:
314            f = open(hosts, 'r')
315            for line in f:
316                m = addr_re.search(line)
317                if m is not None and m.group(1) != addr:
318                    peer = m.group(1)
319                    break
320            else:
321                raise service_error(service_error.req, 
322                        'No other nodes in this subnet??')
323        except EnvironmentError, e:
324            raise service_error(service_error.internal, 
325                    'Cannot open %s: %s' % (e.filename, e.strerror))
326        return peer
327
328
329
330
331    def configure_desktop(self, top, connInfo):
332        '''
333        Build the connection.  Establish routing to the peer if using a
334        separate interface, wait until the other end confirms setup, establish
335        the ssh layer-two tunnel (tap), assign the in-experiment IP address to
336        the tunnel and establish routing to the experiment through the tap.
337        '''
338
339
340        # get the peer and ssh port from the portal and our IP from the other
341        peer = None
342        port = None
343        my_addr = None
344        for e in top.elements:
345            if not isinstance(e, topdl.Computer): continue
346            if e.get_attribute('portal') is None: 
347                # there should be one interface with one IPv4 address
348                if len(e.interface) <1 :
349                    raise service_error(service_error.internal,
350                            'No interface on experiment node!?!?')
351                my_addr = e.interface[0].get_attribute('ip4_address')
352            else:
353                for ci in connInfo:
354                    if ci.get('portal', '') != e.name: continue
355                    peer = ci.get('peer')
356                    port = '22'
357                    for a in ci.get('fedAttr', []):
358                        if a['attribute'] == 'ssh_port': port = a['value']
359
360        # XXX scan hosts for IP addresses and compose better routing entry
361       
362        if not all([peer, port, my_addr]):
363            raise service_error(service_error.req, 
364                    'Cannot find all config parameters %s %s %s' % (peer, port, my_addr))
365
366        exp_peer = self.find_a_peer(my_addr)
367
368        cscript = os.path.join(self.localdir, 'connect')
369        dscript = os.path.join(self.localdir, 'disconnect')
370        local_hosts = os.path.join(self.localdir, 'hosts')
371        try:
372            f = open(cscript, 'w')
373            print >>f, '#!/bin/sh'
374            # This picks the outgoing interface to the experiment using the
375            # routing system.
376            self.set_route(peer, f, self.router, self.src_addr)
377            # Wait until the other end reports that it is configured py placing
378            # a file this end can access into its local file system.  Try once
379            # a minute.
380            print >>f,'while ! /usr/bin/scp -o "StrictHostKeyChecking no" -i %s %s:/usr/local/federation/etc/prep_done /dev/null; do' % (self.ssh_identity, peer)
381            print >>f, 'sleep 60; done'
382            print >>f, ('ssh -w 0:0 -p %s -o "Tunnel ethernet" ' + \
383                    '-o "StrictHostKeyChecking no" -i %s %s perl -I/usr/local/federation/lib /usr/local/federation/bin/setup_bridge.pl --tapno=0 --addr=%s &') % \
384                    (port, self.ssh_identity, peer, my_addr)
385            # This should give the tap a a chance to come up
386            print >>f,'sleep 10'
387            # Add experiment nodes to hosts
388            print >>f, 'cp /etc/hosts /etc/hosts.DETER.fedd.hold'
389            print >>f, 'echo "#--- BEGIN FEDD ADDITIONS ---" >> /etc/hosts'
390            print >>f, 'cat %s >> /etc/hosts' % local_hosts
391            print >>f, 'echo "#--- END FEDD ADDITIONS ---" >> /etc/hosts'
392            # Assign tap address and route experiment connections through it.
393            print >>f, 'ifconfig tap0 %s netmask 255.255.255.0 up' % \
394                    my_addr
395            self.set_route('10.0.0.0/8', f, exp_peer)
396            f.close()
397            os.chmod(cscript, 0755)
398            f = open(dscript, 'w')
399            print >>f, '#!/bin/sh'
400            print >>f, 'ifconfig tap0 destroy'
401            self.unset_route(peer, f)
402            self.unset_route('10.0.0.0/8', f)
403            print >>f, 'mv /etc/hosts.DETER.fedd.hold /etc/hosts'
404            f.close()
405            os.chmod(dscript, 0755)
406        except EnvironmentError, e:
407            raise service_error(service_error.internal, 
408                    'Cannot create connect %s: %s' % (e.filename, e.strerror))
409        script_log = open('/tmp/connect.log', 'w')
410        subprocess.Popen(['sudo', '/bin/sh', cscript], stdout=script_log, stderr=script_log)
411        return True
412
413    def StartSegment(self, req, fid):
414        """
415        Start a segment.  In this simple skeleton, this means to parse the
416        request and assign an unassigned integer to it.  We store the integer
417        in the persistent state.
418        """
419        try:
420            req = req['StartSegmentRequestBody']
421            # Get the request topology.  If not present, a KeyError is thrown.
422            topref = req['segmentdescription']['topdldescription']
423            # The fedid of the allocation we're attaching resources to
424            auth_attr = req['allocID']['fedid']
425        except KeyError:
426            raise service_error(service_error.req, "Badly formed request")
427
428        # String version of the allocation ID for keying
429        aid = "%s" % auth_attr
430        # Authorization check
431        access_ok, proof = self.auth.check_attribute(fid, auth_attr, 
432                with_proof=True)
433        if not access_ok:
434            raise service_error(service_error.access, "Access denied", 
435                    proof=proof)
436        else:
437            # See if this is a replay of an earlier succeeded StartSegment -
438            # sometimes SSL kills 'em.  If so, replay the response rather than
439            # redoing the allocation.
440            self.state_lock.acquire()
441            # Test and set :-)
442            running = self.state['running']
443            self.state['running'] = True
444            retval = self.state[aid].get('started', None)
445            self.state_lock.release()
446            if retval:
447                self.log.warning(
448                        "[StartSegment] Duplicate StartSegment for %s: " \
449                                % aid + \
450                        "replaying response")
451                return retval
452            if running:
453                self.log.debug('[StartSegment] already running')
454                raise service_error(service_error.federant,
455                        'Desktop is already in an experiment')
456
457        certfile = "%s/%s.pem" % (self.certdir, aid)
458
459        # Convert the topology into topdl data structures.  Again, the
460        # skeletion doesn't do anything with it, but this is how one parses a
461        # topology request.
462        if topref: topo = topdl.Topology(**topref)
463        else:
464            raise service_error(service_error.req, 
465                    "Request missing segmentdescription'")
466
467        err = None
468        try:
469            self.validate_topology(topo)
470
471            # The attributes of the request.  The ones we care about are the ssh
472            # keys to operate the tunnel.
473            attrs = req.get('fedAttr', [])
474            for a in attrs:
475                # Save the hosts and ssh_privkeys to our local dir
476                if a['attribute'] in ('hosts', 'ssh_secretkey'):
477                    self.log.debug('Getting %s from %s' % \
478                            (a['attribute'], a['value']))
479                    get_url(a['value'], certfile, self.localdir, log=self.log)
480                    base = os.path.basename(a['value'])
481                    if a['attribute'] == 'ssh_secretkey':
482                        self.ssh_identity = os.path.join(self.localdir, base)
483                    os.chmod(os.path.join(self.localdir, base), 0600)
484                else:
485                    self.log.debug('Ignoring attribute %s' % a['attribute'])
486
487            # Gather connection information and exchange parameters.
488            connInfo = req.get('connection', [])
489            self.validate_connInfo(connInfo)
490            self.export_store_info(certfile, connInfo)
491            self.import_store_info(certfile, connInfo)
492
493            #build it
494            self.configure_desktop(topo, connInfo)
495        except service_error, e:
496            err = e
497
498        # Save the information
499        if err is None:
500            # It's possible that the StartSegment call gets retried (!).  if
501            # the 'started' key is in the allocation, we'll return it rather
502            # than redo the setup.  The integer allocation was saved when we
503            # made it.
504            self.state_lock.acquire()
505            self.state[aid]['started'] = { 
506                    'allocID': req['allocID'],
507                    'allocationLog': "Allocatation complete",
508                    'segmentdescription': { 'topdldescription': topo.to_dict() },
509                    'proof': proof.to_dict(),
510                    }
511            retval = copy.deepcopy(self.state[aid]['started'])
512            self.write_state()
513            self.state_lock.release()
514        else:
515            # Something bad happened - clear the "running" flag so we can try
516            # again
517            self.state_lock.acquire()
518            self.state['running'] = False
519            self.state_lock.release()
520            raise err
521
522        return retval
523
524    def TerminateSegment(self, req, fid):
525        """
526        Remove the resources associated with th eallocation and stop the music.
527        In this example, this simply means removing the integer we allocated.
528        """
529        # Gather the same access information as for Start Segment
530        try:
531            req = req['TerminateSegmentRequestBody']
532        except KeyError:
533            raise service_error(service_error.req, "Badly formed request")
534
535        auth_attr = req['allocID']['fedid']
536        aid = "%s" % auth_attr
537
538        self.log.debug("Terminate request for %s" %aid)
539        # Check authorization
540        access_ok, proof = self.auth.check_attribute(fid, auth_attr, 
541                with_proof=True)
542        if not access_ok:
543            raise service_error(service_error.access, "Access denied", 
544                    proof=proof)
545        cscript = os.path.join(self.localdir, 'connect')
546        dscript = os.path.join(self.localdir, 'disconnect')
547        # Do the work of disconnecting
548        if os.path.exists(dscript):
549            self.log.debug('calling %s' % dscript)
550            rv = subprocess.call(['sudo', '/bin/sh', dscript])
551            if rv != 0:
552                self.log.warning('%s had an error: %d' % (dscript, rv))
553        else:
554            self.log.warn('No disconnection script!?')
555
556        try:
557            for bfn in os.listdir(self.localdir):
558                fn = os.path.join(self.localdir, bfn)
559                self.log.debug('Removing %s' % fn)
560                if os.path.exists(fn):
561                    os.remove(fn)
562        except EnvironmentError, e:
563            self.log.warn('Failed to remove %s: %s' % (e.filename, e.strerror))
564
565        self.ssh_identity = None
566
567        self.state_lock.acquire()
568        self.state['running'] = False
569        self.state_lock.release()
570   
571        return { 'allocID': req['allocID'], 'proof': proof.to_dict() }
Note: See TracBrowser for help on using the repository browser.