From b907303a39a69880d7db4e606b73fcb5a41f7bc1 Mon Sep 17 00:00:00 2001 From: HW42 Date: Sat, 21 Oct 2017 05:57:57 +0200 Subject: [PATCH] qubes/vm: Improve stopped event handling The previous version did not ensure that the stopped/shutdown event was handled before a new VM start. This can easily lead to problems like in QubesOS/qubes-issues#3164. This improved version now ensures that the stopped/shutdown events are handled before a new VM start. Additionally this version should be more robust against unreliable events from libvirt. It handles missing, duplicated and delayed stopped events. Instead of one 'domain-shutdown' event there are now 'domain-stopped' and 'domain-shutdown'. The later is generated after the former. This way it's easy to run code after the VM is shutdown including the stop of it's storage. --- qubes/app.py | 2 +- qubes/tests/integ/basic.py | 1 - qubes/vm/dispvm.py | 28 ++++---- qubes/vm/qubesvm.py | 132 +++++++++++++++++++++++++++++++------ 4 files changed, 126 insertions(+), 37 deletions(-) diff --git a/qubes/app.py b/qubes/app.py index bdc2c904..cc7fa68a 100644 --- a/qubes/app.py +++ b/qubes/app.py @@ -1183,7 +1183,7 @@ class Qubes(qubes.PropertyHolder): return if event == libvirt.VIR_DOMAIN_EVENT_STOPPED: - vm.fire_event('domain-shutdown') + vm.on_libvirt_domain_stopped() @qubes.events.handler('domain-pre-delete') def on_domain_pre_deleted(self, event, vm): diff --git a/qubes/tests/integ/basic.py b/qubes/tests/integ/basic.py index 7a9761f8..cec5ec75 100644 --- a/qubes/tests/integ/basic.py +++ b/qubes/tests/integ/basic.py @@ -137,7 +137,6 @@ class TC_00_Basic(qubes.tests.SystemTestCase): self.test_failure_reason = 'domain-shutdown event received twice' self.domain_shutdown_handled = True - @unittest.expectedFailure def test_201_shutdown_event_race(self): '''Regression test for 3164 - pure events edition''' vmname = self.make_vm_name('appvm') diff --git a/qubes/vm/dispvm.py b/qubes/vm/dispvm.py index 399a3df3..3afd9226 100644 --- a/qubes/vm/dispvm.py +++ b/qubes/vm/dispvm.py @@ -129,18 +129,18 @@ class DispVM(qubes.vm.qubesvm.QubesVM): raise qubes.exc.QubesValueError(self, 'Cannot change template of Disposable VM') + @qubes.events.handler('domain-shutdown') @asyncio.coroutine - def on_domain_shutdown_coro(self): - '''Coroutine for executing cleanup after domain shutdown. + def on_domain_shutdown(self, _event, **_kwargs): + yield from self._auto_cleanup() - This override default action defined in QubesVM.on_domain_shutdown_coro - ''' - with (yield from self.startup_lock): - yield from self.storage.stop() - if self.auto_cleanup and self in self.app.domains: - yield from self.remove_from_disk() - del self.app.domains[self] - self.app.save() + @asyncio.coroutine + def _auto_cleanup(self): + '''Do auto cleanup if enabled''' + if self.auto_cleanup and self in self.app.domains: + yield from self.remove_from_disk() + del self.app.domains[self] + self.app.save() @classmethod @asyncio.coroutine @@ -206,10 +206,6 @@ class DispVM(qubes.vm.qubesvm.QubesVM): yield from super(DispVM, self).start(**kwargs) except: - # cleanup also on failed startup; there is potential race with - # self.on_domain_shutdown_coro, so check if wasn't already removed - if self.auto_cleanup and self in self.app.domains: - yield from self.remove_from_disk() - del self.app.domains[self] - self.app.save() + # Cleanup also on failed startup + yield from self._auto_cleanup() raise diff --git a/qubes/vm/qubesvm.py b/qubes/vm/qubesvm.py index 241aaf87..e2bb993e 100644 --- a/qubes/vm/qubesvm.py +++ b/qubes/vm/qubesvm.py @@ -151,9 +151,33 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): *other arguments are as in :py:meth:`start`* + .. event:: domain-stopped (subject, event) + + Fired when domain has been stopped. + + This event is emitted before ``'domain-shutdown'`` and will trigger + the cleanup in QubesVM. So if you require that the cleanup has + already run use ``'domain-shutdown'``. + + Note that you can receive this event as soon as you received + ``'domain-pre-start'``. This also can be emitted in case of a + startup failure, before or after ``'domain-start-failed'``. + + Handler for this event can be asynchronous (a coroutine). + + :param subject: Event emitter (the qube object) + :param event: Event name (``'domain-stopped'``) + .. event:: domain-shutdown (subject, event) - Fired when domain has been shut down. + Fired when domain has been shut down. It is generated after + ``'domain-stopped'``. + + Note that you can receive this event as soon as you received + ``'domain-pre-start'``. This also can be emitted in case of a + startup failure, before or after ``'domain-start-failed'``. + + Handler for this event can be asynchronous (a coroutine). :param subject: Event emitter (the qube object) :param event: Event name (``'domain-shutdown'``) @@ -643,6 +667,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): self._libvirt_domain = None self._qdb_connection = None + # We assume a fully halted VM here. The 'domain-init' handler will + # check if the VM is already running. + self._domain_stopped_event_received = True + self._domain_stopped_event_handled = True + + self._domain_stopped_future = None + + # Internal lock to ensure ordering between _domain_stopped_coro() and + # start(). This should not be accessed anywhere else. + self._domain_stopped_lock = asyncio.Lock() + if xml is None: # we are creating new VM and attributes came through kwargs assert hasattr(self, 'qid') @@ -712,6 +747,8 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): if not self.app.vmm.offline_mode and self.is_running(): self.start_qdb_watch() + self._domain_stopped_event_received = False + self._domain_stopped_event_handled = False @qubes.events.handler('property-set:label') def on_property_set_label(self, event, name, newvalue, oldvalue=None): @@ -797,6 +834,30 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): if self.get_power_state() != 'Halted': return + with (yield from self._domain_stopped_lock): + # Don't accept any new stopped event's till a new VM has been + # created. If we didn't received any stopped event or it wasn't + # handled yet we will handle this in the next lines. + self._domain_stopped_event_received = True + + if self._domain_stopped_future is not None: + # Libvirt stopped event was already received, so cancel the + # future. If it didn't generate the Qubes events yet we + # will do it below. + self._domain_stopped_future.cancel() + self._domain_stopped_future = None + + if not self._domain_stopped_event_handled: + # No Qubes domain-stopped events have been generated yet. + # So do this now. + + # Set this immediately such that we don't generate events + # twice if an exception gets thrown. + self._domain_stopped_event_handled = True + + yield from self.fire_event_async('domain-stopped') + yield from self.fire_event_async('domain-shutdown') + self.log.info('Starting {}'.format(self.name)) yield from self.fire_event_async('domain-pre-start', @@ -816,14 +877,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): qmemman_client = yield from asyncio.get_event_loop().\ run_in_executor(None, self.request_memory, mem_required) + yield from self.storage.start() + except Exception as exc: # let anyone receiving domain-pre-start know that startup failed yield from self.fire_event_async('domain-start-failed', reason=str(exc)) + if qmemman_client: + qmemman_client.close() raise try: - yield from self.storage.start() self._update_libvirt_domain() self.libvirt_domain.createWithFlags( @@ -833,12 +897,16 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): # let anyone receiving domain-pre-start know that startup failed yield from self.fire_event_async('domain-start-failed', reason=str(exc)) + self.storage.stop() raise finally: if qmemman_client: qmemman_client.close() + self._domain_stopped_event_received = False + self._domain_stopped_event_handled = False + try: yield from self.fire_event_async('domain-spawn', start_guid=start_guid) @@ -870,24 +938,50 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM): return self - @asyncio.coroutine - def on_domain_shutdown_coro(self): - '''Coroutine for executing cleanup after domain shutdown. - Do not allow domain to be started again until this finishes. - ''' - with (yield from self.startup_lock): - try: - yield from self.storage.stop() - except qubes.storage.StoragePoolException: - self.log.exception('Failed to stop storage for domain %s', - self.name) + def on_libvirt_domain_stopped(self): + ''' Handle VIR_DOMAIN_EVENT_STOPPED events from libvirt. - @qubes.events.handler('domain-shutdown') - def on_domain_shutdown(self, _event, **_kwargs): - '''Cleanup after domain shutdown''' - # TODO: ensure that domain haven't been started _before_ this - # coroutine got a chance to acquire a lock - asyncio.ensure_future(self.on_domain_shutdown_coro()) + This is not a Qubes event handler. Instead we do some sanity checks + and synchronization with start() and then emits Qubes events. + ''' + + state = self.get_power_state() + if state not in ['Halted', 'Crashed', 'Dying']: + self.log.warning('Stopped event from libvirt received,' + ' but domain is in state {}!'.format(state)) + # ignore this unexpected event + return + + if self._domain_stopped_event_received: + self.log.warning('Duplicated stopped event from libvirt received!') + # ignore this unexpected event + return + + self._domain_stopped_event_received = True + self._domain_stopped_future = \ + asyncio.ensure_future(self._domain_stopped_coro()) + + @asyncio.coroutine + def _domain_stopped_coro(self): + with (yield from self._domain_stopped_lock): + assert not self._domain_stopped_event_handled + + # Set this immediately such that we don't generate events twice if + # an exception gets thrown. + self._domain_stopped_event_handled = True + + yield from self.fire_event_async('domain-stopped') + yield from self.fire_event_async('domain-shutdown') + + @qubes.events.handler('domain-stopped') + @asyncio.coroutine + def on_domain_stopped(self, _event, **_kwargs): + '''Cleanup after domain was stopped''' + try: + yield from self.storage.stop() + except qubes.storage.StoragePoolException: + self.log.exception('Failed to stop storage for domain %s', + self.name) @asyncio.coroutine def shutdown(self, force=False, wait=False):