qubes/vm: Improve stopped event handling

The previous version did not ensure that the stopped/shutdown event was
handled before a new VM start. This can easily lead to problems like in
QubesOS/qubes-issues#3164.

This improved version now ensures that the stopped/shutdown events are
handled before a new VM start.

Additionally this version should be more robust against unreliable
events from libvirt. It handles missing, duplicated and delayed stopped
events.

Instead of one 'domain-shutdown' event there are now 'domain-stopped'
and 'domain-shutdown'. The later is generated after the former. This way
it's easy to run code after the VM is shutdown including the stop of
it's storage.
This commit is contained in:
HW42 2017-10-21 05:57:57 +02:00
parent 682d9503ee
commit b907303a39
4 changed files with 126 additions and 37 deletions

View File

@ -1183,7 +1183,7 @@ class Qubes(qubes.PropertyHolder):
return
if event == libvirt.VIR_DOMAIN_EVENT_STOPPED:
vm.fire_event('domain-shutdown')
vm.on_libvirt_domain_stopped()
@qubes.events.handler('domain-pre-delete')
def on_domain_pre_deleted(self, event, vm):

View File

@ -137,7 +137,6 @@ class TC_00_Basic(qubes.tests.SystemTestCase):
self.test_failure_reason = 'domain-shutdown event received twice'
self.domain_shutdown_handled = True
@unittest.expectedFailure
def test_201_shutdown_event_race(self):
'''Regression test for 3164 - pure events edition'''
vmname = self.make_vm_name('appvm')

View File

@ -129,18 +129,18 @@ class DispVM(qubes.vm.qubesvm.QubesVM):
raise qubes.exc.QubesValueError(self,
'Cannot change template of Disposable VM')
@qubes.events.handler('domain-shutdown')
@asyncio.coroutine
def on_domain_shutdown_coro(self):
'''Coroutine for executing cleanup after domain shutdown.
def on_domain_shutdown(self, _event, **_kwargs):
yield from self._auto_cleanup()
This override default action defined in QubesVM.on_domain_shutdown_coro
'''
with (yield from self.startup_lock):
yield from self.storage.stop()
if self.auto_cleanup and self in self.app.domains:
yield from self.remove_from_disk()
del self.app.domains[self]
self.app.save()
@asyncio.coroutine
def _auto_cleanup(self):
'''Do auto cleanup if enabled'''
if self.auto_cleanup and self in self.app.domains:
yield from self.remove_from_disk()
del self.app.domains[self]
self.app.save()
@classmethod
@asyncio.coroutine
@ -206,10 +206,6 @@ class DispVM(qubes.vm.qubesvm.QubesVM):
yield from super(DispVM, self).start(**kwargs)
except:
# cleanup also on failed startup; there is potential race with
# self.on_domain_shutdown_coro, so check if wasn't already removed
if self.auto_cleanup and self in self.app.domains:
yield from self.remove_from_disk()
del self.app.domains[self]
self.app.save()
# Cleanup also on failed startup
yield from self._auto_cleanup()
raise

View File

@ -151,9 +151,33 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
*other arguments are as in :py:meth:`start`*
.. event:: domain-stopped (subject, event)
Fired when domain has been stopped.
This event is emitted before ``'domain-shutdown'`` and will trigger
the cleanup in QubesVM. So if you require that the cleanup has
already run use ``'domain-shutdown'``.
Note that you can receive this event as soon as you received
``'domain-pre-start'``. This also can be emitted in case of a
startup failure, before or after ``'domain-start-failed'``.
Handler for this event can be asynchronous (a coroutine).
:param subject: Event emitter (the qube object)
:param event: Event name (``'domain-stopped'``)
.. event:: domain-shutdown (subject, event)
Fired when domain has been shut down.
Fired when domain has been shut down. It is generated after
``'domain-stopped'``.
Note that you can receive this event as soon as you received
``'domain-pre-start'``. This also can be emitted in case of a
startup failure, before or after ``'domain-start-failed'``.
Handler for this event can be asynchronous (a coroutine).
:param subject: Event emitter (the qube object)
:param event: Event name (``'domain-shutdown'``)
@ -643,6 +667,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
self._libvirt_domain = None
self._qdb_connection = None
# We assume a fully halted VM here. The 'domain-init' handler will
# check if the VM is already running.
self._domain_stopped_event_received = True
self._domain_stopped_event_handled = True
self._domain_stopped_future = None
# Internal lock to ensure ordering between _domain_stopped_coro() and
# start(). This should not be accessed anywhere else.
self._domain_stopped_lock = asyncio.Lock()
if xml is None:
# we are creating new VM and attributes came through kwargs
assert hasattr(self, 'qid')
@ -712,6 +747,8 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
if not self.app.vmm.offline_mode and self.is_running():
self.start_qdb_watch()
self._domain_stopped_event_received = False
self._domain_stopped_event_handled = False
@qubes.events.handler('property-set:label')
def on_property_set_label(self, event, name, newvalue, oldvalue=None):
@ -797,6 +834,30 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
if self.get_power_state() != 'Halted':
return
with (yield from self._domain_stopped_lock):
# Don't accept any new stopped event's till a new VM has been
# created. If we didn't received any stopped event or it wasn't
# handled yet we will handle this in the next lines.
self._domain_stopped_event_received = True
if self._domain_stopped_future is not None:
# Libvirt stopped event was already received, so cancel the
# future. If it didn't generate the Qubes events yet we
# will do it below.
self._domain_stopped_future.cancel()
self._domain_stopped_future = None
if not self._domain_stopped_event_handled:
# No Qubes domain-stopped events have been generated yet.
# So do this now.
# Set this immediately such that we don't generate events
# twice if an exception gets thrown.
self._domain_stopped_event_handled = True
yield from self.fire_event_async('domain-stopped')
yield from self.fire_event_async('domain-shutdown')
self.log.info('Starting {}'.format(self.name))
yield from self.fire_event_async('domain-pre-start',
@ -816,14 +877,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
qmemman_client = yield from asyncio.get_event_loop().\
run_in_executor(None, self.request_memory, mem_required)
yield from self.storage.start()
except Exception as exc:
# let anyone receiving domain-pre-start know that startup failed
yield from self.fire_event_async('domain-start-failed',
reason=str(exc))
if qmemman_client:
qmemman_client.close()
raise
try:
yield from self.storage.start()
self._update_libvirt_domain()
self.libvirt_domain.createWithFlags(
@ -833,12 +897,16 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
# let anyone receiving domain-pre-start know that startup failed
yield from self.fire_event_async('domain-start-failed',
reason=str(exc))
self.storage.stop()
raise
finally:
if qmemman_client:
qmemman_client.close()
self._domain_stopped_event_received = False
self._domain_stopped_event_handled = False
try:
yield from self.fire_event_async('domain-spawn',
start_guid=start_guid)
@ -870,24 +938,50 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
return self
@asyncio.coroutine
def on_domain_shutdown_coro(self):
'''Coroutine for executing cleanup after domain shutdown.
Do not allow domain to be started again until this finishes.
'''
with (yield from self.startup_lock):
try:
yield from self.storage.stop()
except qubes.storage.StoragePoolException:
self.log.exception('Failed to stop storage for domain %s',
self.name)
def on_libvirt_domain_stopped(self):
''' Handle VIR_DOMAIN_EVENT_STOPPED events from libvirt.
@qubes.events.handler('domain-shutdown')
def on_domain_shutdown(self, _event, **_kwargs):
'''Cleanup after domain shutdown'''
# TODO: ensure that domain haven't been started _before_ this
# coroutine got a chance to acquire a lock
asyncio.ensure_future(self.on_domain_shutdown_coro())
This is not a Qubes event handler. Instead we do some sanity checks
and synchronization with start() and then emits Qubes events.
'''
state = self.get_power_state()
if state not in ['Halted', 'Crashed', 'Dying']:
self.log.warning('Stopped event from libvirt received,'
' but domain is in state {}!'.format(state))
# ignore this unexpected event
return
if self._domain_stopped_event_received:
self.log.warning('Duplicated stopped event from libvirt received!')
# ignore this unexpected event
return
self._domain_stopped_event_received = True
self._domain_stopped_future = \
asyncio.ensure_future(self._domain_stopped_coro())
@asyncio.coroutine
def _domain_stopped_coro(self):
with (yield from self._domain_stopped_lock):
assert not self._domain_stopped_event_handled
# Set this immediately such that we don't generate events twice if
# an exception gets thrown.
self._domain_stopped_event_handled = True
yield from self.fire_event_async('domain-stopped')
yield from self.fire_event_async('domain-shutdown')
@qubes.events.handler('domain-stopped')
@asyncio.coroutine
def on_domain_stopped(self, _event, **_kwargs):
'''Cleanup after domain was stopped'''
try:
yield from self.storage.stop()
except qubes.storage.StoragePoolException:
self.log.exception('Failed to stop storage for domain %s',
self.name)
@asyncio.coroutine
def shutdown(self, force=False, wait=False):