qubes/vm: Improve stopped event handling
The previous version did not ensure that the stopped/shutdown event was handled before a new VM start. This can easily lead to problems like in QubesOS/qubes-issues#3164. This improved version now ensures that the stopped/shutdown events are handled before a new VM start. Additionally this version should be more robust against unreliable events from libvirt. It handles missing, duplicated and delayed stopped events. Instead of one 'domain-shutdown' event there are now 'domain-stopped' and 'domain-shutdown'. The later is generated after the former. This way it's easy to run code after the VM is shutdown including the stop of it's storage.
This commit is contained in:
parent
682d9503ee
commit
b907303a39
@ -1183,7 +1183,7 @@ class Qubes(qubes.PropertyHolder):
|
||||
return
|
||||
|
||||
if event == libvirt.VIR_DOMAIN_EVENT_STOPPED:
|
||||
vm.fire_event('domain-shutdown')
|
||||
vm.on_libvirt_domain_stopped()
|
||||
|
||||
@qubes.events.handler('domain-pre-delete')
|
||||
def on_domain_pre_deleted(self, event, vm):
|
||||
|
@ -137,7 +137,6 @@ class TC_00_Basic(qubes.tests.SystemTestCase):
|
||||
self.test_failure_reason = 'domain-shutdown event received twice'
|
||||
self.domain_shutdown_handled = True
|
||||
|
||||
@unittest.expectedFailure
|
||||
def test_201_shutdown_event_race(self):
|
||||
'''Regression test for 3164 - pure events edition'''
|
||||
vmname = self.make_vm_name('appvm')
|
||||
|
@ -129,18 +129,18 @@ class DispVM(qubes.vm.qubesvm.QubesVM):
|
||||
raise qubes.exc.QubesValueError(self,
|
||||
'Cannot change template of Disposable VM')
|
||||
|
||||
@qubes.events.handler('domain-shutdown')
|
||||
@asyncio.coroutine
|
||||
def on_domain_shutdown_coro(self):
|
||||
'''Coroutine for executing cleanup after domain shutdown.
|
||||
def on_domain_shutdown(self, _event, **_kwargs):
|
||||
yield from self._auto_cleanup()
|
||||
|
||||
This override default action defined in QubesVM.on_domain_shutdown_coro
|
||||
'''
|
||||
with (yield from self.startup_lock):
|
||||
yield from self.storage.stop()
|
||||
if self.auto_cleanup and self in self.app.domains:
|
||||
yield from self.remove_from_disk()
|
||||
del self.app.domains[self]
|
||||
self.app.save()
|
||||
@asyncio.coroutine
|
||||
def _auto_cleanup(self):
|
||||
'''Do auto cleanup if enabled'''
|
||||
if self.auto_cleanup and self in self.app.domains:
|
||||
yield from self.remove_from_disk()
|
||||
del self.app.domains[self]
|
||||
self.app.save()
|
||||
|
||||
@classmethod
|
||||
@asyncio.coroutine
|
||||
@ -206,10 +206,6 @@ class DispVM(qubes.vm.qubesvm.QubesVM):
|
||||
|
||||
yield from super(DispVM, self).start(**kwargs)
|
||||
except:
|
||||
# cleanup also on failed startup; there is potential race with
|
||||
# self.on_domain_shutdown_coro, so check if wasn't already removed
|
||||
if self.auto_cleanup and self in self.app.domains:
|
||||
yield from self.remove_from_disk()
|
||||
del self.app.domains[self]
|
||||
self.app.save()
|
||||
# Cleanup also on failed startup
|
||||
yield from self._auto_cleanup()
|
||||
raise
|
||||
|
@ -151,9 +151,33 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
|
||||
*other arguments are as in :py:meth:`start`*
|
||||
|
||||
.. event:: domain-stopped (subject, event)
|
||||
|
||||
Fired when domain has been stopped.
|
||||
|
||||
This event is emitted before ``'domain-shutdown'`` and will trigger
|
||||
the cleanup in QubesVM. So if you require that the cleanup has
|
||||
already run use ``'domain-shutdown'``.
|
||||
|
||||
Note that you can receive this event as soon as you received
|
||||
``'domain-pre-start'``. This also can be emitted in case of a
|
||||
startup failure, before or after ``'domain-start-failed'``.
|
||||
|
||||
Handler for this event can be asynchronous (a coroutine).
|
||||
|
||||
:param subject: Event emitter (the qube object)
|
||||
:param event: Event name (``'domain-stopped'``)
|
||||
|
||||
.. event:: domain-shutdown (subject, event)
|
||||
|
||||
Fired when domain has been shut down.
|
||||
Fired when domain has been shut down. It is generated after
|
||||
``'domain-stopped'``.
|
||||
|
||||
Note that you can receive this event as soon as you received
|
||||
``'domain-pre-start'``. This also can be emitted in case of a
|
||||
startup failure, before or after ``'domain-start-failed'``.
|
||||
|
||||
Handler for this event can be asynchronous (a coroutine).
|
||||
|
||||
:param subject: Event emitter (the qube object)
|
||||
:param event: Event name (``'domain-shutdown'``)
|
||||
@ -643,6 +667,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
self._libvirt_domain = None
|
||||
self._qdb_connection = None
|
||||
|
||||
# We assume a fully halted VM here. The 'domain-init' handler will
|
||||
# check if the VM is already running.
|
||||
self._domain_stopped_event_received = True
|
||||
self._domain_stopped_event_handled = True
|
||||
|
||||
self._domain_stopped_future = None
|
||||
|
||||
# Internal lock to ensure ordering between _domain_stopped_coro() and
|
||||
# start(). This should not be accessed anywhere else.
|
||||
self._domain_stopped_lock = asyncio.Lock()
|
||||
|
||||
if xml is None:
|
||||
# we are creating new VM and attributes came through kwargs
|
||||
assert hasattr(self, 'qid')
|
||||
@ -712,6 +747,8 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
|
||||
if not self.app.vmm.offline_mode and self.is_running():
|
||||
self.start_qdb_watch()
|
||||
self._domain_stopped_event_received = False
|
||||
self._domain_stopped_event_handled = False
|
||||
|
||||
@qubes.events.handler('property-set:label')
|
||||
def on_property_set_label(self, event, name, newvalue, oldvalue=None):
|
||||
@ -797,6 +834,30 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
if self.get_power_state() != 'Halted':
|
||||
return
|
||||
|
||||
with (yield from self._domain_stopped_lock):
|
||||
# Don't accept any new stopped event's till a new VM has been
|
||||
# created. If we didn't received any stopped event or it wasn't
|
||||
# handled yet we will handle this in the next lines.
|
||||
self._domain_stopped_event_received = True
|
||||
|
||||
if self._domain_stopped_future is not None:
|
||||
# Libvirt stopped event was already received, so cancel the
|
||||
# future. If it didn't generate the Qubes events yet we
|
||||
# will do it below.
|
||||
self._domain_stopped_future.cancel()
|
||||
self._domain_stopped_future = None
|
||||
|
||||
if not self._domain_stopped_event_handled:
|
||||
# No Qubes domain-stopped events have been generated yet.
|
||||
# So do this now.
|
||||
|
||||
# Set this immediately such that we don't generate events
|
||||
# twice if an exception gets thrown.
|
||||
self._domain_stopped_event_handled = True
|
||||
|
||||
yield from self.fire_event_async('domain-stopped')
|
||||
yield from self.fire_event_async('domain-shutdown')
|
||||
|
||||
self.log.info('Starting {}'.format(self.name))
|
||||
|
||||
yield from self.fire_event_async('domain-pre-start',
|
||||
@ -816,14 +877,17 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
qmemman_client = yield from asyncio.get_event_loop().\
|
||||
run_in_executor(None, self.request_memory, mem_required)
|
||||
|
||||
yield from self.storage.start()
|
||||
|
||||
except Exception as exc:
|
||||
# let anyone receiving domain-pre-start know that startup failed
|
||||
yield from self.fire_event_async('domain-start-failed',
|
||||
reason=str(exc))
|
||||
if qmemman_client:
|
||||
qmemman_client.close()
|
||||
raise
|
||||
|
||||
try:
|
||||
yield from self.storage.start()
|
||||
self._update_libvirt_domain()
|
||||
|
||||
self.libvirt_domain.createWithFlags(
|
||||
@ -833,12 +897,16 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
# let anyone receiving domain-pre-start know that startup failed
|
||||
yield from self.fire_event_async('domain-start-failed',
|
||||
reason=str(exc))
|
||||
self.storage.stop()
|
||||
raise
|
||||
|
||||
finally:
|
||||
if qmemman_client:
|
||||
qmemman_client.close()
|
||||
|
||||
self._domain_stopped_event_received = False
|
||||
self._domain_stopped_event_handled = False
|
||||
|
||||
try:
|
||||
yield from self.fire_event_async('domain-spawn',
|
||||
start_guid=start_guid)
|
||||
@ -870,24 +938,50 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
|
||||
|
||||
return self
|
||||
|
||||
@asyncio.coroutine
|
||||
def on_domain_shutdown_coro(self):
|
||||
'''Coroutine for executing cleanup after domain shutdown.
|
||||
Do not allow domain to be started again until this finishes.
|
||||
'''
|
||||
with (yield from self.startup_lock):
|
||||
try:
|
||||
yield from self.storage.stop()
|
||||
except qubes.storage.StoragePoolException:
|
||||
self.log.exception('Failed to stop storage for domain %s',
|
||||
self.name)
|
||||
def on_libvirt_domain_stopped(self):
|
||||
''' Handle VIR_DOMAIN_EVENT_STOPPED events from libvirt.
|
||||
|
||||
@qubes.events.handler('domain-shutdown')
|
||||
def on_domain_shutdown(self, _event, **_kwargs):
|
||||
'''Cleanup after domain shutdown'''
|
||||
# TODO: ensure that domain haven't been started _before_ this
|
||||
# coroutine got a chance to acquire a lock
|
||||
asyncio.ensure_future(self.on_domain_shutdown_coro())
|
||||
This is not a Qubes event handler. Instead we do some sanity checks
|
||||
and synchronization with start() and then emits Qubes events.
|
||||
'''
|
||||
|
||||
state = self.get_power_state()
|
||||
if state not in ['Halted', 'Crashed', 'Dying']:
|
||||
self.log.warning('Stopped event from libvirt received,'
|
||||
' but domain is in state {}!'.format(state))
|
||||
# ignore this unexpected event
|
||||
return
|
||||
|
||||
if self._domain_stopped_event_received:
|
||||
self.log.warning('Duplicated stopped event from libvirt received!')
|
||||
# ignore this unexpected event
|
||||
return
|
||||
|
||||
self._domain_stopped_event_received = True
|
||||
self._domain_stopped_future = \
|
||||
asyncio.ensure_future(self._domain_stopped_coro())
|
||||
|
||||
@asyncio.coroutine
|
||||
def _domain_stopped_coro(self):
|
||||
with (yield from self._domain_stopped_lock):
|
||||
assert not self._domain_stopped_event_handled
|
||||
|
||||
# Set this immediately such that we don't generate events twice if
|
||||
# an exception gets thrown.
|
||||
self._domain_stopped_event_handled = True
|
||||
|
||||
yield from self.fire_event_async('domain-stopped')
|
||||
yield from self.fire_event_async('domain-shutdown')
|
||||
|
||||
@qubes.events.handler('domain-stopped')
|
||||
@asyncio.coroutine
|
||||
def on_domain_stopped(self, _event, **_kwargs):
|
||||
'''Cleanup after domain was stopped'''
|
||||
try:
|
||||
yield from self.storage.stop()
|
||||
except qubes.storage.StoragePoolException:
|
||||
self.log.exception('Failed to stop storage for domain %s',
|
||||
self.name)
|
||||
|
||||
@asyncio.coroutine
|
||||
def shutdown(self, force=False, wait=False):
|
||||
|
Loading…
Reference in New Issue
Block a user