From 021047f950b0f8c2f147d75bd4dcc056f6757441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Fri, 13 Oct 2017 01:11:55 +0200 Subject: [PATCH 1/4] storage/lvm: remove old volume only after successfully cloning new one In some cases, it may happen that new volume (`self._vid_snap`) does not exists. This is normally an error, but even in such a case, do not remove the only remaining instance of volume (`self.vid`). Instead, rename it temporarily and remove only after new volume is successfully cloned. Fixes QubesOS/qubes-issues#3164 --- qubes/storage/lvm.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/qubes/storage/lvm.py b/qubes/storage/lvm.py index 4f121f6c..e67b0ccf 100644 --- a/qubes/storage/lvm.py +++ b/qubes/storage/lvm.py @@ -271,10 +271,22 @@ class ThinVolume(qubes.storage.Volume): qubes_lvm(cmd, self.log) self._remove_revisions() - cmd = ['remove', self.vid] - qubes_lvm(cmd, self.log) - cmd = ['clone', self._vid_snap, self.vid] + # TODO: when converting this function to coroutine, this _must_ be + # under a lock + # remove old volume only after _successful_ clone of the new one + cmd = ['rename', self.vid, self.vid + '-tmp'] qubes_lvm(cmd, self.log) + try: + cmd = ['clone', self._vid_snap, self.vid] + qubes_lvm(cmd, self.log) + except: + # restore original volume + cmd = ['rename', self.vid + '-tmp', self.vid] + qubes_lvm(cmd, self.log) + raise + else: + cmd = ['remove', self.vid + '-tmp'] + qubes_lvm(cmd, self.log) def create(self): @@ -494,6 +506,8 @@ def qubes_lvm(cmd, log=logging.getLogger('qubes.storage.lvm')): lvm_cmd = ["lvextend", "-L%s" % size, cmd[1]] elif action == 'activate': lvm_cmd = ['lvchange', '-ay', cmd[1]] + elif action == 'rename': + lvm_cmd = ['lvrename', cmd[1], cmd[2]] else: raise NotImplementedError('unsupported action: ' + action) if lvm_is_very_old: From 145ccfb34e46627623818989ea16fc05ccbe8f6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Mon, 16 Oct 2017 00:43:10 +0200 Subject: [PATCH 2/4] storage/lvm: fix Volume.verify() This function is supposed to raise an exception if something is wrong, not just return False. Document this. --- qubes/storage/__init__.py | 3 +++ qubes/storage/lvm.py | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/qubes/storage/__init__.py b/qubes/storage/__init__.py index 5bc5f1a9..12853fb2 100644 --- a/qubes/storage/__init__.py +++ b/qubes/storage/__init__.py @@ -276,6 +276,9 @@ class Volume(object): def verify(self): ''' Verifies the volume. + This function is supposed to either return :py:obj:`True`, or raise + an exception. + This can be implemented as a coroutine.''' raise self._not_implemented("verify") diff --git a/qubes/storage/lvm.py b/qubes/storage/lvm.py index e67b0ccf..b8acbb5d 100644 --- a/qubes/storage/lvm.py +++ b/qubes/storage/lvm.py @@ -455,11 +455,21 @@ class ThinVolume(qubes.storage.Volume): def verify(self): ''' Verifies the volume. ''' + if not self.save_on_stop and not self.snap_on_start: + # volatile volumes don't need any files + return True + if self.source is not None: + vid = str(self.source) + else: + vid = self.vid try: - vol_info = size_cache[self.vid] - return vol_info['attr'][4] == 'a' + vol_info = size_cache[vid] + if vol_info['attr'][4] != 'a': + raise qubes.storage.StoragePoolException( + 'volume {} not active'.format(vid)) except KeyError: - return False + raise qubes.storage.StoragePoolException( + 'volume {} missing'.format(vid)) def block_device(self): From 08583af1ff7c0fa0b12009921aed059e78996c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Mon, 16 Oct 2017 00:44:52 +0200 Subject: [PATCH 3/4] storage/lvm: make sure volume cache is refreshed after changes Even if start/stop fails, still refresh the cache. The failed operation might still modified something. --- qubes/storage/lvm.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/qubes/storage/lvm.py b/qubes/storage/lvm.py index b8acbb5d..85e216da 100644 --- a/qubes/storage/lvm.py +++ b/qubes/storage/lvm.py @@ -432,25 +432,28 @@ class ThinVolume(qubes.storage.Volume): def start(self): - if self.snap_on_start or self.save_on_stop: - if not self.save_on_stop or not self.is_dirty(): - self._snapshot() - else: - self._reset() - - reset_cache() + try: + if self.snap_on_start or self.save_on_stop: + if not self.save_on_stop or not self.is_dirty(): + self._snapshot() + else: + self._reset() + finally: + reset_cache() return self def stop(self): - if self.save_on_stop: - self._commit() - if self.snap_on_start or self.save_on_stop: - cmd = ['remove', self._vid_snap] - qubes_lvm(cmd, self.log) - else: - cmd = ['remove', self.vid] - qubes_lvm(cmd, self.log) - reset_cache() + try: + if self.save_on_stop: + self._commit() + if self.snap_on_start or self.save_on_stop: + cmd = ['remove', self._vid_snap] + qubes_lvm(cmd, self.log) + else: + cmd = ['remove', self.vid] + qubes_lvm(cmd, self.log) + finally: + reset_cache() return self def verify(self): From c8519a700fcc171ee2d902781ccc855f70cd2dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Mon, 16 Oct 2017 00:48:25 +0200 Subject: [PATCH 4/4] tests: add regression test for #3164 This is a race condition, so to make it more likely to fail (if it's broken), make some things manually. In normal circumstances this order of actions is also possible, just less likely to happen. But as seen in the bug report, happens from time to time. QubesOS/qubes-issues#3164 --- qubes/tests/integ/basic.py | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/qubes/tests/integ/basic.py b/qubes/tests/integ/basic.py index 67b5f44c..0be1a572 100644 --- a/qubes/tests/integ/basic.py +++ b/qubes/tests/integ/basic.py @@ -31,9 +31,12 @@ import tempfile import time import unittest +import collections + import qubes import qubes.firewall import qubes.tests +import qubes.storage import qubes.vm.appvm import qubes.vm.qubesvm import qubes.vm.standalonevm @@ -80,6 +83,49 @@ class TC_00_Basic(qubes.tests.SystemTestCase): self.loop.run_until_complete(asyncio.sleep(0.1)) self.assertTrue(flag) + def _test_200_on_domain_start(self, vm, event, **_kwargs): + '''Simulate domain crash just after startup''' + vm.libvirt_domain.destroy() + + def test_200_shutdown_event_race(self): + '''Regression test for 3164''' + vmname = self.make_vm_name('appvm') + + self.vm = self.app.add_new_vm(qubes.vm.appvm.AppVM, + name=vmname, template=self.app.default_template, + label='red') + # help the luck a little - don't wait for qrexec to easier win the race + self.vm.features['qrexec'] = False + self.loop.run_until_complete(self.vm.create_on_disk()) + # another way to help the luck a little - make sure the private + # volume is first in (normally unordered) dict - this way if any + # volume action fails, it will be at or after private volume - not + # before (preventing private volume action) + old_volumes = self.vm.volumes + self.vm.volumes = collections.OrderedDict() + self.vm.volumes['private'] = old_volumes.pop('private') + self.vm.volumes.update(old_volumes.items()) + del old_volumes + + self.loop.run_until_complete(self.vm.start()) + + # kill it the way it does not give a chance for domain-shutdown it + # execute + self.vm.libvirt_domain.destroy() + + # now, lets try to start the VM again, before domain-shutdown event + # got handled (#3164), and immediately trigger second domain-shutdown + self.vm.add_handler('domain-start', self._test_200_on_domain_start) + self.loop.run_until_complete(self.vm.start()) + + # and give a chance for both domain-shutdown handlers to execute + self.loop.run_until_complete(asyncio.sleep(1)) + with self.assertNotRaises(qubes.exc.QubesException): + # if the above caused two domain-shutdown handlers being called + # one after another, private volume is gone + self.loop.run_until_complete(self.vm.storage.verify()) + + class TC_01_Properties(qubes.tests.SystemTestCase): # pylint: disable=attribute-defined-outside-init def setUp(self):