浏览代码

Merge remote-tracking branch 'qubesos/pr/188'

* qubesos/pr/188:
  file-reflink, a storage driver optimized for CoW filesystems
  Make AppVM/DispVM root volume rw to avoid CoW-on-CoW
Marek Marczykowski-Górecki 6 年之前
父节点
当前提交
209af07fd0
共有 9 个文件被更改,包括 446 次插入13 次删除
  1. 12 7
      doc/qubes-storage.rst
  2. 1 1
      qubes/app.py
  3. 423 0
      qubes/storage/reflink.py
  4. 4 2
      qubes/tests/storage.py
  5. 1 1
      qubes/tests/storage_file.py
  6. 1 1
      qubes/vm/appvm.py
  7. 1 1
      qubes/vm/dispvm.py
  8. 1 0
      rpm_spec/core-dom0.spec
  9. 2 0
      setup.py

+ 12 - 7
doc/qubes-storage.rst

@@ -9,14 +9,18 @@ possible to register additional 3rd-party drivers.
 Domain's storage volumes:
 
  - `root` - this is where operating system is installed. The volume is
-   available read-write to :py:class:`~qubes.vm.templatevm.TemplateVM` and
-   :py:class:`~qubes.vm.standalonevm.StandaloneVM`, and read-only to others
-   (:py:class:`~qubes.vm.appvm.AppVM` and :py:class:`~qubes.vm.dispvm.DispVM`).
+   available read-write to all domain classes. It could be made read-only for
+   :py:class:`~qubes.vm.appvm.AppVM` and :py:class:`~qubes.vm.dispvm.DispVM` to
+   implement an untrusted storage domain in the future, but doing so will cause
+   such VMs to set up a device-mapper based copy-on-write layer that redirects
+   writes to the `volatile` volume. Whose storage driver may already do CoW,
+   leading to an inefficient CoW-on-CoW setup. For this reason, `root` is
+   currently read-write in all cases.
  - `private` - this is where domain's data live. The volume is available
    read-write to all domain classes (including :py:class:`~qubes.vm.dispvm.DispVM`,
    but data written there is discarded on domain shutdown).
  - `volatile` - this is used for any data that do not to persist. This include
-   swap, copy-on-write layer for `root` volume etc.
+   swap, copy-on-write layer for a future read-only `root` volume etc.
  - `kernel` - domain boot files - operating system kernel, initial ramdisk,
    kernel modules etc. This volume is provided read-only and should be provided by
    a storage pool respecting :py:attr:`qubes.vm.qubesvm.QubesVM.kernel` property.
@@ -26,11 +30,12 @@ Storage pool concept
 
 Storage pool is responsible for managing its volumes. Qubes have defined
 storage pool driver API, allowing to put domains storage in various places. By
-default two drivers are provided: :py:class:`qubes.storage.file.FilePool`
-(named `file`) and :py:class:`qubes.storage.lvm.ThinPool` (named `lvm_thin`).
+default three drivers are provided: :py:class:`qubes.storage.file.FilePool`
+(named `file`), :py:class:`qubes.storage.reflink.ReflinkPool` (named
+`file-reflink`), and :py:class:`qubes.storage.lvm.ThinPool` (named `lvm_thin`).
 But the API allow to implement variety of other drivers (like additionally
 encrypted storage, external disk, drivers using special features of some
-filesystems like btrfs, etc).
+filesystems, etc).
 
 Most of storage API focus on storage volumes. Each volume have at least those
 properties:

+ 1 - 1
qubes/app.py

@@ -628,7 +628,7 @@ def _default_pool(app):
 
         # not a thin volume? look for file pools
         for pool in app.pools.values():
-            if pool.config.get('driver', None) != 'file':
+            if pool.config.get('driver', None) not in ('file', 'file-reflink'):
                 continue
             if pool.config['dir_path'] == qubes.config.qubes_base_dir:
                 return pool

+ 423 - 0
qubes/storage/reflink.py

@@ -0,0 +1,423 @@
+#
+# The Qubes OS Project, https://www.qubes-os.org/
+#
+# Copyright (C) 2018 Rusty Bird <rustybird@net-c.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <https://www.gnu.org/licenses/>.
+#
+
+''' Driver for handling VM images as files, without any device-mapper
+    involvement. A reflink-capable filesystem is strongly recommended,
+    but not required.
+'''
+
+import collections
+import errno
+import fcntl
+import glob
+import logging
+import os
+import re
+import subprocess
+import tempfile
+from contextlib import contextmanager, suppress
+
+import qubes.storage
+
+BLKSIZE = 512
+FICLONE = 1074041865  # see ioctl_ficlone manpage
+LOGGER = logging.getLogger('qube.storage.reflink')
+
+
+class ReflinkPool(qubes.storage.Pool):
+    driver = 'file-reflink'
+    _known_dir_path_prefixes = ['appvms', 'vm-templates']
+
+    def __init__(self, dir_path, setup_check='yes', revisions_to_keep=1,
+                 **kwargs):
+        super().__init__(revisions_to_keep=revisions_to_keep, **kwargs)
+        self._volumes = {}
+        self.dir_path = os.path.abspath(dir_path)
+        self.setup_check = qubes.property.bool(None, None, setup_check)
+
+    def setup(self):
+        created = _make_dir(self.dir_path)
+        if self.setup_check and not is_reflink_supported(self.dir_path):
+            if created:
+                _remove_empty_dir(self.dir_path)
+            raise qubes.storage.StoragePoolException(
+                'The filesystem for {!r} does not support reflinks. If you'
+                ' can live with VM startup delays and wasted disk space, pass'
+                ' the "setup_check=no" option.'.format(self.dir_path))
+        for dir_path_prefix in self._known_dir_path_prefixes:
+            _make_dir(os.path.join(self.dir_path, dir_path_prefix))
+        return self
+
+    def init_volume(self, vm, volume_config):
+        # Fail closed on any strange VM dir_path_prefix, just in case
+        # /etc/udev/rules/00-qubes-ignore-devices.rules needs updating
+        assert vm.dir_path_prefix in self._known_dir_path_prefixes, \
+               'Unknown dir_path_prefix {!r}'.format(vm.dir_path_prefix)
+
+        volume_config['pool'] = self
+        if 'revisions_to_keep' not in volume_config:
+            volume_config['revisions_to_keep'] = self.revisions_to_keep
+        if 'vid' not in volume_config:
+            volume_config['vid'] = os.path.join(vm.dir_path_prefix, vm.name,
+                                                volume_config['name'])
+        volume = ReflinkVolume(**volume_config)
+        self._volumes[volume_config['vid']] = volume
+        return volume
+
+    def list_volumes(self):
+        return list(self._volumes.values())
+
+    def get_volume(self, vid):
+        return self._volumes[vid]
+
+    def destroy(self):
+        pass
+
+    @property
+    def config(self):
+        return {
+            'name': self.name,
+            'dir_path': self.dir_path,
+            'driver': ReflinkPool.driver,
+            'revisions_to_keep': self.revisions_to_keep
+        }
+
+    @property
+    def size(self):
+        statvfs = os.statvfs(self.dir_path)
+        return statvfs.f_frsize * statvfs.f_blocks
+
+    @property
+    def usage(self):
+        statvfs = os.statvfs(self.dir_path)
+        return statvfs.f_frsize * (statvfs.f_blocks - statvfs.f_bfree)
+
+
+class ReflinkVolume(qubes.storage.Volume):
+    def create(self):
+        if self.save_on_stop and not self.snap_on_start:
+            _create_sparse_file(self._path_clean, self.size)
+        return self
+
+    def verify(self):
+        if self.snap_on_start:
+            # pylint: disable=protected-access
+            img = self.source._path_clean
+        elif self.save_on_stop:
+            img = self._path_clean
+        else:
+            img = None
+
+        if img is None or os.path.exists(img):
+            return True
+        raise qubes.storage.StoragePoolException(
+            'Missing image file {!r} for volume {!s}'.format(img, self.vid))
+
+    def remove(self):
+        ''' Drop volume object from pool; remove volume images from
+            oldest to newest; remove empty VM directory.
+        '''
+        with suppress(KeyError):
+            # pylint: disable=protected-access
+            del self.pool._volumes[self]
+
+        self._prune_revisions(keep=0)
+        _remove_file(self._path_clean)
+        _remove_file(self._path_dirty)
+
+        try:
+            _remove_empty_dir(os.path.dirname(self._path_dirty))
+        except OSError as ex:
+            if ex.errno is not errno.ENOTEMPTY:
+                raise
+
+        return self
+
+    def is_outdated(self):
+        if self.snap_on_start:
+            with suppress(FileNotFoundError):
+                # pylint: disable=protected-access
+                return (os.path.getmtime(self.source._path_clean) >
+                        os.path.getmtime(self._path_clean))
+        return False
+
+    def is_dirty(self):
+        return self.save_on_stop and os.path.exists(self._path_dirty)
+
+    def start(self):
+        if self.snap_on_start:
+            # pylint: disable=protected-access
+            _copy_file(self.source._path_clean, self._path_clean)
+        if self.is_dirty():  # implies self.save_on_stop
+            return self
+        if self.save_on_stop or self.snap_on_start:
+            _copy_file(self._path_clean, self._path_dirty)
+        else:
+            _create_sparse_file(self._path_dirty, self.size)
+        return self
+
+    def stop(self):
+        if self.save_on_stop:
+            self._commit()
+        else:
+            _remove_file(self._path_dirty)
+            if self.snap_on_start:
+                _remove_file(self._path_clean)
+        return self
+
+    def _commit(self):
+        self._add_revision()
+        self._prune_revisions()
+        _rename_file(self._path_dirty, self._path_clean)
+
+    def _add_revision(self):
+        if self.revisions_to_keep is 0:
+            return
+        if _get_file_disk_usage(self._path_clean) is 0:
+            return
+        ctime = os.path.getctime(self._path_clean)
+        revision = qubes.storage.isodate(int(ctime)) + 'Z'
+        _copy_file(self._path_clean, self._path_revision(revision))
+
+    def _prune_revisions(self, keep=None):
+        if keep is None:
+            keep = self.revisions_to_keep
+        # pylint: disable=invalid-unary-operand-type
+        for revision in list(self.revisions.keys())[:(-keep) or None]:
+            _remove_file(self._path_revision(revision))
+
+    def revert(self, revision=None):
+        if revision is None:
+            revision = list(self.revisions.keys())[-1]
+        elif not os.path.exists(self._path_revision(revision)):
+            raise qubes.storage.StoragePoolException(
+                'Missing revision {!r} for volume {!s}'.format(
+                    revision, self.vid))
+        self._add_revision()
+        _rename_file(self._path_revision(revision), self._path_clean)
+        return self
+
+    def resize(self, size):
+        ''' Expand a read-write volume image; notify any corresponding
+            loop devices of the size change.
+        '''
+        if not self.rw:
+            raise qubes.storage.StoragePoolException(
+                'Cannot resize: {!s} is read-only'.format(self.vid))
+
+        if size < self.size:
+            raise qubes.storage.StoragePoolException(
+                'For your own safety, shrinking of {!s} is disabled.'
+                ' If you really know what you are doing,'
+                ' use "truncate" manually.'.format(self.vid))
+
+        try:  # assume volume is not (cleanly) stopped ...
+            _resize_file(self._path_dirty, size)
+        except FileNotFoundError:  # ... but it actually is.
+            _resize_file(self._path_clean, size)
+
+        self.size = size
+
+        # resize any corresponding loop devices
+        out = _cmd('losetup', '--associated', self._path_dirty)
+        for match in re.finditer(br'^(/dev/loop[0-9]+): ', out, re.MULTILINE):
+            loop_dev = match.group(1).decode('ascii')
+            _cmd('losetup', '--set-capacity', loop_dev)
+
+        return self
+
+    def _require_save_on_stop(self, method_name):
+        if not self.save_on_stop:
+            raise NotImplementedError(
+                'Cannot {!s}: {!s} is not save_on_stop'.format(
+                    method_name, self.vid))
+
+    def export(self):
+        self._require_save_on_stop('export')
+        return self._path_clean
+
+    def import_data(self):
+        self._require_save_on_stop('import_data')
+        _create_sparse_file(self._path_dirty, self.size)
+        return self._path_dirty
+
+    def import_data_end(self, success):
+        if success:
+            self._commit()
+        else:
+            _remove_file(self._path_dirty)
+        return self
+
+    def import_volume(self, src_volume):
+        self._require_save_on_stop('import_volume')
+        try:
+            _copy_file(src_volume.export(), self._path_dirty)
+        except:
+            self.import_data_end(False)
+            raise
+        self.import_data_end(True)
+        return self
+
+    def _path_revision(self, revision):
+        return self._path_clean + '@' + revision
+
+    @property
+    def _path_clean(self):
+        return os.path.join(self.pool.dir_path, self.vid + '.img')
+
+    @property
+    def _path_dirty(self):
+        return os.path.join(self.pool.dir_path, self.vid + '-dirty.img')
+
+    @property
+    def path(self):
+        return self._path_dirty
+
+    @property
+    def revisions(self):
+        revision_to_timestamp = collections.OrderedDict()
+        prefix = self._path_revision('')
+        for filename in sorted(glob.glob(glob.escape(prefix) + '*Z')):
+            revision = filename[len(prefix):]
+            timestamp = revision[:-1]
+            revision_to_timestamp[revision] = timestamp
+        return revision_to_timestamp
+
+    @property
+    def usage(self):
+        ''' Return volume disk usage from the VM's perspective. It is
+            usually much lower from the host's perspective due to CoW.
+        '''
+        with suppress(FileNotFoundError):
+            return _get_file_disk_usage(self._path_dirty)
+        with suppress(FileNotFoundError):
+            return _get_file_disk_usage(self._path_clean)
+        return 0
+
+
+@contextmanager
+def _replace_file(dst):
+    ''' Yield a tempfile whose name starts with dst, creating the last
+        directory component if necessary. If the block does not raise
+        an exception, flush+fsync the tempfile and rename it to dst.
+    '''
+    tmp_dir, prefix = os.path.split(dst + '~')
+    _make_dir(tmp_dir)
+    tmp = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix=prefix, delete=False)
+    try:
+        yield tmp
+        tmp.flush()
+        os.fsync(tmp.fileno())
+        tmp.close()
+        _rename_file(tmp.name, dst)
+    except:
+        tmp.close()
+        _remove_file(tmp.name)
+        raise
+
+def _get_file_disk_usage(path):
+    ''' Return real disk usage (not logical file size) of a file. '''
+    return os.stat(path).st_blocks * BLKSIZE
+
+def _fsync_dir(path):
+    dir_fd = os.open(path, os.O_RDONLY | os.O_DIRECTORY)
+    try:
+        os.fsync(dir_fd)
+    finally:
+        os.close(dir_fd)
+
+def _make_dir(path):
+    ''' mkdir path, ignoring FileExistsError; return whether we
+        created it.
+    '''
+    with suppress(FileExistsError):
+        os.mkdir(path)
+        _fsync_dir(os.path.dirname(path))
+        LOGGER.info('Created directory: %s', path)
+        return True
+    return False
+
+def _remove_file(path):
+    with suppress(FileNotFoundError):
+        os.remove(path)
+        _fsync_dir(os.path.dirname(path))
+        LOGGER.info('Removed file: %s', path)
+
+def _remove_empty_dir(path):
+    with suppress(FileNotFoundError):
+        os.rmdir(path)
+        _fsync_dir(os.path.dirname(path))
+        LOGGER.info('Removed empty directory: %s', path)
+
+def _rename_file(src, dst):
+    os.rename(src, dst)
+    dst_dir = os.path.dirname(dst)
+    src_dir = os.path.dirname(src)
+    _fsync_dir(dst_dir)
+    if src_dir != dst_dir:
+        _fsync_dir(src_dir)
+    LOGGER.info('Renamed file: %s -> %s', src, dst)
+
+def _resize_file(path, size):
+    ''' Resize an existing file. '''
+    with open(path, 'rb+') as file:
+        file.truncate(size)
+
+def _create_sparse_file(path, size):
+    ''' Create an empty sparse file. '''
+    with _replace_file(path) as tmp:
+        tmp.truncate(size)
+        LOGGER.info('Created sparse file: %s', tmp.name)
+
+def _copy_file(src, dst):
+    ''' Copy src to dst as a reflink if possible, sparse if not. '''
+    if not os.path.exists(src):
+        raise FileNotFoundError(src)
+    with _replace_file(dst) as tmp:
+        LOGGER.info('Copying file: %s -> %s', src, tmp.name)
+        _cmd('cp', '--sparse=always', '--reflink=auto', src, tmp.name)
+
+def _cmd(*args):
+    ''' Run command until finished; return stdout (as bytes) if it
+        exited 0. Otherwise, raise a detailed StoragePoolException.
+    '''
+    try:
+        return subprocess.run(args, check=True,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE).stdout
+    except subprocess.CalledProcessError as ex:
+        msg = '{!s} err={!r} out={!r}'.format(ex, ex.stderr, ex.stdout)
+        raise qubes.storage.StoragePoolException(msg) from ex
+
+def is_reflink_supported(dst_dir, src_dir=None):
+    ''' Return whether destination directory supports reflink copies
+        from source directory. (A temporary file is created in each
+        directory, using O_TMPFILE if possible.)
+    '''
+    if src_dir is None:
+        src_dir = dst_dir
+    dst = tempfile.TemporaryFile(dir=dst_dir)
+    src = tempfile.TemporaryFile(dir=src_dir)
+    src.write(b'foo')  # don't let any filesystem get clever with empty files
+
+    try:
+        fcntl.ioctl(dst.fileno(), FICLONE, src.fileno())
+        return True
+    except OSError:
+        return False

+ 4 - 2
qubes/tests/storage.py

@@ -101,8 +101,10 @@ class TC_00_Pool(SystemTestCase):
             self.app.get_pool('foo-bar')
 
     def test_001_all_pool_drivers(self):
-        """ The only predefined pool driver is file """
-        self.assertCountEqual(['linux-kernel', 'lvm_thin', 'file'], pool_drivers())
+        """ Expect all our pool drivers (and only them) """
+        self.assertCountEqual(
+            ['linux-kernel', 'lvm_thin', 'file', 'file-reflink'],
+            pool_drivers())
 
     def test_002_get_pool_klass(self):
         """ Expect the default pool to be `FilePool` """

+ 1 - 1
qubes/tests/storage_file.py

@@ -274,7 +274,7 @@ class TC_01_FileVolumes(qubes.tests.QubesTestCase):
         expected = template_dir + '/root.img:' + \
                    template_dir + '/root-cow.img:' + \
                    vm_dir + '/root-cow.img'
-        self.assertVolumePath(vm, 'root', expected, rw=False)
+        self.assertVolumePath(vm, 'root', expected, rw=True)
         expected = vm_dir + '/private.img:' + \
             vm_dir + '/private-cow.img'
         self.assertVolumePath(vm, 'private', expected, rw=True)

+ 1 - 1
qubes/vm/appvm.py

@@ -45,7 +45,7 @@ class AppVM(qubes.vm.qubesvm.QubesVM):
                 'name': 'root',
                 'snap_on_start': True,
                 'save_on_stop': False,
-                'rw': False,
+                'rw': True,
                 'source': None,
             },
             'private': {

+ 1 - 1
qubes/vm/dispvm.py

@@ -47,7 +47,7 @@ class DispVM(qubes.vm.qubesvm.QubesVM):
                 'name': 'root',
                 'snap_on_start': True,
                 'save_on_stop': False,
-                'rw': False,
+                'rw': True,
                 'source': None,
             },
             'private': {

+ 1 - 0
rpm_spec/core-dom0.spec

@@ -269,6 +269,7 @@ fi
 %{python3_sitelib}/qubes/storage/__pycache__/*
 %{python3_sitelib}/qubes/storage/__init__.py
 %{python3_sitelib}/qubes/storage/file.py
+%{python3_sitelib}/qubes/storage/reflink.py
 %{python3_sitelib}/qubes/storage/kernels.py
 %{python3_sitelib}/qubes/storage/lvm.py
 

+ 2 - 0
setup.py

@@ -82,12 +82,14 @@ if __name__ == '__main__':
             ],
             'qubes.storage': [
                 'file = qubes.storage.file:FilePool',
+                'file-reflink = qubes.storage.reflink:ReflinkPool',
                 'linux-kernel = qubes.storage.kernels:LinuxKernel',
                 'lvm_thin = qubes.storage.lvm:ThinPool',
             ],
             'qubes.tests.storage': [
                 'test = qubes.tests.storage:TestPool',
                 'file = qubes.storage.file:FilePool',
+                'file-reflink = qubes.storage.reflink:ReflinkPool',
                 'linux-kernel = qubes.storage.kernels:LinuxKernel',
                 'lvm_thin = qubes.storage.lvm:ThinPool',
             ],