From 36eb7f923fdde034965e91037eb88ab33674db33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Tue, 4 Oct 2016 21:54:29 +0200
Subject: [PATCH] qubes/tarwriter: add simple sparse-tar writer module

tar can't write archive with _contents_ of block device. We need this to
backup LVM-based disk images. To avoid dumping image to a file first,
create a simple tar archiver just for this purpose.

Python is not the fastest possible technology, it's 3 times slower than
equivalent written in C. But it's much easier to read, much less
error-prone, and still process 1GB image under 1s (CPU time, leaving
along actual disk reads). So, it's acceptable.
---
 qubes/tarwriter.py       | 206 +++++++++++++++++++++++++++++++++++++++
 qubes/tests/__init__.py  |   1 +
 qubes/tests/tarwriter.py | 147 ++++++++++++++++++++++++++++
 rpm_spec/core-dom0.spec  |   2 +
 4 files changed, 356 insertions(+)
 create mode 100644 qubes/tarwriter.py
 create mode 100644 qubes/tests/tarwriter.py

diff --git a/qubes/tarwriter.py b/qubes/tarwriter.py
new file mode 100644
index 00000000..eccf639b
--- /dev/null
+++ b/qubes/tarwriter.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python2
+# -*- encoding: utf8 -*-
+#
+# The Qubes OS Project, http://www.qubes-os.org
+#
+# Copyright (C) 2016 Marek Marczykowski-Górecki
+#                               <marmarek@invisiblethingslab.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import argparse
+import functools
+import subprocess
+import tarfile
+import io
+
+BUF_SIZE = 409600
+
+
+class TarSparseInfo(tarfile.TarInfo):
+    def __init__(self, name="", sparsemap=None):
+        super(TarSparseInfo, self).__init__(name)
+        if sparsemap is not None:
+            self.type = tarfile.GNUTYPE_SPARSE
+            self.sparsemap = list(sparsemap)
+            # compact size
+            self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0)
+        else:
+            self.sparsemap = []
+
+    @property
+    def realsize(self):
+        if len(self.sparsemap):
+            return self.sparsemap[-1][0] + self.sparsemap[-1][1]
+        else:
+            return self.size
+
+    def sparse_header_chunk(self, index):
+        if index < len(self.sparsemap):
+            return ''.join([
+                tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT),
+                tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT),
+            ])
+        else:
+            return '\0' * 12 * 2
+
+    def get_gnu_header(self):
+        '''Part placed in 'prefix' field of posix header'''
+
+        parts = [
+            tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT),  # atime
+            tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT),  # ctime
+            tarfile.itn(0, 12, tarfile.GNU_FORMAT),  # offset
+            tarfile.stn('', 4),  # longnames
+            '\0',  # unused_pad2
+        ]
+        parts += [self.sparse_header_chunk(i) for i in range(4)]
+        parts += [
+            '\1' if len(self.sparsemap) > 4 else '\0',  # isextended
+            tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT),  # realsize
+        ]
+        return ''.join(parts)
+
+    def get_info(self, encoding, errors):
+        info = super(TarSparseInfo, self).get_info(encoding, errors)
+        # place GNU extension into
+        info['prefix'] = self.get_gnu_header()
+        return info
+
+    def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
+            errors="strict"):
+        # pylint: disable=redefined-builtin
+        header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
+        if len(self.sparsemap) > 4:
+            return header_buf + ''.join(self.create_ext_sparse_headers())
+        else:
+            return header_buf
+
+    def create_ext_sparse_headers(self):
+        for ext_hdr in range(4, len(self.sparsemap), 21):
+            sparse_parts = [self.sparse_header_chunk(i) for i in
+                range(ext_hdr, ext_hdr+21)]
+            sparse_parts += '\1' if ext_hdr+21 < len(self.sparsemap) else '\0'
+            yield tarfile.stn(''.join(sparse_parts), 512)
+
+
+def get_sparse_map(input_file):
+    '''
+    Return map of the file where actual data is present, ignoring zero-ed
+    blocks. Last entry of the map spans to the end of file, even if that part is
+    zero-size (when file ends with zeros).
+
+    This function is performance critical.
+
+    :param input_file: io.File object
+    :return: iterable of (offset, size)
+    '''
+    zero_block = bytearray(tarfile.BLOCKSIZE)
+    buf = bytearray(BUF_SIZE)
+    in_data_block = False
+    data_block_start = 0
+    buf_start_offset = 0
+    while True:
+        buf_len = input_file.readinto(buf)
+        if not buf_len:
+            break
+        for offset in range(0, buf_len, tarfile.BLOCKSIZE):
+            if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
+                if in_data_block:
+                    in_data_block = False
+                    yield (data_block_start,
+                        buf_start_offset+offset-data_block_start)
+            else:
+                if not in_data_block:
+                    in_data_block = True
+                    data_block_start = buf_start_offset+offset
+        buf_start_offset += buf_len
+    if in_data_block:
+        yield (data_block_start, buf_start_offset-data_block_start)
+    else:
+        # always emit last slice to the input end - otherwise extracted file
+        # will be truncated
+        yield (buf_start_offset, 0)
+
+
+def copy_sparse_data(input_stream, output_stream, sparse_map):
+    '''Copy data blocks from input to output according to sparse_map
+
+    :param input_stream: io.IOBase input instance
+    :param output_stream: io.IOBase output instance
+    :param sparse_map: iterable of (offset, size)
+    '''
+
+    buf = bytearray(BUF_SIZE)
+
+    for chunk in sparse_map:
+        input_stream.seek(chunk[0])
+        left = chunk[1]
+        while left:
+            if left > BUF_SIZE:
+                read = input_stream.readinto(buf)
+                output_stream.write(buf[:read])
+            else:
+                buf_trailer = input_stream.read(left)
+                read = len(buf_trailer)
+                output_stream.write(buf_trailer)
+            left -= read
+            if not read:
+                raise Exception('premature EOF')
+
+def finalize(output):
+    '''Write EOF blocks'''
+    output.write('\0' * 512)
+    output.write('\0' * 512)
+
+def main(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--override-name', action='store', dest='override_name',
+        help='use this name in tar header')
+    parser.add_argument('--use-compress-program', default=None,
+        metavar='COMMAND', action='store', dest='use_compress_program',
+        help='Filter data through COMMAND.')
+    parser.add_argument('input_file',
+        help='input file name')
+    parser.add_argument('output_file', default='-', nargs='?',
+        help='output file name')
+    args = parser.parse_args(args)
+    input_file = io.open(args.input_file, 'rb')
+    sparse_map = list(get_sparse_map(input_file))
+    header_name = args.input_file
+    if args.override_name:
+        header_name = args.override_name
+    tar_info = TarSparseInfo(header_name, sparse_map)
+    if args.output_file == '-':
+        output = io.open('/dev/stdout', 'wb')
+    else:
+        output = io.open(args.output_file, 'wb')
+    if args.use_compress_program:
+        compress = subprocess.Popen([args.use_compress_program],
+            stdin=subprocess.PIPE, stdout=output)
+        output = compress.stdin
+    else:
+        compress = None
+    output.write(tar_info.tobuf(tarfile.GNU_FORMAT))
+    copy_sparse_data(input_file, output, sparse_map)
+    finalize(output)
+    input_file.close()
+    output.close()
+    if compress is not None:
+        compress.wait()
+        return compress.returncode
+    return 0
+
+if __name__ == '__main__':
+    main()
diff --git a/qubes/tests/__init__.py b/qubes/tests/__init__.py
index 9dc7f3fa..5649190e 100644
--- a/qubes/tests/__init__.py
+++ b/qubes/tests/__init__.py
@@ -809,6 +809,7 @@ def load_tests(loader, tests, pattern): # pylint: disable=unused-argument
             'qubes.tests.vm.mix.net',
             'qubes.tests.vm.adminvm',
             'qubes.tests.app',
+            'qubes.tests.tarwriter',
             'qubes.tests.tools.qvm_device',
             'qubes.tests.tools.qvm_firewall',
             'qubes.tests.tools.qvm_ls',
diff --git a/qubes/tests/tarwriter.py b/qubes/tests/tarwriter.py
new file mode 100644
index 00000000..d95144e7
--- /dev/null
+++ b/qubes/tests/tarwriter.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python2
+# -*- encoding: utf8 -*-
+#
+# The Qubes OS Project, http://www.qubes-os.org
+#
+# Copyright (C) 2016 Marek Marczykowski-Górecki 
+#                               <marmarek@invisiblethingslab.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import os
+import subprocess
+import tempfile
+
+import shutil
+
+import qubes.tarwriter
+import qubes.tests
+
+
+class TC_00_TarWriter(qubes.tests.QubesTestCase):
+    def setUp(self):
+        super(TC_00_TarWriter, self).setUp()
+        self.input_path = tempfile.mktemp()
+        self.output_path = tempfile.mktemp()
+        self.extract_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        if os.path.exists(self.input_path):
+            os.unlink(self.input_path)
+        if os.path.exists(self.output_path):
+            os.unlink(self.output_path)
+        if os.path.exists(self.extract_dir):
+            shutil.rmtree(self.extract_dir)
+        return super(TC_00_TarWriter, self).tearDown()
+
+    def assertTarExtractable(self, expected_name=None):
+        if expected_name is None:
+            expected_name = self.input_path
+        with self.assertNotRaises(subprocess.CalledProcessError):
+            tar_output = subprocess.check_output(
+                ['tar', 'xvf', self.output_path],
+                cwd=self.extract_dir,
+                stderr=subprocess.STDOUT)
+        expected_output = expected_name + '\n'
+        if expected_name[0] == '/':
+            expected_output = (
+                'tar: Removing leading `/\' from member names\n' +
+                expected_output)
+        self.assertEqual(tar_output, expected_output)
+        extracted_path = os.path.join(self.extract_dir,
+            expected_name.lstrip('/'))
+        with self.assertNotRaises(subprocess.CalledProcessError):
+            subprocess.check_call(
+                ['diff', '-q', self.input_path, extracted_path])
+        # make sure the file is still sparse
+        orig_stat = os.stat(self.input_path)
+        extracted_stat = os.stat(extracted_path)
+        self.assertEqual(orig_stat.st_blocks, extracted_stat.st_blocks)
+        self.assertEqual(orig_stat.st_size, extracted_stat.st_size)
+
+    def write_sparse_chunks(self, num_chunks):
+        with open(self.input_path, 'w') as f:
+            for i in range(num_chunks):
+                f.seek(8192 * i)
+                f.write('a' * 4096)
+
+    def test_000_simple(self):
+        self.write_sparse_chunks(1)
+        with open(self.input_path, 'w') as f:
+            f.write('a' * 4096)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_001_simple_sparse2(self):
+        self.write_sparse_chunks(2)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_002_simple_sparse3(self):
+        # tar header contains info about 4 chunks, check for off-by-one errors
+        self.write_sparse_chunks(3)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_003_simple_sparse4(self):
+        # tar header contains info about 4 chunks, check for off-by-one errors
+        self.write_sparse_chunks(4)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_004_simple_sparse5(self):
+        # tar header contains info about 4 chunks, check for off-by-one errors
+        self.write_sparse_chunks(5)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_005_simple_sparse24(self):
+        # tar header contains info about 4 chunks, next header contains 21 of
+        # them, check for off-by-one errors
+        self.write_sparse_chunks(24)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_006_simple_sparse25(self):
+        # tar header contains info about 4 chunks, next header contains 21 of
+        # them, check for off-by-one errors
+        self.write_sparse_chunks(25)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_007_simple_sparse26(self):
+        # tar header contains info about 4 chunks, next header contains 21 of
+        # them, check for off-by-one errors
+        self.write_sparse_chunks(26)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_010_override_name(self):
+        self.write_sparse_chunks(1)
+        qubes.tarwriter.main(['--override-name',
+            'different-name', self.input_path, self.output_path])
+        self.assertTarExtractable(expected_name='different-name')
+
+    def test_011_empty(self):
+        self.write_sparse_chunks(0)
+        qubes.tarwriter.main([self.input_path, self.output_path])
+        self.assertTarExtractable()
+
+    def test_012_gzip(self):
+        self.write_sparse_chunks(0)
+        qubes.tarwriter.main([
+            '--use-compress-program=gzip', self.input_path, self.output_path])
+        with self.assertNotRaises(subprocess.CalledProcessError):
+            subprocess.check_call(['gzip', '--test', self.output_path])
+        self.assertTarExtractable()
diff --git a/rpm_spec/core-dom0.spec b/rpm_spec/core-dom0.spec
index 9b0bb0e4..ecfd4391 100644
--- a/rpm_spec/core-dom0.spec
+++ b/rpm_spec/core-dom0.spec
@@ -220,6 +220,7 @@ fi
 %{python_sitelib}/qubes/exc.py*
 %{python_sitelib}/qubes/log.py*
 %{python_sitelib}/qubes/rngdoc.py*
+%{python_sitelib}/qubes/tarwriter.py*
 %{python_sitelib}/qubes/utils.py*
 
 %dir %{python_sitelib}/qubes/vm
@@ -290,6 +291,7 @@ fi
 %{python_sitelib}/qubes/tests/storage.py*
 %{python_sitelib}/qubes/tests/storage_file.py*
 %{python_sitelib}/qubes/tests/storage_lvm.py*
+%{python_sitelib}/qubes/tests/tarwriter.py*
 
 %dir %{python_sitelib}/qubes/tests/vm
 %{python_sitelib}/qubes/tests/vm/__init__.py*