From 36eb7f923fdde034965e91037eb88ab33674db33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Tue, 4 Oct 2016 21:54:29 +0200 Subject: [PATCH] qubes/tarwriter: add simple sparse-tar writer module tar can't write archive with _contents_ of block device. We need this to backup LVM-based disk images. To avoid dumping image to a file first, create a simple tar archiver just for this purpose. Python is not the fastest possible technology, it's 3 times slower than equivalent written in C. But it's much easier to read, much less error-prone, and still process 1GB image under 1s (CPU time, leaving along actual disk reads). So, it's acceptable. --- qubes/tarwriter.py | 206 +++++++++++++++++++++++++++++++++++++++ qubes/tests/__init__.py | 1 + qubes/tests/tarwriter.py | 147 ++++++++++++++++++++++++++++ rpm_spec/core-dom0.spec | 2 + 4 files changed, 356 insertions(+) create mode 100644 qubes/tarwriter.py create mode 100644 qubes/tests/tarwriter.py diff --git a/qubes/tarwriter.py b/qubes/tarwriter.py new file mode 100644 index 00000000..eccf639b --- /dev/null +++ b/qubes/tarwriter.py @@ -0,0 +1,206 @@ +#!/usr/bin/python2 +# -*- encoding: utf8 -*- +# +# The Qubes OS Project, http://www.qubes-os.org +# +# Copyright (C) 2016 Marek Marczykowski-Górecki +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import argparse +import functools +import subprocess +import tarfile +import io + +BUF_SIZE = 409600 + + +class TarSparseInfo(tarfile.TarInfo): + def __init__(self, name="", sparsemap=None): + super(TarSparseInfo, self).__init__(name) + if sparsemap is not None: + self.type = tarfile.GNUTYPE_SPARSE + self.sparsemap = list(sparsemap) + # compact size + self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0) + else: + self.sparsemap = [] + + @property + def realsize(self): + if len(self.sparsemap): + return self.sparsemap[-1][0] + self.sparsemap[-1][1] + else: + return self.size + + def sparse_header_chunk(self, index): + if index < len(self.sparsemap): + return ''.join([ + tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT), + tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT), + ]) + else: + return '\0' * 12 * 2 + + def get_gnu_header(self): + '''Part placed in 'prefix' field of posix header''' + + parts = [ + tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # atime + tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # ctime + tarfile.itn(0, 12, tarfile.GNU_FORMAT), # offset + tarfile.stn('', 4), # longnames + '\0', # unused_pad2 + ] + parts += [self.sparse_header_chunk(i) for i in range(4)] + parts += [ + '\1' if len(self.sparsemap) > 4 else '\0', # isextended + tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT), # realsize + ] + return ''.join(parts) + + def get_info(self, encoding, errors): + info = super(TarSparseInfo, self).get_info(encoding, errors) + # place GNU extension into + info['prefix'] = self.get_gnu_header() + return info + + def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING, + errors="strict"): + # pylint: disable=redefined-builtin + header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors) + if len(self.sparsemap) > 4: + return header_buf + ''.join(self.create_ext_sparse_headers()) + else: + return header_buf + + def create_ext_sparse_headers(self): + for ext_hdr in range(4, len(self.sparsemap), 21): + sparse_parts = [self.sparse_header_chunk(i) for i in + range(ext_hdr, ext_hdr+21)] + sparse_parts += '\1' if ext_hdr+21 < len(self.sparsemap) else '\0' + yield tarfile.stn(''.join(sparse_parts), 512) + + +def get_sparse_map(input_file): + ''' + Return map of the file where actual data is present, ignoring zero-ed + blocks. Last entry of the map spans to the end of file, even if that part is + zero-size (when file ends with zeros). + + This function is performance critical. + + :param input_file: io.File object + :return: iterable of (offset, size) + ''' + zero_block = bytearray(tarfile.BLOCKSIZE) + buf = bytearray(BUF_SIZE) + in_data_block = False + data_block_start = 0 + buf_start_offset = 0 + while True: + buf_len = input_file.readinto(buf) + if not buf_len: + break + for offset in range(0, buf_len, tarfile.BLOCKSIZE): + if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block: + if in_data_block: + in_data_block = False + yield (data_block_start, + buf_start_offset+offset-data_block_start) + else: + if not in_data_block: + in_data_block = True + data_block_start = buf_start_offset+offset + buf_start_offset += buf_len + if in_data_block: + yield (data_block_start, buf_start_offset-data_block_start) + else: + # always emit last slice to the input end - otherwise extracted file + # will be truncated + yield (buf_start_offset, 0) + + +def copy_sparse_data(input_stream, output_stream, sparse_map): + '''Copy data blocks from input to output according to sparse_map + + :param input_stream: io.IOBase input instance + :param output_stream: io.IOBase output instance + :param sparse_map: iterable of (offset, size) + ''' + + buf = bytearray(BUF_SIZE) + + for chunk in sparse_map: + input_stream.seek(chunk[0]) + left = chunk[1] + while left: + if left > BUF_SIZE: + read = input_stream.readinto(buf) + output_stream.write(buf[:read]) + else: + buf_trailer = input_stream.read(left) + read = len(buf_trailer) + output_stream.write(buf_trailer) + left -= read + if not read: + raise Exception('premature EOF') + +def finalize(output): + '''Write EOF blocks''' + output.write('\0' * 512) + output.write('\0' * 512) + +def main(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('--override-name', action='store', dest='override_name', + help='use this name in tar header') + parser.add_argument('--use-compress-program', default=None, + metavar='COMMAND', action='store', dest='use_compress_program', + help='Filter data through COMMAND.') + parser.add_argument('input_file', + help='input file name') + parser.add_argument('output_file', default='-', nargs='?', + help='output file name') + args = parser.parse_args(args) + input_file = io.open(args.input_file, 'rb') + sparse_map = list(get_sparse_map(input_file)) + header_name = args.input_file + if args.override_name: + header_name = args.override_name + tar_info = TarSparseInfo(header_name, sparse_map) + if args.output_file == '-': + output = io.open('/dev/stdout', 'wb') + else: + output = io.open(args.output_file, 'wb') + if args.use_compress_program: + compress = subprocess.Popen([args.use_compress_program], + stdin=subprocess.PIPE, stdout=output) + output = compress.stdin + else: + compress = None + output.write(tar_info.tobuf(tarfile.GNU_FORMAT)) + copy_sparse_data(input_file, output, sparse_map) + finalize(output) + input_file.close() + output.close() + if compress is not None: + compress.wait() + return compress.returncode + return 0 + +if __name__ == '__main__': + main() diff --git a/qubes/tests/__init__.py b/qubes/tests/__init__.py index 9dc7f3fa..5649190e 100644 --- a/qubes/tests/__init__.py +++ b/qubes/tests/__init__.py @@ -809,6 +809,7 @@ def load_tests(loader, tests, pattern): # pylint: disable=unused-argument 'qubes.tests.vm.mix.net', 'qubes.tests.vm.adminvm', 'qubes.tests.app', + 'qubes.tests.tarwriter', 'qubes.tests.tools.qvm_device', 'qubes.tests.tools.qvm_firewall', 'qubes.tests.tools.qvm_ls', diff --git a/qubes/tests/tarwriter.py b/qubes/tests/tarwriter.py new file mode 100644 index 00000000..d95144e7 --- /dev/null +++ b/qubes/tests/tarwriter.py @@ -0,0 +1,147 @@ +#!/usr/bin/python2 +# -*- encoding: utf8 -*- +# +# The Qubes OS Project, http://www.qubes-os.org +# +# Copyright (C) 2016 Marek Marczykowski-Górecki +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import os +import subprocess +import tempfile + +import shutil + +import qubes.tarwriter +import qubes.tests + + +class TC_00_TarWriter(qubes.tests.QubesTestCase): + def setUp(self): + super(TC_00_TarWriter, self).setUp() + self.input_path = tempfile.mktemp() + self.output_path = tempfile.mktemp() + self.extract_dir = tempfile.mkdtemp() + + def tearDown(self): + if os.path.exists(self.input_path): + os.unlink(self.input_path) + if os.path.exists(self.output_path): + os.unlink(self.output_path) + if os.path.exists(self.extract_dir): + shutil.rmtree(self.extract_dir) + return super(TC_00_TarWriter, self).tearDown() + + def assertTarExtractable(self, expected_name=None): + if expected_name is None: + expected_name = self.input_path + with self.assertNotRaises(subprocess.CalledProcessError): + tar_output = subprocess.check_output( + ['tar', 'xvf', self.output_path], + cwd=self.extract_dir, + stderr=subprocess.STDOUT) + expected_output = expected_name + '\n' + if expected_name[0] == '/': + expected_output = ( + 'tar: Removing leading `/\' from member names\n' + + expected_output) + self.assertEqual(tar_output, expected_output) + extracted_path = os.path.join(self.extract_dir, + expected_name.lstrip('/')) + with self.assertNotRaises(subprocess.CalledProcessError): + subprocess.check_call( + ['diff', '-q', self.input_path, extracted_path]) + # make sure the file is still sparse + orig_stat = os.stat(self.input_path) + extracted_stat = os.stat(extracted_path) + self.assertEqual(orig_stat.st_blocks, extracted_stat.st_blocks) + self.assertEqual(orig_stat.st_size, extracted_stat.st_size) + + def write_sparse_chunks(self, num_chunks): + with open(self.input_path, 'w') as f: + for i in range(num_chunks): + f.seek(8192 * i) + f.write('a' * 4096) + + def test_000_simple(self): + self.write_sparse_chunks(1) + with open(self.input_path, 'w') as f: + f.write('a' * 4096) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_001_simple_sparse2(self): + self.write_sparse_chunks(2) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_002_simple_sparse3(self): + # tar header contains info about 4 chunks, check for off-by-one errors + self.write_sparse_chunks(3) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_003_simple_sparse4(self): + # tar header contains info about 4 chunks, check for off-by-one errors + self.write_sparse_chunks(4) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_004_simple_sparse5(self): + # tar header contains info about 4 chunks, check for off-by-one errors + self.write_sparse_chunks(5) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_005_simple_sparse24(self): + # tar header contains info about 4 chunks, next header contains 21 of + # them, check for off-by-one errors + self.write_sparse_chunks(24) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_006_simple_sparse25(self): + # tar header contains info about 4 chunks, next header contains 21 of + # them, check for off-by-one errors + self.write_sparse_chunks(25) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_007_simple_sparse26(self): + # tar header contains info about 4 chunks, next header contains 21 of + # them, check for off-by-one errors + self.write_sparse_chunks(26) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_010_override_name(self): + self.write_sparse_chunks(1) + qubes.tarwriter.main(['--override-name', + 'different-name', self.input_path, self.output_path]) + self.assertTarExtractable(expected_name='different-name') + + def test_011_empty(self): + self.write_sparse_chunks(0) + qubes.tarwriter.main([self.input_path, self.output_path]) + self.assertTarExtractable() + + def test_012_gzip(self): + self.write_sparse_chunks(0) + qubes.tarwriter.main([ + '--use-compress-program=gzip', self.input_path, self.output_path]) + with self.assertNotRaises(subprocess.CalledProcessError): + subprocess.check_call(['gzip', '--test', self.output_path]) + self.assertTarExtractable() diff --git a/rpm_spec/core-dom0.spec b/rpm_spec/core-dom0.spec index 9b0bb0e4..ecfd4391 100644 --- a/rpm_spec/core-dom0.spec +++ b/rpm_spec/core-dom0.spec @@ -220,6 +220,7 @@ fi %{python_sitelib}/qubes/exc.py* %{python_sitelib}/qubes/log.py* %{python_sitelib}/qubes/rngdoc.py* +%{python_sitelib}/qubes/tarwriter.py* %{python_sitelib}/qubes/utils.py* %dir %{python_sitelib}/qubes/vm @@ -290,6 +291,7 @@ fi %{python_sitelib}/qubes/tests/storage.py* %{python_sitelib}/qubes/tests/storage_file.py* %{python_sitelib}/qubes/tests/storage_lvm.py* +%{python_sitelib}/qubes/tests/tarwriter.py* %dir %{python_sitelib}/qubes/tests/vm %{python_sitelib}/qubes/tests/vm/__init__.py*