qubes/tarwriter: add simple sparse-tar writer module
tar can't write archive with _contents_ of block device. We need this to backup LVM-based disk images. To avoid dumping image to a file first, create a simple tar archiver just for this purpose. Python is not the fastest possible technology, it's 3 times slower than equivalent written in C. But it's much easier to read, much less error-prone, and still process 1GB image under 1s (CPU time, leaving along actual disk reads). So, it's acceptable.
This commit is contained in:
parent
278a5340dc
commit
36eb7f923f
206
qubes/tarwriter.py
Normal file
206
qubes/tarwriter.py
Normal file
@ -0,0 +1,206 @@
|
||||
#!/usr/bin/python2
|
||||
# -*- encoding: utf8 -*-
|
||||
#
|
||||
# The Qubes OS Project, http://www.qubes-os.org
|
||||
#
|
||||
# Copyright (C) 2016 Marek Marczykowski-Górecki
|
||||
# <marmarek@invisiblethingslab.com>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import argparse
|
||||
import functools
|
||||
import subprocess
|
||||
import tarfile
|
||||
import io
|
||||
|
||||
BUF_SIZE = 409600
|
||||
|
||||
|
||||
class TarSparseInfo(tarfile.TarInfo):
|
||||
def __init__(self, name="", sparsemap=None):
|
||||
super(TarSparseInfo, self).__init__(name)
|
||||
if sparsemap is not None:
|
||||
self.type = tarfile.GNUTYPE_SPARSE
|
||||
self.sparsemap = list(sparsemap)
|
||||
# compact size
|
||||
self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0)
|
||||
else:
|
||||
self.sparsemap = []
|
||||
|
||||
@property
|
||||
def realsize(self):
|
||||
if len(self.sparsemap):
|
||||
return self.sparsemap[-1][0] + self.sparsemap[-1][1]
|
||||
else:
|
||||
return self.size
|
||||
|
||||
def sparse_header_chunk(self, index):
|
||||
if index < len(self.sparsemap):
|
||||
return ''.join([
|
||||
tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT),
|
||||
tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT),
|
||||
])
|
||||
else:
|
||||
return '\0' * 12 * 2
|
||||
|
||||
def get_gnu_header(self):
|
||||
'''Part placed in 'prefix' field of posix header'''
|
||||
|
||||
parts = [
|
||||
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # atime
|
||||
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # ctime
|
||||
tarfile.itn(0, 12, tarfile.GNU_FORMAT), # offset
|
||||
tarfile.stn('', 4), # longnames
|
||||
'\0', # unused_pad2
|
||||
]
|
||||
parts += [self.sparse_header_chunk(i) for i in range(4)]
|
||||
parts += [
|
||||
'\1' if len(self.sparsemap) > 4 else '\0', # isextended
|
||||
tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT), # realsize
|
||||
]
|
||||
return ''.join(parts)
|
||||
|
||||
def get_info(self, encoding, errors):
|
||||
info = super(TarSparseInfo, self).get_info(encoding, errors)
|
||||
# place GNU extension into
|
||||
info['prefix'] = self.get_gnu_header()
|
||||
return info
|
||||
|
||||
def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
|
||||
errors="strict"):
|
||||
# pylint: disable=redefined-builtin
|
||||
header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
|
||||
if len(self.sparsemap) > 4:
|
||||
return header_buf + ''.join(self.create_ext_sparse_headers())
|
||||
else:
|
||||
return header_buf
|
||||
|
||||
def create_ext_sparse_headers(self):
|
||||
for ext_hdr in range(4, len(self.sparsemap), 21):
|
||||
sparse_parts = [self.sparse_header_chunk(i) for i in
|
||||
range(ext_hdr, ext_hdr+21)]
|
||||
sparse_parts += '\1' if ext_hdr+21 < len(self.sparsemap) else '\0'
|
||||
yield tarfile.stn(''.join(sparse_parts), 512)
|
||||
|
||||
|
||||
def get_sparse_map(input_file):
|
||||
'''
|
||||
Return map of the file where actual data is present, ignoring zero-ed
|
||||
blocks. Last entry of the map spans to the end of file, even if that part is
|
||||
zero-size (when file ends with zeros).
|
||||
|
||||
This function is performance critical.
|
||||
|
||||
:param input_file: io.File object
|
||||
:return: iterable of (offset, size)
|
||||
'''
|
||||
zero_block = bytearray(tarfile.BLOCKSIZE)
|
||||
buf = bytearray(BUF_SIZE)
|
||||
in_data_block = False
|
||||
data_block_start = 0
|
||||
buf_start_offset = 0
|
||||
while True:
|
||||
buf_len = input_file.readinto(buf)
|
||||
if not buf_len:
|
||||
break
|
||||
for offset in range(0, buf_len, tarfile.BLOCKSIZE):
|
||||
if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
|
||||
if in_data_block:
|
||||
in_data_block = False
|
||||
yield (data_block_start,
|
||||
buf_start_offset+offset-data_block_start)
|
||||
else:
|
||||
if not in_data_block:
|
||||
in_data_block = True
|
||||
data_block_start = buf_start_offset+offset
|
||||
buf_start_offset += buf_len
|
||||
if in_data_block:
|
||||
yield (data_block_start, buf_start_offset-data_block_start)
|
||||
else:
|
||||
# always emit last slice to the input end - otherwise extracted file
|
||||
# will be truncated
|
||||
yield (buf_start_offset, 0)
|
||||
|
||||
|
||||
def copy_sparse_data(input_stream, output_stream, sparse_map):
|
||||
'''Copy data blocks from input to output according to sparse_map
|
||||
|
||||
:param input_stream: io.IOBase input instance
|
||||
:param output_stream: io.IOBase output instance
|
||||
:param sparse_map: iterable of (offset, size)
|
||||
'''
|
||||
|
||||
buf = bytearray(BUF_SIZE)
|
||||
|
||||
for chunk in sparse_map:
|
||||
input_stream.seek(chunk[0])
|
||||
left = chunk[1]
|
||||
while left:
|
||||
if left > BUF_SIZE:
|
||||
read = input_stream.readinto(buf)
|
||||
output_stream.write(buf[:read])
|
||||
else:
|
||||
buf_trailer = input_stream.read(left)
|
||||
read = len(buf_trailer)
|
||||
output_stream.write(buf_trailer)
|
||||
left -= read
|
||||
if not read:
|
||||
raise Exception('premature EOF')
|
||||
|
||||
def finalize(output):
|
||||
'''Write EOF blocks'''
|
||||
output.write('\0' * 512)
|
||||
output.write('\0' * 512)
|
||||
|
||||
def main(args=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--override-name', action='store', dest='override_name',
|
||||
help='use this name in tar header')
|
||||
parser.add_argument('--use-compress-program', default=None,
|
||||
metavar='COMMAND', action='store', dest='use_compress_program',
|
||||
help='Filter data through COMMAND.')
|
||||
parser.add_argument('input_file',
|
||||
help='input file name')
|
||||
parser.add_argument('output_file', default='-', nargs='?',
|
||||
help='output file name')
|
||||
args = parser.parse_args(args)
|
||||
input_file = io.open(args.input_file, 'rb')
|
||||
sparse_map = list(get_sparse_map(input_file))
|
||||
header_name = args.input_file
|
||||
if args.override_name:
|
||||
header_name = args.override_name
|
||||
tar_info = TarSparseInfo(header_name, sparse_map)
|
||||
if args.output_file == '-':
|
||||
output = io.open('/dev/stdout', 'wb')
|
||||
else:
|
||||
output = io.open(args.output_file, 'wb')
|
||||
if args.use_compress_program:
|
||||
compress = subprocess.Popen([args.use_compress_program],
|
||||
stdin=subprocess.PIPE, stdout=output)
|
||||
output = compress.stdin
|
||||
else:
|
||||
compress = None
|
||||
output.write(tar_info.tobuf(tarfile.GNU_FORMAT))
|
||||
copy_sparse_data(input_file, output, sparse_map)
|
||||
finalize(output)
|
||||
input_file.close()
|
||||
output.close()
|
||||
if compress is not None:
|
||||
compress.wait()
|
||||
return compress.returncode
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -809,6 +809,7 @@ def load_tests(loader, tests, pattern): # pylint: disable=unused-argument
|
||||
'qubes.tests.vm.mix.net',
|
||||
'qubes.tests.vm.adminvm',
|
||||
'qubes.tests.app',
|
||||
'qubes.tests.tarwriter',
|
||||
'qubes.tests.tools.qvm_device',
|
||||
'qubes.tests.tools.qvm_firewall',
|
||||
'qubes.tests.tools.qvm_ls',
|
||||
|
147
qubes/tests/tarwriter.py
Normal file
147
qubes/tests/tarwriter.py
Normal file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/python2
|
||||
# -*- encoding: utf8 -*-
|
||||
#
|
||||
# The Qubes OS Project, http://www.qubes-os.org
|
||||
#
|
||||
# Copyright (C) 2016 Marek Marczykowski-Górecki
|
||||
# <marmarek@invisiblethingslab.com>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import shutil
|
||||
|
||||
import qubes.tarwriter
|
||||
import qubes.tests
|
||||
|
||||
|
||||
class TC_00_TarWriter(qubes.tests.QubesTestCase):
|
||||
def setUp(self):
|
||||
super(TC_00_TarWriter, self).setUp()
|
||||
self.input_path = tempfile.mktemp()
|
||||
self.output_path = tempfile.mktemp()
|
||||
self.extract_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
if os.path.exists(self.input_path):
|
||||
os.unlink(self.input_path)
|
||||
if os.path.exists(self.output_path):
|
||||
os.unlink(self.output_path)
|
||||
if os.path.exists(self.extract_dir):
|
||||
shutil.rmtree(self.extract_dir)
|
||||
return super(TC_00_TarWriter, self).tearDown()
|
||||
|
||||
def assertTarExtractable(self, expected_name=None):
|
||||
if expected_name is None:
|
||||
expected_name = self.input_path
|
||||
with self.assertNotRaises(subprocess.CalledProcessError):
|
||||
tar_output = subprocess.check_output(
|
||||
['tar', 'xvf', self.output_path],
|
||||
cwd=self.extract_dir,
|
||||
stderr=subprocess.STDOUT)
|
||||
expected_output = expected_name + '\n'
|
||||
if expected_name[0] == '/':
|
||||
expected_output = (
|
||||
'tar: Removing leading `/\' from member names\n' +
|
||||
expected_output)
|
||||
self.assertEqual(tar_output, expected_output)
|
||||
extracted_path = os.path.join(self.extract_dir,
|
||||
expected_name.lstrip('/'))
|
||||
with self.assertNotRaises(subprocess.CalledProcessError):
|
||||
subprocess.check_call(
|
||||
['diff', '-q', self.input_path, extracted_path])
|
||||
# make sure the file is still sparse
|
||||
orig_stat = os.stat(self.input_path)
|
||||
extracted_stat = os.stat(extracted_path)
|
||||
self.assertEqual(orig_stat.st_blocks, extracted_stat.st_blocks)
|
||||
self.assertEqual(orig_stat.st_size, extracted_stat.st_size)
|
||||
|
||||
def write_sparse_chunks(self, num_chunks):
|
||||
with open(self.input_path, 'w') as f:
|
||||
for i in range(num_chunks):
|
||||
f.seek(8192 * i)
|
||||
f.write('a' * 4096)
|
||||
|
||||
def test_000_simple(self):
|
||||
self.write_sparse_chunks(1)
|
||||
with open(self.input_path, 'w') as f:
|
||||
f.write('a' * 4096)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_001_simple_sparse2(self):
|
||||
self.write_sparse_chunks(2)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_002_simple_sparse3(self):
|
||||
# tar header contains info about 4 chunks, check for off-by-one errors
|
||||
self.write_sparse_chunks(3)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_003_simple_sparse4(self):
|
||||
# tar header contains info about 4 chunks, check for off-by-one errors
|
||||
self.write_sparse_chunks(4)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_004_simple_sparse5(self):
|
||||
# tar header contains info about 4 chunks, check for off-by-one errors
|
||||
self.write_sparse_chunks(5)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_005_simple_sparse24(self):
|
||||
# tar header contains info about 4 chunks, next header contains 21 of
|
||||
# them, check for off-by-one errors
|
||||
self.write_sparse_chunks(24)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_006_simple_sparse25(self):
|
||||
# tar header contains info about 4 chunks, next header contains 21 of
|
||||
# them, check for off-by-one errors
|
||||
self.write_sparse_chunks(25)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_007_simple_sparse26(self):
|
||||
# tar header contains info about 4 chunks, next header contains 21 of
|
||||
# them, check for off-by-one errors
|
||||
self.write_sparse_chunks(26)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_010_override_name(self):
|
||||
self.write_sparse_chunks(1)
|
||||
qubes.tarwriter.main(['--override-name',
|
||||
'different-name', self.input_path, self.output_path])
|
||||
self.assertTarExtractable(expected_name='different-name')
|
||||
|
||||
def test_011_empty(self):
|
||||
self.write_sparse_chunks(0)
|
||||
qubes.tarwriter.main([self.input_path, self.output_path])
|
||||
self.assertTarExtractable()
|
||||
|
||||
def test_012_gzip(self):
|
||||
self.write_sparse_chunks(0)
|
||||
qubes.tarwriter.main([
|
||||
'--use-compress-program=gzip', self.input_path, self.output_path])
|
||||
with self.assertNotRaises(subprocess.CalledProcessError):
|
||||
subprocess.check_call(['gzip', '--test', self.output_path])
|
||||
self.assertTarExtractable()
|
@ -220,6 +220,7 @@ fi
|
||||
%{python_sitelib}/qubes/exc.py*
|
||||
%{python_sitelib}/qubes/log.py*
|
||||
%{python_sitelib}/qubes/rngdoc.py*
|
||||
%{python_sitelib}/qubes/tarwriter.py*
|
||||
%{python_sitelib}/qubes/utils.py*
|
||||
|
||||
%dir %{python_sitelib}/qubes/vm
|
||||
@ -290,6 +291,7 @@ fi
|
||||
%{python_sitelib}/qubes/tests/storage.py*
|
||||
%{python_sitelib}/qubes/tests/storage_file.py*
|
||||
%{python_sitelib}/qubes/tests/storage_lvm.py*
|
||||
%{python_sitelib}/qubes/tests/tarwriter.py*
|
||||
|
||||
%dir %{python_sitelib}/qubes/tests/vm
|
||||
%{python_sitelib}/qubes/tests/vm/__init__.py*
|
||||
|
Loading…
Reference in New Issue
Block a user