core-admin/qubes/tarwriter.py
Marek Marczykowski-Górecki 397a8263bd
tarwriter: use new PAX format, instead of old GNU for sparse files
The old format have many issues and is discouraged by tar developers. In
this case the most important one is header with possible non-ASCII
characters, which will result in UnicodeDecodeError (tarfile module
require header parts in utf-8).
PAX format is much cleaner, as it use standard mechanism for extended
headers.
2017-07-21 03:14:06 +02:00

181 lines
6.4 KiB
Python

#
# The Qubes OS Project, http://www.qubes-os.org
#
# Copyright (C) 2016 Marek Marczykowski-Górecki
# <marmarek@invisiblethingslab.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import argparse
import functools
import os
import subprocess
import tarfile
import io
BUF_SIZE = 409600
class TarSparseInfo(tarfile.TarInfo):
def __init__(self, name="", sparsemap=None):
super(TarSparseInfo, self).__init__(name)
if sparsemap is not None:
self.type = tarfile.REGTYPE
self.sparsemap = sparsemap
self.sparsemap_buf = self.format_sparse_map()
# compact size
self.size = functools.reduce(lambda x, y: x+y[1], sparsemap,
0) + len(self.sparsemap_buf)
self.pax_headers['GNU.sparse.major'] = '1'
self.pax_headers['GNU.sparse.minor'] = '0'
self.pax_headers['GNU.sparse.name'] = name
self.pax_headers['GNU.sparse.realsize'] = str(self.realsize)
self.name = '{}/GNUSparseFile.{}/{}'.format(
os.path.dirname(name), os.getpid(), os.path.basename(name))
else:
self.sparsemap = []
self.sparsemap_buf = b''
@property
def realsize(self):
if self.sparsemap:
return self.sparsemap[-1][0] + self.sparsemap[-1][1]
return self.size
def format_sparse_map(self):
sparsemap_txt = (str(len(self.sparsemap)) + '\n' +
''.join('{}\n{}\n'.format(*entry) for entry in self.sparsemap))
sparsemap_txt_len = len(sparsemap_txt)
if sparsemap_txt_len % tarfile.BLOCKSIZE:
padding = '\0' * (tarfile.BLOCKSIZE -
sparsemap_txt_len % tarfile.BLOCKSIZE)
else:
padding = ''
return (sparsemap_txt + padding).encode()
def tobuf(self, format=tarfile.PAX_FORMAT, encoding=tarfile.ENCODING,
errors="strict"):
# pylint: disable=redefined-builtin
header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
return header_buf + self.sparsemap_buf
def get_sparse_map(input_file):
'''
Return map of the file where actual data is present, ignoring zero-ed
blocks. Last entry of the map spans to the end of file, even if that part is
zero-size (when file ends with zeros).
This function is performance critical.
:param input_file: io.File object
:return: iterable of (offset, size)
'''
zero_block = bytearray(tarfile.BLOCKSIZE)
buf = bytearray(BUF_SIZE)
in_data_block = False
data_block_start = 0
buf_start_offset = 0
while True:
buf_len = input_file.readinto(buf)
if not buf_len:
break
for offset in range(0, buf_len, tarfile.BLOCKSIZE):
if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
if in_data_block:
in_data_block = False
yield (data_block_start,
buf_start_offset+offset-data_block_start)
else:
if not in_data_block:
in_data_block = True
data_block_start = buf_start_offset+offset
buf_start_offset += buf_len
if in_data_block:
yield (data_block_start, buf_start_offset-data_block_start)
else:
# always emit last slice to the input end - otherwise extracted file
# will be truncated
yield (buf_start_offset, 0)
def copy_sparse_data(input_stream, output_stream, sparse_map):
'''Copy data blocks from input to output according to sparse_map
:param input_stream: io.IOBase input instance
:param output_stream: io.IOBase output instance
:param sparse_map: iterable of (offset, size)
'''
buf = bytearray(BUF_SIZE)
for chunk in sparse_map:
input_stream.seek(chunk[0])
left = chunk[1]
while left:
if left > BUF_SIZE:
read = input_stream.readinto(buf)
output_stream.write(buf[:read])
else:
buf_trailer = input_stream.read(left)
read = len(buf_trailer)
output_stream.write(buf_trailer)
left -= read
if not read:
raise Exception('premature EOF')
def finalize(output):
'''Write EOF blocks'''
output.write(b'\0' * 512)
output.write(b'\0' * 512)
def main(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--override-name', action='store', dest='override_name',
help='use this name in tar header')
parser.add_argument('--use-compress-program', default=None,
metavar='COMMAND', action='store', dest='use_compress_program',
help='Filter data through COMMAND.')
parser.add_argument('input_file',
help='input file name')
parser.add_argument('output_file', default='-', nargs='?',
help='output file name')
args = parser.parse_args(args)
input_file = io.open(args.input_file, 'rb')
sparse_map = list(get_sparse_map(input_file))
header_name = args.input_file
if args.override_name:
header_name = args.override_name
tar_info = TarSparseInfo(header_name, sparse_map)
if args.output_file == '-':
output = io.open('/dev/stdout', 'wb')
else:
output = io.open(args.output_file, 'wb')
if args.use_compress_program:
compress = subprocess.Popen([args.use_compress_program],
stdin=subprocess.PIPE, stdout=output)
output = compress.stdin
else:
compress = None
output.write(tar_info.tobuf(tarfile.PAX_FORMAT))
copy_sparse_data(input_file, output, sparse_map)
finalize(output)
input_file.close()
output.close()
if compress is not None:
compress.wait()
return compress.returncode
return 0
if __name__ == '__main__':
main()