core-admin/qubes/tarwriter.py

#!/usr/bin/python2
# -*- encoding: utf8 -*-
#
# The Qubes OS Project, http://www.qubes-os.org
#
# Copyright (C) 2016 Marek Marczykowski-Górecki
#                               <marmarek@invisiblethingslab.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import argparse
import functools
import subprocess
import tarfile
import io

BUF_SIZE = 409600


class TarSparseInfo(tarfile.TarInfo):
    def __init__(self, name="", sparsemap=None):
        super(TarSparseInfo, self).__init__(name)
        if sparsemap is not None:
            self.type = tarfile.GNUTYPE_SPARSE
            self.sparsemap = list(sparsemap)
            # compact size
            self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0)
        else:
            self.sparsemap = []

    @property
    def realsize(self):
        if len(self.sparsemap):
            return self.sparsemap[-1][0] + self.sparsemap[-1][1]
        else:
            return self.size

    def sparse_header_chunk(self, index):
        if index < len(self.sparsemap):
            return ''.join([
                tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT),
                tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT),
            ])
        else:
            return '\0' * 12 * 2

    def get_gnu_header(self):
        '''Part placed in 'prefix' field of posix header'''

        parts = [
            tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT),  # atime
            tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT),  # ctime
            tarfile.itn(0, 12, tarfile.GNU_FORMAT),  # offset
            tarfile.stn('', 4),  # longnames
            '\0',  # unused_pad2
        ]
        parts += [self.sparse_header_chunk(i) for i in range(4)]
        parts += [
            '\1' if len(self.sparsemap) > 4 else '\0',  # isextended
            tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT),  # realsize
        ]
        return ''.join(parts)

    def get_info(self, encoding, errors):
        info = super(TarSparseInfo, self).get_info(encoding, errors)
        # place GNU extension into
        info['prefix'] = self.get_gnu_header()
        return info

    def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
            errors="strict"):
        # pylint: disable=redefined-builtin
        header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
        if len(self.sparsemap) > 4:
            return header_buf + ''.join(self.create_ext_sparse_headers())
        else:
            return header_buf

    def create_ext_sparse_headers(self):
        for ext_hdr in range(4, len(self.sparsemap), 21):
            sparse_parts = [self.sparse_header_chunk(i) for i in
                range(ext_hdr, ext_hdr+21)]
            sparse_parts += '\1' if ext_hdr+21 < len(self.sparsemap) else '\0'
            yield tarfile.stn(''.join(sparse_parts), 512)


def get_sparse_map(input_file):
    '''
    Return map of the file where actual data is present, ignoring zero-ed
    blocks. Last entry of the map spans to the end of file, even if that part is
    zero-size (when file ends with zeros).

    This function is performance critical.

    :param input_file: io.File object
    :return: iterable of (offset, size)
    '''
    zero_block = bytearray(tarfile.BLOCKSIZE)
    buf = bytearray(BUF_SIZE)
    in_data_block = False
    data_block_start = 0
    buf_start_offset = 0
    while True:
        buf_len = input_file.readinto(buf)
        if not buf_len:
            break
        for offset in range(0, buf_len, tarfile.BLOCKSIZE):
            if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
                if in_data_block:
                    in_data_block = False
                    yield (data_block_start,
                        buf_start_offset+offset-data_block_start)
            else:
                if not in_data_block:
                    in_data_block = True
                    data_block_start = buf_start_offset+offset
        buf_start_offset += buf_len
    if in_data_block:
        yield (data_block_start, buf_start_offset-data_block_start)
    else:
        # always emit last slice to the input end - otherwise extracted file
        # will be truncated
        yield (buf_start_offset, 0)


def copy_sparse_data(input_stream, output_stream, sparse_map):
    '''Copy data blocks from input to output according to sparse_map

    :param input_stream: io.IOBase input instance
    :param output_stream: io.IOBase output instance
    :param sparse_map: iterable of (offset, size)
    '''

    buf = bytearray(BUF_SIZE)

    for chunk in sparse_map:
        input_stream.seek(chunk[0])
        left = chunk[1]
        while left:
            if left > BUF_SIZE:
                read = input_stream.readinto(buf)
                output_stream.write(buf[:read])
            else:
                buf_trailer = input_stream.read(left)
                read = len(buf_trailer)
                output_stream.write(buf_trailer)
            left -= read
            if not read:
                raise Exception('premature EOF')

def finalize(output):
    '''Write EOF blocks'''
    output.write('\0' * 512)
    output.write('\0' * 512)

def main(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--override-name', action='store', dest='override_name',
        help='use this name in tar header')
    parser.add_argument('--use-compress-program', default=None,
        metavar='COMMAND', action='store', dest='use_compress_program',
        help='Filter data through COMMAND.')
    parser.add_argument('input_file',
        help='input file name')
    parser.add_argument('output_file', default='-', nargs='?',
        help='output file name')
    args = parser.parse_args(args)
    input_file = io.open(args.input_file, 'rb')
    sparse_map = list(get_sparse_map(input_file))
    header_name = args.input_file
    if args.override_name:
        header_name = args.override_name
    tar_info = TarSparseInfo(header_name, sparse_map)
    if args.output_file == '-':
        output = io.open('/dev/stdout', 'wb')
    else:
        output = io.open(args.output_file, 'wb')
    if args.use_compress_program:
        compress = subprocess.Popen([args.use_compress_program],
            stdin=subprocess.PIPE, stdout=output)
        output = compress.stdin
    else:
        compress = None
    output.write(tar_info.tobuf(tarfile.GNU_FORMAT))
    copy_sparse_data(input_file, output, sparse_map)
    finalize(output)
    input_file.close()
    output.close()
    if compress is not None:
        compress.wait()
        return compress.returncode
    return 0

if __name__ == '__main__':
    main()