tarwriter: use new PAX format, instead of old GNU for sparse files

The old format have many issues and is discouraged by tar developers. In
this case the most important one is header with possible non-ASCII
characters, which will result in UnicodeDecodeError (tarfile module
require header parts in utf-8).
PAX format is much cleaner, as it use standard mechanism for extended
headers.
This commit is contained in:
Marek Marczykowski-Górecki 2017-07-21 03:14:06 +02:00
parent abdad8c2b2
commit 397a8263bd
No known key found for this signature in database
GPG Key ID: 063938BA42CFA724
2 changed files with 27 additions and 51 deletions

View File

@ -19,6 +19,7 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import argparse import argparse
import functools import functools
import os
import subprocess import subprocess
import tarfile import tarfile
import io import io
@ -29,12 +30,21 @@ class TarSparseInfo(tarfile.TarInfo):
def __init__(self, name="", sparsemap=None): def __init__(self, name="", sparsemap=None):
super(TarSparseInfo, self).__init__(name) super(TarSparseInfo, self).__init__(name)
if sparsemap is not None: if sparsemap is not None:
self.type = tarfile.GNUTYPE_SPARSE self.type = tarfile.REGTYPE
self.sparsemap = list(sparsemap) self.sparsemap = sparsemap
self.sparsemap_buf = self.format_sparse_map()
# compact size # compact size
self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0) self.size = functools.reduce(lambda x, y: x+y[1], sparsemap,
0) + len(self.sparsemap_buf)
self.pax_headers['GNU.sparse.major'] = '1'
self.pax_headers['GNU.sparse.minor'] = '0'
self.pax_headers['GNU.sparse.name'] = name
self.pax_headers['GNU.sparse.realsize'] = str(self.realsize)
self.name = '{}/GNUSparseFile.{}/{}'.format(
os.path.dirname(name), os.getpid(), os.path.basename(name))
else: else:
self.sparsemap = [] self.sparsemap = []
self.sparsemap_buf = b''
@property @property
def realsize(self): def realsize(self):
@ -42,56 +52,22 @@ class TarSparseInfo(tarfile.TarInfo):
return self.sparsemap[-1][0] + self.sparsemap[-1][1] return self.sparsemap[-1][0] + self.sparsemap[-1][1]
return self.size return self.size
def sparse_header_chunk(self, index): def format_sparse_map(self):
if index < len(self.sparsemap): sparsemap_txt = (str(len(self.sparsemap)) + '\n' +
return b''.join([ ''.join('{}\n{}\n'.format(*entry) for entry in self.sparsemap))
tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT), sparsemap_txt_len = len(sparsemap_txt)
tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT), if sparsemap_txt_len % tarfile.BLOCKSIZE:
]) padding = '\0' * (tarfile.BLOCKSIZE -
return b'\0' * 12 * 2 sparsemap_txt_len % tarfile.BLOCKSIZE)
else:
padding = ''
return (sparsemap_txt + padding).encode()
def get_gnu_header(self): def tobuf(self, format=tarfile.PAX_FORMAT, encoding=tarfile.ENCODING,
'''Part placed in 'prefix' field of posix header'''
parts = [
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # atime
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # ctime
tarfile.itn(0, 12, tarfile.GNU_FORMAT), # offset
tarfile.stn('', 4, tarfile.ENCODING, 'surrogateescape'), #longnames
b'\0', # unused_pad2
]
parts += [self.sparse_header_chunk(i) for i in range(4)]
parts += [
b'\1' if len(self.sparsemap) > 4 else b'\0', # isextended
tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT), # realsize
]
return b''.join(parts)
def get_info(self):
info = super(TarSparseInfo, self).get_info()
# place GNU extension into
info['prefix'] = self.get_gnu_header().decode(tarfile.ENCODING)
return info
def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
errors="strict"): errors="strict"):
# pylint: disable=redefined-builtin # pylint: disable=redefined-builtin
header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors) header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
if len(self.sparsemap) > 4: return header_buf + self.sparsemap_buf
return header_buf + b''.join(self.create_ext_sparse_headers())
return header_buf
def create_ext_sparse_headers(self):
for ext_hdr in range(4, len(self.sparsemap), 21):
sparse_parts = [
self.sparse_header_chunk(i).decode(
tarfile.ENCODING, 'surrogateescape')
for i in range(ext_hdr, ext_hdr+21)]
sparse_parts.append(
'\1' if ext_hdr+21 < len(self.sparsemap) else '\0')
yield tarfile.stn(''.join(sparse_parts), 512,
tarfile.ENCODING, 'surrogateescape')
def get_sparse_map(input_file): def get_sparse_map(input_file):
''' '''
@ -190,7 +166,7 @@ def main(args=None):
output = compress.stdin output = compress.stdin
else: else:
compress = None compress = None
output.write(tar_info.tobuf(tarfile.GNU_FORMAT)) output.write(tar_info.tobuf(tarfile.PAX_FORMAT))
copy_sparse_data(input_file, output, sparse_map) copy_sparse_data(input_file, output, sparse_map)
finalize(output) finalize(output)
input_file.close() input_file.close()

View File

@ -1,7 +1,7 @@
# #
# The Qubes OS Project, http://www.qubes-os.org # The Qubes OS Project, http://www.qubes-os.org
# #
# Copyright (C) 2016 Marek Marczykowski-Górecki # Copyright (C) 2016 Marek Marczykowski-Górecki
# <marmarek@invisiblethingslab.com> # <marmarek@invisiblethingslab.com>
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify