tarwriter: use new PAX format, instead of old GNU for sparse files
The old format have many issues and is discouraged by tar developers. In this case the most important one is header with possible non-ASCII characters, which will result in UnicodeDecodeError (tarfile module require header parts in utf-8). PAX format is much cleaner, as it use standard mechanism for extended headers.
This commit is contained in:
parent
abdad8c2b2
commit
397a8263bd
@ -19,6 +19,7 @@
|
|||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
import argparse
|
import argparse
|
||||||
import functools
|
import functools
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import tarfile
|
import tarfile
|
||||||
import io
|
import io
|
||||||
@ -29,12 +30,21 @@ class TarSparseInfo(tarfile.TarInfo):
|
|||||||
def __init__(self, name="", sparsemap=None):
|
def __init__(self, name="", sparsemap=None):
|
||||||
super(TarSparseInfo, self).__init__(name)
|
super(TarSparseInfo, self).__init__(name)
|
||||||
if sparsemap is not None:
|
if sparsemap is not None:
|
||||||
self.type = tarfile.GNUTYPE_SPARSE
|
self.type = tarfile.REGTYPE
|
||||||
self.sparsemap = list(sparsemap)
|
self.sparsemap = sparsemap
|
||||||
|
self.sparsemap_buf = self.format_sparse_map()
|
||||||
# compact size
|
# compact size
|
||||||
self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0)
|
self.size = functools.reduce(lambda x, y: x+y[1], sparsemap,
|
||||||
|
0) + len(self.sparsemap_buf)
|
||||||
|
self.pax_headers['GNU.sparse.major'] = '1'
|
||||||
|
self.pax_headers['GNU.sparse.minor'] = '0'
|
||||||
|
self.pax_headers['GNU.sparse.name'] = name
|
||||||
|
self.pax_headers['GNU.sparse.realsize'] = str(self.realsize)
|
||||||
|
self.name = '{}/GNUSparseFile.{}/{}'.format(
|
||||||
|
os.path.dirname(name), os.getpid(), os.path.basename(name))
|
||||||
else:
|
else:
|
||||||
self.sparsemap = []
|
self.sparsemap = []
|
||||||
|
self.sparsemap_buf = b''
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def realsize(self):
|
def realsize(self):
|
||||||
@ -42,56 +52,22 @@ class TarSparseInfo(tarfile.TarInfo):
|
|||||||
return self.sparsemap[-1][0] + self.sparsemap[-1][1]
|
return self.sparsemap[-1][0] + self.sparsemap[-1][1]
|
||||||
return self.size
|
return self.size
|
||||||
|
|
||||||
def sparse_header_chunk(self, index):
|
def format_sparse_map(self):
|
||||||
if index < len(self.sparsemap):
|
sparsemap_txt = (str(len(self.sparsemap)) + '\n' +
|
||||||
return b''.join([
|
''.join('{}\n{}\n'.format(*entry) for entry in self.sparsemap))
|
||||||
tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT),
|
sparsemap_txt_len = len(sparsemap_txt)
|
||||||
tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT),
|
if sparsemap_txt_len % tarfile.BLOCKSIZE:
|
||||||
])
|
padding = '\0' * (tarfile.BLOCKSIZE -
|
||||||
return b'\0' * 12 * 2
|
sparsemap_txt_len % tarfile.BLOCKSIZE)
|
||||||
|
else:
|
||||||
|
padding = ''
|
||||||
|
return (sparsemap_txt + padding).encode()
|
||||||
|
|
||||||
def get_gnu_header(self):
|
def tobuf(self, format=tarfile.PAX_FORMAT, encoding=tarfile.ENCODING,
|
||||||
'''Part placed in 'prefix' field of posix header'''
|
|
||||||
|
|
||||||
parts = [
|
|
||||||
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # atime
|
|
||||||
tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # ctime
|
|
||||||
tarfile.itn(0, 12, tarfile.GNU_FORMAT), # offset
|
|
||||||
tarfile.stn('', 4, tarfile.ENCODING, 'surrogateescape'), #longnames
|
|
||||||
b'\0', # unused_pad2
|
|
||||||
]
|
|
||||||
parts += [self.sparse_header_chunk(i) for i in range(4)]
|
|
||||||
parts += [
|
|
||||||
b'\1' if len(self.sparsemap) > 4 else b'\0', # isextended
|
|
||||||
tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT), # realsize
|
|
||||||
]
|
|
||||||
return b''.join(parts)
|
|
||||||
|
|
||||||
def get_info(self):
|
|
||||||
info = super(TarSparseInfo, self).get_info()
|
|
||||||
# place GNU extension into
|
|
||||||
info['prefix'] = self.get_gnu_header().decode(tarfile.ENCODING)
|
|
||||||
return info
|
|
||||||
|
|
||||||
def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
|
|
||||||
errors="strict"):
|
errors="strict"):
|
||||||
# pylint: disable=redefined-builtin
|
# pylint: disable=redefined-builtin
|
||||||
header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
|
header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
|
||||||
if len(self.sparsemap) > 4:
|
return header_buf + self.sparsemap_buf
|
||||||
return header_buf + b''.join(self.create_ext_sparse_headers())
|
|
||||||
return header_buf
|
|
||||||
|
|
||||||
def create_ext_sparse_headers(self):
|
|
||||||
for ext_hdr in range(4, len(self.sparsemap), 21):
|
|
||||||
sparse_parts = [
|
|
||||||
self.sparse_header_chunk(i).decode(
|
|
||||||
tarfile.ENCODING, 'surrogateescape')
|
|
||||||
for i in range(ext_hdr, ext_hdr+21)]
|
|
||||||
sparse_parts.append(
|
|
||||||
'\1' if ext_hdr+21 < len(self.sparsemap) else '\0')
|
|
||||||
yield tarfile.stn(''.join(sparse_parts), 512,
|
|
||||||
tarfile.ENCODING, 'surrogateescape')
|
|
||||||
|
|
||||||
|
|
||||||
def get_sparse_map(input_file):
|
def get_sparse_map(input_file):
|
||||||
'''
|
'''
|
||||||
@ -190,7 +166,7 @@ def main(args=None):
|
|||||||
output = compress.stdin
|
output = compress.stdin
|
||||||
else:
|
else:
|
||||||
compress = None
|
compress = None
|
||||||
output.write(tar_info.tobuf(tarfile.GNU_FORMAT))
|
output.write(tar_info.tobuf(tarfile.PAX_FORMAT))
|
||||||
copy_sparse_data(input_file, output, sparse_map)
|
copy_sparse_data(input_file, output, sparse_map)
|
||||||
finalize(output)
|
finalize(output)
|
||||||
input_file.close()
|
input_file.close()
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#
|
#
|
||||||
# The Qubes OS Project, http://www.qubes-os.org
|
# The Qubes OS Project, http://www.qubes-os.org
|
||||||
#
|
#
|
||||||
# Copyright (C) 2016 Marek Marczykowski-Górecki
|
# Copyright (C) 2016 Marek Marczykowski-Górecki
|
||||||
# <marmarek@invisiblethingslab.com>
|
# <marmarek@invisiblethingslab.com>
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
Loading…
Reference in New Issue
Block a user