tarwriter.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #
  2. # The Qubes OS Project, http://www.qubes-os.org
  3. #
  4. # Copyright (C) 2016 Marek Marczykowski-Górecki
  5. # <marmarek@invisiblethingslab.com>
  6. #
  7. # This program is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation; either version 2 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License along
  18. # with this program; if not, write to the Free Software Foundation, Inc.,
  19. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20. import argparse
  21. import functools
  22. import subprocess
  23. import tarfile
  24. import io
  25. BUF_SIZE = 409600
  26. class TarSparseInfo(tarfile.TarInfo):
  27. def __init__(self, name="", sparsemap=None):
  28. super(TarSparseInfo, self).__init__(name)
  29. if sparsemap is not None:
  30. self.type = tarfile.GNUTYPE_SPARSE
  31. self.sparsemap = list(sparsemap)
  32. # compact size
  33. self.size = functools.reduce(lambda x, y: x+y[1], sparsemap, 0)
  34. else:
  35. self.sparsemap = []
  36. @property
  37. def realsize(self):
  38. if len(self.sparsemap):
  39. return self.sparsemap[-1][0] + self.sparsemap[-1][1]
  40. else:
  41. return self.size
  42. def sparse_header_chunk(self, index):
  43. if index < len(self.sparsemap):
  44. return b''.join([
  45. tarfile.itn(self.sparsemap[index][0], 12, tarfile.GNU_FORMAT),
  46. tarfile.itn(self.sparsemap[index][1], 12, tarfile.GNU_FORMAT),
  47. ])
  48. else:
  49. return b'\0' * 12 * 2
  50. def get_gnu_header(self):
  51. '''Part placed in 'prefix' field of posix header'''
  52. parts = [
  53. tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # atime
  54. tarfile.itn(self.mtime, 12, tarfile.GNU_FORMAT), # ctime
  55. tarfile.itn(0, 12, tarfile.GNU_FORMAT), # offset
  56. tarfile.stn('', 4, tarfile.ENCODING, 'surrogateescape'), #longnames
  57. b'\0', # unused_pad2
  58. ]
  59. parts += [self.sparse_header_chunk(i) for i in range(4)]
  60. parts += [
  61. b'\1' if len(self.sparsemap) > 4 else b'\0', # isextended
  62. tarfile.itn(self.realsize, 12, tarfile.GNU_FORMAT), # realsize
  63. ]
  64. return b''.join(parts)
  65. def get_info(self):
  66. info = super(TarSparseInfo, self).get_info()
  67. # place GNU extension into
  68. info['prefix'] = self.get_gnu_header().decode(tarfile.ENCODING)
  69. return info
  70. def tobuf(self, format=tarfile.DEFAULT_FORMAT, encoding=tarfile.ENCODING,
  71. errors="strict"):
  72. # pylint: disable=redefined-builtin
  73. header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
  74. if len(self.sparsemap) > 4:
  75. return header_buf + b''.join(self.create_ext_sparse_headers())
  76. else:
  77. return header_buf
  78. def create_ext_sparse_headers(self):
  79. for ext_hdr in range(4, len(self.sparsemap), 21):
  80. sparse_parts = [
  81. self.sparse_header_chunk(i).decode(
  82. tarfile.ENCODING, 'surrogateescape')
  83. for i in range(ext_hdr, ext_hdr+21)]
  84. sparse_parts.append(
  85. '\1' if ext_hdr+21 < len(self.sparsemap) else '\0')
  86. yield tarfile.stn(''.join(sparse_parts), 512,
  87. tarfile.ENCODING, 'surrogateescape')
  88. def get_sparse_map(input_file):
  89. '''
  90. Return map of the file where actual data is present, ignoring zero-ed
  91. blocks. Last entry of the map spans to the end of file, even if that part is
  92. zero-size (when file ends with zeros).
  93. This function is performance critical.
  94. :param input_file: io.File object
  95. :return: iterable of (offset, size)
  96. '''
  97. zero_block = bytearray(tarfile.BLOCKSIZE)
  98. buf = bytearray(BUF_SIZE)
  99. in_data_block = False
  100. data_block_start = 0
  101. buf_start_offset = 0
  102. while True:
  103. buf_len = input_file.readinto(buf)
  104. if not buf_len:
  105. break
  106. for offset in range(0, buf_len, tarfile.BLOCKSIZE):
  107. if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
  108. if in_data_block:
  109. in_data_block = False
  110. yield (data_block_start,
  111. buf_start_offset+offset-data_block_start)
  112. else:
  113. if not in_data_block:
  114. in_data_block = True
  115. data_block_start = buf_start_offset+offset
  116. buf_start_offset += buf_len
  117. if in_data_block:
  118. yield (data_block_start, buf_start_offset-data_block_start)
  119. else:
  120. # always emit last slice to the input end - otherwise extracted file
  121. # will be truncated
  122. yield (buf_start_offset, 0)
  123. def copy_sparse_data(input_stream, output_stream, sparse_map):
  124. '''Copy data blocks from input to output according to sparse_map
  125. :param input_stream: io.IOBase input instance
  126. :param output_stream: io.IOBase output instance
  127. :param sparse_map: iterable of (offset, size)
  128. '''
  129. buf = bytearray(BUF_SIZE)
  130. for chunk in sparse_map:
  131. input_stream.seek(chunk[0])
  132. left = chunk[1]
  133. while left:
  134. if left > BUF_SIZE:
  135. read = input_stream.readinto(buf)
  136. output_stream.write(buf[:read])
  137. else:
  138. buf_trailer = input_stream.read(left)
  139. read = len(buf_trailer)
  140. output_stream.write(buf_trailer)
  141. left -= read
  142. if not read:
  143. raise Exception('premature EOF')
  144. def finalize(output):
  145. '''Write EOF blocks'''
  146. output.write(b'\0' * 512)
  147. output.write(b'\0' * 512)
  148. def main(args=None):
  149. parser = argparse.ArgumentParser()
  150. parser.add_argument('--override-name', action='store', dest='override_name',
  151. help='use this name in tar header')
  152. parser.add_argument('--use-compress-program', default=None,
  153. metavar='COMMAND', action='store', dest='use_compress_program',
  154. help='Filter data through COMMAND.')
  155. parser.add_argument('input_file',
  156. help='input file name')
  157. parser.add_argument('output_file', default='-', nargs='?',
  158. help='output file name')
  159. args = parser.parse_args(args)
  160. input_file = io.open(args.input_file, 'rb')
  161. sparse_map = list(get_sparse_map(input_file))
  162. header_name = args.input_file
  163. if args.override_name:
  164. header_name = args.override_name
  165. tar_info = TarSparseInfo(header_name, sparse_map)
  166. if args.output_file == '-':
  167. output = io.open('/dev/stdout', 'wb')
  168. else:
  169. output = io.open(args.output_file, 'wb')
  170. if args.use_compress_program:
  171. compress = subprocess.Popen([args.use_compress_program],
  172. stdin=subprocess.PIPE, stdout=output)
  173. output = compress.stdin
  174. else:
  175. compress = None
  176. output.write(tar_info.tobuf(tarfile.GNU_FORMAT))
  177. copy_sparse_data(input_file, output, sparse_map)
  178. finalize(output)
  179. input_file.close()
  180. output.close()
  181. if compress is not None:
  182. compress.wait()
  183. return compress.returncode
  184. return 0
  185. if __name__ == '__main__':
  186. main()