tarwriter.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #
  2. # The Qubes OS Project, http://www.qubes-os.org
  3. #
  4. # Copyright (C) 2016 Marek Marczykowski-Górecki
  5. # <marmarek@invisiblethingslab.com>
  6. #
  7. # This program is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation; either version 2 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License along
  18. # with this program; if not, write to the Free Software Foundation, Inc.,
  19. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20. import argparse
  21. import functools
  22. import os
  23. import subprocess
  24. import tarfile
  25. import io
  26. BUF_SIZE = 409600
  27. class TarSparseInfo(tarfile.TarInfo):
  28. def __init__(self, name="", sparsemap=None):
  29. super(TarSparseInfo, self).__init__(name)
  30. if sparsemap is not None:
  31. self.type = tarfile.REGTYPE
  32. self.sparsemap = sparsemap
  33. self.sparsemap_buf = self.format_sparse_map()
  34. # compact size
  35. self.size = functools.reduce(lambda x, y: x+y[1], sparsemap,
  36. 0) + len(self.sparsemap_buf)
  37. self.pax_headers['GNU.sparse.major'] = '1'
  38. self.pax_headers['GNU.sparse.minor'] = '0'
  39. self.pax_headers['GNU.sparse.name'] = name
  40. self.pax_headers['GNU.sparse.realsize'] = str(self.realsize)
  41. self.name = '{}/GNUSparseFile.{}/{}'.format(
  42. os.path.dirname(name), os.getpid(), os.path.basename(name))
  43. else:
  44. self.sparsemap = []
  45. self.sparsemap_buf = b''
  46. @property
  47. def realsize(self):
  48. if self.sparsemap:
  49. return self.sparsemap[-1][0] + self.sparsemap[-1][1]
  50. return self.size
  51. def format_sparse_map(self):
  52. sparsemap_txt = (str(len(self.sparsemap)) + '\n' +
  53. ''.join('{}\n{}\n'.format(*entry) for entry in self.sparsemap))
  54. sparsemap_txt_len = len(sparsemap_txt)
  55. if sparsemap_txt_len % tarfile.BLOCKSIZE:
  56. padding = '\0' * (tarfile.BLOCKSIZE -
  57. sparsemap_txt_len % tarfile.BLOCKSIZE)
  58. else:
  59. padding = ''
  60. return (sparsemap_txt + padding).encode()
  61. def tobuf(self, format=tarfile.PAX_FORMAT, encoding=tarfile.ENCODING,
  62. errors="strict"):
  63. # pylint: disable=redefined-builtin
  64. header_buf = super(TarSparseInfo, self).tobuf(format, encoding, errors)
  65. return header_buf + self.sparsemap_buf
  66. def get_sparse_map(input_file):
  67. '''
  68. Return map of the file where actual data is present, ignoring zero-ed
  69. blocks. Last entry of the map spans to the end of file, even if that part is
  70. zero-size (when file ends with zeros).
  71. This function is performance critical.
  72. :param input_file: io.File object
  73. :return: iterable of (offset, size)
  74. '''
  75. zero_block = bytearray(tarfile.BLOCKSIZE)
  76. buf = bytearray(BUF_SIZE)
  77. in_data_block = False
  78. data_block_start = 0
  79. buf_start_offset = 0
  80. while True:
  81. buf_len = input_file.readinto(buf)
  82. if not buf_len:
  83. break
  84. for offset in range(0, buf_len, tarfile.BLOCKSIZE):
  85. if buf[offset:offset+tarfile.BLOCKSIZE] == zero_block:
  86. if in_data_block:
  87. in_data_block = False
  88. yield (data_block_start,
  89. buf_start_offset+offset-data_block_start)
  90. else:
  91. if not in_data_block:
  92. in_data_block = True
  93. data_block_start = buf_start_offset+offset
  94. buf_start_offset += buf_len
  95. if in_data_block:
  96. yield (data_block_start, buf_start_offset-data_block_start)
  97. else:
  98. # always emit last slice to the input end - otherwise extracted file
  99. # will be truncated
  100. yield (buf_start_offset, 0)
  101. def copy_sparse_data(input_stream, output_stream, sparse_map):
  102. '''Copy data blocks from input to output according to sparse_map
  103. :param input_stream: io.IOBase input instance
  104. :param output_stream: io.IOBase output instance
  105. :param sparse_map: iterable of (offset, size)
  106. '''
  107. buf = bytearray(BUF_SIZE)
  108. for chunk in sparse_map:
  109. input_stream.seek(chunk[0])
  110. left = chunk[1]
  111. while left:
  112. if left > BUF_SIZE:
  113. read = input_stream.readinto(buf)
  114. output_stream.write(buf[:read])
  115. else:
  116. buf_trailer = input_stream.read(left)
  117. read = len(buf_trailer)
  118. output_stream.write(buf_trailer)
  119. left -= read
  120. if not read:
  121. raise Exception('premature EOF')
  122. def finalize(output):
  123. '''Write EOF blocks'''
  124. output.write(b'\0' * 512)
  125. output.write(b'\0' * 512)
  126. def main(args=None):
  127. parser = argparse.ArgumentParser()
  128. parser.add_argument('--override-name', action='store', dest='override_name',
  129. help='use this name in tar header')
  130. parser.add_argument('--use-compress-program', default=None,
  131. metavar='COMMAND', action='store', dest='use_compress_program',
  132. help='Filter data through COMMAND.')
  133. parser.add_argument('input_file',
  134. help='input file name')
  135. parser.add_argument('output_file', default='-', nargs='?',
  136. help='output file name')
  137. args = parser.parse_args(args)
  138. input_file = io.open(args.input_file, 'rb')
  139. sparse_map = list(get_sparse_map(input_file))
  140. header_name = args.input_file
  141. if args.override_name:
  142. header_name = args.override_name
  143. tar_info = TarSparseInfo(header_name, sparse_map)
  144. if args.output_file == '-':
  145. output = io.open('/dev/stdout', 'wb')
  146. else:
  147. output = io.open(args.output_file, 'wb')
  148. if args.use_compress_program:
  149. compress = subprocess.Popen([args.use_compress_program],
  150. stdin=subprocess.PIPE, stdout=output)
  151. output = compress.stdin
  152. else:
  153. compress = None
  154. output.write(tar_info.tobuf(tarfile.PAX_FORMAT))
  155. copy_sparse_data(input_file, output, sparse_map)
  156. finalize(output)
  157. input_file.close()
  158. output.close()
  159. if compress is not None:
  160. compress.wait()
  161. return compress.returncode
  162. return 0
  163. if __name__ == '__main__':
  164. main()