In a remote server, due to some limitations, I have generated tarfiles split by 2000 MB using the command as stated here:
tar -cvzf - tdd*20210914*.csv | split -b 2000M - archives/20210914.tar.gz.part
Now, I have a list of files: [20210914.tar.gz.partaa, 20210914.tar.gz.partab, 20210914.tar.gz.partac]
, and need to extract all the partfiles in a windows machine, using python.
Using tar.extractall()
:
def extract(infile : str, path : str):
tar = tarfile.open(infile, "r:gz")
tar.extractall(path = path)
tar.close()
extract("20210914.tar.gz.partaa", path = "tmp") # where file is first file
However, I am getting EOFError: Compressed file ended before the end-of-stream marker was reached
which is expected as (I suppose) there are two more files that need to be extracted.
My question: how to modify the function to read all files, and extract them in the same directory?
I've tried to directly pass the second file into the function, but the following error is raised:
OSError Traceback (most recent call last)
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1643 try:
-> 1644 t = cls.taropen(name, mode, fileobj, **kwargs)
1645 except OSError:
~\.conda\envs\python37\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~\.conda\envs\python37\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~\.conda\envs\python37\lib\tarfile.py in next(self)
2286 try:
-> 2287 tarinfo = self.tarinfo.fromtarfile(self)
2288 except EOFHeaderError as e:
~\.conda\envs\python37\lib\tarfile.py in fromtarfile(cls, tarfile)
1093
-> 1094 buf = tarfile.fileobj.read(BLOCKSIZE)
1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
~\.conda\envs\python37\lib\gzip.py in read(self, size)
286 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 287 return self._buffer.read(size)
288
~\.conda\envs\python37\lib\_compression.py in readinto(self, b)
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
~\.conda\envs\python37\lib\gzip.py in read(self, size)
473 self._init_read()
--> 474 if not self._read_gzip_header():
475 self._size = self._pos
~\.conda\envs\python37\lib\gzip.py in _read_gzip_header(self)
421 if magic != b'\037\213':
--> 422 raise OSError('Not a gzipped file (%r)' % magic)
423
OSError: Not a gzipped file (b'|\x19')
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
<ipython-input-77-29d5169be949> in <module>
----> 1 extract("20210914.tar.gz.partab", path = "tmp") # where file is first file
<ipython-input-75-60cd4e78bf4e> in extract(infile, path, chunk, **kwargs)
1 def extract(infile : str, path : str, chunk : int = 2000, **kwargs):
----> 2 tar = tarfile.open(infile, "r:gz")
3 tar.extractall(path = path)
4 tar.close()
~\.conda\envs\python37\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
1593 elif "|" in mode:
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1646 fileobj.close()
1647 if mode == 'r':
-> 1648 raise ReadError("not a gzip file")
1649 raise
1650 except:
ReadError: not a gzip file