I have a Python routine which archives file recordings into a GZipped tarball. The output file appears to be far larger than the source files, and I cannot work out why. As an example of the scale of the issue, 6GB of call recordings are generating an archive of 10GB.
There appear to be no errors in the script and the output .gz file is readable and appears OK apart from the huge size.
Excerpt from my script as follows:
# construct tar filename and open file
client_fileid = client_id + "_" + dt.datetime.now().strftime("%Y%m%d_%H%M%S")
tarname = tar_path + "/" + client_fileid + ".tar.gz"
print "Opening tar file %s " % (tarname), "\n"
try:
tar = tarfile.open (tarname, "w:gz")
except:
print "Error opening tar file: %s" % sys.exc_info()[0]
sql="""SELECT number, er.id, e.id, flow, filename, filesize, unread, er.cr_date, callerid,
length, callid, info, party FROM extension_recording er, extension e, client c
WHERE er.extension_id = e.id AND e.client_id = c.id AND c.parent_client_id = %s
AND DATE(er.cr_date) BETWEEN '%s' AND '%s'""" % (client_id, start_date, end_date)
rows = cur.execute(sql)
recordings = cur.fetchall()
if rows == 0: sys.exit("No recordings for selected date range - exiting")
for recording in recordings: # loop through recordings cursor
try:
ext_len = len(str(recording[0]))
# add preceding zeroes if the ext no starts with 0 or 00
if ext_len == 2: extension_no = "0" + str(recording[0])
elif ext_len == 1: extension_no = "00" + str(recording[0])
else: extension_no = str(recording[0])
filename = recording[4]
extended_no = client_id + "*%s" % (extension_no)
sourcedir = recording_path + "/" + extended_no
tardir = extended_no + "/" + filename
complete_name = sourcedir + "/" + filename
tar.add(complete_name, arcname=tardir) # add to tar archive
except:
print "Error '%s' writing to tar file %s" % (sys.exc_info()[1], csvfullfilename)