Ok - First attempt posting to stackOverflow... but I'm searched everywhere, cannot find an answer, am sure I'm on the wrong track, and feel this is a question with general applicability, so here goes.
I deal with a lot of different instruments with a lot of different file types - but they all generally use a records-based protocol for storing data. E.g - each file contains a sequence of records, each record with a record-type header that includes a synchronization bit, what kind of record it is, and the size of the record. I want to build a generic class for reading record-based files that I can inherit and customize for each file type I deal with. Files are usually very large (1GB), so efficiency is key - which also means I'm probably out of my league.
Here is my class. I'm using file position markers to jump all around in the file, and am concerned that I'm building a very slow class - which is a big deal given the size and quantity of data. Wrong track?
class RecordBasedFile(object):
syncpattern = '\xff\xff\x00\x00'
maxrecsize = 1024*64
recordIDfromSync = 28
recordSizefromSync = 4
def __init__(self,fileHandle):
self.fh = fileHandle
self.recordList = []
self.sync()
def sync(self):
abpos = self.fh.tell()
chunk = self.fh.read(self.maxrecsize)
syncbit = chunk.find(self.syncpattern)
if syncbit >=0:
self.fh.seek(syncbit+abpos)
return True
else:
return False
def checkSync(self):
result = self.fh.read(len(self.syncpattern)) == self.syncpattern
self.fh.seek(-len(self.syncpattern),1)
return result
def recordID(self):
return self.getField('<I',self.recordIDfromSync)
def recordSize(self):
size = self.getField('<I',self.recordSizefromSync)
return size
def record(self,yieldRecord = True):
abpos = self.fh.tell()
rsize,roffset = self.recordSize()
if yieldRecord:
self.fh.seek(abpos+roffset)
output = self.fh.read(rsize)
else:
self.fh.seek(abpos+roffset+rsize)
output = False
self.fh.seek(self.endofRecordtoSync,1)
return output
def fmtLength(self,Fmt):
fmt = Fmt.lower()
return 1 * fmt.count('b') + \
2*fmt.count('h') + \
4*(fmt.count('i') + fmt.count('f'))
def getField(self,fmt,offset):
abpos = self.fh.tell()
self.fh.seek(abpos+offset)
output = unpack(fmt,self.fh.read(self.fmtLength(fmt)))
self.fh.seek(abpos)
return output[0]
def __iter__(self):
return self
def next(self):
rid = self.recordID()
record = self.record(yieldRecord = rid in self.recordList)
if not self.checkSync():
self.sync
return rid, record