I'm adapting this code from this thread: Reading huge sas dataset in python
When I test run with my own dataset :
filename = 'df_53.SAS7BDAT'
CHUNKSIZE = 50000
offset = 0
# Get the function object in a variable getChunk
if filename.lower().endswith('sas7bdat'):
getChunk = pyreadstat.read_sas7bdat
else:
getChunk = pyreadstat.read_xport
allChunk,_ = getChunk(row['/data/research1/test/cc/53/'], row_limit=CHUNKSIZE, row_offset=offset)
allChunk = allChunk.astype('category')
while True:
offset += CHUNKSIZE
# for xpt data, use pyreadstat.read_xpt()
chunk, _ = pyreadstat.read_sas7bdat(filename, row_limit=CHUNKSIZE, row_offset=offset)
if chunk.empty: break # if chunk is empty, it means the entire data has been read, so break
for eachCol in chunk: #converting each column to categorical
colUnion = pd.api.types.union_categoricals([allChunk[eachCol], chunk[eachCol]])
allChunk[eachCol] = pd.Categorical(allChunk[eachCol], categories=colUnion.categories)
chunk[eachCol] = pd.Categorical(chunk[eachCol], categories=colUnion.categories)
allChunk = pd.concat([allChunk, chunk]) #Append each chunk to the resulting dataframe
I received an error that says:
name 'row' is not defined
Anyone has any idea how to fix the error?