The goal of the code is simply to do the OHE in two columns and write the rest of the columns as they are in the original file. But the column Dur, as shown in the image, is somehow "bugging" when it's written in the second file and passing more content than it should. I didn't want to limit the field because the original file's too large and probably has lines with longer and shorter fields and it could complicate the analysis later.
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
def opendataset():
file = pd.read_csv('originalfiletest.binetflow')
return file
def writefile():
df.to_csv('newfiletest.binetflow', columns=['Dur','Proto','State','TotBytes','average_packet_size','average_bits_psecond'], index=False)
def writebackupproto():
df.to_csv('fieldprotobackup.binetflow', columns=['Proto2','Proto'], index=False)
def writebackupstate():
df.to_csv('fieldstatebackup.binetflow', columns=['State2','State'], index=False)
df = opendataset()
df['State2'] = df['State']
df['Proto2'] = df['Proto']
le = LabelEncoder()
dfle = df
dfle.State = le.fit_transform(dfle.State)
X = dfle[['State']].values
Y = dfle[['Proto']].values
ohe = OneHotEncoder()
OnehotX = ohe.fit_transform(X).toarray()
OnehotY = ohe.fit_transform(Y).toarray()
dx = pd.DataFrame(data=OnehotX)
dy = pd.DataFrame(data=OnehotY)
dfle['State'] = (dx[dx.columns[0:]].apply(lambda x:''.join(x.dropna().astype(int).astype(str)), axis=1))
dfle['Proto'] = (dy[dy.columns[0:]].apply(lambda y:''.join(y.dropna().astype(int).astype(str)), axis=1))
writefile()
writebackupproto()
writebackupstate()