I use a dataset available in this link and machine learning algorithms to classify 75 network traffic classes based on 87 feature (columns). The dataset consists of 3.577.296 instances (rows).
First I index the label, standardize the columns that have continuous values, apply feature selection, and then use ML algorithms for classification; Logistic Regression, Random Forest, Decision Tree & Naive Base.
All the algorithms result in low accuracy (0.59 using DT & 0.005 in NV). What could be the reason behind these low accuracy?
Please I need help. There is no reason for voting negatively. Thanks
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import pandas as pd
spark = SparkSession \
.builder.config("spark.driver.memory", "15g") \
.appName('SDN Data') \
.getOrCreate()
df = (spark.read.format("csv")\
.option('header', 'true')\
.option("inferSchema", "true")\
.load("D:/PHD Project/Paper_3/Datasets_Download/IP Network
Traffic Flows Labeled with 75 Apps/
Dataset-Unicauca-Version2-87Atts.csv"))
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol = "L7Protocol", outputCol = "label")
df_1 = indexer.fit(df).transform(df)
df_1 = df_1.drop('ProtocolName')
df_1.printSchema()
allFeatures=df_1.columns
from pyspark.ml.feature import UnivariateFeatureSelector
from pyspark.ml.feature import VectorAssembler
import numpy as np
import pandas as pd
vec_assembler = VectorAssembler(inputCols=allFeatures,
outputCol="features",handleInvalid="skip")
vec_df = vec_assembler.transform(df_1)
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
withStd=True, withMean=False)
scalerModel = scaler.fit(vec_df)
scaledData = scalerModel.transform(vec_df)
selector = UnivariateFeatureSelector(featuresCol='scaledFeatures',
outputCol="selectedFeatures",labelCol= "label")
selector.setFeatureType("continuous").setLabelType("categorical")
.setSelectionThreshold(20)
model = selector.fit(scaledData)
model.getFeaturesCol()
selectedFeatures=model.selectedFeatures
selectedFeatures_Sama=np.array(df_1.columns)[model.selectedFeatures]
np.array(df_1.columns)[selectedFeatures]
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=selectedFeatures_Sama,
outputCol="features",handleInvalid="skip")
# Create stages list
MyStages=[indexer,vectorAssembler]
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=MyStages)
pModel = pipeline.fit(df)
df_2 = pModel.transform(df)
trainDF, testDF = df_2.randomSplit([0.8, 0.2])
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol =
'label', maxDepth = 10)
dtModel = dt.fit(trainDF)
predictions = dtModel.transform(testDF)
predictions.select("probability","label","prediction")
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator =MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)