-1

I use a dataset available in this link and machine learning algorithms to classify 75 network traffic classes based on 87 feature (columns). The dataset consists of 3.577.296 instances (rows).

First I index the label, standardize the columns that have continuous values, apply feature selection, and then use ML algorithms for classification; Logistic Regression, Random Forest, Decision Tree & Naive Base.

All the algorithms result in low accuracy (0.59 using DT & 0.005 in NV). What could be the reason behind these low accuracy?

Please I need help. There is no reason for voting negatively. Thanks

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import pandas as pd
spark = SparkSession \
    .builder.config("spark.driver.memory", "15g") \
    .appName('SDN Data') \
    .getOrCreate()

df = (spark.read.format("csv")\
          .option('header', 'true')\
          .option("inferSchema", "true")\
          .load("D:/PHD Project/Paper_3/Datasets_Download/IP Network 
Traffic Flows Labeled with 75 Apps/
Dataset-Unicauca-Version2-87Atts.csv"))

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol = "L7Protocol", outputCol = "label")
df_1 = indexer.fit(df).transform(df)
df_1 = df_1.drop('ProtocolName')
df_1.printSchema()
allFeatures=df_1.columns

from pyspark.ml.feature import UnivariateFeatureSelector
from pyspark.ml.feature import VectorAssembler
import numpy as np
import pandas as pd

vec_assembler = VectorAssembler(inputCols=allFeatures, 
outputCol="features",handleInvalid="skip")


vec_df = vec_assembler.transform(df_1)

from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(vec_df)


scaledData = scalerModel.transform(vec_df)

selector = UnivariateFeatureSelector(featuresCol='scaledFeatures', 
outputCol="selectedFeatures",labelCol= "label")  

selector.setFeatureType("continuous").setLabelType("categorical")
.setSelectionThreshold(20)


model = selector.fit(scaledData)


model.getFeaturesCol()

selectedFeatures=model.selectedFeatures

selectedFeatures_Sama=np.array(df_1.columns)[model.selectedFeatures]
np.array(df_1.columns)[selectedFeatures]

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=selectedFeatures_Sama, 
outputCol="features",handleInvalid="skip")

# Create stages list
MyStages=[indexer,vectorAssembler]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=MyStages)

pModel = pipeline.fit(df)

df_2 = pModel.transform(df)

trainDF, testDF = df_2.randomSplit([0.8, 0.2])

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 
'label', maxDepth = 10)

dtModel = dt.fit(trainDF)

predictions = dtModel.transform(testDF)

predictions.select("probability","label","prediction")

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator =MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

1 Answers1

0

You dont have an algorithm in your pipeline. You only train your pipeline with an indexer and a vectorAssembler?

# Create stages list
MyStages=[indexer,vectorAssembler]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=MyStages)

pModel = pipeline.fit(df)

df_2 = pModel.transform(df)
JAdel
  • 1,309
  • 1
  • 7
  • 24