df = spark.read.csv('test.csv', header = True, inferSchema = True)
[trainingDF, testingDF] = df.randomSplit([0.8, 0.2])
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
names=testingDF.schema.names
for i in range(0, len(names)):
y = trainingDF.select(names[i]).collect()
x=trainingDF.select([c for c in trainingDF.columns if c not in {names[i]}]).collect()
numericCols = ["age", "race", "dpros", "dcaps", "psa","vol","gleason"]
assembler = VectorAssembler(inputCols=numericCols, outputCol="features") #Step 1
df = assembler.transform(trainingDF)
df=df.withColumnRenamed("CAPSULE","label")
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(df)
rsq = lrModel.rsquared
i am trying to calculate my rsquared value. In stats library we have .rsquared to calculate the rsquare value but due to project requirements i have to avoid using stats library.
How can we calculate the rsquared in pyspark?
i am getting this error:
AttributeError: 'LinearRegressionModel' object has no attribute 'rsquared'