l = [('Alice2', 1),('Alice3', 2),('Alice3', 3),('Alice1', 4),('Alice3', 5)]
df = spark.createDataFrame(l,['name', 'age'])
df.show()
print df.approxQuantile("age", [0.5],0)
the result is [4.0], why not [3.0]? 0.5 is the median
l = [('Alice2', 1),('Alice3', 2),('Alice3', 3),('Alice1', 4),('Alice3', 5)]
df = spark.createDataFrame(l,['name', 'age'])
df.show()
print df.approxQuantile("age", [0.5],0)
the result is [4.0], why not [3.0]? 0.5 is the median