Creating clusters from 3D data through HDBSCAN

Question

I have a problem, I have big data set of 15000 points, those points represent the airplanes over Europe and I have latitudes, longitudes and altitudes. I am trying to create program that will take points from specific country and then create clusters of those data and plot them, and eventually the last function will be to create new points in those clusters. I am already in the phase in which I create clusters and plot them.

The problem is that whenever I create clusters I creates kind of layer based cluster, which means that even through the points is horizontally very far from his cluster vertically it is very close if not the same height, and the program just puts these points in one layer in one cluster. I have points with altitude from 0 to 12000 meters and it just creates few cluster made as a layer and puts all the points in that layer into that cluster no matter how far are the horizontally. I would like to create more spherical clusters. I dont know if you can see it well but there are just layers of cluster through whole horizontal area. This is my code

import easygui
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.path import Path
from matplotlib.patches import PathPatch
from shapely.geometry import Point, Polygon
import matplotlib.path as mplPath
import plotly.graph_objs as go
from plotly.offline import plot
from plotly.subplots import make_subplots
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import hdbscan
from sklearn.mixture import GaussianMixture

# otevírání excelu
#path = easygui.fileopenbox()
df = pd.read_excel("1finalnidata.xlsx", sheet_name='50vzorek', usecols=["time", "icao24", "lat", "lon", "geoaltitude"],na_values=["NULL"])

# ukládání hodnot  excelu
k2 = df["time"].values
m2 = df["icao24"].values
n2 = df["lat"].values
b2 = df["lon"].values
z2 = df["geoaltitude"].values

# načítání excelu s daty prostorů
#path = easygui.fileopenbox()
df = pd.read_excel("FinalniProstoryPYT.xlsx", sheet_name='Sheet1', usecols=["aid", "lat", "lon", "seznam"])

id = df["aid"].values
lat = df["lat"].values
lon = df["lon"].values
sez = df["seznam"].values
sez = sez[:38]

# prázdné proměnné s x,y souřadnicemi prostoru
airspaceLON = []
airspaceLAT = []

check = None
poly = []
b = 0
planedatID = []
planedatAS = []
soux = []
souy = []
souz = []
print("3")

# algoritmus pro projíždění jednotlivých prostorů a určování ze kterého prostoru bod je
for i in range(0, len(sez)):
    check = sez[i]
    for j in range(0, len(id)):
        if id[j] == check:
            airspaceLON.append(lon[j])
            airspaceLAT.append(lat[j])

        if j == 8861:
            poly = list(zip(airspaceLAT, airspaceLON))
            poly_path = mplPath.Path(np.array(poly))

            for x in range(0, len(n2)):
                point = (n2[x], b2[x])
                if poly_path.contains_point(point):
                    planedatID.append(m2[x])
                    planedatAS.append(check)
                    soux.append(n2[x])
                    souy.append(b2[x])
                    souz.append(z2[x])

                else:
                    poly.clear()
                    airspaceLON.clear()
                    airspaceLAT.clear()
                poly.clear()
                airspaceLON.clear()
                airspaceLAT.clear()

matice5 = np.column_stack((planedatID, planedatAS))
souPuvBod = list(zip(souy, soux))


polyg1 = []
polyg2 = []

for j in range(0, len(id)):
    if id[j] == "FRA":
        polyg1.append(lon[j])
        polyg2.append(lat[j])

poly1 = list(zip(polyg1, polyg2))

pointsINpoly=[]

polygon = Polygon(np.array(poly1))

poly_path1 = mplPath.Path(np.array(poly1))

bod1=[]
bod2=[]
bod3=[]

for x in range(0, len(souPuvBod)):
    point = (souy[x], soux[x])
    if poly_path1.contains_point(point):
        bod1.append(souy[x])
        bod2.append(soux[x])
        bod3.append(souz[x])

data = np.column_stack((bod1, bod2, bod3))

# run HDBSCAN clustering algorithm
clusterer = hdbscan.HDBSCAN(min_cluster_size=40, min_samples=5, metric='euclidean')
labels = clusterer.fit_predict(np.column_stack((bod1, bod2, bod3)))

# get the number of clusters
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)

# create a 3D scatter plot
fig = make_subplots(rows=1, cols=1, specs=[[{'type':'scatter3d'}]])

# create a list of colors for each label
colors = ['rgb({},{},{})'.format(int(255/num_clusters*i), 0, int(255/num_clusters*(num_clusters-i-1))) for i in range(num_clusters)]

# create a scatter plot for each label
for i in range(num_clusters):
    x = []
    y = []
    z = []
    for j in range(len(bod1)):
        if labels[j] == i:
            x.append(bod1[j])
            y.append(bod2[j])
            z.append(bod3[j])
    fig.add_trace(go.Scatter3d(x=x, y=y, z=z, mode='markers', marker=dict(color=colors[i], size=3)),row=1, col=1)

# set the layout
fig.update_layout(title='Data for Specific Polygon in 3D', scene=dict(xaxis=dict(title='Longitude'), yaxis=dict(title='Latitude'), zaxis=dict(title='Altitude')))
plot(fig, filename='3d_scatter_with_polygon.html')  # save the plot as an HTML file

I was trying to change the parameters of the clustering, but everytime It creates clusters in these layers. Is there something I could do, like some other parameters or other clustering method ? I would just have to say that I would like to cluster more vertically, not just these horizontal layers.

Creating clusters from 3D data through HDBSCAN

0 Answers0