I need to create a graph like this which have two relationships, continent-country, country-city. I have 3 columns: city, country, continent, but not sure how to get it into this graph.
Below is an example of another graph with only two columns, country & city. metro_name
is the city.
metro = spark.read.csv("metro.csv", header='true').withColumnRenamed("name","metro_name")
country = spark.read.csv("country.csv", header='true').withColumnRenamed("name","country_name")
continent = spark.read.csv("continent.csv", header='true').withColumnRenamed("name","continent_name")
metro_country = spark.read.csv("metro_country.csv", header='true')
country_continent = spark.read.csv("country_continent.csv", header='true')
mc_vertices = country.select(col("country_name").alias("id"))
mc_edges = country.join(metro_country, country.country_id == metro_country.country_id).join(metro, metro_country.metro_id == metro.metro_id).select(col("country_name").alias("src"),col("metro_name").alias("dst"))
mc = GraphFrame(mc_vertices, mc_edges)
# display graph
import networkx as nx
mc_gp = nx.from_pandas_edgelist(mc.edges.toPandas(),'src','dst')
nx.draw(mc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
I have tried:
# gets graph that has country, city, continent
mcc_vertices = mc_vertices
mcc_edges = mc_edges.join(cc_edges, mc_edges.src == cc_edges.src).select(mc_edges["src"],mc_edges["dst"],cc_edges["dst"].alias("continent_name"))
mcc = GraphFrame(mcc_vertices, mcc_edges)
# display the graph
mcc_gp = nx.from_pandas_edgelist(mcc.edges.toPandas(),'continent_name','src','dst')
nx.draw(mcc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
# gets graph that only has "North America"
northamerica_vertices = mcc_edges.filter(mcc_edges.continent_name == "North America").select(col("src").alias("id")).distinct()
northamerica_edges = mcc_edges.filter(mcc_edges.continent_name == "North America")
northamerica = GraphFrame(northamerica_vertices, northamerica_edges)
northamerica_gp = nx.from_pandas_edgelist(northamerica.edges.toPandas(),'src','dst')
nx.draw(northamerica_gp, with_labels = True, node_size = 40, font_size = 10, edge_color = "red")