I'd like to build a minimum encompassing taxonomic tree for a given set of wordnet synsets. For a set of 2 synsets the tree would be one where they are both children nodes of their lowest common hypernym.
For the following set:
[{'name': 'tench.n.01'},
{'name': 'goldfish.n.01'},
{'name': 'great_white_shark.n.01'},
{'name': 'tiger_shark.n.01'},
{'name': 'hammerhead.n.03'}]
The required result is:
{'name': 'fish.n.01',
'children': [{'name': 'cyprinid.n.01',
'children': [{'name': 'tench.n.01'}, {'name': 'goldfish.n.01'}]},
{'name': 'shark.n.01',
'children': [{'name': 'tiger_shark.n.01'},
{'name': 'great_white_shark.n.01'},
{'name': 'hammerhead.n.03'}]}]}
I had some success with relatively small sets. Once I try larger sets things start to break down.
E.g. for a 30 long set I got a tree that can be visualized as follows:
One can see for example that the great gray owl is not classified under bird.
Code example
Below I give a reproducible example in python of what I got so far:
Define tree building function
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from itertools import combinations
import pandas as pd
def synset_tree(synsets):
# find similarities between all leaf nodes
synsets_sim = []
for i,j in combinations(range(len(synsets)),2):
synsets_sim.append(pd.DataFrame({'syn1':[synsets[i]["name"]], 'syn2':[synsets[j]["name"]],
'sim':[wn.synset(synsets[i]["name"]).path_similarity(wn.synset(synsets[j]["name"]))]}))
synsets_sim = pd.concat(synsets_sim, axis=0)
while len(synsets)>1:
synsets_sim = synsets_sim.sort_values('sim', ascending=False)
# Find common ancestor of 2 closest leaf nodes
common_hype = wn.synset(synsets_sim.syn1.iloc[0]).lowest_common_hypernyms(wn.synset(synsets_sim.syn2.iloc[0]))[0].name()
# extract 2 leaf nodes
syn_dict1 = list(filter(lambda x: x["name"] == synsets_sim.syn1.iloc[0], synsets))[0]
syn_dict2 = list(filter(lambda x: x["name"] == synsets_sim.syn2.iloc[0], synsets))[0]
# remove lead nodes from leaf node list
synsets = [syn_dict for syn_dict in synsets if syn_dict not in [syn_dict1, syn_dict2]]
# The common hypernym will replace the 2 leaf nodes. Calculate it's similarity to all remaining leaf nodes.
new_sim = []
for i in range(len(synsets)):
new_sim.append(pd.DataFrame({'syn1':[synsets[i]["name"]], 'syn2':[common_hype],
'sim':[wn.synset(synsets[i]["name"]).path_similarity(wn.synset(common_hype))]}))
if len(new_sim) > 0:
new_sim = pd.concat(new_sim, axis=0)
new_sim = new_sim[new_sim.sim<1]
synsets_sim = pd.concat([synsets_sim, new_sim],axis=0)
# Add children of the nodes being removed to the common hypernym
if common_hype == syn_dict1["name"]:
if syn_dict1.get("children"):
common_hype = {"name":common_hype, "children":[syn_dict2] + syn_dict1.get("children")}
else:
common_hype = {"name":common_hype, "children":[syn_dict2]}
synsets_sim = synsets_sim[~((synsets_sim.syn1 == syn_dict2["name"]) | (synsets_sim.syn2 == syn_dict2["name"]))]
elif common_hype == syn_dict2["name"]:
if syn_dict2.get("children"):
common_hype = {"name":common_hype, "children":[syn_dict1] + syn_dict2.get("children")}
else:
common_hype = {"name":common_hype, "children":[syn_dict1]}
synsets_sim = synsets_sim[~((synsets_sim.syn1 == syn_dict1["name"]) | (synsets_sim.syn2 == syn_dict1["name"]))]
elif common_hype in [x["name"] for x in synsets]:
for i in range(len(synsets)):
if common_hype == synsets[i]["name"]:
if synsets[i]["children"]:
synsets[i]["children"] = synsets[i]["children"] + [syn_dict1, syn_dict2]
else:
synsets[i]["children"] = [syn_dict1, syn_dict2]
else:
common_hype = {"name":common_hype, "children":[syn_dict1, syn_dict2]}
synsets_sim = synsets_sim[~((synsets_sim.syn1 == syn_dict1["name"]) | (synsets_sim.syn2 == syn_dict1["name"]))]
synsets_sim = synsets_sim[~((synsets_sim.syn1 == syn_dict2["name"]) | (synsets_sim.syn2 == syn_dict2["name"]))]
synsets.append(common_hype)
return synsets[0]
Input set that works
synsets = [{'name': 'tench.n.01'},
{'name': 'goldfish.n.01'},
{'name': 'great_white_shark.n.01'},
{'name': 'tiger_shark.n.01'},
{'name': 'hammerhead.n.03'},
{'name': 'electric_ray.n.01'},
{'name': 'stingray.n.01'},
{'name': 'cock.n.05'},
{'name': 'hen.n.02'},
{'name': 'ostrich.n.02'},
{'name': 'brambling.n.01'},
{'name': 'goldfinch.n.02'},
{'name': 'house_finch.n.01'},
{'name': 'junco.n.01'},
{'name': 'indigo_bunting.n.01'},
{'name': 'robin.n.02'},
{'name': 'bulbul.n.01'},
{'name': 'jay.n.02'},
{'name': 'magpie.n.01'},
{'name': 'chickadee.n.01'},
{'name': 'water_ouzel.n.01'},
{'name': 'kite.n.04'},
{'name': 'bald_eagle.n.01'},
{'name': 'vulture.n.01'},
{'name': 'great_grey_owl.n.01'},
{'name': 'european_fire_salamander.n.01'},
{'name': 'common_newt.n.01'},
{'name': 'eft.n.01'},
{'name': 'spotted_salamander.n.01'},
{'name': 'axolotl.n.01'}]
wow = synset_tree(synsets[:5])
wow
{'name': 'fish.n.01',
'children': [{'name': 'cyprinid.n.01',
'children': [{'name': 'tench.n.01'}, {'name': 'goldfish.n.01'}]},
{'name': 'shark.n.01',
'children': [{'name': 'tiger_shark.n.01'},
{'name': 'great_white_shark.n.01'},
{'name': 'hammerhead.n.03'}]}]}
Input set that fails
wow = synset_tree(synsets) # this gives the tree which produces the above image