2

I have a function which counts co-occurrences between center and context words within reviews.

def get_coocs(x):
    
    occurdict={}
    
    # Pre-processing
    tokens = nltk.word_tokenize(x)
    tokenslower = list(map(str.lower, tokens)) 
    
    # Save all the nouns in each review
    allnouns=[word for word in tokenslower if word in cent_vocab]
    
    # Save all the verbs/adjectives in each review
    allverbs_adj=Counter(word for word in tokenslower if word in cont_vocab)
    
    # Creating a dictionary of dictionaries
    for noun in allnouns:
        occurdict[noun]=dict(allverbs_adj)
        
    return occurdict

coocs=df['comments'].apply(lambda x: get_coocs(x))

My dict of dicts looks like this:

{'host': {'is': 3, 'most': 1, 'amazing': 1},
{'time': {'had': 1, 'such': 1, 'great': 1},
{'room': {'very': 2, 'professional': 1},
{'way': {'is': 3, 'recommended': 1, 'provided': 2}

But when I try and convert it into a dataframe, with nouns as columns and verbs/adjectives as indexes with corresponding co-occurence values I end up with this:

def cooc_dict2df(coocs):
    
    coocdf=pd.DataFrame.from_dict({i:coocs[i] for i in coocs.keys()}, orient='index')
    
    return coocdf

dataframe

I've attempted other solutions but I still can't seem to get what I want.

ThePyGuy
  • 17,779
  • 5
  • 18
  • 45
RDTJr
  • 185
  • 1
  • 9

1 Answers1

0

You could try this:

# Toy data
coocs = {
    "host": [
        {"is": 3, "most": 8, "amazing": 1},
        {"had": 5, "such": 7, "great": 9},
        {"very": 3, "recommended": 1, "provided": 2},
    ],
    "time": [
        {"is": 2, "most": 9, "amazing": 7},
        {"had": 6, "such": 6, "great": 8},
        {"very": 2, "recommended": 3, "provided": 4},
    ],
    "room": [
        {"is": 7, "most": 1, "amazing": 2},
        {"had": 7, "such": 5, "great": 8},
        {"very": 1, "recommended": 5, "provided": 4},
    ],
    "way": [
        {"is": 1, "most": 6, "amazing": 9},
        {"had": 8, "such": 4, "great": 9},
        {"very": 7, "recommended": 7, "provided": 1},
    ],
}

# Make a list of dataframes
dfs = [pd.DataFrame({colname: col}) for colname, cols in coocs.items() for col in cols]

# Merge dataframes
new_df = dfs[0]
for df in dfs[1:]:
    new_df = new_df.merge(df, how="outer", left_index=True, right_index=True)

new_df.fillna(0, inplace=True)

# Add identical columns
for name in coocs.keys():
    new_df[f"new_{name}"] = 0
    for col in new_df.columns:
        if col.startswith(name):
            new_df[f"new_{name}"] = new_df[f"new_{name}"] + new_df[col]

# Drop useless columns and rename the remaining ones
new_df = new_df.drop(columns=[col for col in new_df.columns if "new_" not in col])
new_df.columns = [col[4:] for col in new_df.columns]


print(new_df)
# Outputs
            host    time    room    way
amazing     1.0     7.0     2.0     9.0
great       9.0     8.0     8.0     9.0
had         5.0     6.0     7.0     8.0
is          3.0     2.0     7.0     1.0
most        8.0     9.0     1.0     6.0
provided    2.0     4.0     4.0     1.0
recommended 1.0     3.0     5.0     7.0
such        7.0     6.0     5.0     4.0
very        3.0     2.0     1.0     7.0
Laurent
  • 12,287
  • 7
  • 21
  • 37