One possible solution:
df = df['col'].str.get_dummies(' ')
print (df)
A123 B234 C345 D555 data more test
0 1 1 1 0 1 0 1
1 1 1 1 1 1 0 1
2 1 1 0 0 1 0 1
3 1 1 1 0 1 1 0
Alternative:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(df['col'].str.split()),
columns=mlb.classes_,
index=df.index)
print (df)
A123 B234 C345 D555 data more test
0 1 1 1 0 1 0 1
1 1 1 1 1 1 0 1
2 1 1 0 0 1 0 1
3 1 1 1 0 1 1 0
Get all combination from min_length
to max
of all combinations of columns (words
):
from itertools import combinations
a = df.columns
min_length = 3
comb = [j for i in range(len(a), min_length -1, -1) for j in combinations(a,i)]
In list comprehension count values:
df1 = pd.DataFrame([(', '.join(x), df.loc[:, x].all(axis=1).sum(), len(x)) for x in comb],
columns=['words','count','len'])
TOP = 2
TOP_count = sorted(df1['count'].unique())[-TOP:]
df1 = df1[df1['count'].isin(TOP_count)].sort_values(['count', 'len'], ascending=False)
print (df1)
words count len
66 A123, B234, data 4 3
30 A123, B234, C345, data 3 4
37 A123, B234, data, test 3 4
64 A123, B234, C345 3 3
68 A123, B234, test 3 3
70 A123, C345, data 3 3
77 A123, data, test 3 3
80 B234, C345, data 3 3
87 B234, data, test 3 3
EDIT:
Pure python solution:
from itertools import combinations, takewhile
from collections import Counter
min_length = 3
d = Counter()
for a in df['col'].str.split():
for i in range(len(a), min_length -1, -1):
for j in combinations(a,i):
d[j] +=1
#print (d)
#https://stackoverflow.com/a/26831143
def get_items_upto_count(dct, n):
data = dct.most_common()
val = data[n-1][1] #get the value of n-1th item
#Now collect all items whose value is greater than or equal to `val`.
return list(takewhile(lambda x: x[1] >= val, data))
L = get_items_upto_count(d, 2)
s = pd.DataFrame(L, columns=['val','count'])
print (s)
val count
0 (A123, B234, data) 4
1 (A123, B234, C345, data) 3
2 (A123, B234, test, data) 3
3 (A123, B234, C345) 3
4 (A123, B234, test) 3
5 (A123, C345, data) 3
6 (A123, test, data) 3
7 (B234, C345, data) 3
8 (B234, test, data) 3