The issue is you are treating the TOPICS as all one, if you want individual sections use the groupby code from the original answer getting a set of all names first then comparing the set of names against the defualtdict keys to find the difference in each section:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
# find every word in every TOPIC
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0) # rset pointer
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
# get difference in all_words vs words in current TOPIC
# giving 0 as default for missing values
for word in all_words - d.viewkeys():
d[word] = 0
for k,v in d.iteritems():
print("Prob for {} is {}".format(k,v))
d = defaultdict(float)
To store all the output you can add the dicts to a list:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
for word in all_words - d.viewkeys():
d[word] = 0
out.append(d)
d = defaultdict(float)
Then iterate over the list:
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
Or forget the defualtdict and use dict.fromkeys:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = [line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")]
f.seek(0)
out, d = [], dict.fromkeys(all_words ,0.0)
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
out.append(d)
d = dict.fromkeys(all_words ,0)
If you always want the missing words at the end use a collections.OrderedDict with the first approach adding missing values at the end of the dict:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d.setdefault(name, (float(val) * f))
for word in all_words.difference(d):
d[word] = 0
out.append(d)
d = OrderedDict()
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
Finally to store in order and by topic:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = OrderedDict()
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v).rstrip()
# create OrderedDict for each topic
out[topic] = OrderedDict()
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
out[topic].setdefault(name, (float(val) * f))
# find words missing from TOPIC and set to 0
for word in all_words.difference(out[topic]):
out[topic][word] = 0
for k,v in out.items():
print(k) # each TOPIC
for k,v in v.iteritems():
print("Prob for {} is {}".format(k,v)) # the OrderedDict items
print("\n")
doc1:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
doc2:
0.345 0.566667
Output:
TOPIC:topic_0 5892.0
Prob for site is 0.0128233197556
Prob for Internet is 0.00901731160895
Prob for online is 0.00790478615073
Prob for web is 0.00755346232181
Prob for say is 0.00550407331974
Prob for image is 0.00521130346231
Prob for BBC is 0
Prob for Mr is 0
Prob for s is 0
Prob for president is 0
Prob for tell is 0
TOPIC:topic_1 12366.0
Prob for Mr is 0.085187930859
Prob for s is 0.0293277438137
Prob for say is 0.0255701266375
Prob for president is 0.00870667394471
Prob for tell is 0.0076985327511
Prob for BBC is 0.0076985327511
Prob for web is 0
Prob for image is 0
Prob for online is 0
Prob for site is 0
Prob for Internet is 0
You can apply the exact same logic using a regular for loop, the groupby just does all the grouping work for you.
If you actually just want to write to a file then the code even simpler:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2,open("prob.txt","w") as f3:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic, words = next(v), []
flt = next(values)
f3.write(topic)
for s in v:
name, val = s.split()
words.append(name)
f3.write("{} {}\n".format(name, (float(val) * flt)))
for word in all_words.difference(words):
f3.write("{} {}\n".format(word, 0))
f3.write("\n")
prob.txt:
TOPIC:topic_0 5892.0
site 0.0128233197556
Internet 0.00901731160895
online 0.00790478615073
web 0.00755346232181
say 0.00550407331974
image 0.00521130346231
BBC 0
Mr 0
s 0
president 0
tell 0
TOPIC:topic_1 12366.0
Mr 0.085187930859
s 0.0293277438137
say 0.0255701266375
president 0.00870667394471
tell 0.0076985327511
BBC 0.0076985327511
web 0
image 0
online 0
site 0
Internet 0