I have a list of strings and on every string I am doing some changes that you can see in wordify()
. Now, to speed this up, I split up the list into sublists using chunked()
(the number of sublists is the number of CPU cores - 1). That way I get lists that look like [[,,],[,,],[,,],[,,]]
.
What I try to achieve:
I want to do wordify()
on every of these sublists simultaneously, returning the sublists as separate lists. I want to wait until all processes finish and then join these sublists into one list. The approach below does not work.
import multiprocessing
from multiprocessing import Pool
from contextlib import closing
def readFiles():
words = []
with open("somefile.txt") as f:
w = f.readlines()
words = words + w
return words
def chunked(words, num_cpu):
avg = len(words) / float(num_cpu)
out = []
last = 0.0
while last < len(words):
out.append(words[int(last):int(last + avg)])
last += avg
return out
def wordify(chunk,wl):
wl.append([chunk[word].split(",", 1)[0] for word in range(len(chunk))])
return wl
if __name__ == '__main__':
num_cpu = multiprocessing.cpu_count() - 1
words = readFiles()
chunked = chunked(words, num_cpu)
wordlist = []
wordify(words, wordlist) # works
with closing(Pool(processes = num_cpu)) as p:
p.map(wordify, chunked, wordlist) # fails