I've got a script that loads, resamples, renames and saves audio data to a new (or the same) location.
For the past two days, I've been trying to run this script on Google Cloud. With 8CPUs, this operation should take about 8 hours. I got 6 hours in today, and it went pear-shaped.
Unfortunately I keep running into a system error at some random point in the process:
RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "/home/jupyter/jn-kaggle/birdsong/who-said-what/wsw/preprocessing.py", line 27, in write_audio
with sf.SoundFile(path, 'w', sr, channels=1, format='WAV') as f:
File "/opt/conda/lib/python3.7/site-packages/soundfile.py", line 629, in __init__
self._file = self._open(file, mode_int, closefd)
File "/opt/conda/lib/python3.7/site-packages/soundfile.py", line 1184, in _open
"Error opening {0!r}: ".format(self.name))
File "/opt/conda/lib/python3.7/site-packages/soundfile.py", line 1357, in _error_check
raise RuntimeError(prefix + _ffi.string(err_str).decode('utf-8', 'replace'))
RuntimeError: Error opening '/home/jupyter/jn-kaggle/birdsong/data/resampled/solsan/XC448920.wav': System error.
"""
Now, I've read that this system error is typically because the file path doesn't exist. However, since I am creating the file BEFORE attempting to open it, I don't believe this is possible:
os.makedirs(os.path.dirname(path), exist_ok=True)
with sf.SoundFile(path, 'w', sr, channels=1, format=format) as f:
f.write(audio)
So, I don't think it's an issue with getting the path wrong, since it is defined and created.
Here is the full script:
def resample_all(old_path, new_path, sr):
# Get every folder in a directory
folders = [d for d in os.scandir(old_path) if os.path.isdir(d.path)]
n_folders = len(folders)
# for each folder, get each file
for i, folder in enumerate(folders):
dirname = folder.name
# get every file from
files = [f.name for f in os.scandir(folder) if os.path.isfile(f.path)]
# rename name file with .wav extension
renamed = map(rename, files)
# get original path of every file
r_paths = [f.path for f in os.scandir(folder)]
# get path to write to for every file.
w_paths = [os.path.join(new_path, dirname, f) for f in renamed]
with Pool(os.cpu_count()) as p:
# resample audio
data = p.starmap(read_audio, zip(r_paths, [sr] * len(r_paths)))
# save audio
aud, srs = zip(*data)
p.starmap(write_audio, zip(w_paths, aud, srs))
The three helper functions which are being passed off to the multiprocessing.Pool
are:
def read_audio(file_path, sr=22050):
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
return librosa.load(file_path, sr=sr)
def write_audio(path, audio, sr, format='WAV'):
os.makedirs(os.path.dirname(path), exist_ok=True)
with sf.SoundFile(path, 'w', sr, channels=1, format=format) as f:
f.write(audio)
def rename(file_path, ext='.wav'):
return os.path.splitext(file_path)[0] + ext