subproccess on ffprobe is way to slow for a 16tb hd full of movies. I need something faster for getting the bitrate

Question

from subprocess import Popen, PIPE
from win32api import GetFileAttributes
from win32con import FILE_ATTRIBUTE_HIDDEN as HIDDEN, FILE_ATTRIBUTE_SYSTEM as SYSTEM
from datetime import datetime as dt
from mimetypes import guess_type
import os
from timer import Timelapse


ROOTPATH = 'D:\\Media'
FORMAT = '%Y-%m-%d %H:%M:%S'
FOLDER_NAMES = [
    'Downloads',
    'DVDFab',
    'Notifications',
    'TextAloud',
    'Wallpaper Engine',
    'Log',
    'Logs',
    'Temp'
]


def get_bitrate(fullname: str):
    output: str | bytes
    output, err = Popen([
        'ffprobe',
        '-v', 'error',
        '-hide_banner',
        '-of', 'default=noprint_wrappers=0',
        '-print_format', 'flat',
        '-select_streams', 'v:0',
        '-show_entries', 'format=bit_rate',
        f'{fullname}'
    ], stdout = PIPE, stderr = PIPE).communicate()
    return '' if err else''.join([num for num in output.decode() if num.isnumeric()])


def get_folder_content() -> dict[str, dict[str, dict[str, str] | None]]:
    out = {}
    for directory, directories, filenames in os.walk('D:\\Media'):
        # Removing folders I don't want processed by os.walk
        names = [name for name in directories]
        for name in names:
            if name in FOLDER_NAMES:
                directories.remove(name)

        # The whole point of this script is to sync these files on another
        # computer through the socket module, so the ROOTPATH has to be removed.
        # You can see in this script the Media is in D:\Media. On the other computer
        # the files are in %USERPROFILE%\Media
        dirname = directory.removeprefix(ROOTPATH).lstrip('\\')
        
        # skipping folders with no files that contain other folders.
        # No point in adding these to the dict.
        if len(directories) > 0 and len(filenames) == 0:
            continue
        else:
            # dictionary name/path is being added in case of an empty folder.
            # I still want the dictionaries to be created even if they are empty.
            out[dirname] = {}
            if len(filenames):
                for filename in filenames:
                    fullname = os.path.join(directory, filename)
                    if bool(GetFileAttributes(fullname) & (HIDDEN | SYSTEM)):
                        continue

                    mimetype, _ = guess_type(fullname)
                    if not (GetFileAttributes(fullname) & (HIDDEN | SYSTEM)) and mimetype is not None:
                        if 'video' in mimetype:
                            out[dirname][filename] = {
                                'mimetype': mimetype,
                                'filesize': os.path.getsize(fullname),
                                'mtime': dt.fromtimestamp(os.path.getmtime(fullname)).strftime(FORMAT),
                                'bitrate': get_bitrate(fullname)
                            }
                        elif 'text/plain' in mimetype:  # *.srt subtitle files
                            out[dirname][filename] = None
    return out


if __name__ == '__main__':
    timelapse = Timelapse()
    content = get_folder_content()
    timelapse.print()

I have a media folder that I want to generate a json file for syncing to another computer and ffprobe is taking way to long. Any ideas on a faster way to get the bitrate of a video file?

Some of the modules I've looked at are way outdated and I'm looking for a module that is still in active development that I can use to get the bitrate of movies. Something faster than using subproccess on ffprobe.

Ξένη Γήινος · Answer 1 · 2023-07-09T14:53:55.583

You are doing it wrong, it is not Python is slow, it is your inexperience.

First you didn't include the necessary imports, you need to add the following:

from subprocess import POPEN, PIPE

Otherwise the code wouldn't run.

Then, observe the output of your subprocess call, it is in the following format:

output = b'format.bit_rate="2638330"\r\n'

The above is a string encoded in ASCII, you want the number in it and convert to integer, that is easy to do:

int(''.join(i for i in output.decode() if i.isdigit()))

First we decode the byte sequence, then we just accumulate the characters that are digits, then we convert the accumulated string to integer.

But that is inefficient, we can just use string operations to convert to integer. Observe that the number comes after the equals sign and is inside two quotes, we can just split twice and using indexing to get the number.

int(output.decode().split('="')[1].split('"')[0])

Performance:


In [17]: int(''.join(i for i in output.decode() if i.isdigit()))
Out[17]: 2638330

In [18]: %timeit int(''.join(i for i in output.decode() if i.isdigit()))
2.8 µs ± 130 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

In [19]: int(output.decode().split('="')[1].split('"')[0])
Out[19]: 2638330

In [20]: %timeit int(output.decode().split('="')[1].split('"')[0])
595 ns ± 22.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

Fixed code:

from subprocess import POPEN, PIPE

def get_bitrate(fullname: str) -> int:
    output, _ = Popen([
        'ffprobe',
        '-v', 'error',
        '-hide_banner',
        '-of', 'default=noprint_wrappers=0',
        '-print_format', 'flat',
        '-select_streams', 'v:0',
        '-show_entries', 'format=bit_rate',
        f'{fullname}'
    ], stdout = PIPE, stderr = PIPE).communicate()

    return int(output.decode().split('="')[1].split('"')[0])

Edit

You claim your code is still slow but I assure you this has absolutely nothing to do with this, the problem is elsewhere, since you know very little of programming, you must have made a number of mistakes somewhere else, which you weren't able to identify as cause and mistakenly thought this to be the cause.

First ffprobe doesn't do any processing at all, it just retrieves information about media files, that's it.

But media files are stored in secondary devices, that are persistent storage, meaning after reboot the data is retained, but to do anything with the files, you need to load the files into primary memory first, loading into RAM is inherently slow, it is I/O bound, it will always use milliseconds, no matter what you use, SSD is much faster than HDD, but still very slow.

Update

Now that you have posted more code, I can make the Python portion of the code much faster, and I just spent some time to do so. But unfortunately this is still slow, and for asking alternatives, first, seeking software recommendations here is strictly off-topic here and will cause your question to be closed.

Secondly there aren't really any alternatives, ffmpeg is open source and really efficient, so most video processing softwares use it one way or the other, I don't think you can find anything that doesn't use ffmpeg under the hood, so, sorry, there isn't anything like you imagined.

Now look at this:

    for name in directories:
        if name in ['Downloads', 'Notifications', 'TextAloud', 'Wallpaper Engine', 'Log', 'Logs', 'Temp']:
            directories.remove(name)

What is the purpose of this? directories isn't used anywhere in the following code, yet you are doing this pointless task, it should be removed.

And yet, if you want to remove some elements from a list, your method is not very efficient. First don't use a list to do membership checking, use a set, a list uses linear search and is O(n), the more elements the longer it takes to look for something, because it just iterates through the list one by one and do equality checking.

Use a set to do membership checking, it uses a hashtable and looking up it is O(1), it is very fast no matter how many elements you have.

And mutating a list in place is slow, it would be faster to use a list comprehension and leave out the elements you want to ignore.

So write it like this should you need it (in this case you don't):

directories = [name for name in directories if name not in {'Downloads', 'Notifications', 'TextAloud', 'Wallpaper Engine', 'Log', 'Logs', 'Temp'}]

Then don't use str.count to check for substrings, it is slow because it doesn't short circuit, it doesn't stop as soon as a substring is found, it loops through the whole string to count all occurrences of the substring.

So using str.count creates a huge amount of unnecessary computation when you only want to know if a substring is contained in a string.

Use a in b for that purpose. For example, 'am' in 'example'. Now since the mimetypes are like this: 'video/mp4', they start with 'video', use str.startswith, if you know a suffix, use str.endswith instead.

Performance:

In [104]: %timeit 'am' in 'example'
46.8 ns ± 0.573 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [105]: %timeit 'example'.count('am')
142 ns ± 0.693 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [106]: %timeit 'example'.startswith('ex')
122 ns ± 1.78 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

We should use str.startswith here but since you only care about performance, use membership checking, since we know the format of mimetypes.

if bool(GetFileAttributes(fullname) & (is_hidden | is_system)):

There is absolutely no need to use explicit bool casting here, if conditions are automatically converted to bool, remove the casting.

Now the following might be too advanced for you, because filesystem lookup is very slow, we need to save file object information to RAM to avoid unnecessary filesystem calls, don't use os.walk, use os.scandir instead, it returns os.DirEntry which contains file information.

To access the stats use DirEntry.stat(), the output is like this:

os.stat_result(st_mode=33206, st_ino=0, st_dev=0, st_nlink=0, st_uid=0, st_gid=0, st_size=8591, st_atime=1649586624, st_mtime=1648480031, st_ctime=1649586624)

Size is st_size and last modified time is st_mtime, to access name of DirEntry use DirEntry.name, and to access full name use DirEntry.path, and you can use a simple recursive function to do what you want.

Refactored code:

from collections import deque
from subprocess import Popen, PIPE
from win32api import GetFileAttributes
from win32con import FILE_ATTRIBUTE_HIDDEN as is_hidden, FILE_ATTRIBUTE_SYSTEM as is_system
from datetime import datetime as dt
from mimetypes import guess_type
import json
import os
import time

ROOTPATH = 'D:\\Media'
FORMAT = '%Y-%m-%d %H:%M:%S'


def get_bitrate(fullname: str) -> int:
    output, err = Popen([
        'ffprobe',
        '-v', 'error',
        '-hide_banner',
        '-of', 'default=noprint_wrappers=0',
        '-print_format', 'flat',
        '-select_streams', 'v:0',
        '-show_entries', 'format=bit_rate',
        f'{fullname}'
    ], stdout = PIPE, stderr = PIPE).communicate()
    return 0 if err else int(output.decode().split('="')[1].split('"')[0])

timestamp = time.time()

def get_videoinfo(folder):
    out = {}
    for entry in os.scandir(folder):
        if entry.is_file() and not (GetFileAttributes(entry.path) & (is_hidden | is_system)):
            mimetype, _ = guess_type(entry)
            if mimetype is not None and 'video' in mimetype:
                stat = entry.stat()
                out[entry.name] = {
                    'mimetype': mimetype,
                    'filesize': stat.st_size,
                    'mtime': dt.fromtimestamp(stat.st_mtime).strftime(FORMAT),
                    'bitrate': get_bitrate(entry.path)
                }
        elif entry.is_dir():
            out |= get_videoinfo(entry)
    
    name = folder.name if isinstance(folder, os.DirEntry) else folder
    return {name: out}

print(json.dumps(get_videoinfo(ROOTPATH)))
print(dt.fromtimestamp(time.time() - timestamp).strftime('%M:%S:%f'))

And please don't steal my code again and edit your question to use my code and claim it to be still slow, it is plagiarism and do I need to remind you it is bad? Also it invalidates my advices because you copied my code and "fixed" the issues I addressed, so that my answer no longer is relevant.

Just stop your not helping me. I was asking for something other than FFmpeg and you're stuck on FFmpeg. — phpjunkie, Jul 12 '23 at 14:09

phpjunkie · Answer 2 · 2023-07-12T22:28:33.547

Here is my script once again with some modifications to compare the execution time between mine and yours.

from subprocess import Popen, PIPE
from win32api import GetFileAttributes
from win32con import FILE_ATTRIBUTE_HIDDEN as HIDDEN, FILE_ATTRIBUTE_SYSTEM as SYSTEM
from datetime import datetime as dt
from mimetypes import guess_type
import os
from timer import Timelapse


ROOTPATH = 'D:\\Media'
FORMAT = '%Y-%m-%d %H:%M:%S'
FOLDER_NAMES = [
    'Downloads',
    'Notifications',
    'TextAloud',
    'Wallpaper Engine',
    'Log',
    'Logs',
    'Temp'
]


def get_bitrate(fullname: str):
    output: str | bytes
    output, err = Popen([
        'ffprobe',
        '-v', 'error',
        '-hide_banner',
        '-of', 'default=noprint_wrappers=0',
        '-print_format', 'flat',
        '-select_streams', 'v:0',
        '-show_entries', 'format=bit_rate',
        f'{fullname}'
    ], stdout = PIPE, stderr = PIPE).communicate()
    return '' if err else''.join([num for num in output.decode() if num.isnumeric()])


def get_folder_content() -> dict[str, dict[str, dict[str, str] | None]]:
    output = {}
    for directory, directories, filenames in os.walk('D:\\Media'):
        # Removing folders I don't want processed by os.walk
        names = [name for name in directories]
        for name in names:
            if name in FOLDER_NAMES:
                directories.remove(name)

        # The whole point of this script is to sync these files on another
        # computer through the socket module, so the ROOTPATH has to be removed.
        # You can see in this script the Media is in D:\Media. On the other computer
        # the files are in %USERPROFILE%\Media
        # Also the lstrip('\\') is there 'cause the prefixed \ will cause os.path.join
        # to leave out the ROOTPATH on the other computer and I'll still end up with
        # the partial path except os.path.join will add drive letter and everything will
        # get dumped on the root of the C: drive.
        dirname = directory.removeprefix(ROOTPATH).lstrip('\\')

        # skipping folders with no files that contain other folders.
        # No point in adding these to the dict. All the parent folders
        # are going to be created with os.makedirs(folder\path, True)
        # on the other computer.
        if len(directories) > 0 and len(filenames) == 0:
            continue
        else:
            # dictionary path\dirname is being added in case of empty folders.
            # I still want the folders to be created even if they are empty.
            output[dirname] = {}

            # I originally wanted to set this up to strip out files I didn't want
            # processed by os.walk but I made these changes so I can compare the
            # execution time of your script and mine.
            if len(filenames):
                for filename in filenames:
                    fullname = os.path.join(directory, filename)
                    mimetype, _ = guess_type(fullname)
                    if not (GetFileAttributes(fullname) & (HIDDEN | SYSTEM)) and mimetype is not None and 'video' in mimetype:
                        output[dirname][filename] = {
                            'mimetype': mimetype,
                            'filesize': os.path.getsize(fullname),
                            'mtime': dt.fromtimestamp(os.path.getmtime(fullname)).strftime(FORMAT),
                            # 'bitrate': get_bitrate(fullname)
                        }
    return output


if __name__ == '__main__':
    timelapse = Timelapse()
    content = get_folder_content()
    timelapse.print()

Here is your script with some modifications so that both scripts are skipping and processing the same data.

from subprocess import Popen, PIPE
from win32api import GetFileAttributes
from win32con import FILE_ATTRIBUTE_HIDDEN as HIDDEN, FILE_ATTRIBUTE_SYSTEM as SYSTEM
from datetime import datetime as dt
from mimetypes import guess_type
import os
from timer import Timelapse


ROOTPATH = 'D:\\Media'
FORMAT = '%Y-%m-%d %H:%M:%S'
FOLDER_NAMES = [
    'Downloads',
    'Notifications',
    'TextAloud',
    'Wallpaper Engine',
    'Log',
    'Logs',
    'Temp'
]


def get_bitrate(fullname: str):
    output: str | bytes
    output, err = Popen([
        'ffprobe',
        '-v', 'error',
        '-hide_banner',
        '-of', 'default=noprint_wrappers=0',
        '-print_format', 'flat',
        '-select_streams', 'v:0',
        '-show_entries', 'format=bit_rate',
        f'{fullname}'
    ], stdout = PIPE, stderr = PIPE).communicate()
    return 0 if err else output.decode().split('="')[1].split('"')[0]


def get_videoinfo(folder):
    out = {}
    for entry in os.scandir(folder):
        if entry.is_file() and not (GetFileAttributes(entry.path) & (HIDDEN | SYSTEM)):
            mimetype, _ = guess_type(entry.path)
            if mimetype is not None and 'video' in mimetype:
                out[entry.name] = {
                    'mimetype': mimetype,
                    'filesize': entry.stat().st_size,
                    'mtime': dt.fromtimestamp(entry.stat().st_mtime).strftime(FORMAT),
                    # 'bitrate': get_bitrate(entry.path)
                }
        name = entry.path.split('\\')[-1]
        if entry.is_dir() and name not in FOLDER_NAMES:
            out |= get_videoinfo(entry.path)

    name = folder.path if isinstance(folder, os.DirEntry) else folder
    return {name: out}


if __name__ == '__main__':
    timelapse = Timelapse()
    get_videoinfo(ROOTPATH)
    timelapse.print()

With the bit rate in the dict here are the differences in execution time.

Mine:
01:00.322172
01:00.453455
01:00.304837

Yours:
01:00.408499
01:00.018996
01:00.017061

Here are the differences without the bit rate in the dict.

Mine:
00:00.192922
00:00.182526
00:00.196380

Yours:
00:00.092782
00:00.092743
00:00.091568

Yours is a tiny bit faster but the whole reason I'm using os.walk is 'cause os.walk makes it easy to skip files and folders I don't want processed by os.walk simply be removing them from the dirnames list and the filenames list.

subproccess on ffprobe is way to slow for a 16tb hd full of movies. I need something faster for getting the bitrate

2 Answers2

Edit

Update