I have read the argument that sometimes implementing things with CUDA on the GPU takes more time than doing it with the CPU because of:
- The time to allocate device memory
- The time to transfer to and back to that memory
alright, so I have written a script (two actually) in which I do not include the above considerations when I measure the time. It is not ideal but I measure only the time consumed by the kernel. Not transfer, not allocation.
Also, I use a kernel that does nothing. So no complex operator to delay us.
However, even there the kernel takes 10 times more than a opencv operation done in the CPU.
Here the pycuda script
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--show', action='store_true',help="show the video while making")
parser.add_argument('--resize',type=int,default=800,help="if resize is needed")
parser.add_argument('--noconvert', action='store_true',help="avoid rgb conversion")
# Parse and print the results
args = parser.parse_args()
print(args)
# Path to the input H.264 file
input_video_path = '70secsmovie.h264' # Replace with the path to your input H.264 file
# Path to the output MP4 file
output_video_path = 'output_video_cuda.mp4' # Replace with the desired output MP4 file name
# Open the input video file
video_capture = cv2.VideoCapture(input_video_path)
# Check if the video file was opened successfully
if not video_capture.isOpened():
print("Failed to open the video file.")
exit()
# Set the desired width for the output frames
desired_width = args.resize #800
# Get the video properties
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video_capture.get(cv2.CAP_PROP_FPS))
aspect_ratio = frame_width / frame_height
desired_height = int(desired_width / aspect_ratio)
# Create a VideoWriter object to save the output video
codec = cv2.VideoWriter_fourcc(*'mp4v')
# output_video = cv2.VideoWriter(output_video_path, codec, fps, (frame_width, frame_height))
output_video = cv2.VideoWriter(output_video_path, codec, fps, (desired_width, desired_height))
# Load the CUDA kernel for drawing the rectangle
mod = SourceModule("""
__global__ void draw_rectangle_kernel(unsigned char *image, int image_width, int x, int y, int width, int height, unsigned char *color)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= y && row < y + height && col >= x && col < x + width)
{
// Perform no operation
}
}
""")
draw_rectangle_kernel = mod.get_function("draw_rectangle_kernel")
# Set the block dimensions
block_dim_x, block_dim_y = 16, 16
# Calculate the grid dimensions
grid_dim_x = (frame_width + block_dim_x - 1) // block_dim_x
grid_dim_y = (frame_height + block_dim_y - 1) // block_dim_y
# Define the rectangle properties (you can modify these as desired)
x, y, width, height = 100, 100, 200, 150
color = np.array([0, 255, 0], dtype=np.uint8)
# Initialize the frame count
frame_count = 0
average = 0.0
start = cuda.Event()
end = cuda.Event()
# Read, process, and write each frame from the input video
while True:
# Read a frame from the video file
ret, frame = video_capture.read()
# If the frame was not read successfully, the end of the video file is reached
if not ret:
break
# Increment the frame count
frame_count += 1
if not args.noconvert:
# Convert the frame to the RGB format
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
else:
frame_rgb = frame
# start.record()
# start.synchronize()
# Upload the frame to the GPU
frame_gpu = cuda.mem_alloc(frame_rgb.nbytes)
cuda.memcpy_htod(frame_gpu, frame_rgb)
start.record()
start.synchronize()
# Invoke the CUDA kernel to draw the rectangle
# grid_dim_x = (frame_width + block_dim_x - 1) // block_dim_x
# grid_dim_y = (frame_height + block_dim_y - 1) // block_dim_y
draw_rectangle_kernel(frame_gpu, np.int32(frame_width), np.int32(x), np.int32(y),
np.int32(width), np.int32(height), cuda.In(color), block=(block_dim_x, block_dim_y, 1),
grid=(grid_dim_x, grid_dim_y))
end.record()
end.synchronize()
# Download the modified frame from the GPU
frame_with_rectangle_rgb = np.empty_like(frame_rgb)
cuda.memcpy_dtoh(frame_with_rectangle_rgb, frame_gpu)
# end.record()
# end.synchronize()
secs = start.time_till(end)*1e-3
# print("Time of Squaring on GPU with inout")
# print("%fs" % (secs))
average = average + secs
if not args.noconvert:
# Convert the modified frame back to the BGR format
frame_with_rectangle_bgr = cv2.cvtColor(frame_with_rectangle_rgb, cv2.COLOR_RGB2BGR)
else:
frame_with_rectangle_bgr = frame_with_rectangle_rgb
# Resize the frame to the desired width and height while maintaining the aspect ratio
resized_frame = cv2.resize(frame_with_rectangle_bgr, (desired_width, desired_height))
# Write the modified frame to the output video
output_video.write(resized_frame)
# Write the modified frame to the output video
# output_video.write(frame_with_rectangle_bgr)
if args.show:
# Display the modified frame (optional)
# cv2.imshow('Modified Frame', frame_with_rectangle_bgr)
cv2.imshow('Modified Frame', resized_frame)
# Wait for the 'q' key to be pressed to stop (optional)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release the video capture and writer objects and close any open windows
video_capture.release()
output_video.release()
if args.show:
cv2.destroyAllWindows()
# Print the total frame count
print("Total frames processed:", frame_count)
print("Operation took ", (average/frame_count))
and here to compare the opencv script
import cv2
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--show', action='store_true',help="show the video while making")
# Parse and print the results
args = parser.parse_args()
print(args)
# Path to the input H.264 file
input_video_path = '70secsmovie.h264' # Replace with the path to your input H.264 file
# Path to the output MP4 file
output_video_path = 'output_video.mp4' # Replace with the desired output MP4 file name
# Open the input video file
video_capture = cv2.VideoCapture(input_video_path)
# Check if the video file was opened successfully
if not video_capture.isOpened():
print("Failed to open the video file.")
exit()
# Set the desired width for the output frames
desired_width = 800
# Get the video properties
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video_capture.get(cv2.CAP_PROP_FPS))
codec = cv2.VideoWriter_fourcc(*'mp4v')
aspect_ratio = frame_width / frame_height
desired_height = int(desired_width / aspect_ratio)
# Create a VideoWriter object to save the output video
# output_video = cv2.VideoWriter(output_video_path, codec, fps, (frame_width, frame_height))
output_video = cv2.VideoWriter(output_video_path, codec, fps, (desired_width, desired_height))
# Initialize the frame count
frame_count = 0
average = 0.0
# Read, process, and write each frame from the input video
while True:
# Read a frame from the video file
ret, frame = video_capture.read()
# If the frame was not read successfully, the end of the video file is reached
if not ret:
break
# Increment the frame count
frame_count += 1
# Draw a rectangle on the frame (you can modify the rectangle's properties here)
x, y, width, height = 100, 100, 200, 150
start = cv2.getTickCount()
cv2.rectangle(frame, (x, y), (x + width, y + height), (0, 255, 0), 2)
end = cv2.getTickCount()
time = (end - start)/ cv2.getTickFrequency()
# print("Time for Drawing Rectangle using OpenCV")
# print("%fs" % (time))
average = average + time
# Resize the frame to the desired width and height while maintaining the aspect ratio
resized_frame = cv2.resize(frame, (desired_width, desired_height))
# Write the modified frame to the output video
output_video.write(resized_frame)
if args.show:
# Display the modified frame (optional)
cv2.imshow('Modified Frame', resized_frame)
# Wait for the 'q' key to be pressed to stop (optional)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release the video capture and writer objects and close any open windows
video_capture.release()
output_video.release()
if args.show:
cv2.destroyAllWindows()
# Print the total frame count
print("Total frames processed:", frame_count)
print("Operation took ", (average/frame_count))
With that the opencv script takes
Total frames processed: 704
Operation took 3.119159232954547e-05
while the pycuda took
Total frames processed: 704
Operation took 0.0003763223639266063
How can pycuda (or cuda for that matter) be useful for image processing?