I have tried creating the video files in to chunks and transcribe the video but while trancribing the chunks the video start time for the 2nd chunk is starting from 0 and aslo it doesnot make the video file when there is silence and if a video file is splited into two chunks then chunk 1 last sentence and chunk 2 first sentence is over lapped suggest some solutions here is my code,
import openai
import math
import os
import subprocess
openai.api_key = "sk-oijfowiejfo"
filename = 'test.mp4'
# Constants
max_bytes = 26214400 # From Whisper error message
overlap_seconds = 5
silence_threshold = -40 # Adjust this threshold as needed (in dB)
# Get the bit rate directly from the file
bit_rate = float(subprocess.check_output(
["ffprobe", "-v", "quiet", "-show_entries", "format=bit_rate", "-of",
"default=noprint_wrappers=1:nokey=1", filename]).strip())
# Estimate the duration of each chunk
chunk_duration_s = (max_bytes * 8.0) / bit_rate * 0.9
# Get the duration of the audio file
audio_duration_s = float(subprocess.check_output(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of",
"default=noprint_wrappers=1:nokey=1", filename]).strip())
# Calculate the number of chunks
num_chunks = math.ceil(audio_duration_s / (chunk_duration_s - overlap_seconds))
transcriptions = []
output_folder = "chunks"
os.makedirs(output_folder, exist_ok=True)
# Get the file extension from the filename
file_extension = os.path.splitext(filename)[1]
# Silence detection function
def detect_silence(file):
cmd = ["ffmpeg", "-i", file, "-af", "silencedetect=noise=-" + str(silence_threshold) + "dB:d=0.5", "-f", "null", "-"]
result = subprocess.run(cmd, capture_output=True, text=True)
output = result.stderr
silence_segments = []
for line in output.split('\n'):
if "silence_start" in line:
start_time = float(line.split(':')[1])
silence_segments.append(start_time)
return silence_segments
# Split audio file based on silence
def split_audio_file(file, start_time, end_time, chunk_index):
chunk_file = os.path.join(output_folder, f"chunk_{chunk_index}{file_extension}")
subprocess.call(["ffmpeg", "-i", file, "-ss", str(start_time), "-to", str(end_time), "-y", chunk_file])
# Perform silence detection and split audio into chunks
silence_segments = detect_silence(filename)
start_time = 0
for i in range(num_chunks):
end_time = start_time + chunk_duration_s
# Adjust end time based on silence detection
for silence_time in silence_segments:
if start_time < silence_time < end_time:
end_time = silence_time
break
split_audio_file(filename, start_time, end_time, i + 1)
start_time = end_time - overlap_seconds
# Transcribe the chunks
previous_chunk_end = 0
for i in range(num_chunks):
chunk_file = os.path.join(output_folder, f"chunk_{i + 1}{file_extension}")
with open(chunk_file, "rb") as file:
chunk_duration = end_time - start_time
transcription = openai.Audio.transcribe("whisper-1", file, response_format="verbose_json")
transcriptions.append(transcription)
# Adjust the start and end times for the next chunk based on the current chunk's duration
previous_chunk_end = end_time
start_time = previous_chunk_end - overlap_seconds
end_time = start_time + chunk_duration_s
# Save transcriptions to a file
with open("transcriptionssteve_1.txt", "w") as file:
for idx, transcription in enumerate(transcriptions):
file.write(f"Chunk {idx + 1}:\n{transcription}\n\n")
# Combine the transcriptions into a single string
combined_transcription = " ".join(chunk["text"] for chunk in transcriptions)
# Write the combined transcription to a file
with open("transcriptions_combined_steve_1.txt", "w") as file:
file.write(combined_transcription)
output
Chunk 1 = {
"avg_logprob": -0.20832061767578125,
"compression_ratio": 1.3933333333333333,
"end": 565.56,
"id": 136,
"no_speech_prob": 1.994591912080068e-06,
"seek": 55456,
"start": 560.4799999999999,
"temperature": 0.0,
"text": " It made an impression on me, and since then, for the past 33 years, I've looked in the",
"tokens": [
467,
1027,
364,
9995,
322,
385,
11,
293,
1670,
550,
11,
337,
264,
1791,
11816,
924,
11,
286,
600,
2956,
294,
264
],
"transient": false
},
{
"avg_logprob": -0.45558141407213715,
"compression_ratio": 1.0704225352112675,
"end": 585.56,
"id": 137,
"no_speech_prob": 0.00018518016440793872,
"seek": 56556,
"start": 565.56,
"temperature": 0.0,
"text": " mirror every morning and asked myself if today were the last day of my life.",
"tokens": [
50364,
8013,
633,
2446,
293,
2351,
2059,
498,
965,
645,
264,
1036,
786,
295,
452,
993,
13,
51364
],
"transient": false
}
],
Chunk 2:
{
"duration": 339.1,
"language": "english",
"segments": [
{
"avg_logprob": -0.22957308330233134,
"compression_ratio": 1.6341463414634145,
"end": 2.7600000000000002,
"id": 0,
"no_speech_prob": 0.0005702849011868238,
"seek": 0,
"start": 0.0,
"temperature": 0.0,
"text": " I've looked in the mirror every morning and asked myself,",
"tokens": [
286,
600,
2956,
294,
264,
8013,
633,
2446,
293,
2351,
2059,
11
],
"transient": false
},
https://www.youtube.com/watch?v=UF8uR6Z6KLc&ab_channel=Stanford this is the video i had used for transcription.