0

I have tried creating the video files in to chunks and transcribe the video but while trancribing the chunks the video start time for the 2nd chunk is starting from 0 and aslo it doesnot make the video file when there is silence and if a video file is splited into two chunks then chunk 1 last sentence and chunk 2 first sentence is over lapped suggest some solutions here is my code,

import openai
import math
import os
import subprocess

openai.api_key = "sk-oijfowiejfo"
filename = 'test.mp4'

# Constants
max_bytes = 26214400  # From Whisper error message
overlap_seconds = 5
silence_threshold = -40  # Adjust this threshold as needed (in dB)

# Get the bit rate directly from the file
bit_rate = float(subprocess.check_output(
    ["ffprobe", "-v", "quiet", "-show_entries", "format=bit_rate", "-of",
     "default=noprint_wrappers=1:nokey=1", filename]).strip())

# Estimate the duration of each chunk
chunk_duration_s = (max_bytes * 8.0) / bit_rate * 0.9

# Get the duration of the audio file
audio_duration_s = float(subprocess.check_output(
    ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of",
     "default=noprint_wrappers=1:nokey=1", filename]).strip())

# Calculate the number of chunks
num_chunks = math.ceil(audio_duration_s / (chunk_duration_s - overlap_seconds))

transcriptions = []

output_folder = "chunks"
os.makedirs(output_folder, exist_ok=True)

# Get the file extension from the filename
file_extension = os.path.splitext(filename)[1]

# Silence detection function
def detect_silence(file):
    cmd = ["ffmpeg", "-i", file, "-af", "silencedetect=noise=-" + str(silence_threshold) + "dB:d=0.5", "-f", "null", "-"]
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stderr
    silence_segments = []
    for line in output.split('\n'):
        if "silence_start" in line:
            start_time = float(line.split(':')[1])
            silence_segments.append(start_time)
    return silence_segments

# Split audio file based on silence
def split_audio_file(file, start_time, end_time, chunk_index):
    chunk_file = os.path.join(output_folder, f"chunk_{chunk_index}{file_extension}")
    subprocess.call(["ffmpeg", "-i", file, "-ss", str(start_time), "-to", str(end_time), "-y", chunk_file])

# Perform silence detection and split audio into chunks
silence_segments = detect_silence(filename)
start_time = 0
for i in range(num_chunks):
    end_time = start_time + chunk_duration_s
    # Adjust end time based on silence detection
    for silence_time in silence_segments:
        if start_time < silence_time < end_time:
            end_time = silence_time
            break
    split_audio_file(filename, start_time, end_time, i + 1)
    start_time = end_time - overlap_seconds

# Transcribe the chunks
previous_chunk_end = 0
for i in range(num_chunks):
    chunk_file = os.path.join(output_folder, f"chunk_{i + 1}{file_extension}")
    with open(chunk_file, "rb") as file:
        chunk_duration = end_time - start_time
        transcription = openai.Audio.transcribe("whisper-1", file, response_format="verbose_json")
       
        transcriptions.append(transcription)
        # Adjust the start and end times for the next chunk based on the current chunk's duration
        previous_chunk_end = end_time
        start_time = previous_chunk_end - overlap_seconds
        end_time = start_time + chunk_duration_s

# Save transcriptions to a file
with open("transcriptionssteve_1.txt", "w") as file:
    for idx, transcription in enumerate(transcriptions):
        file.write(f"Chunk {idx + 1}:\n{transcription}\n\n")

# Combine the transcriptions into a single string
combined_transcription = " ".join(chunk["text"] for chunk in transcriptions)

# Write the combined transcription to a file
with open("transcriptions_combined_steve_1.txt", "w") as file:
    file.write(combined_transcription)

output

 Chunk 1 =   {
      "avg_logprob": -0.20832061767578125,
      "compression_ratio": 1.3933333333333333,
      "end": 565.56,
      "id": 136,
      "no_speech_prob": 1.994591912080068e-06,
      "seek": 55456,
      "start": 560.4799999999999,
      "temperature": 0.0,
      "text": " It made an impression on me, and since then, for the past 33 years, I've looked in the",
      "tokens": [
        467,
        1027,
        364,
        9995,
        322,
        385,
        11,
        293,
        1670,
        550,
        11,
        337,
        264,
        1791,
        11816,
        924,
        11,
        286,
        600,
        2956,
        294,
        264
      ],
      "transient": false
    },
    {
      "avg_logprob": -0.45558141407213715,
      "compression_ratio": 1.0704225352112675,
      "end": 585.56,
      "id": 137,
      "no_speech_prob": 0.00018518016440793872,
      "seek": 56556,
      "start": 565.56,
      "temperature": 0.0,
      "text": " mirror every morning and asked myself if today were the last day of my life.",
      "tokens": [
        50364,
        8013,
        633,
        2446,
        293,
        2351,
        2059,
        498,
        965,
        645,
        264,
        1036,
        786,
        295,
        452,
        993,
        13,
        51364
      ],
      "transient": false
    }
  ],

Chunk 2:
{
  "duration": 339.1,
  "language": "english",
  "segments": [
    {
      "avg_logprob": -0.22957308330233134,
      "compression_ratio": 1.6341463414634145,
      "end": 2.7600000000000002,
      "id": 0,
      "no_speech_prob": 0.0005702849011868238,
      "seek": 0,
      "start": 0.0,
      "temperature": 0.0,
      "text": " I've looked in the mirror every morning and asked myself,",
      "tokens": [
        286,
        600,
        2956,
        294,
        264,
        8013,
        633,
        2446,
        293,
        2351,
        2059,
        11
      ],
      "transient": false
    },

https://www.youtube.com/watch?v=UF8uR6Z6KLc&ab_channel=Stanford this is the video i had used for transcription.

0 Answers0