I worked on code published on GitHub https://github.com/jrterven/audio-visual-dataset/blob/master/extract_detailed_text_watson.py the code was design to use 5 words but I want to change it to 1 word so I try do that in the code but there is an error in the run as shown below
found 2 files
Processing video: health_news_2.mp4
video resolution: 608 x 1080
video framerate: 29.97002997002997
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'صورة', 'conf': 0.61, 'start': 2.07, 'end': 2.55, 'bounding_box': []}
s_sec, s_millisec: 2.0 69.99999999999984
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'مجهرية', 'conf': 0.97, 'start': 2.55, 'end': 3.24, 'bounding_box': []}
s_sec, s_millisec: 2.0 549.9999999999998
/Users/shaimaa/opt/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py:780: UserWarning: Note that order of the arguments: ceil_mode and return_indices will changeto match the args list in nn.MaxPool2d in a future release.
warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'مجهرية', 'conf': 0.97, 'start': 2.55, 'end': 3.24, 'bounding_box': [230, 126, 131, 171]}
ffmpeg version 4.4.1 Copyright (c) 2000-2021 the FFmpeg developers
built with Apple clang version 13.0.0 (clang-1300.0.29.3)
configuration: --prefix=/usr/local/Cellar/ffmpeg/4.4.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-avresample --enable-videotoolbox
libavutil 56. 70.100 / 56. 70.100
libavcodec 58.134.100 / 58.134.100
libavformat 58. 76.100 / 58. 76.100
libavdevice 58. 13.100 / 58. 13.100
libavfilter 7.110.100 / 7.110.100
libavresample 4. 0. 0 / 4. 0. 0
libswscale 5. 9.100 / 5. 9.100
libswresample 3. 9.100 / 3. 9.100
libpostproc 55. 9.100 / 55. 9.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'build_Dataset/news/health_news_2.mp4':
Metadata:
major_brand : isom
minor_version : 512
compatible_brands: isomiso2avc1mp41
encoder : Lavf58.45.100
Duration: 00:04:50.88, start: 0.000000, bitrate: 603 kb/s
Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 608x1080 [SAR 1:1 DAR 76:135], 468 kb/s, 29.97 fps, 29.97 tbr, 30k tbn, 59.94 tbc (default)
Metadata:
handler_name : ISO Media file produced by Google Inc.
vendor_id : [0][0][0][0]
Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 127 kb/s (default)
Metadata:
handler_name : ISO Media file produced by Google Inc.
vendor_id : [0][0][0][0]
Stream mapping:
Stream #0:0 -> #0:0 (h264 (native) -> h264 (libx264))
Stream #0:1 -> #0:1 (copy)
Press [q] to stop, [?] for help
[libx264 @ 0x7f7fe8810800] using SAR=1/1
[libx264 @ 0x7f7fe8810800] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2
[libx264 @ 0x7f7fe8810800] profile High, level 1.1, 4:2:0, 8-bit
[libx264 @ 0x7f7fe8810800] 264 - core 163 r3060 5db6aa6 - H.264/MPEG-4 AVC codec - Copyleft 2003-2021 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=5 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00
Output #0, mp4, to '/Users/shaimaa/Downloads/LIP_Reading/Code/audio-visual-dataset-master/results_news/news/health_news_2/1.mp4':
Metadata:
major_brand : isom
minor_version : 512
compatible_brands: isomiso2avc1mp41
encoder : Lavf58.76.100
Stream #0:0(und): Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 130x170 [SAR 1:1 DAR 13:17], q=2-31, 29.97 fps, 30k tbn (default)
Metadata:
handler_name : ISO Media file produced by Google Inc.
vendor_id : [0][0][0][0]
encoder : Lavc58.134.100 libx264
Side data:
cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A
Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 127 kb/s (default)
Metadata:
handler_name : ISO Media file produced by Google Inc.
vendor_id : [0][0][0][0]
frame= 21 fps=0.0 q=-1.0 Lsize= 18kB time=00:00:00.67 bitrate= 222.3kbits/s dup=1 drop=0 speed=4.86x
video:5kB audio:11kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 13.600434%
[libx264 @ 0x7f7fe8810800] frame I:1 Avg QP:23.12 size: 1388
[libx264 @ 0x7f7fe8810800] frame P:5 Avg QP:25.05 size: 401
[libx264 @ 0x7f7fe8810800] frame B:15 Avg QP:30.82 size: 85
[libx264 @ 0x7f7fe8810800] consecutive B-frames: 4.8% 0.0% 0.0% 95.2%
[libx264 @ 0x7f7fe8810800] mb I I16..4: 3.0% 80.8% 16.2%
[libx264 @ 0x7f7fe8810800] mb P I16..4: 1.4% 2.6% 0.4% P16..4: 51.3% 19.2% 6.3% 0.0% 0.0% skip:18.8%
[libx264 @ 0x7f7fe8810800] mb B I16..4: 0.0% 0.1% 0.1% B16..8: 31.9% 2.7% 0.5% direct: 0.7% skip:64.0% L0:31.9% L1:64.7% BI: 3.4%
[libx264 @ 0x7f7fe8810800] 8x8 transform intra:76.4% inter:84.5%
[libx264 @ 0x7f7fe8810800] coded y,uvDC,uvAC intra: 75.0% 89.4% 42.3% inter: 7.0% 6.6% 0.3%
[libx264 @ 0x7f7fe8810800] i16 v,h,dc,p: 0% 40% 10% 50%
[libx264 @ 0x7f7fe8810800] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 15% 32% 8% 5% 7% 8% 7% 10% 9%
[libx264 @ 0x7f7fe8810800] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 21% 26% 9% 7% 7% 10% 4% 10% 6%
[libx264 @ 0x7f7fe8810800] i8c dc,h,v,p: 38% 34% 20% 8%
[libx264 @ 0x7f7fe8810800] Weighted P-Frames: Y:0.0% UV:0.0%
[libx264 @ 0x7f7fe8810800] ref P L0: 68.6% 9.7% 15.4% 6.2%
[libx264 @ 0x7f7fe8810800] ref B L0: 91.5% 5.4% 3.1%
[libx264 @ 0x7f7fe8810800] ref B L1: 89.9% 10.1%
[libx264 @ 0x7f7fe8810800] kb/s:53.25
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'تظهر', 'conf': 1.0, 'start': 3.24, 'end': 3.66, 'bounding_box': []}
s_sec, s_millisec: 3.0 240.00000000000023
entry: <class 'dict'> {}
Traceback (most recent call last):
File "extract_subvideos.py", line 424, in <module>
main(args)
File "extract_subvideos.py", line 140, in main
s_sec, s_millisec = divmod(float(entry['start']), 1)
KeyError: 'start'
In this part of the code I change in it an only the number of word
fa = FaceAlignment()
videos_directory = args.videos_dir
results_dir = args.results_dir
vids_name = args.category
vid_proc_name = args.log_file
dataset_annotation_file = args.ann_file
if args.save_videos == 'True':
save_videos = True
else:
save_videos = False
# Create video window
cv2.namedWindow('Vid')
# load or create list with processed files
processed_files = []
videos_processed_exists = os.path.isfile(
os.path.join(results_dir, vid_proc_name))
if not videos_processed_exists:
with open(os.path.join(results_dir, vid_proc_name), "w") as fp:
for pfiles in processed_files:
print(pfiles, file=fp)
else:
with open(os.path.join(results_dir, vid_proc_name)) as fp:
processed_files = fp.read().splitlines()
# Create annotation file the first time
annotation_exists = os.path.isfile(os.path.join(
results_dir, dataset_annotation_file))
if not annotation_exists:
try:
with open(os.path.join(
results_dir, dataset_annotation_file), 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
except IOError:
print("Error creating annotaton file. I/O error")
# Get json files list names in videos directory
files_list = []
for ann_file in os.listdir(os.path.join(videos_directory, vids_name)):
if ann_file.endswith(".json"):
files_list.append(ann_file[0:-5])
files_list = natsorted(files_list)
num_files = len(files_list)
print('found', num_files, 'files')
# traverse all the files
stop_videos = False
for file in files_list:
if stop_videos:
break
# check if current video is not in alredy processed
if file in processed_files:
print(file, 'has already been processed. Skipping it.')
continue
num_output_video = 0
# Search for the video files in videos_directory
video_name = file + '.mp4'
print('Processing video:', video_name)
if save_videos:
# create output directory
output_dir = os.path.join(results_dir, vids_name, file)
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
# Load watson results
with open(os.path.join(
videos_directory, vids_name, file + '.json')) as f:
stt_results = json.load(f)
# Extract all the words with confidence >90
words_data = extract_words_from_watson_results(stt_results, max_words=5)
# Start the video capture
cap = cv2.VideoCapture(os.path.join(
videos_directory, vids_name, video_name))
# Extract video metadata
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
fps = cap.get(cv2.CAP_PROP_FPS)
print('video resolution:', width, ' x ', height)
print('video framerate:', fps)
frame_count = 0
fps_processing = 30.0 # fps holder
t = cv2.getTickCount() # initiate the tickCounter
count = 0
for entry in words_data:
# Extract speech to text data
print('entry:', type(entry), entry)
s_sec, s_millisec = divmod(float(entry['start']), 1)
e_sec, e_millisec = divmod(float(entry['end']), 1)
s_min = 0
e_min = 0
s_millisec = s_millisec * 1000
e_millisec = e_millisec * 1000
print('s_sec, s_millisec:', s_sec, s_millisec)
if s_sec >= 60:
s_min = math.floor(s_sec / 60.0)
s_sec = s_sec % 60
if e_sec >= 60:
e_min = math.floor(e_sec / 60.0)
e_sec = e_sec % 60
# Determine video frames involved in stt entry
min_frame = s_min*fps*60 + (s_sec*fps)
max_frame = e_min*fps*60 + (e_sec*fps)
# go to min_frame
cap.set(cv2.CAP_PROP_POS_FRAMES, min_frame)
frame_count = min_frame
# read frames from min_frame to max_frame
num_people = 0
valid_video = True
landmarks = []
angles = []
consecutive_frames_no_people = 0
while frame_count < max_frame:
if count == 0:
t = cv2.getTickCount()
# capture next frame
ret, frame = cap.read()
if not ret:
continue
frame_count += 1
# resize frame for faster processing
if frame.shape[0] <= 0 or frame.shape[1] <= 0:
continue
frame_small = cv2.resize(frame, (0, 0), fx=scale, fy=scale,
interpolation=cv2.INTER_LINEAR)
# detect faces and landmarjs
fa.update_features(frame_small)
landmarks.append(fa.get_mouth_features(scale=scale))
num_people = fa.get_num_people()
angles.append(fa.get_yaw())
# if it detects less than or more than 1 person
# go to next subtitle
if num_people != 1:
consecutive_frames_no_people += 1
if consecutive_frames_no_people >= max_bad_frames:
print(consecutive_frames_no_people,
' frames without 1 person. Skiping to next subtitle')
valid_video = False
break
# if only one person in the scene
if num_people == 1:
consecutive_frames_no_people = 0
fa.renderMouth(frame_small)
# Put fps at which we are processing camera feed on frame
cv2.putText(frame_small, "{0:.2f}-fps".format(fps_processing),
(50, height-50), cv2.FONT_HERSHEY_COMPLEX,
1, (0, 0, 255), 2)
# Display the image
cv2.imshow('Vid',frame_small)
# Read keyboard and exit if ESC was pressed
k = cv2.waitKey(1) & 0xFF
if k ==27:
exit()
elif k == ord('q'):
stop_videos = True
# increment frame counter
count = count + 1
# calculate fps at an interval of 100 frames
if (count == 30):
t = (cv2.getTickCount() - t)/cv2.getTickFrequency()
fps_processing = 30.0/t
count = 0
# if this was a valid video
if valid_video and len(landmarks) > 0:
num_output_video += 1
entry['mouth3d'] = landmarks
entry['angle'] = angles
if save_videos:
s_hr = 0
e_hr = 0
if s_min >= 60:
s_hr = math.floor(s_min / 60)
s_min = s_min % 60
if e_min >= 60:
e_hr = math.floor(e_min / 60)
e_min = e_min % 60
# cut and crop video
# ffmpeg -i input.mp4 -ss hh:mm:ss -filter:v crop=w:h:x:y -c:a copy -to hh:mm:ss output.mp4
ss = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
s_hr, s_min, int(s_sec), math.ceil(s_millisec))
es = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
e_hr, e_min, int(e_sec), math.ceil(e_millisec))
crop = "crop={0:1d}:{1:1d}:{2:1d}:{3:1d}".format(
bbw, bbh, bbx1, bby1)
out_name = os.path.join(output_dir, str(num_output_video))
subprocess.call(['ffmpeg', #'-hide_banner', '-loglevel', 'panic',
'-i', os.path.join(
videos_directory, vids_name, video_name),
'-ss', ss,
'-filter:v', crop, '-c:a', 'copy',
'-to', es, out_name +'.mp4'])
# save recognized speech
text_file = open(out_name +'.txt', "w")
text_file.write(entry['text'] + '\n')
text_file.write(str(entry['conf']))
text_file.close()
# append results to annotation file
append_annotation_file(os.path.join(
results_dir, dataset_annotation_file), words_data)
# save name of processed file
processed_files.append(file)
with open(os.path.join(results_dir, vid_proc_name), "w") as fp:
for p_file in processed_files:
print(p_file, file=fp)
# Release resources
cap.release()
cv2.destroyAllWindows()
def extract_text_conf_ts(s_idx, max_words, num_words, timestamps, conf, link):
text = ''
avg_conf = 0
start = timestamps[int(s_idx * max_words)][1]
end = timestamps[int(s_idx * max_words + num_words-1)][2]
for w_idx in range(num_words):
text = text + ' ' + timestamps[int(s_idx*max_words + w_idx)][0]
avg_conf += conf[int(s_idx*max_words + w_idx)][1]
avg_conf = round(avg_conf/num_words, 2)
if len(text.strip()) >= 4:
out_entry = {'link': link, 'text': text.strip(), 'conf': avg_conf,
'start':start, 'end': end, 'mouth3d': [],
'angle': [] }
else:
out_entry = {}
return out_entry
def extract_words_from_watson_results(stt_results, max_words=5):
data = stt_results['results']
link = stt_results['link']
link = link.rsplit('/', 1)[-1]
out_data = []
for sentence_idx, ann in enumerate(data):
data_ann = ann['alternatives'][0]
text = data_ann['transcript']
conf = data_ann['word_confidence']
timestamps = data_ann['timestamps']
num_words = len(timestamps)
num_splits = num_words//max_words
rest = num_words%max_words
if num_words < max_words:
maxx_words = num_words
else:
maxx_words = max_words
for s_idx in range(num_splits):
out_entry = extract_text_conf_ts(s_idx, maxx_words, maxx_words,
timestamps, conf, link)
out_data.append(out_entry)
if rest > 0:
out_entry = extract_text_conf_ts(num_splits, maxx_words, rest,
timestamps, conf, link)
if out_entry:
out_data.append(out_entry)
return out_data
def append_annotation_file(csv_file, data):
try:
with open(csv_file, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
for entry in data:
writer.writerow(entry)
except IOError:
print("I/O error")
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = nfkd_form.encode('ASCII', 'ignore')
return only_ascii
if __name__== "__main__":
# Parse input arguments
parser = argparse.ArgumentParser(description='Extract subvideos')
parser.add_argument('--dir', dest='videos_dir',
help='Directory with videos', type=str)
parser.add_argument('--cat', dest='category',
help='Video category', type=str)
parser.add_argument('--vids_log', dest='log_file',
help='Name of log file', type=str)
parser.add_argument('--results_dir', dest='results_dir',
help='Directory with results', type=str)
parser.add_argument('--ann_file', dest='ann_file',
help='Annotations file (csv)', type=str)
parser.add_argument('--save_videos', dest='save_videos',
help='Save videos', type=str, default='False')
args = parser.parse_args()
main(args)