Tensorflowjs with ReactJs - only returning one prediction for video element

Question

I'm trying to run the coco-ssd model using tensorflowJs in a react app (NOT react native);

I trialled it with images and got the desired outcome but now I want to run a video with it so I can track certain objects throughout the video.

The issue I am facing right now is that the model only returns a single prediction for the entire video. So it seems it is registering the first frame of the video and then nothing more.

In the component I am using to run this I have two video elements. Video1 is the video preview from the file upload. Right next to it I have video2 which is overlayed with a canvas element so that I can draw the bounding boxes, etc.

Initially I tried to simple pass in the video1 element to the model, but this did nothing. Then did some research into one of tensorflowjs demos and saw they were capturing the stream from the video element and assigning it to the srcObj of video2.

This has worked but only by capturing the first and single prediction so far. It takes a dataFile (the raw result from an input element) and the already preloaded tensorflowjs models as props.

Here is my component:

import React, { useEffect, useRef, useState } from "react";
import { connect } from "react-redux";
import Loader from "../loader";

function ResultsComponent(props) {
  const { dataFile, models } = props;
  const videoRef = useRef(null);
  const videoStreamRef = useRef(null);
  const canvasRef = useRef(null);
  const [videoObj, setVideoObj] = useState(null);
  const [changingVideo, setChangingVideo] = useState(false);
  const [videoPlaying, setVideoPlaying] = useState(false);
  const [streamReady, setStreamReady] = useState(false);

  useEffect(() => {
    async function runPredictions() {
      if (videoPlaying && videoObj && streamReady && videoStreamRef.current) {
        const { coco_ssd } = models;
        const _videoStream = videoStreamRef.current;
        const predictions = await coco_ssd.detect(_videoStream);
        console.log(predictions);
      }
    }

    runPredictions();
  }, [videoPlaying, videoObj, models, streamReady]);

  useEffect(() => {
    async function detect() {
      if (videoObj) {
        const _video = videoRef.current;
        const _videoStream = videoStreamRef.current;

        _video.onloadeddata = () => {
          _videoStream.srcObject = _video.captureStream(0);
          setStreamReady(true);
        };

        _video.onplay = () => {
          console.log("Started");
          setVideoPlaying(true);
        };

        _video.onended = () => {
          console.log("ended");
          setVideoPlaying(false);
          setStreamReady(false);
        };

        _video.onpause = () => {
          console.log("paused");
          setVideoPlaying(false);
        };

        const videoWidth = _videoStream.width;
        const videoHeight = _videoStream.height;

        // Set canvas width
        canvasRef.current.width = videoWidth;
        canvasRef.current.height = videoHeight;

        _video.src = URL.createObjectURL(videoObj);
        _video.load();
      }
    }

    detect();
  }, [videoObj]);

  useEffect(() => {
    if (dataFile) {
      setVideoPlaying(false);
      setChangingVideo(true);
      setVideoObj(dataFile);
      setChangingVideo(false);
    }
  }, [dataFile, setChangingVideo, setVideoObj]);

  return (
    <div>
      {changingVideo ? (
        <Loader />
      ) : (
        <>
          {videoObj ? (
            <div>
              <video
                id="video-input-video-src"
                ref={videoRef}
                controls
                width="440"
                height="480"
                muted
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "1em",
                  left: "10px",
                  textAlign: "left",
                  zindex: 9,
                }}
              />
              <video
                id="video-input-video-stream"
                ref={videoStreamRef}
                width="440"
                height="480"
                playsInline
                autoPlay
                muted
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "1em",
                  right: "10px",
                  textAlign: "right",
                  zindex: 9,
                }}
              />

              <canvas
                id="webcam-input-canvas"
                ref={canvasRef}
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "1em",
                  right: "10px",
                  textAlign: "right",
                  zindex: 9,
                  width: 440,
                  height: 420,
                }}
              />
            </div>
          ) : null}
        </>
      )}
    </div>
  );
}

const mapStateToProps = (state) => {
  const { models } = state;
  return { models };
};

export default connect(mapStateToProps, null)(ResultsComponent);

Any ideas regarding this would be much appreciated!

Can't you use `setInterval` to run the prediction code: check if the video is loaded, running, ... then get current frame and pass it to the model, i also found [this answer](https://stackoverflow.com/questions/17044567/get-frame-change-in-video-html5) but i believe 16ms may be too fast for the model. — Mohamed abdelmagid, Sep 23 '21 at 09:04
I can use `setInterval` but would rather avoid it if I can, and given it can be done without `setInterval`, I would rather do it that way. I would also like to understand why my approach isn't working as expected. — hyprstack, Sep 23 '21 at 09:06

score 1 · Answer 1 · answered Sep 23 '21 at 12:12

1

module can take video element as input, but that is only a wrapper - it draws snapshot on temp canvas, reads pixels from it and runs analysis.

and there is nothing to trigger it on next frame, it only gets triggered once when video is loaded.

you need to run a loop, either using fixed timer setTimeot() (not recommended) or run next frame as soon as detection is complete using setImmediate() (ok) or requestAnimationFrame() (recommended).

answered Sep 23 '21 at 12:12

Vladimir Mandic

813
5
11

I have noticed that when I get the bbox coordinates for the video input, these do not match that of the displayed video. I am setting width and height on my video object, but wonder if the bbox coords correspond to the temp canvas used by the model and if these are related to the video original size and not the values I set? – hyprstack Sep 23 '21 at 12:54
1

no, `bbox` is not dependend on internally used temp canvas. they can either be `raw`, meaning having relative value in float in range of `0..1` or scaled to input. however, do check the format of the `bbox` - i don't use pre-packaged libraries so don't know for `coco-ssd`, but quite often tfjs `bbox` format is in `[y1,x1,y2,x2]`, NOT `[x,y,width,height]` – Vladimir Mandic Sep 23 '21 at 18:33
Got it working. It was using the original video width and height. I had to scale the resulting coords from the bbox by the aspect ratio of the original video and my defined width for my video output... I will post my updated code as an answer for anyone else coming across this issue. – hyprstack Sep 23 '21 at 18:40

hyprstack · Answer 2 · 2021-09-27T10:20:23.280

Edit Note: Following vladimir's comment about race conditions, I added a minimum elapsed time of 35mls and a busy state

Got it working in the end.

Ended up listening to the ontimeupdate event of the video to run the prediction.

Also had to then calculate the ratio between the original video width and my video element width so that I could maintain aspect ratio when drawing bounding box around my predicted objects.

Update result is as follows:

import React, { useEffect, useRef, useState } from "react";
import { connect } from "react-redux";
import Loader from "../loader";

function ResultsComponent(props) {
  const { dataFile, models } = props;
  const videoRef = useRef(null);
  const videoStreamRef = useRef(null);
  const canvasRef = useRef(null);
  const [videoObj, setVideoObj] = useState(null);
  const [changingVideo, setChangingVideo] = useState(false);
  const [videoPlaying, setVideoPlaying] = useState(false);
  const [streamReady, setStreamReady] = useState(false);
  const [xRatio, setXratio] = useState(1);
  const [yRatio, setYratio] = useState(1);
  const [busy, setBusy] = useState(false);

  useEffect(() => {
    function drawBbox(predictions) {
      predictions.forEach((pred) => {
        const { bbox, class: _class, score } = pred;
        const canvas = canvasRef.current;
        const ctx = canvas.getContext("2d");
        //clear pre-existing stroke
        ctx.clearRect(0, 0, canvas.width, canvas.height);
        const [x, y, width, height] = bbox;
        const _x = x * xRatio;
        const _y = y * yRatio;
        const _width = width * xRatio;
        const _height = height * yRatio;

        ctx.lineWidth = 3;
        ctx.strokeStyle = "#ec0707";
        ctx.font = "18px serif";
        ctx.fillStyle = "#ec0707";

        ctx.strokeRect(_x, _y, _width, _height);
        ctx.fillText(``, _x, y - 10, _width, _height);
        ctx.fillText(
          `${_class} ${(score * 100).toFixed(2)}%`,
          _x,
          _y - 10,
          _width
        );
      });
    }

    async function runPredictions() {
      if (videoPlaying && videoObj && streamReady && videoStreamRef.current) {
        const { coco_ssd } = models;
        const _videoStream = videoStreamRef.current;
        let lastTime = -1;

        _videoStream.ontimeupdate = async (event) => {
          const { target } = event;
          const { currentTime } = _videoStream;

          if (currentTime !== lastTime && currentTime > lastTime + 0.35 && !busy) {
            lastTime = currentTime;
            setBusy(() => true);
            const predictions = await coco_ssd.detect(target);
            drawBbox(predictions);
            setBusy(() => false);
          }
        };
      }
    }

    runPredictions();
  }, [videoPlaying, videoObj, models, streamReady, xRatio, yRatio]);

  useEffect(() => {
    async function detect() {
      if (videoObj) {
        const _video = videoRef.current;
        const _videoStream = videoStreamRef.current;

        _video.onloadeddata = () => {
          const widthRatio = _video.width / _video.videoWidth;
          const _height = widthRatio * _video.videoHeight;
          _videoStream.height = _height;
          canvasRef.current.height = _height;
          console.log(_video.videoWidth);
          console.log(_video.videoHeight);
          setXratio(widthRatio);
          setYratio(_height / _video.videoHeight);
          setStreamReady(true);
          _videoStream.srcObject = _video.captureStream(0);
        };

        _video.onplay = () => {
          console.log("Started");
          setVideoPlaying(true);
          _videoStream.play();
          // Set canvas width
          canvasRef.current.width = _videoStream.width;
          canvasRef.current.height = _videoStream.height;
        };

        _video.onended = () => {
          console.log("ended");
          _videoStream.pause();
          // Do clean-up
          _videoStream.currentTime = 0;
          _videoStream.srcObject = null;
          setVideoPlaying(false);
          const canvas = canvasRef.current;
          const ctx = canvas.getContext("2d");
          ctx.clearRect(0, 0, canvas.width, canvas.height);
        };

        _video.onpause = () => {
          console.log("paused");
          setVideoPlaying(false);
          _videoStream.pause();
        };

        _video.src = URL.createObjectURL(videoObj);
        _video.load();
      }
    }

    detect();
  }, [videoObj]);

  useEffect(() => {
    if (dataFile) {
      setVideoPlaying(false);
      setChangingVideo(true);
      setVideoObj(dataFile);
      setChangingVideo(false);
    }
  }, [dataFile, setChangingVideo, setVideoObj]);

  return (
    <div>
      {changingVideo ? (
        <Loader />
      ) : (
        <>
          {videoObj ? (
            <div>
              <video
                id="video-input-video-src"
                ref={videoRef}
                controls
                width="480"
                height="440"
                muted
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "10px",
                  left: "10px",
                  textAlign: "left",
                  zindex: 9,
                }}
              />
              <video
                id="video-input-video-stream"
                ref={videoStreamRef}
                width="480"
                playsInline
                muted
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "6em",
                  right: "10px",
                  textAlign: "right",
                  zindex: 9,
                }}
              />

              <canvas
                id="webcam-input-canvas"
                ref={canvasRef}
                style={{
                  position: "absolute",
                  marginLeft: "auto",
                  marginRight: "auto",
                  top: "6em",
                  right: "10px",
                  textAlign: "right",
                  zindex: 9,
                  width: 480,
                }}
              />
            </div>
          ) : null}
        </>
      )}
    </div>
  );
}

const mapStateToProps = (state) => {
  const { models } = state;
  return { models };
};

export default connect(mapStateToProps, null)(ResultsComponent);

`video.ontimeupdate` will trigger new detection regardless of detection status - it's basically a guaranteed race condition when multiple detections run in parallel for no reason. at a minimum, implement a **busy** flag that you set before calling detect and clear after, so you can skip detect calls if busy — Vladimir Mandic, Sep 23 '21 at 20:06

Tensorflowjs with ReactJs - only returning one prediction for video element

2 Answers2