I am conducting an experiment where I have 3 fixed cameras synchronized to record an individual evolving in a limited space (which is entirely coverd by the field of each camera). I use a pre-trained AI model to detect more than 20 pose landmarks at every frame, representing hundreds of 2D points in each video. My goal is to represent these landmarks as a 3D point cloud using OpenCV (Python). I know this is feasible in the case of 2 cameras using cv2.triangulatePoints(). I used this method to obtain one 3D point set for each possible cameras pair (camera 1 - camera 2, camera 2 - camera 3, ...). But it seems that these sets cannot easily be merged together because they are expressed in different coordinates systems. How can I "triangulate" 3D points using the data from the 3 cameras, rather than an arbitrary pair of them?
I tried to merge the 3D sets by using the extrinsic matrices obtained through cv2.recoverPose(), just to realize that this is not feasible. I then ran cv2.solvePnP() and got rotation and translation matrices, but do not know exactly what to make of them.
Here is the type of code I use:
def getProjectionMat(pts_l, pts_r, K_l, K_r):
pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None)
pts_r_norm = cv2.undistortPoints(np.expand_dims(pts_r, axis=1), cameraMatrix=K_r, distCoeffs=None)
E, mask = cv2.findEssentialMat(pts_l_norm, pts_r_norm, focal=1.0, pp=(0., 0.), method=cv2.RANSAC, prob=0.999, threshold=3.0)
points, R, t, mask = cv2.recoverPose(E, pts_l_norm, pts_r_norm)
M_r = np.hstack((R, t))
M_l = np.hstack((np.eye(3, 3), np.zeros((3, 1))))
P_l = np.dot(K_l, M_l)
P_r = np.dot(K_r, M_r)
return P_l, P_r, R, t
def get3DPoints(points1, points2, P1, P2):
point_4d_hom = cv2.triangulatePoints(P1, P2, np.expand_dims(points1, axis=1), np.expand_dims(points2, axis=1))
point_4d = point_4d_hom / np.tile(point_4d_hom[-1, :], (4, 1))
return point_4d[:3, :].T
#ptpL1 are 2D points of camera 1, ptpL2 are corresponding points in camera 2, etc.
#K1, K2, etc. are intrinsic matrices
P1_1, P2_1, xR1, xt1=getProjectionMat(ptpL1, ptpL2, K1, K2) #projection between cams 1 and 2
P1_2, P2_2, xR2, xt2=getProjectionMat(ptpl2, ptpl3, K2, K3) #... between cams 2 and 3
P1_3, P2_3, xR3, xt3=getProjectionMat(ptpl1, ptpL3, K1, K3) #... between cams 1 and 3
#get2DPointForCam(x, markOfInterest) returns the 2D coordinates of the given landmark for cam x
for markOfInterest in marksOfInterest:
point2D1=get2DPointForCam(1, markOfInterest)
point2D2=get2DPointForCam(2, markOfInterest)
point2D3=get2DPointForCam(3, markOfInterest)
point3D1=get3DPoints(point2D1, point2D2, P1_1, P2_2)[0]
point3D2=get3DPoints(point2D2, point2D3, P1_2, P2_2)[0]
point3D3=get3DPoints(point2D1, point2D3, P1_3, P2_3)[0]
#problem : point3D1 and point3D3 are close, but point3D2 has a completely different value. Still, when projected onto camera2, it is well positionned.
UPDATE : following @Micka suggestion, I (re)tried to use solvePNP to compute each camera relative position to some pose markers. I was hoping that this would allow me to convert to triangulation of each pairs of cameras into a common coordinate system. However, this fails again (see following code where problematic results are commented):
def getProjectionMat(pts_l, pts_r, K_l, K_r):
pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None)
pts_r_norm = cv2.undistortPoints(np.expand_dims(pts_r, axis=1), cameraMatrix=K_r, distCoeffs=None)
E, mask = cv2.findEssentialMat(pts_l_norm, pts_r_norm, focal=1.0, pp=(0., 0.), method=cv2.RANSAC, prob=0.999, threshold=3.0)
points, R, t, mask = cv2.recoverPose(E, pts_l_norm, pts_r_norm)
M_r = np.hstack((R, t))
M_l = np.hstack((np.eye(3, 3), np.zeros((3, 1))))
P_l = np.dot(K_l, M_l)
P_r = np.dot(K_r, M_r)
return P_l, P_r, R, t
def triangulate3DPoints(points1, points2, P1, P2):
point_4d_hom = cv2.triangulatePoints(P1, P2, np.expand_dims(points1, axis=1), np.expand_dims(points2, axis=1))
point_4d = point_4d_hom / np.tile(point_4d_hom[-1, :], (4, 1))
return point_4d[:3, :].T
def getPoseMatrixFromPNP(points3D, points2D, K):
_, vector_rotation, vector_translation = cv2.solvePnP(np.array(points3D), np.array(points2D), K, np.zeros((4,1)), flags=0)
matrix_rotation, _ = cv2.Rodrigues(vector_rotation)
return np.linalg.inv(np.vstack((np.hstack((matrix_rotation, vector_translation)), np.array([[0,0,0,1]]))))
def toHomogenous(vec):
return np.array([vec[0], vec[1], vec[2], 1])
def project_3D_point_to_2D(point_3D, P):
point_3D_homogeneous = np.hstack((point_3D, np.ones(1)))
point_2D_homogeneous = np.dot(P, point_3D_homogeneous.T).T
point_2D = point_2D_homogeneous[:2] / point_2D_homogeneous[2]
return point_2D
P1_1, P2_1, xR1, xt1=getProjectionMat(ptpL1, ptpL2, K1, K2)
P1_2, P2_2, xR2, xt2=getProjectionMat(ptpL2, ptpL3, K2, K3)
P1_3, P2_3, xR3, xt3=getProjectionMat(ptpL1, ptpL3, K1, K3)
marksOfInterest=[mp.solutions.pose.PoseLandmark.NOSE] #for example
#c12Dpoints, etc., will contain 2D coordinates of each landmark respectively for cam 1, 2 and 3
c12Dpoints=[]
c22Dpoints=[]
c32Dpoints=[]
#resTriangulated will contain 3D coordinates of each landmark, as estimated using cv2.triangulatePoints() between cams 1 and 2
resTriangulated=[]
for markOfInterest in marksOfInterest:
marks1=getSpecificLandMarks(landMarks1, markOfInterest)
marks2=getSpecificLandMarks(landMarks2, markOfInterest)
marks3=getSpecificLandMarks(landMarks3, markOfInterest)
for idFrame, e1 in enumerate(marks1):
e2 = marks2[idFrame]
e3 = marks3[idFrame]
if e1 is not None and e2 is not None and e3 is not None:
p1=[[e1.x*width, e1.y*height]]
p2=[[e2.x*width, e2.y*height]]
p3=[[e3.x*width, e3.y*height]]
triangulated=triangulate3DPoints(p1, p2, P1_1, P2_1)[0]
resTriangulated.append(triangulated)
c12Dpoints.append(p1)
c22Dpoints.append(p2)
c32Dpoints.append(p3)
pose1=getPoseMatrixFromPNP(resTriangulated, c12Dpoints, K1)
pose2=getPoseMatrixFromPNP(resTriangulated, c22Dpoints, K2)
pose3=getPoseMatrixFromPNP(resTriangulated, c32Dpoints, K3)
posList=[]
for markOfInterest in marksOfInterest:
marks1=getSpecificLandMarks(landMarks1, markOfInterest)
marks2=getSpecificLandMarks(landMarks2, markOfInterest)
marks3=getSpecificLandMarks(landMarks3, markOfInterest)
for idFrame, e1 in enumerate(marks1):
e2 = marks2[idFrame]
e3 = marks3[idFrame]
if e1 is not None and e2 is not None and e3 is not None:
p1=[[e1.x*width, e1.y*height]]
p2=[[e2.x*width, e2.y*height]]
p3=[[e3.x*width, e3.y*height]]
triangulated1=triangulate3DPoints(p1, p2, P1_1, P2_1)[0]
triangulated2=triangulate3DPoints(p2, p3, P1_2, P2_2)[0]
triangulated3=triangulate3DPoints(p1, p3, P1_3, P2_3)[0]
worldCoordP1=(pose1 @ toHomogenous(triangulated1).reshape(4,1).flatten())[:-1]
worldCoordP2=(pose2 @ toHomogenous(triangulated2).reshape(4,1).flatten())[:-1]
worldCoordP3=(pose1 @ toHomogenous(triangulated3).reshape(4,1).flatten())[:-1]
#problem: worldCoordP2 is very different from worldCoordP1 and worldCoordP3 (the latter being quite different too). The three variables should be close as they represent the same point (although estimated differently) in the same coordinate system.