I want to estimate the pose of a current image relative to a matched one in a database. The images are coming from a moving robot with reasonably accurate odometry so I can take two consecutive images that are both similar to the one in the database and have a good estimate of the relative pose between these two images. I want to use that information to estimate the 3D relationship between the matched keypoints in the two current images using triangulatePoints and then use solvePnPRansac to estimate the pose of the image in the database relative to the first current image using the keypoints that also match to it.
If have tried to implement this in OpenCV with Python as shown below. Currently I do not use the database image and am simply confirming if I can get the odometery that I enforced back, but unfortunately the current output is garbage (translation in the order of e+10).
import numpy as np
import cv2
# Fisheye camera and distortion matrices
K=np.array([[455.5000196386718, 0.0, 482.65324003911945], [0.0, 340.6409393462825, 254.5063795692748], [0.0, 0.0, 1.0]])
D=np.array([[-0.018682808343432777], [-0.044315351694893736], [0.047678551616171246], [-0.018283908577445218]])
orb = cv2.ORB_create(nfeatures=1000)
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
img0 = cv2.imread("0cm.png")
img2 = cv2.imread("2cm.png")
# Find keypoints and match them up
kp0, des0 = orb.detectAndCompute(img0, None)
kp2, des2 = orb.detectAndCompute(img2, None)
matches = bf.knnMatch(des0, des2, k=2)
# Find good matches using the ratio test
ratio_thresh = 0.8
good_matches = []
for m,n in matches:
if m.distance < ratio_thresh * n.distance:
good_matches.append(m)
# Convert from keypoints to points
pts0 = np.float32([kp0[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
pts2 = np.float32([kp2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
# Remove the fisheye distortion from the points
pts0 = cv2.fisheye.undistortPoints(pts0, K, D, P=K)
pts2 = cv2.fisheye.undistortPoints(pts2, K, D, P=K)
# Keep only the points that make geometric sense
# TODO: find a more efficient way to apply the mask
E, mask = cv2.findEssentialMat(pts0, pts2, K, cv2.RANSAC, 0.999, 1, None)
_, R, t, mask = cv2.recoverPose(E, pts0, pts2, cameraMatrix=K, mask=mask)
pts0_m = []
pts2_m = []
for i in range(len(mask)):
if mask[i] == 1:
pts0_m.append(pts0[i])
pts2_m.append(pts2[i])
pts0 = np.array(pts0_m).T.reshape(2, -1)
pts2 = np.array(pts2_m).T.reshape(2, -1)
# Setup the projection matrices
R = np.eye(3)
t0 = np.array([[0], [0], [0]])
t2 = np.array([[0], [0], [2]])
P0 = np.dot(K, np.concatenate((R, t0), axis=1))
P2 = np.dot(K, np.concatenate((R, t2), axis=1))
# Find the keypoint world homogeneous coordinates assuming img0 is the world origin
X = cv2.triangulatePoints(P0, P2, pts0, pts2)
# Convert from homogeneous cooridinates, TODO: find a more efficient way
X /= X[3]
objPts = X.T[:,:3]
# Find the pose of the second frame
_, rvec, tvec, inliers = cv2.solvePnPRansac(objPts, pts2.T, K, None)
print(rvec)
print(tvec)
Is there something wrong with my code or my approach (or both)?