I am making stereo system with two cameras and MiDas Deep learning model which returns depth map, each get calibrated at the same time and provide stereo map
Code:
import cv2 as cv
import glob
from matplotlib import pyplot as plt
################ FIND CHESSBOARD CORNERS - OBJECT POINTS AND IMAGE POINTS #############################
chessboardSize = (6,4) #Inner cells
frameSize = (640,480)
# Termination criteria
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# Prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0)
objp = np.zeros((chessboardSize[0] * chessboardSize[1], 3), np.float32)
objp[:,:2] = np.mgrid[0:chessboardSize[0],0:chessboardSize[1]].T.reshape(-1,2)
objp = objp * 16 # size of cell in mm
#print(objp)
# Arrays to store object points and image points from all the images.
objpoints = [] # 3d point in real world space
imgpointsL = [] # 2d points in image plane.
imgpointsR = [] # 2d points in image plane.
imagesLeft = sorted(glob.glob('images/stereoLeft/*.png'))
imagesRight = sorted(glob.glob('images/stereoRight/*.png'))
for imgLeft, imgRight in zip(imagesLeft, imagesRight):
imgL = cv.imread(imgLeft)
imgR = cv.imread(imgRight)
grayL = cv.cvtColor(imgL, cv.COLOR_BGR2GRAY)
grayR = cv.cvtColor(imgR, cv.COLOR_BGR2GRAY)
# Find the chess board corners
retL, cornersL = cv.findChessboardCorners(grayL, chessboardSize, None)
retR, cornersR = cv.findChessboardCorners(grayR, chessboardSize, None)
# If found, add object points, image points (after refining them)
if retL and retR == True:
objpoints.append(objp)
cornersL = cv.cornerSubPix(grayL, cornersL, (11,11), (-1,-1), criteria)
imgpointsL.append(cornersL)
cornersR = cv.cornerSubPix(grayR, cornersR, (11,11), (-1,-1), criteria)
imgpointsR.append(cornersR)
# Draw and display the corners
cv.drawChessboardCorners(imgL, chessboardSize, cornersL, retL)
cv.imshow('img left', imgL)
cv.drawChessboardCorners(imgR, chessboardSize, cornersR, retR)
cv.imshow('img right', imgR)
cv.waitKey(500)
cv.destroyAllWindows()
############## CALIBRATION #######################################################
retL, cameraMatrixL, distL, rvecsL, tvecsL = cv.calibrateCamera(objpoints, imgpointsL, frameSize, None, None)
heightL, widthL, channelsL = imgL.shape
newCameraMatrixL, roi_L = cv.getOptimalNewCameraMatrix(cameraMatrixL, distL, (widthL, heightL), 1, (widthL, heightL))
retR, cameraMatrixR, distR, rvecsR, tvecsR = cv.calibrateCamera(objpoints, imgpointsR, frameSize, None, None)
heightR, widthR, channelsR = imgR.shape
newCameraMatrixR, roi_R = cv.getOptimalNewCameraMatrix(cameraMatrixR, distR, (widthR, heightR), 1, (widthR, heightR))
print(cameraMatrixL)
print(newCameraMatrixL)
print()
print(cameraMatrixR)
print(newCameraMatrixR)
########## Stereo Vision Calibration #############################################
flags = 0
flags |= cv.CALIB_FIX_INTRINSIC
# Here we fix the intrinsic camara matrixes so that only Rot, Trns, Emat and Fmat are calculated.
# Hence intrinsic parameters are the same
criteria_stereo = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
# This step is performed to transformation between the two cameras and calculate Essential and Fundamental matrix
retStereo, newCameraMatrixL, distL, newCameraMatrixR, distR, rot, trans, essentialMatrix, fundamentalMatrix = cv.stereoCalibrate(objpoints, imgpointsL, imgpointsR, newCameraMatrixL, distL, newCameraMatrixR, distR, grayL.shape[::-1], criteria_stereo, flags)
# Reprojection Error
mean_error = 0
for i in range(len(objpoints)):
imgpoints2, _ = cv.projectPoints(objpoints[i], rvecsL[i], tvecsL[i], newCameraMatrixL, distL)
error = cv.norm(imgpointsL[i], imgpoints2, cv.NORM_L2)/len(imgpoints2)
mean_error += error
print("Total error: {}".format(mean_error/len(objpoints)))
########## Stereo Rectification #################################################
rectifyScale= 1
rectL, rectR, projMatrixL, projMatrixR, Q, roi_L, roi_R= cv.stereoRectify(newCameraMatrixL, distL, newCameraMatrixR, distR, grayL.shape[::-1], rot, trans, rectifyScale,(0,0))
print(Q)
stereoMapL = cv.initUndistortRectifyMap(newCameraMatrixL, distL, rectL, projMatrixL, grayL.shape[::-1], cv.CV_16SC2)
stereoMapR = cv.initUndistortRectifyMap(newCameraMatrixR, distR, rectR, projMatrixR, grayR.shape[::-1], cv.CV_16SC2)
print("Saving parameters!")
cv_file = cv.FileStorage('stereoMap.xml', cv.FILE_STORAGE_WRITE)
cv_file.write('stereoMapL_x',stereoMapL[0])
cv_file.write('stereoMapL_y',stereoMapL[1])
cv_file.write('stereoMapR_x',stereoMapR[0])
cv_file.write('stereoMapR_y',stereoMapR[1])
cv_file.write('q', Q)
cv_file.release()
There is 10cm in distance between cameras and they both share almost the same baseline, one is bit deeper on Z axis than the other
Both camera matrixes and total error of calibration seem to be good
[[829.23334573 0. 320.16373913]
[ 0. 618.52774177 250.13711108]
[ 0. 0. 1. ]]
[[605.63713704 0. 320.44688502]
[ 0. 608.28836475 253.3957634 ]
[ 0. 0. 1. ]]
Total error: 0.40797200269886297
After I am done with calibration, I proceed to do 3D reconstruction. I am using deep learning model to create depth map and Q matrix from previous stereo calibration to do 3D reprojection
Code:
import numpy as np
import cv2
import time
from matplotlib import pyplot as plt
#Function that Downsamples image x number (reduce_factor) of times.
def downsample_image(image, reduce_factor):
for i in range(0,reduce_factor):
#Check if image is color or grayscale
if len(image.shape) > 2:
row,col = image.shape[:2]
else:
row,col = image.shape
image = cv2.pyrDown(image, dstsize= (col//2, row // 2))
return image
path_model = "models/"
# Read Network
model_name = "model-f46da743.onnx"; # MiDaS v2.1 Large
#model_name = "model-small.onnx"; # MiDaS v2.1 Small
# Load the DNN model
model = cv2.dnn.readNet(path_model + model_name)
if (model.empty()):
print("Could not load the neural net! - Check path")
# Set backend and target to CUDA to use GPU
#model.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
#model.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
cv_file = cv2.FileStorage()
cv_file.open('stereoMap.xml', cv2.FileStorage_READ)
Q = cv_file.getNode('q').mat()
print(Q)
# # Webcam
# cap = cv2.VideoCapture(0)
# # Read in the image
# success, img = cap.read()
# cv2.imshow('image', img)
# cv2.waitKey(0)
# img = downsample_image(img, 3)
# cv2.imshow('image', img)
# cv2.waitKey(0)
img = cv2.imread('images/test_r.png')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
imgHeight, imgWidth, channels = img.shape
# Create Blob from Input Image
# MiDaS v2.1 Large ( Scale : 1 / 255, Size : 384 x 384, Mean Subtraction : ( 123.675, 116.28, 103.53 ), Channels Order : RGB )
blob = cv2.dnn.blobFromImage(img, 1/255., (384,384), (123.675, 116.28, 103.53), True, False)
# MiDaS v2.1 Small ( Scale : 1 / 255, Size : 256 x 256, Mean Subtraction : ( 123.675, 116.28, 103.53 ), Channels Order : RGB )
#blob = cv2.dnn.blobFromImage(img, 1/255., (256,256), (123.675, 116.28, 103.53), True, False)
# Set input to the model
model.setInput(blob)
# Make forward pass in model
output = model.forward()
output = output[0,:,:]
output = cv2.resize(output, (imgWidth, imgHeight))
# Normalize the output
output = cv2.normalize(output, None, 0, 1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
plt.imshow(output,'gray')
plt.show()
# -------------------------------------------------------------------------------------
#Reproject points into 3D
points_3D = cv2.reprojectImageTo3D(output, Q, handleMissingValues=False)
print(len(points_3D))
#Get rid of points with value 0 (i.e no depth)
mask_map = output > output.min()
#Mask colors and points.
output_points = points_3D[mask_map]
print(len(output_points))
output_colors = img[mask_map]
#Function to create point cloud file
def create_output(vertices, colors, filename):
colors = colors.reshape(-1,3)
vertices = np.hstack([vertices.reshape(-1,3),colors])
ply_header = '''ply
format ascii 1.0
element vertex %(vert_num)d
property float x
property float y
property float z
property uchar red
property uchar green
property uchar blue
end_header
'''
with open(filename, 'w') as f:
f.write(ply_header %dict(vert_num=len(vertices)))
np.savetxt(f,vertices,'%f %f %f %d %d %d')
output_file = 'reconstructedMono.ply'
#Generate point cloud
create_output(output_points, output_colors, output_file)
# cap.release()
cv2.destroyAllWindows()
Depth map seems to be decent and gives values between 0 and 1

Now for the final result which is Pointcloud I get an odd result which does not make sense

I could even say I get no result at this point. I am so confused on what the issue is, is it because of the Q matrix maybe or because of the depth map given by the model?
Tiny additional information, in case it helps.The results of printing length of output_points seem to be good I get 307199 which is 1 point less than the maximum of 480x640
