I am trying to generate synthetic data using blender for disparity prediction. I used z-pass in Cycles for depth ground-truth, but I find out that the z-pass value is always 1.09 times larger than actual depth.
There is a demo, I use functions from 3x4 camera matrix from blender camera to get extrinsic data.
import bpy
import os
os.environ["OPENCV_IO_ENABLE_OPENEXR"]="1"
import cv2
from mathutils import Matrix, Vector
import numpy as np
################################################################
#from https://blender.stackexchange.com/questions/38009/3x4-camera-matrix-from-blender-camera
#---------------------------------------------------------------
3x4 P matrix from Blender camera
#---------------------------------------------------------------
BKE_camera_sensor_size
def get_sensor_size(sensor_fit, sensor_x, sensor_y):
if sensor_fit == 'VERTICAL':
return sensor_y
return sensor_x
BKE_camera_sensor_fit
def get_sensor_fit(sensor_fit, size_x, size_y):
if sensor_fit == 'AUTO':
if size_x >= size_y:
return 'HORIZONTAL'
else:
return 'VERTICAL'
return sensor_fit
Build intrinsic camera parameters from Blender camera data
See notes on this in
blender.stackexchange.com/questions/15102/what-is-blenders-camera-projection-matrix-model
as well as
https://blender.stackexchange.com/a/120063/3581
def get_calibration_matrix_K_from_blender(camd):
if camd.type != 'PERSP':
raise ValueError('Non-perspective cameras not supported')
scene = bpy.context.scene
f_in_mm = camd.lens
scale = scene.render.resolution_percentage / 100
resolution_x_in_px = scale * scene.render.resolution_x
resolution_y_in_px = scale * scene.render.resolution_y
sensor_size_in_mm = get_sensor_size(camd.sensor_fit, camd.sensor_width, camd.sensor_height)
sensor_fit = get_sensor_fit(
camd.sensor_fit,
scene.render.pixel_aspect_x * resolution_x_in_px,
scene.render.pixel_aspect_y * resolution_y_in_px
)
pixel_aspect_ratio = scene.render.pixel_aspect_y / scene.render.pixel_aspect_x
if sensor_fit == 'HORIZONTAL':
view_fac_in_px = resolution_x_in_px
else:
view_fac_in_px = pixel_aspect_ratio * resolution_y_in_px
pixel_size_mm_per_px = sensor_size_in_mm / f_in_mm / view_fac_in_px
s_u = 1 / pixel_size_mm_per_px
s_v = 1 / pixel_size_mm_per_px / pixel_aspect_ratio
# Parameters of intrinsic calibration matrix K
u_0 = resolution_x_in_px / 2 - camd.shift_x * view_fac_in_px
v_0 = resolution_y_in_px / 2 + camd.shift_y * view_fac_in_px / pixel_aspect_ratio
skew = 0 # only use rectangular pixels
K = Matrix(
((s_u, skew, u_0),
( 0, s_v, v_0),
( 0, 0, 1)))
return K
Returns camera rotation and translation matrices from Blender.
There are 3 coordinate systems involved:
1. The World coordinates: "world"
- right-handed
2. The Blender camera coordinates: "bcam"
- x is horizontal
- y is up
- right-handed: negative z look-at direction
3. The desired computer vision camera coordinates: "cv"
- x is horizontal
- y is down (to align to the actual pixel coordinates
used in digital images)
- right-handed: positive z look-at direction
def get_3x4_RT_matrix_from_blender(cam):
# bcam stands for blender camera
R_bcam2cv = Matrix(
((1, 0, 0),
(0, -1, 0),
(0, 0, -1)))
# Transpose since the rotation is object rotation,
# and we want coordinate rotation
# R_world2bcam = cam.rotation_euler.to_matrix().transposed()
# T_world2bcam = -1*R_world2bcam @ location
#
# Use matrix_world instead to account for all constraints
location, rotation = cam.matrix_world.decompose()[0:2]
R_world2bcam = rotation.to_matrix().transposed()
# Convert camera location to translation vector used in coordinate changes
# T_world2bcam = -1*R_world2bcam @ cam.location
# Use location from matrix_world to account for constraints:
T_world2bcam = -1*R_world2bcam @ location
# Build the coordinate transform matrix from world to computer vision camera
R_world2cv = R_bcam2cv@R_world2bcam
T_world2cv = R_bcam2cv@T_world2bcam
# put into 3x4 matrix
RT = Matrix((
R_world2cv[0][:] + (T_world2cv[0],),
R_world2cv[1][:] + (T_world2cv[1],),
R_world2cv[2][:] + (T_world2cv[2],)
))
return RT
def get_3x4_P_matrix_from_blender(cam):
K = get_calibration_matrix_K_from_blender(cam.data)
RT = get_3x4_RT_matrix_from_blender(cam)
return K, RT
#################################################################
#config set
camera_pos=[0,0,1] #change this
lens=25 #change this
bpy.context.scene.render.engine ='CYCLES'
bpy.context.scene.cycles.device = 'GPU'
bpy.context.preferences.addons['cycles'].preferences.compute_device_type='CUDA'
bpy.context.preferences.addons['cycles'].preferences.get_devices()
bpy.context.scene.cycles.samples = 4 #for fast inference
bpy.context.scene.render.resolution_x = 1920
bpy.context.scene.render.resolution_y = 1080
bpy.context.scene.use_nodes = True
bpy.context.scene.view_layers["ViewLayer"].use_pass_z = True
for obj in bpy.data.objects:
bpy.data.objects.remove(obj,do_unlink=True)
#add plan
bpy.ops.mesh.primitive_plane_add(size=3, enter_editmode=False, align='WORLD', location=[0,0,0], scale=(1, 1, 1))
#add camera
bpy.ops.object.camera_add(enter_editmode=False, align='WORLD', location=camera_pos, rotation=(0, 0, 0), scale=(1, 1, 1))
bpy.context.object.data.lens = lens
#render
tree = bpy.context.scene.node_tree
links = tree.links
for n in tree.nodes:
tree.nodes.remove(n)
rl = tree.nodes.new('CompositorNodeRLayers')
output_path='test'
if not os.path.exists(output_path):
os.makedirs(output_path)
Set up rendering depth
depth_output = tree.nodes.new(type='CompositorNodeOutputFile')
depth_output.format.file_format = 'OPEN_EXR'
depth_output.base_path = output_path
links.new(rl.outputs['Depth'], depth_output.inputs['Image'])
scene = bpy.context.scene
scene.camera = bpy.data.objects['Camera']
bpy.ops.render.render()
K, RT=get_3x4_P_matrix_from_blender(bpy.data.objects['Camera'])
print('extrinsic matrix: ', RT)
depth=cv2.imread('test/Image0001.exr', cv2.IMREAD_ANYDEPTH)
print('depth: ', np.unique(depth))
This code just adds a plan at [0,0,0] and add a camera at some position ([0,0,1] for example). and print the extrinsic matrix and z-pass value:
extrinsic matrix: <Matrix 3x4 (1.0000, 0.0000, 0.0000, 0.0000)
(0.0000, -1.0000, 0.0000, 0.0000)
(0.0000, 0.0000, -1.0000, 1.0000)>
z-pass: [1.09]
The experiment results are:
- set camera at [0,0,1], set focal length as 50: real depth is 1 but the z-pass is 1.09
- set camera at [0,0,0.5], set focal length as 50: real depth is 0.5 but the z-pass is 0.545
- set camera at [0,0,1], set focal length as 25: real depth is 1 but the z-pass is 1.09
My observation: The real depth should be the distance between plan and camera origin, and z-pass is always 1.09 times larger than real depth regardless of camera position and focal length
I am very new to blender and I want to figure out where "1.09" comes from? And how could I get the ground-truth depth?
Thanks in advance!