Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

Object recognition performance with ORB

I have a sample project where I am trying to recognize, and draw a box around, a circuit board. Ideally I want the result to be something close to this: https://www.youtube.com/watch?v=-ZNYoL8rzPY I'm hoping that some people with more experience and intution can suggest changes to my algorithm and code to improve results.

I am using OpenCV with Python, and have been playing around with the ORB detector, which I understand is a free license vs. SIFT or SURF. Also it seems fairly efficient. However my code isn't working too well for matching. It runs fast in real-time, but below are some sample match attempts:

enter image description here

On the left is my (static) template, on the right is the real-time scene. I've taken a few screenshots that are indicative of my results: not that many matches, some mismatches, and it's never able to draw a nice bounding box around the object as my code would (hopefully) have it do. I've also tried taking more vertical/birds-eye pictures of the template, but with similarly poor results. Resolution for template and image are both 320x240.

enter image description here

Below I will paste my code. It uses the ORB recognizer as I mentioned. I am open to switching approaches/algorithms but I'd also like to understand why I could expect better performance if doing so. Thank you in advance for any help or advice, hopefully others will benefit as well.

def ORB_recognizer(detector, kp1, des1, img2):

    ms1=0;ms2=0;ms3=0;ms4=0;ms5=0;ms6=0;ms7=0;ms8=0 # I was profiling performance

    ms1 = time.time()*1000.0

    kp2, des2 = detector.detectAndCompute(img2,None)

    ms2 = time.time()*1000.0

    # create BFMatcher object
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    # Second param is boolean variable, crossCheck which is false by default. If it is true, Matcher returns only those matches with value (i,j) such that i-th descriptor in set A has j-th descriptor in set B as the best match and vice-versa.
    # Match descriptors.

    ms3 = time.time()*1000.0

    matches = bf.match(des1,des2)

    ms4 = time.time()*1000.0

    # Sort them in the order of their distance.
    matches = sorted(matches, key = lambda x:x.distance)

    MIN_MATCH_COUNT = 10
    good = matches[:MIN_MATCH_COUNT]    

    #if len(good)>MIN_MATCH_COUNT:

    ms5 = time.time()*1000.0

    src_pts = np.float32([ kp1[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
    dst_pts = np.float32([ kp2[m.trainIdx].pt for m in good ]).reshape(-1,1,2)

    M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC,5.0)
    matchesMask = mask.ravel().tolist()

    ms6 = time.time()*1000.0

    h,w,d = img1.shape
    pts = np.float32([ [0,0],[0,h-1],[w-1,h-1],[w-1,0] ]).reshape(-1,1,2)
    dst = cv2.perspectiveTransform(pts,M)

    img2 = cv2.polylines(img2,[np.int32(dst)],True,255,3, cv2.LINE_AA)

    ms7 = time.time()*1000.0
    #else:
        #print "Not enough matches are found - %d/%d" % (len(good),MIN_MATCH_COUNT)
        #matchesMask = None




    draw_params = dict(matchColor = (0,255,0), # draw matches in green color
                       singlePointColor = None,
                       matchesMask = matchesMask, # draw only inliers
                       flags = 2)

    img3 = cv2.drawMatches(img1,kp1,img2,kp2,good,None,**draw_params)

    ms8 = time.time()*1000.0

    #img3 = drawMatches2(img2,kp2,img1,kp1,matches[:K], None, **draw_params) # this version of function flips order of train,query image
    #plt.imshow(img3, 'gray'),plt.show()

    return img3, [ms2-ms1, ms3-ms2, ms4-ms3, ms5-ms4, ms6-ms5, ms7-ms6, ms8-ms7] # I was profiling the time

################
##### MAIN #####
################
# Example usage:
# python Obj_rec_testing.py template.jpg 1 320 240  

cv2.ocl.setUseOpenCL(False)

img1 = cv2.imread(str(sys.argv[1])) # queryImage
source = int(sys.argv[2]) # e.g. 1 for USB webcam
width = int(sys.argv[3])
height = int(sys.argv[4])

detector = cv2.ORB_create() 
kp1, des1 = detector.detectAndCompute(img1,None)

cap = cv2.VideoCapture(source) # ==1 for USB webcam
cap.set(3,width)
cap.set(4,height)

while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()

    # Our operations on the frame come here
    #gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    new_img, elapsed_times = ORB_recognizer(detector, kp1, des1, frame)

    # Display the resulting frame
    cv2.imshow('frame',new_img) #works

    print elapsed_times

    # KEY PRESS FUNCTIONALITY
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    if cv2.waitKey(1) & 0xFF == ord('s'):
        print "Saving image..."
        cv2.imwrite("saved_image.jpg", frame)

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

Object recognition performance with ORB

I have a sample project where I am trying to recognize, and draw a box around, a circuit board. Ideally I want the result to be something close to this: https://www.youtube.com/watch?v=-ZNYoL8rzPY I'm hoping that some people with more experience and intution can suggest changes to my algorithm and code to improve results.

I am using OpenCV with Python, and have been playing around with the ORB detector, which I understand is a free license vs. SIFT or SURF. Also it seems fairly efficient. However my code isn't working too well for matching. It runs fast in real-time, but below are some sample match attempts:

enter image description here

On the left is my (static) template, on the right is the real-time scene. I've taken a few screenshots that are indicative of my results: not that many matches, some mismatches, and it's never able to draw a nice bounding box around the object as my code would (hopefully) have it do. I've also tried taking more vertical/birds-eye pictures of the template, but with similarly poor results. Resolution for template and image are both 320x240.

enter image description here

Below I will paste my code. It uses the ORB recognizer as I mentioned. I am open to switching approaches/algorithms but I'd also like to understand why I could expect better performance if doing so. Thank you in advance for any help or advice, hopefully others will benefit as well.

def ORB_recognizer(detector, kp1, des1, img2):

    ms1=0;ms2=0;ms3=0;ms4=0;ms5=0;ms6=0;ms7=0;ms8=0 # I was profiling performance

    ms1 = time.time()*1000.0

    kp2, des2 = detector.detectAndCompute(img2,None)

    ms2 = time.time()*1000.0

    # create BFMatcher object
    bf = cv2.BFMatcher(cv2.NORM_L2, cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    # Second param is boolean variable, crossCheck which is false by default. If it is true, Matcher returns only those matches with value (i,j) such that i-th descriptor in set A has j-th descriptor in set B as the best match and vice-versa.
    # Match descriptors.

    ms3 = time.time()*1000.0

    matches = bf.match(des1,des2)

    ms4 = time.time()*1000.0

    # Sort them in the order of their distance.
    matches = sorted(matches, key = lambda x:x.distance)

    MIN_MATCH_COUNT = 10
    good = matches[:MIN_MATCH_COUNT]    

    #if len(good)>MIN_MATCH_COUNT:

    ms5 = time.time()*1000.0

    src_pts = np.float32([ kp1[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
    dst_pts = np.float32([ kp2[m.trainIdx].pt for m in good ]).reshape(-1,1,2)

    M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC,5.0)
    matchesMask = mask.ravel().tolist()

    ms6 = time.time()*1000.0

    h,w,d = img1.shape
    pts = np.float32([ [0,0],[0,h-1],[w-1,h-1],[w-1,0] ]).reshape(-1,1,2)
    dst = cv2.perspectiveTransform(pts,M)

    img2 = cv2.polylines(img2,[np.int32(dst)],True,255,3, cv2.LINE_AA)

    ms7 = time.time()*1000.0
    #else:
        #print "Not enough matches are found - %d/%d" % (len(good),MIN_MATCH_COUNT)
        #matchesMask = None




    draw_params = dict(matchColor = (0,255,0), # draw matches in green color
                       singlePointColor = None,
                       matchesMask = matchesMask, # draw only inliers
                       flags = 2)

    img3 = cv2.drawMatches(img1,kp1,img2,kp2,good,None,**draw_params)

    ms8 = time.time()*1000.0

    #img3 = drawMatches2(img2,kp2,img1,kp1,matches[:K], None, **draw_params) # this version of function flips order of train,query image
    #plt.imshow(img3, 'gray'),plt.show()

    return img3, [ms2-ms1, ms3-ms2, ms4-ms3, ms5-ms4, ms6-ms5, ms7-ms6, ms8-ms7] # I was profiling the time

################
##### MAIN #####
################
# Example usage:
# python Obj_rec_testing.py template.jpg 1 320 240  

cv2.ocl.setUseOpenCL(False)

img1 = cv2.imread(str(sys.argv[1])) # queryImage
source = int(sys.argv[2]) # e.g. 1 for USB webcam
width = int(sys.argv[3])
height = int(sys.argv[4])

detector = cv2.ORB_create() 
kp1, des1 = detector.detectAndCompute(img1,None)

cap = cv2.VideoCapture(source) # ==1 for USB webcam
cap.set(3,width)
cap.set(4,height)

while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()

    # Our operations on the frame come here
    #gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    new_img, elapsed_times = ORB_recognizer(detector, kp1, des1, frame)

    # Display the resulting frame
    cv2.imshow('frame',new_img) #works

    print elapsed_times

    # KEY PRESS FUNCTIONALITY
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    if cv2.waitKey(1) & 0xFF == ord('s'):
        print "Saving image..."
        cv2.imwrite("saved_image.jpg", frame)

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()