We all love the incredibly powerful features of the Mediapipe AI library. But, lets face it, the data is very hard to interpret, and the returned data structures are complex, and poorly documented. The purpose of this lesson is to show you how to parse the complex data structures coming from Mediapipe, and to create simple, intuitive arrays for the landmark data. We create classes in python which do the parsing, and the classes are easy to work with. With these classes, you can use mediapipe simply, and get simple to understand data structures back. The video explains what we are doing and how we are doing it. For your convenience the resulting code is posted below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import cv2 print(cv2.__version__) class mpFaceMesh: import mediapipe as mp def __init__(self,still=False,numFaces=3,tol1=.5,tol2=.5,drawMesh=True): self.myFaceMesh=self.mp.solutions.face_mesh.FaceMesh(still,numFaces,tol1,tol2) self.myDraw=self.mp.solutions.drawing_utils self.draw=drawMesh def Marks(self,frame): global width global height drawSpecCircle=self.myDraw.DrawingSpec(thickness=0,circle_radius=0,color=(0,0,255)) drawSpecLine=self.myDraw.DrawingSpec(thickness=3,circle_radius=2,color=(255,0,0)) frameRGB=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) results=self.myFaceMesh.process(frameRGB) facesMeshLandmarks=[] if results.multi_face_landmarks !=None: for faceMesh in results.multi_face_landmarks: faceMeshLandmarks=[] for lm in faceMesh.landmark: loc=(int(lm.x*width),int(lm.y*height)) faceMeshLandmarks.append(loc) facesMeshLandmarks.append(faceMeshLandmarks) if self.draw==True: self.myDraw.draw_landmarks(frame,faceMesh,self.mp.solutions.face_mesh.FACE_CONNECTIONS,drawSpecCircle,drawSpecLine) return facesMeshLandmarks class mpFace: import mediapipe as mp def __init__(self): self.myFace=self.mp.solutions.face_detection.FaceDetection() def Marks(self,frame): frameRGB=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) results=self.myFace.process(frameRGB) faceBoundBoxs=[] if results.detections != None: for face in results.detections: bBox=face.location_data.relative_bounding_box topLeft=(int(bBox.xmin*width),int(bBox.ymin*height)) bottomRight=(int((bBox.xmin+bBox.width)*width),int((bBox.ymin+bBox.height)*height)) faceBoundBoxs.append((topLeft,bottomRight)) return faceBoundBoxs class mpPose: import mediapipe as mp def __init__(self,still=False,upperBody=False, smoothData=True, tol1=.5, tol2=.5): self.myPose=self.mp.solutions.pose.Pose(still,upperBody,smoothData,tol1,tol2) def Marks(self,frame): frameRGB=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) results=self.myPose.process(frameRGB) poseLandmarks=[] if results.pose_landmarks: for lm in results.pose_landmarks.landmark: poseLandmarks.append((int(lm.x*width),int(lm.y*height))) return poseLandmarks class mpHands: import mediapipe as mp def __init__(self,maxHands=2,tol1=.5,tol2=.5): self.hands=self.mp.solutions.hands.Hands(False,maxHands,tol1,tol2) def Marks(self,frame): myHands=[] handsType=[] frameRGB=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) results=self.hands.process(frameRGB) if results.multi_hand_landmarks != None: #print(results.multi_handedness) for hand in results.multi_handedness: #print(hand) #print(hand.classification) #print(hand.classification[0]) handType=hand.classification[0].label handsType.append(handType) for handLandMarks in results.multi_hand_landmarks: myHand=[] for landMark in handLandMarks.landmark: myHand.append((int(landMark.x*width),int(landMark.y*height))) myHands.append(myHand) return myHands,handsType width=1280 height=720 cam=cv2.VideoCapture(3,cv2.CAP_DSHOW) cam.set(cv2.CAP_PROP_FRAME_WIDTH, width) cam.set(cv2.CAP_PROP_FRAME_HEIGHT,height) cam.set(cv2.CAP_PROP_FPS, 30) cam.set(cv2.CAP_PROP_FOURCC,cv2.VideoWriter_fourcc(*'MJPG')) findHands=mpHands(2) findFace=mpFace() findPose=mpPose() findMesh=mpFaceMesh(drawMesh=True) font=cv2.FONT_HERSHEY_SIMPLEX fontColor=(0,0,255) fontSize=.1 fontThick=1 cv2.namedWindow('Trackbars') cv2.moveWindow('Trackbars',width+50,0) cv2.resizeWindow('Trackbars',400,150) while True: ignore, frame = cam.read() frame=cv2.resize(frame,(width,height)) handsLM,handsType=findHands.Marks(frame) faceLoc=findFace.Marks(frame) poseLM=findPose.Marks(frame) facesMeshLM=findMesh.Marks(frame) if poseLM != []: for ind in [13,14,15,16]: cv2.circle(frame,poseLM[ind],20,(0,255,0),-1) for face in faceLoc: cv2.rectangle(frame,face[0],face[1],(255,0,0),3) for hand,handType in zip(handsLM,handsType): if handType=='Right': lbl='Right' if handType=='Left': lbl='Left' cv2.putText(frame,lbl,hand[8],font,2,fontColor,2) for faceMeshLM in facesMeshLM: cnt=0 for lm in faceMeshLM: cv2.putText(frame,str(cnt),lm,font,fontSize,fontColor,fontThick) cnt=cnt+1 cv2.imshow('my WEBcam', frame) cv2.moveWindow('my WEBcam',0,0) if cv2.waitKey(1) & 0xff ==ord('q'): break cam.release() |