In this video lesson we learn how to add speech to our NVIDIA Jetson Nano we demonstrate how the Jetson can not only recognize an item, but can audibly speak the item it sees. The video takes you through the process step-by-step, and shows you how to make it all work together properly. For your convenience, the code we developed is included below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import jetson.inference import jetson.utils import numpy as np import time import os from gtts import gTTS import threading speak=True item='Welcome to My Identify. Are you Ready to Rumble?' confidence=0 itemOld='' import cv2 print(cv2.__version__) width=1280 height=720 flip=2 #Uncomment These next Two Line for Pi Camera #camSet='nvarguscamerasrc ! video/x-raw(memory:NVMM), width=3264, height=2464, format=NV12, framerate=21/1 ! nvvidconv flip-method='+str(flip)+' ! video/x-raw, width='+str(dispW)+', height='+str(dispH)+', format=BGRx ! videoconvert ! video/x-raw, format=BGR ! appsink' #cam= cv2.VideoCapture(camSet) def sayItem(): global speak global item while True: if speak ==True: output=gTTS(text=item, lang='en',slow=False) output.save('output.mp3') os.system('mpg123 output.mp3') speak=False x=threading.Thread(target=sayItem, daemon=True) x.start() #Or, if you have a WEB cam, uncomment the next line #(If it does not work, try setting to '1' instead of '0') cam=cv2.VideoCapture('/dev/video1') cam.set(cv2.CAP_PROP_FRAME_WIDTH,width) cam.set(cv2.CAP_PROP_FRAME_HEIGHT,height) net=jetson.inference.imageNet('googlenet') font=cv2.FONT_HERSHEY_SIMPLEX timeMark=time.time() fpsFilter=0 while True: ret, frame = cam.read() img=cv2.cvtColor(frame,cv2.COLOR_BGR2RGBA).astype(np.float32) img=jetson.utils.cudaFromNumpy(img) if speak==False: classID, confidence = net.Classify(img,width,height) if confidence>=.5: item=net.GetClassDesc(classID) if item!=itemOld: speak=True if confidence<.5: item='' itemOld=item dt=time.time()-timeMark timeMark=time.time() fps=1/dt fpsFilter=.95*fpsFilter + .05 *fps cv2.putText(frame,str(round(fpsFilter,1))+' fps '+item+' '+str(round(confidence,2)),(0,30),font,1,(0,0,255),2) cv2.imshow('nanoCam',frame) cv2.moveWindow('nanoCam',0,0) if cv2.waitKey(1)==ord('q'): break cam.release() cv2.destroyAllWindows() |