Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

your_path_to_yolox_l.onnx和your_path_to_dw-ll_ucoco_384.onnx是什么? #121

Open
JV-X opened this issue Jan 7, 2025 · 6 comments
Open
Labels

Comments

@JV-X
Copy link

JV-X commented Jan 7, 2025

我在本地搭建好了echomimic的环境,使用自己的照片和音频尝试生成数字人,生成的数字人变形严重,我在这里 #35 的最后一个回复里看到提供了一个demo.ipynb脚本用于把图片“对齐”,于是我尝试了一下把demo.ipynb里的代码复制到一个名为fix.py的文件里,修改refimg_path和audio_path为我自己的照片和音频,并执行,结果遇到了如下报错:

(echomimic) hygx@hygx:~/code/echomimic_v2$ python fix.py
dwpose_detector init ok cuda
Traceback (most recent call last):
  File "/home/hygx/code/echomimic_v2/fix.py", line 271, in <module>
    detected_poses, height, width, ori_frame = get_img_pose(refimg_path, max_frame=None)
  File "/home/hygx/code/echomimic_v2/fix.py", line 254, in get_img_pose
    detected_poses = [dwprocessor(frame)]
  File "/home/hygx/code/echomimic_v2/src/models/dwpose/dwpose_detector.py", line 32, in __call__
    self.pose_estimation = Wholebody(*self.args)
  File "/home/hygx/code/echomimic_v2/src/models/dwpose/wholebody.py", line 15, in __init__
    self.session_det = ort.InferenceSession(
  File "/home/hygx/anaconda3/envs/echomimic/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 465, in __init__
    self._create_inference_session(providers, provider_options, disabled_optimizers)
  File "/home/hygx/anaconda3/envs/echomimic/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 526, in _create_inference_session
    sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model)
onnxruntime.capi.onnxruntime_pybind11_state.NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from your_path_to_yolox_l.onnx failed:Load model your_path_to_yolox_l.onnx failed. File doesn't exist

src/models/dwpose/dwpose_detector.py里代码是这么定义的:

dwpose_detector = DWposeDetector(
    model_det="your_path_to_yolox_l.onnx",
    model_pose="your_path_to_dw-ll_ucoco_384.onnx",
    device=device)
print('dwpose_detector init ok', device)

请问这两个onnx文件是什么,我应该从哪里获取?

感谢您的回复

@JV-X
Copy link
Author

JV-X commented Jan 7, 2025

这个问题我解决了,在这里下载: https://huggingface.co/yzd-v/DWPose/tree/main

@nitinmukesh
Copy link

nitinmukesh commented Jan 7, 2025

@JV-X

I am looking for the same. Please could you share the python script (fix.py) with me to extract pose locally.

@JV-X
Copy link
Author

JV-X commented Jan 8, 2025

@JV-X

I am looking for the same. Please could you share the python script (fix.py) with me to extract pose locally.

In fact, it is the code in demo.ipynb:

# reference image aligned
refimg_path = './assets/halfbody_demo/refimag/x.jpg'
audio_path ='./assets/halfbody_demo/audio/chinese/x.wav'
using_video_driving = False
if not using_video_driving:
  pose_path = './assets/halfbody_demo/pose/good'

import sys
from src.utils.img_utils import pil_to_cv2, cv2_to_pil, center_crop_cv2, pils_from_video, save_videos_from_pils, save_video_from_cv2_list
from PIL import Image
import cv2
from IPython import embed
import numpy as np
import copy
from src.utils.motion_utils import motion_sync
import pathlib
import torch
import pickle
from glob import glob
import os
from src.models.dwpose.dwpose_detector import dwpose_detector as dwprocessor
from src.models.dwpose.util import draw_pose
import decord
from tqdm import tqdm
from moviepy.editor import AudioFileClip, VideoFileClip
from multiprocessing.pool import ThreadPool

##################################
process_num = 100 #1266

start = 0
end = process_num + start
#################################
MAX_SIZE = 768

def convert_fps(src_path, tgt_path, tgt_fps=24, tgt_sr=16000):
    clip = VideoFileClip(src_path)
    new_clip = clip.set_fps(tgt_fps)
    if tgt_fps is not None:
        audio = new_clip.audio
        audio = audio.set_fps(tgt_sr)
        new_clip = new_clip.set_audio(audio)
    if '.mov' in tgt_path:
        tgt_path = tgt_path.replace('.mov', '.mp4')
    new_clip.write_videofile(tgt_path, codec='libx264', audio_codec='aac')
    
def get_video_pose(
        video_path: str, 
        sample_stride: int=1,
        max_frame=None):

    # read input video
    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
    sample_stride *= max(1, int(vr.get_avg_fps() / 24))

    frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
    # print(frames[0])
    if max_frame is not None:
        frames = frames[0:max_frame,:,:]
    height, width, _ = frames[0].shape
    detected_poses = [dwprocessor(frm) for frm in frames]
    dwprocessor.release_memory()

    return detected_poses, height, width, frames

def resize_and_pad(img, max_size):
    img_new = np.zeros((max_size, max_size, 3)).astype('uint8')
    imh, imw = img.shape[0], img.shape[1]
    half = max_size // 2
    if imh > imw:
        imh_new = max_size
        imw_new = int(round(imw/imh * imh_new))
        half_w = imw_new // 2
        rb, re = 0, max_size
        cb = half-half_w
        ce = cb + imw_new
    else:
        imw_new = max_size
        imh_new = int(round(imh/imw * imw_new))
        half_h = imh_new // 2
        cb, ce = 0, max_size
        rb = half-half_h
        re = rb + imh_new

    img_resize = cv2.resize(img, (imw_new, imh_new))
    img_new[rb:re,cb:ce,:] = img_resize
    return img_new

def resize_and_pad_param(imh, imw, max_size):
    half = max_size // 2
    if imh > imw:
        imh_new = max_size
        imw_new = int(round(imw/imh * imh_new))
        half_w = imw_new // 2
        rb, re = 0, max_size
        cb = half-half_w
        ce = cb + imw_new
    else:
        imw_new = max_size
        imh_new = int(round(imh/imw * imw_new))
        imh_new = max_size

        half_h = imh_new // 2
        cb, ce = 0, max_size
        rb = half-half_h
        re = rb + imh_new
        
    return imh_new, imw_new, rb, re, cb, ce

def get_pose_params(detected_poses, max_size):
    print('get_pose_params...')
    # pose rescale 
    w_min_all, w_max_all, h_min_all, h_max_all = [], [], [], []
    mid_all = []
    for num, detected_pose in enumerate(detected_poses):
        detected_poses[num]['num'] = num
        candidate_body = detected_pose['bodies']['candidate']
        score_body = detected_pose['bodies']['score']
        candidate_face = detected_pose['faces']
        score_face = detected_pose['faces_score']
        candidate_hand = detected_pose['hands']
        score_hand = detected_pose['hands_score']

        # face
        if candidate_face.shape[0] > 1:
            index = 0
            candidate_face = candidate_face[index]
            score_face = score_face[index]
            detected_poses[num]['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
            detected_poses[num]['faces_score'] = score_face.reshape(1, score_face.shape[0])
        else:
            candidate_face = candidate_face[0]
            score_face = score_face[0]

        # body
        if score_body.shape[0] > 1:
            tmp_score = []
            for k in range(0, score_body.shape[0]):
                tmp_score.append(score_body[k].mean())
            index = np.argmax(tmp_score)
            candidate_body = candidate_body[index*18:(index+1)*18,:]
            score_body = score_body[index]
            score_hand = score_hand[(index*2):(index*2+2),:]
            candidate_hand = candidate_hand[(index*2):(index*2+2),:,:]
        else:
            score_body = score_body[0]
        all_pose = np.concatenate((candidate_body, candidate_face))
        all_score = np.concatenate((score_body, score_face))
        all_pose = all_pose[all_score>0.8]

        body_pose = np.concatenate((candidate_body,))
        mid_ = body_pose[1, 0]

        face_pose = candidate_face
        hand_pose = candidate_hand

        h_min, h_max = np.min(face_pose[:,1]), np.max(body_pose[:7,1])

        h_ = h_max - h_min
        
        mid_w = mid_
        w_min = mid_w - h_ // 2
        w_max = mid_w + h_ // 2
        
        w_min_all.append(w_min)
        w_max_all.append(w_max)
        h_min_all.append(h_min)
        h_max_all.append(h_max)
        mid_all.append(mid_w)

    w_min = np.min(w_min_all)
    w_max = np.max(w_max_all)
    h_min = np.min(h_min_all)
    h_max = np.max(h_max_all)
    mid = np.mean(mid_all)

    margin_ratio = 0.25
    h_margin = (h_max-h_min)*margin_ratio
    
    h_min = max(h_min-h_margin*0.65, 0)
    h_max = min(h_max+h_margin*0.05, 1)

    h_new = h_max - h_min
    
    h_min_real = int(h_min*height)
    h_max_real = int(h_max*height)
    mid_real = int(mid*width)
    
    height_new = h_max_real-h_min_real+1
    width_new = height_new
    w_min_real = mid_real - width_new // 2
    if w_min_real < 0:
      w_min_real = 0
      width_new = mid_real * 2

    w_max_real = w_min_real + width_new
    w_min = w_min_real / width
    w_max = w_max_real / width

    imh_new, imw_new, rb, re, cb, ce = resize_and_pad_param(height_new, width_new, max_size)
    res = {'draw_pose_params': [imh_new, imw_new, rb, re, cb, ce], 
           'pose_params': [w_min, w_max, h_min, h_max],
           'video_params': [h_min_real, h_max_real, w_min_real, w_max_real],
           }
    return res

def save_pose_params_item(input_items):
    detected_pose, pose_params, draw_pose_params, save_dir = input_items
    w_min, w_max, h_min, h_max = pose_params
    num = detected_pose['num']
    candidate_body = detected_pose['bodies']['candidate']
    candidate_face = detected_pose['faces'][0]
    candidate_hand = detected_pose['hands']
    candidate_body[:,0] = (candidate_body[:,0]-w_min)/(w_max-w_min)
    candidate_body[:,1] = (candidate_body[:,1]-h_min)/(h_max-h_min)
    candidate_face[:,0] = (candidate_face[:,0]-w_min)/(w_max-w_min)
    candidate_face[:,1] = (candidate_face[:,1]-h_min)/(h_max-h_min)
    candidate_hand[:,:,0] = (candidate_hand[:,:,0]-w_min)/(w_max-w_min)
    candidate_hand[:,:,1] = (candidate_hand[:,:,1]-h_min)/(h_max-h_min)
    detected_pose['bodies']['candidate'] = candidate_body
    detected_pose['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
    detected_pose['hands'] = candidate_hand
    detected_pose['draw_pose_params'] = draw_pose_params
    np.save(save_dir+'/'+str(num)+'.npy', detected_pose)

def save_pose_params(detected_poses, pose_params, draw_pose_params, ori_video_path):
    save_dir = ori_video_path.replace('video', 'pose/')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    input_list = []
    
    for i, detected_pose in enumerate(detected_poses):
        input_list.append([detected_pose, pose_params, draw_pose_params, save_dir])

    pool = ThreadPool(8)
    pool.map(save_pose_params_item, input_list)
    pool.close()
    pool.join()
    return save_dir
from torchvision.transforms import functional as F
def get_img_pose(
        img_path: str, 
        sample_stride: int=1,
        max_frame=None):

  # read input img
  frame = cv2.imread(img_path)
  height, width, _ = frame.shape
  short_size = min(height, width)
  resize_ratio = max(MAX_SIZE / short_size, 1.0)
  frame = cv2.resize(frame, (int(resize_ratio * width), int(resize_ratio * height)))
  height, width, _ = frame.shape
  detected_poses = [dwprocessor(frame)]
  dwprocessor.release_memory()

  return detected_poses, height, width, frame

def save_aligned_img(ori_frame, video_params, max_size):
  h_min_real, h_max_real, w_min_real, w_max_real = video_params
  img = ori_frame[h_min_real:h_max_real,w_min_real:w_max_real,:]
  img_aligened = resize_and_pad(img, max_size=max_size)
  print('aligned img shape:', img_aligened.shape)
  save_dir = './assets/refimg_aligned'

  os.makedirs(save_dir, exist_ok=True)
  save_path = os.path.join(save_dir, 'aligned.png')
  cv2.imwrite(save_path, img_aligened)
  return save_path

detected_poses, height, width, ori_frame = get_img_pose(refimg_path, max_frame=None)
res_params = get_pose_params(detected_poses, MAX_SIZE)
refimg_aligned_path = save_aligned_img(ori_frame, res_params['video_params'], MAX_SIZE)
print(1111111111111)
print(refimg_aligned_path)

@nitinmukesh
Copy link

nitinmukesh commented Jan 8, 2025

@JV-X

Thank you.

@JV-X
Copy link
Author

JV-X commented Jan 9, 2025

@JV-X

Thank you.

you're welcome

@nitinmukesh
Copy link

@JV-X

Did you get good result with your own video. I tried and there are problems.

@mengrang mengrang added the done label Jan 10, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

3 participants