olivebot/test/ovr_lipsync/test_olipsync.py
xszyou 875ed28b58 初始上传
初始上传
2023-12-12 00:03:36 +08:00

109 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import subprocess
import time
import os
os.environ['PATH'] += os.pathsep + os.path.join(os.getcwd(), "test", "ovr_lipsync", "ffmpeg", "bin")
from pydub import AudioSegment
import json
def list_files(dir_path):
for root, dirs, files in os.walk(dir_path):
for file in files:
print(os.path.join(root, file))
class LipSyncGenerator:
def __init__(self):
self.viseme_em = [
"sil", "PP", "FF", "TH", "DD",
"kk", "CH", "SS", "nn", "RR",
"aa", "E", "ih", "oh", "ou"]
self.viseme = []
self.exe_path = os.path.join(os.getcwd(), "test", "ovr_lipsync", "ovr_lipsync_exe", "ProcessWAV.exe")
def convert_mp3_to_wav(self, mp3_filepath):
audio = AudioSegment.from_mp3(mp3_filepath)
# 使用 set_frame_rate 方法设置采样率
audio = audio.set_frame_rate(44100)
wav_filepath = mp3_filepath.rsplit(".", 1)[0] + ".wav"
audio.export(wav_filepath, format="wav")
return wav_filepath
def run_exe_and_get_output(self, arguments):
process = subprocess.Popen([self.exe_path] + arguments, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
output = process.stdout.readline()
if output == b'' and process.poll() is not None:
break
if output:
self.viseme.append(output.strip().decode())
rc = process.poll()
return rc
def filter(self, viseme):
new_viseme = []
for v in self.viseme:
if v in self.viseme_em:
new_viseme.append(v)
return new_viseme
def generate_visemes(self, mp3_filepath):
wav_filepath = self.convert_mp3_to_wav(mp3_filepath)
arguments = ["--print-viseme-name", wav_filepath]
self.run_exe_and_get_output(arguments)
return self.filter(self.viseme)
def consolidate_visemes(self, viseme_list):
if not viseme_list:
return []
result = []
current_viseme = viseme_list[0]
count = 1
for viseme in viseme_list[1:]:
if viseme == current_viseme:
count += 1
else:
result.append({"Lip": current_viseme, "Time": count*33}) # Multiply by 10 for duration in ms
current_viseme = viseme
count = 1
# Add the last viseme to the result
result.append({"Lip": current_viseme, "Time": count*33}) # Multiply by 10 for duration in ms
new_data = []
for i in range(len(result)):
if result[i]['Time'] < 30:
if len(new_data) > 0:
new_data[-1]['Time'] += result[i]['Time']
else:
new_data.append(result[i])
return new_data
if __name__ == "__main__":
start_time = time.time()
lip_sync_generator = LipSyncGenerator()
viseme_list = lip_sync_generator.generate_visemes(R"E:\\github\\Fay assistant\\samples\\sample-1701224060795.mp3")
print(viseme_list)
consolidated_visemes = lip_sync_generator.consolidate_visemes(viseme_list)
print(json.dumps(consolidated_visemes))
print(time.time() - start_time)
audio = AudioSegment.from_file("E:\\github\\Fay assistant\\samples\\sample-1701224060795.wav")
audio_length = len(audio) # 持续时间,单位为毫秒
interface_text = f"""{{
"Topic": "Unreal",
"Data": {{
"Key": "audio",
"Value": "E:\\github\\Fay assistant\\samples\\sample-1701224060795.mp3",
"Text" : "我叫Fay,我今年18岁很年青。"
"Lips":{consolidated_visemes},
"Time": {audio_length / 1000},
"Type": "interact"
}}
}}"""
with open("notepad.txt", "w") as f:
f.write(interface_text)