-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
cjyaddone
committed
Dec 19, 2022
1 parent
6bd8854
commit 5c5403c
Showing
1 changed file
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,386 @@ | ||
from scipy.io.wavfile import write | ||
from mel_processing import spectrogram_torch | ||
from text import text_to_sequence, _clean_text | ||
from models import SynthesizerTrn | ||
import utils | ||
import commons | ||
import sys | ||
import re | ||
from torch import no_grad, LongTensor | ||
import logging | ||
from winsound import PlaySound | ||
|
||
|
||
######Socket###### | ||
import socket | ||
s= socket.socket() | ||
ip_port = ('127.0.0.1', 9000) | ||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM ) | ||
s.bind(ip_port) | ||
s.listen(5) | ||
|
||
#################################### | ||
# CHATGPT INITIALIZE | ||
from pyChatGPT import ChatGPT | ||
|
||
idmessage = """ID Speaker | ||
0 綾地寧々 | ||
1 在原七海 | ||
2 小茸 | ||
3 唐乐吟 | ||
""" | ||
speakerID = 0 | ||
|
||
|
||
def get_input(): | ||
# prompt for input | ||
print("You:") | ||
user_input = input() | ||
return user_input | ||
|
||
|
||
def get_token(): | ||
token = input("Copy your token from ChatGPT and press Enter \n") | ||
return token; | ||
|
||
|
||
################################################ | ||
|
||
|
||
logging.getLogger('numba').setLevel(logging.WARNING) | ||
|
||
|
||
def ex_print(text, escape=False): | ||
if escape: | ||
print(text.encode('unicode_escape').decode()) | ||
else: | ||
print(text) | ||
|
||
|
||
def get_text(text, hps, cleaned=False): | ||
if cleaned: | ||
text_norm = text_to_sequence(text, hps.symbols, []) | ||
else: | ||
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) | ||
if hps.data.add_blank: | ||
text_norm = commons.intersperse(text_norm, 0) | ||
text_norm = LongTensor(text_norm) | ||
return text_norm | ||
|
||
|
||
def ask_if_continue(): | ||
while True: | ||
answer = input('Continue? (y/n): ') | ||
if answer == 'y': | ||
break | ||
elif answer == 'n': | ||
sys.exit(0) | ||
|
||
|
||
def print_speakers(speakers, escape=False): | ||
if len(speakers) > 100: | ||
return | ||
print('ID\tSpeaker') | ||
for id, name in enumerate(speakers): | ||
ex_print(str(id) + '\t' + name, escape) | ||
|
||
|
||
def get_speaker_id(message): | ||
speaker_id = input(message) | ||
try: | ||
speaker_id = int(speaker_id) | ||
except: | ||
print(str(speaker_id) + ' is not a valid ID!') | ||
sys.exit(1) | ||
return speaker_id | ||
|
||
|
||
def get_label_value(text, label, default, warning_name='value'): | ||
value = re.search(rf'\[{label}=(.+?)\]', text) | ||
if value: | ||
try: | ||
text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) | ||
value = float(value.group(1)) | ||
except: | ||
print(f'Invalid {warning_name}!') | ||
sys.exit(1) | ||
else: | ||
value = default | ||
return value, text | ||
|
||
|
||
def get_label(text, label): | ||
if f'[{label}]' in text: | ||
return True, text.replace(f'[{label}]', '') | ||
else: | ||
return False, text | ||
|
||
|
||
def generateSound(inputString): | ||
if '--escape' in sys.argv: | ||
escape = True | ||
else: | ||
escape = False | ||
|
||
# model = input('Path of a VITS model: ') | ||
# config = input('Path of a config file: ') | ||
model = r".\model\CN\model.pth" | ||
config = r".\model\CN\config.json" | ||
|
||
hps_ms = utils.get_hparams_from_file(config) | ||
n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 | ||
n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 | ||
speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] | ||
use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False | ||
emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False | ||
|
||
net_g_ms = SynthesizerTrn( | ||
n_symbols, | ||
hps_ms.data.filter_length // 2 + 1, | ||
hps_ms.train.segment_size // hps_ms.data.hop_length, | ||
n_speakers=n_speakers, | ||
emotion_embedding=emotion_embedding, | ||
**hps_ms.model) | ||
_ = net_g_ms.eval() | ||
utils.load_checkpoint(model, net_g_ms) | ||
|
||
def voice_conversion(): | ||
audio_path = input('Path of an audio file to convert:\n') | ||
print_speakers(speakers) | ||
audio = utils.load_audio_to_torch( | ||
audio_path, hps_ms.data.sampling_rate) | ||
|
||
originnal_id = get_speaker_id('Original speaker ID: ') | ||
target_id = get_speaker_id('Target speaker ID: ') | ||
out_path = input('Path to save: ') | ||
|
||
y = audio.unsqueeze(0) | ||
|
||
spec = spectrogram_torch(y, hps_ms.data.filter_length, | ||
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, | ||
center=False) | ||
spec_lengths = LongTensor([spec.size(-1)]) | ||
sid_src = LongTensor([originnal_id]) | ||
|
||
with no_grad(): | ||
sid_tgt = LongTensor([target_id]) | ||
audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ | ||
0][0, 0].data.cpu().float().numpy() | ||
return audio, out_path | ||
|
||
if n_symbols != 0: | ||
if not emotion_embedding: | ||
# while True: | ||
if (1 == 1): | ||
# choice = input('TTS or VC? (t/v):') | ||
choice = 't' | ||
if choice == 't': | ||
# text = input('Text to read: ') | ||
text = inputString | ||
if text == '[ADVANCED]': | ||
# text = input('Raw text:') | ||
text = "我不会说" | ||
# print('Cleaned text is:') | ||
# ex_print(_clean_text( | ||
# text, hps_ms.data.text_cleaners), escape) | ||
# continue | ||
|
||
length_scale, text = get_label_value( | ||
text, 'LENGTH', 1, 'length scale') | ||
noise_scale, text = get_label_value( | ||
text, 'NOISE', 0.667, 'noise scale') | ||
noise_scale_w, text = get_label_value( | ||
text, 'NOISEW', 0.8, 'deviation of noise') | ||
cleaned, text = get_label(text, 'CLEANED') | ||
|
||
stn_tst = get_text(text, hps_ms, cleaned=cleaned) | ||
|
||
# print_speakers(speakers, escape) | ||
# speaker_id = get_speaker_id('Speaker ID: ') | ||
speaker_id = speakerID | ||
# out_path = input('Path to save: ') | ||
out_path = "output.wav" | ||
|
||
with no_grad(): | ||
x_tst = stn_tst.unsqueeze(0) | ||
x_tst_lengths = LongTensor([stn_tst.size(0)]) | ||
sid = LongTensor([speaker_id]) | ||
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | ||
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][ | ||
0, 0].data.cpu().float().numpy() | ||
|
||
elif choice == 'v': | ||
audio, out_path = voice_conversion() | ||
|
||
write(out_path, hps_ms.data.sampling_rate, audio) | ||
print('Successfully saved!') | ||
# ask_if_continue() | ||
else: | ||
import os | ||
import librosa | ||
import numpy as np | ||
from torch import FloatTensor | ||
import audonnx | ||
w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') | ||
w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) | ||
# while True: | ||
if (1 == 1): | ||
# choice = input('TTS or VC? (t/v):') | ||
choice = 't' | ||
if choice == 't': | ||
# text = input('Text to read: ') | ||
text = inputString | ||
if text == '[ADVANCED]': | ||
# text = input('Raw text:') | ||
text = "我不会说" | ||
# print('Cleaned text is:') | ||
# ex_print(_clean_text( | ||
# text, hps_ms.data.text_cleaners), escape) | ||
# continue | ||
|
||
length_scale, text = get_label_value( | ||
text, 'LENGTH', 1, 'length scale') | ||
noise_scale, text = get_label_value( | ||
text, 'NOISE', 0.667, 'noise scale') | ||
noise_scale_w, text = get_label_value( | ||
text, 'NOISEW', 0.8, 'deviation of noise') | ||
cleaned, text = get_label(text, 'CLEANED') | ||
|
||
stn_tst = get_text(text, hps_ms, cleaned=cleaned) | ||
|
||
# print_speakers(speakers, escape) | ||
# speaker_id = get_speaker_id('Speaker ID: ') | ||
speaker_id = speakerID | ||
|
||
emotion_reference = input('Path of an emotion reference: ') | ||
if emotion_reference.endswith('.npy'): | ||
emotion = np.load(emotion_reference) | ||
emotion = FloatTensor(emotion).unsqueeze(0) | ||
else: | ||
audio16000, sampling_rate = librosa.load( | ||
emotion_reference, sr=16000, mono=True) | ||
emotion = w2v2_model(audio16000, sampling_rate)[ | ||
'hidden_states'] | ||
emotion_reference = re.sub( | ||
r'\..*$', '', emotion_reference) | ||
np.save(emotion_reference, emotion.squeeze(0)) | ||
emotion = FloatTensor(emotion) | ||
|
||
# out_path = input('Path to save: ') | ||
out_path = "output.wav" | ||
|
||
with no_grad(): | ||
x_tst = stn_tst.unsqueeze(0) | ||
x_tst_lengths = LongTensor([stn_tst.size(0)]) | ||
sid = LongTensor([speaker_id]) | ||
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | ||
noise_scale_w=noise_scale_w, | ||
length_scale=length_scale, emotion_embedding=emotion)[0][ | ||
0, 0].data.cpu().float().numpy() | ||
|
||
elif choice == 'v': | ||
audio, out_path = voice_conversion() | ||
|
||
write(out_path, hps_ms.data.sampling_rate, audio) | ||
print('Successfully saved!') | ||
# ask_if_continue() | ||
else: | ||
model = input('Path of a hubert-soft model: ') | ||
from hubert_model import hubert_soft | ||
hubert = hubert_soft(model) | ||
|
||
while True: | ||
audio_path = input('Path of an audio file to convert:\n') | ||
|
||
if audio_path != '[VC]': | ||
import librosa | ||
if use_f0: | ||
audio, sampling_rate = librosa.load( | ||
audio_path, sr=hps_ms.data.sampling_rate, mono=True) | ||
audio16000 = librosa.resample( | ||
audio, orig_sr=sampling_rate, target_sr=16000) | ||
else: | ||
audio16000, sampling_rate = librosa.load( | ||
audio_path, sr=16000, mono=True) | ||
|
||
# print_speakers(speakers, escape) | ||
target_id = get_speaker_id('Target speaker ID: ') | ||
out_path = input('Path to save: ') | ||
length_scale, out_path = get_label_value( | ||
out_path, 'LENGTH', 1, 'length scale') | ||
noise_scale, out_path = get_label_value( | ||
out_path, 'NOISE', 0.1, 'noise scale') | ||
noise_scale_w, out_path = get_label_value( | ||
out_path, 'NOISEW', 0.1, 'deviation of noise') | ||
|
||
from torch import inference_mode, FloatTensor | ||
import numpy as np | ||
with inference_mode(): | ||
units = hubert.units(FloatTensor(audio16000).unsqueeze( | ||
0).unsqueeze(0)).squeeze(0).numpy() | ||
if use_f0: | ||
f0_scale, out_path = get_label_value( | ||
out_path, 'F0', 1, 'f0 scale') | ||
f0 = librosa.pyin(audio, sr=sampling_rate, | ||
fmin=librosa.note_to_hz('C0'), | ||
fmax=librosa.note_to_hz('C7'), | ||
frame_length=1780)[0] | ||
target_length = len(units[:, 0]) | ||
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length, | ||
np.arange(0, len(f0)), f0)) * f0_scale | ||
units[:, 0] = f0 / 10 | ||
|
||
stn_tst = FloatTensor(units) | ||
with no_grad(): | ||
x_tst = stn_tst.unsqueeze(0) | ||
x_tst_lengths = LongTensor([stn_tst.size(0)]) | ||
sid = LongTensor([target_id]) | ||
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | ||
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][ | ||
0, 0].data.float().numpy() | ||
|
||
else: | ||
audio, out_path = voice_conversion() | ||
|
||
write(out_path, hps_ms.data.sampling_rate, audio) | ||
print('Successfully saved!') | ||
# ask_if_continue() | ||
|
||
|
||
if __name__ == "__main__": | ||
print("链接已生成,等待UI连接") | ||
client, client_addr = s.accept() | ||
print("链接已建立,等待接受token") | ||
|
||
total_data = bytes() | ||
while True: | ||
data = client.recv(1024) | ||
total_data += data | ||
if len(data) < 1024: | ||
break | ||
session_token = total_data.decode() | ||
print(session_token) | ||
if(session_token): | ||
print("收到token:"+ session_token) | ||
api = ChatGPT(session_token) | ||
speakerID = int(client.recv(1024).decode()) | ||
|
||
while True: | ||
total_data = bytes() | ||
while True: | ||
data = client.recv(1024) | ||
total_data += data | ||
if len(data) < 1024: | ||
break | ||
question = total_data.decode() | ||
print("Question Received: "+ question) | ||
if(len(question) > 0): | ||
resp = api.send_message(str(question)) | ||
answer = resp["message"].replace('\n', '') | ||
print("ChatGPT:") | ||
print(answer) | ||
generateSound("[ZH]" + answer + "[ZH]") | ||
client.send(answer.encode()) | ||
PlaySound(r'.\output.wav', flags=1) | ||
|
||
|