Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
cjyaddone committed Dec 19, 2022
1 parent 6bd8854 commit 5c5403c
Showing 1 changed file with 386 additions and 0 deletions.
386 changes: 386 additions & 0 deletions ChatWaifuServer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,386 @@
from scipy.io.wavfile import write
from mel_processing import spectrogram_torch
from text import text_to_sequence, _clean_text
from models import SynthesizerTrn
import utils
import commons
import sys
import re
from torch import no_grad, LongTensor
import logging
from winsound import PlaySound


######Socket######
import socket
s= socket.socket()
ip_port = ('127.0.0.1', 9000)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM )
s.bind(ip_port)
s.listen(5)

####################################
# CHATGPT INITIALIZE
from pyChatGPT import ChatGPT

idmessage = """ID Speaker
0 綾地寧々
1 在原七海
2 小茸
3 唐乐吟
"""
speakerID = 0


def get_input():
# prompt for input
print("You:")
user_input = input()
return user_input


def get_token():
token = input("Copy your token from ChatGPT and press Enter \n")
return token;


################################################


logging.getLogger('numba').setLevel(logging.WARNING)


def ex_print(text, escape=False):
if escape:
print(text.encode('unicode_escape').decode())
else:
print(text)


def get_text(text, hps, cleaned=False):
if cleaned:
text_norm = text_to_sequence(text, hps.symbols, [])
else:
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm


def ask_if_continue():
while True:
answer = input('Continue? (y/n): ')
if answer == 'y':
break
elif answer == 'n':
sys.exit(0)


def print_speakers(speakers, escape=False):
if len(speakers) > 100:
return
print('ID\tSpeaker')
for id, name in enumerate(speakers):
ex_print(str(id) + '\t' + name, escape)


def get_speaker_id(message):
speaker_id = input(message)
try:
speaker_id = int(speaker_id)
except:
print(str(speaker_id) + ' is not a valid ID!')
sys.exit(1)
return speaker_id


def get_label_value(text, label, default, warning_name='value'):
value = re.search(rf'\[{label}=(.+?)\]', text)
if value:
try:
text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
value = float(value.group(1))
except:
print(f'Invalid {warning_name}!')
sys.exit(1)
else:
value = default
return value, text


def get_label(text, label):
if f'[{label}]' in text:
return True, text.replace(f'[{label}]', '')
else:
return False, text


def generateSound(inputString):
if '--escape' in sys.argv:
escape = True
else:
escape = False

# model = input('Path of a VITS model: ')
# config = input('Path of a config file: ')
model = r".\model\CN\model.pth"
config = r".\model\CN\config.json"

hps_ms = utils.get_hparams_from_file(config)
n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False

net_g_ms = SynthesizerTrn(
n_symbols,
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=n_speakers,
emotion_embedding=emotion_embedding,
**hps_ms.model)
_ = net_g_ms.eval()
utils.load_checkpoint(model, net_g_ms)

def voice_conversion():
audio_path = input('Path of an audio file to convert:\n')
print_speakers(speakers)
audio = utils.load_audio_to_torch(
audio_path, hps_ms.data.sampling_rate)

originnal_id = get_speaker_id('Original speaker ID: ')
target_id = get_speaker_id('Target speaker ID: ')
out_path = input('Path to save: ')

y = audio.unsqueeze(0)

spec = spectrogram_torch(y, hps_ms.data.filter_length,
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
center=False)
spec_lengths = LongTensor([spec.size(-1)])
sid_src = LongTensor([originnal_id])

with no_grad():
sid_tgt = LongTensor([target_id])
audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
0][0, 0].data.cpu().float().numpy()
return audio, out_path

if n_symbols != 0:
if not emotion_embedding:
# while True:
if (1 == 1):
# choice = input('TTS or VC? (t/v):')
choice = 't'
if choice == 't':
# text = input('Text to read: ')
text = inputString
if text == '[ADVANCED]':
# text = input('Raw text:')
text = "我不会说"
# print('Cleaned text is:')
# ex_print(_clean_text(
# text, hps_ms.data.text_cleaners), escape)
# continue

length_scale, text = get_label_value(
text, 'LENGTH', 1, 'length scale')
noise_scale, text = get_label_value(
text, 'NOISE', 0.667, 'noise scale')
noise_scale_w, text = get_label_value(
text, 'NOISEW', 0.8, 'deviation of noise')
cleaned, text = get_label(text, 'CLEANED')

stn_tst = get_text(text, hps_ms, cleaned=cleaned)

# print_speakers(speakers, escape)
# speaker_id = get_speaker_id('Speaker ID: ')
speaker_id = speakerID
# out_path = input('Path to save: ')
out_path = "output.wav"

with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([speaker_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
0, 0].data.cpu().float().numpy()

elif choice == 'v':
audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)
print('Successfully saved!')
# ask_if_continue()
else:
import os
import librosa
import numpy as np
from torch import FloatTensor
import audonnx
w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
# while True:
if (1 == 1):
# choice = input('TTS or VC? (t/v):')
choice = 't'
if choice == 't':
# text = input('Text to read: ')
text = inputString
if text == '[ADVANCED]':
# text = input('Raw text:')
text = "我不会说"
# print('Cleaned text is:')
# ex_print(_clean_text(
# text, hps_ms.data.text_cleaners), escape)
# continue

length_scale, text = get_label_value(
text, 'LENGTH', 1, 'length scale')
noise_scale, text = get_label_value(
text, 'NOISE', 0.667, 'noise scale')
noise_scale_w, text = get_label_value(
text, 'NOISEW', 0.8, 'deviation of noise')
cleaned, text = get_label(text, 'CLEANED')

stn_tst = get_text(text, hps_ms, cleaned=cleaned)

# print_speakers(speakers, escape)
# speaker_id = get_speaker_id('Speaker ID: ')
speaker_id = speakerID

emotion_reference = input('Path of an emotion reference: ')
if emotion_reference.endswith('.npy'):
emotion = np.load(emotion_reference)
emotion = FloatTensor(emotion).unsqueeze(0)
else:
audio16000, sampling_rate = librosa.load(
emotion_reference, sr=16000, mono=True)
emotion = w2v2_model(audio16000, sampling_rate)[
'hidden_states']
emotion_reference = re.sub(
r'\..*$', '', emotion_reference)
np.save(emotion_reference, emotion.squeeze(0))
emotion = FloatTensor(emotion)

# out_path = input('Path to save: ')
out_path = "output.wav"

with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([speaker_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale, emotion_embedding=emotion)[0][
0, 0].data.cpu().float().numpy()

elif choice == 'v':
audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)
print('Successfully saved!')
# ask_if_continue()
else:
model = input('Path of a hubert-soft model: ')
from hubert_model import hubert_soft
hubert = hubert_soft(model)

while True:
audio_path = input('Path of an audio file to convert:\n')

if audio_path != '[VC]':
import librosa
if use_f0:
audio, sampling_rate = librosa.load(
audio_path, sr=hps_ms.data.sampling_rate, mono=True)
audio16000 = librosa.resample(
audio, orig_sr=sampling_rate, target_sr=16000)
else:
audio16000, sampling_rate = librosa.load(
audio_path, sr=16000, mono=True)

# print_speakers(speakers, escape)
target_id = get_speaker_id('Target speaker ID: ')
out_path = input('Path to save: ')
length_scale, out_path = get_label_value(
out_path, 'LENGTH', 1, 'length scale')
noise_scale, out_path = get_label_value(
out_path, 'NOISE', 0.1, 'noise scale')
noise_scale_w, out_path = get_label_value(
out_path, 'NOISEW', 0.1, 'deviation of noise')

from torch import inference_mode, FloatTensor
import numpy as np
with inference_mode():
units = hubert.units(FloatTensor(audio16000).unsqueeze(
0).unsqueeze(0)).squeeze(0).numpy()
if use_f0:
f0_scale, out_path = get_label_value(
out_path, 'F0', 1, 'f0 scale')
f0 = librosa.pyin(audio, sr=sampling_rate,
fmin=librosa.note_to_hz('C0'),
fmax=librosa.note_to_hz('C7'),
frame_length=1780)[0]
target_length = len(units[:, 0])
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
np.arange(0, len(f0)), f0)) * f0_scale
units[:, 0] = f0 / 10

stn_tst = FloatTensor(units)
with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([target_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
0, 0].data.float().numpy()

else:
audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)
print('Successfully saved!')
# ask_if_continue()


if __name__ == "__main__":
print("链接已生成,等待UI连接")
client, client_addr = s.accept()
print("链接已建立,等待接受token")

total_data = bytes()
while True:
data = client.recv(1024)
total_data += data
if len(data) < 1024:
break
session_token = total_data.decode()
print(session_token)
if(session_token):
print("收到token:"+ session_token)
api = ChatGPT(session_token)
speakerID = int(client.recv(1024).decode())

while True:
total_data = bytes()
while True:
data = client.recv(1024)
total_data += data
if len(data) < 1024:
break
question = total_data.decode()
print("Question Received: "+ question)
if(len(question) > 0):
resp = api.send_message(str(question))
answer = resp["message"].replace('\n', '')
print("ChatGPT:")
print(answer)
generateSound("[ZH]" + answer + "[ZH]")
client.send(answer.encode())
PlaySound(r'.\output.wav', flags=1)


0 comments on commit 5c5403c

Please sign in to comment.