avatardatatec.studio

Free AI web copilot to create summaries, insights and extended knowledge, download it at here

3747

Abstract

m</span> bark.generation <span class="hljs-keyword">import</span> load_codec_model, generate_text_semantic <span class="hljs-keyword">from</span> encodec.utils <span class="hljs-keyword">import</span> convert_audio

<span class="hljs-keyword">import</span> torchaudio <span class="hljs-keyword">import</span> torch

device = <span class="hljs-string">'cuda'</span> <span class="hljs-comment"># or 'cpu'</span> model = load_codec_model(use_gpu=<span class="hljs-literal">True</span> <span class="hljs-keyword">if</span> device == <span class="hljs-string">'cuda'</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">False</span>)

<span class="hljs-keyword">from</span> hubert.hubert_manager <span class="hljs-keyword">import</span> HuBERTManager hubert_manager = HuBERTManager() hubert_manager.make_sure_hubert_installed() hubert_manager.make_sure_tokenizer_installed()

<span class="hljs-keyword">from</span> hubert.pre_kmeans_hubert <span class="hljs-keyword">import</span> CustomHubert <span class="hljs-keyword">from</span> hubert.customtokenizer <span class="hljs-keyword">import</span> CustomTokenizer hubert_model = CustomHubert(checkpoint_path=<span class="hljs-string">'data/models/hubert/hubert.pt'</span>).to(device)

tokenizer = CustomTokenizer.load_from_checkpoint(<span class="hljs-string">'data/models/hubert/tokenizer.pth'</span>).to(device) <span class="hljs-comment"># Automatically uses the right layers</span>

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># mount google driver, prepare .wav file</span> <span class="hljs-comment">###############################</span> <span class="hljs-keyword">from</span> google.colab <span class="hljs-keyword">import</span> drive drive.mount(<span class="hljs-string">'/content/drive'</span>)

<span class="hljs-comment"># Please create a folder "AI_Voice_Lab" and a sub-folder "Voice_Output" in your google driver at first.</span> %cd /content/drive/MyDrive/AI_Voice_Lab/

lab_folder = <span class="hljs-string">"/content/drive/MyDrive/AI_Voice_Lab/"</span> audio_filepath = lab_folder + <span class="hljs-string">"ylcn.wav"</span> <span class="hljs-comment"># the audio you want to clone (under 13 seconds)</span>

voice_name = <span class="hljs-string">'output'</span> <span class="hljs-comment"># whatever you want the name of the voice to be</span> output_path = lab_folder + <span class="hljs-string">"Voice_Output/"</span> + voice_name + <span class="hljs-string">'.npz'</span>

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># Load and pre-process the audio waveform</span> <span class="hljs-comment"># Extract discrete codes from EnCodec</span> <span class="hljs-comment"># move codes to cpu</span> <span class="hljs-comment"># move semantic tokens to cpu</span> <span class="hljs-comment"># .npz file will be saved to output_path. </span> <span class="hljs-comment"># The output_path will be used later as history prompt when clone your voice.</span> <span class="hljs-comment">###############################</span>

wav, sr = torchaudio.load(audio_filepath) wav = convert_audio(wav, sr, model.sample_rate, model.channels) wav = wav.to(device)

semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate) semantic_tokens = tokenizer.get_token(semantic_vectors)

<span class="hljs-keyword">with</span> torch.no_grad(): encoded_frames = model.encode(wav.unsqueeze(<span class="hljs-number">0</span>)) codes = torch.cat([encoded[<span class="hljs-number">0</span>] <span class="hljs-keyword">for</span> encoded <span class="hljs-keyword">in</span> encoded_frames], dim=-<span class="hljs-number">1</span>).squeeze() <span class="hljs-commen

Options

t"># [n_q, T]</span> codes = codes.cpu().numpy() semantic_tokens = semantic_tokens.cpu().numpy()

<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:<span class="hljs-number">2</span>, :], semantic_prompt=semantic_tokens)

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># Enter your prompt and speaker here</span> <span class="hljs-comment"># download and load all models</span> <span class="hljs-comment"># simple generation audio array </span> <span class="hljs-comment">###############################</span> <span class="hljs-keyword">from</span> bark.api <span class="hljs-keyword">import</span> generate_audio <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BertTokenizer <span class="hljs-keyword">from</span> bark.generation <span class="hljs-keyword">import</span> SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

text_prompt = <span class="hljs-string">"你们吃了没"</span>

preload_models( text_use_gpu=<span class="hljs-literal">True</span>, text_use_small=<span class="hljs-literal">False</span>, coarse_use_gpu=<span class="hljs-literal">True</span>, coarse_use_small=<span class="hljs-literal">False</span>, fine_use_gpu=<span class="hljs-literal">True</span>, fine_use_small=<span class="hljs-literal">False</span>, codec_use_gpu=<span class="hljs-literal">True</span>, force_reload=<span class="hljs-literal">False</span> )

audio_array = generate_audio(text_prompt, history_prompt=output_path, text_temp=<span class="hljs-number">0.7</span>, waveform_temp=<span class="hljs-number">0.7</span>)

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># generation with more control#</span> <span class="hljs-comment">###############################</span> x_semantic = generate_text_semantic( text_prompt, history_prompt=output_path, temp=<span class="hljs-number">0.7</span>, top_k=<span class="hljs-number">50</span>, top_p=<span class="hljs-number">0.95</span>, )

x_coarse_gen = generate_coarse( x_semantic, history_prompt=output_path, temp=<span class="hljs-number">0.7</span>, top_k=<span class="hljs-number">50</span>, top_p=<span class="hljs-number">0.95</span>, ) x_fine_gen = generate_fine( x_coarse_gen, history_prompt=output_path, temp=<span class="hljs-number">0.5</span>, ) audio_array = codec_decode(x_fine_gen)

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># play audio #</span> <span class="hljs-comment">###############################</span> <span class="hljs-keyword">from</span> IPython.display <span class="hljs-keyword">import</span> Audio Audio(audio_array, rate=SAMPLE_RATE)

<span class="hljs-comment">###############################</span> <span class="hljs-comment"># save audio to google driver #</span> <span class="hljs-comment">###############################</span> <span class="hljs-keyword">from</span> scipy.io.wavfile <span class="hljs-keyword">import</span> write <span class="hljs-keyword">as</span> write_wav cloned_voice_file_path = lab_folder + <span class="hljs-string">"Voice_Output/cloned_audio.wav"</span> write_wav(cloned_voice_file_path, SAMPLE_RATE, audio_array)</pre></div><p id="57d3">I hope you enjoyed today’s content.</p><p id="2960"><b><i>You are welcome to my network:</i></b></p><p id="50db">Follow me on <a href="/@datatec.studio">Medium</a></p><p id="62d2">Your claps 👏 keep me continue writing high-quality articles. Thank you!</p></article></body>

How to Generate Your Own Voices with a 12-Second Sampler

Photo by Md Mahdi on Unsplash

This article is about how to generate voice from text prompt on Colab. Before the generation of voice file, a 12-Second voice file was used as basis information.

During the experiment, i trained the model with chinese. The generated voice seems like from another person. Maybe i should try with english or try multitimes.

However, it might make sense to share the experience here.

The solution was inspired by bark-with-voice-clone.

Table of Contents

Use Case

Colab Project

Use Case

By given a text prompt, i would like to have a voice file generated, which should be sound like from me.

Play cloned voice and save voice file

Colab Project

Precondition

  1. Create a folder ”AI_Voice_Lab” and a subfolder “Voice_Output” on Google driver.
  2. Upload your voice file (.wav) to folder “AI_Voice_Lab”. The voice file must be under 13 seconds.
  3. Colab account. GPU or CPU can be used for the project.
Google Driver
GPU Usage

Colab Project Source Code

You can find the Colab project with output of each code block from my github repository.

Following are related source code:

###############################
# https://medium.com/@datatec.studio/
# https://www.reddit.com/r/singularity/comments/12udgzh/bark_text2speechbut_with_custom_voice_cloning/
# https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
# https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
#
# Install python package
# Install hubert model
# Load HuBERT for semantic tokens
# Load the HuBERT model
# Load the CustomTokenizer model
###############################
!pip install bark encodec torchaudio transformers fairseq audiolm_pytorch

!git clone https://github.com/serp-ai/bark-with-voice-clone
%cd bark-with-voice-clone/
!pip install git+https://github.com/suno-ai/bark.git

from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

device = 'cuda' # or 'cpu'
model = load_codec_model(use_gpu=True if device == 'cuda' else False)


from hubert.hubert_manager import HuBERTManager
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)

tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers

###############################
# mount google driver, prepare .wav file
###############################
from google.colab import drive
drive.mount('/content/drive')

# Please create a folder "AI_Voice_Lab" and a sub-folder "Voice_Output" in your google driver at first.
%cd /content/drive/MyDrive/AI_Voice_Lab/

lab_folder = "/content/drive/MyDrive/AI_Voice_Lab/"
audio_filepath = lab_folder + "ylcn.wav" # the audio you want to clone (under 13 seconds)

voice_name = 'output' # whatever you want the name of the voice to be
output_path = lab_folder + "Voice_Output/" + voice_name + '.npz'

###############################
# Load and pre-process the audio waveform
# Extract discrete codes from EnCodec
# move codes to cpu
# move semantic tokens to cpu
# .npz file will be saved to output_path. 
# The output_path will be used later as history prompt when clone your voice.
###############################

wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

with torch.no_grad():
    encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]
codes = codes.cpu().numpy()
semantic_tokens = semantic_tokens.cpu().numpy()

import numpy as np
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)


###############################
# Enter your prompt and speaker here
# download and load all models
# simple generation audio array 
###############################
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

text_prompt = "你们吃了没"

preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False
)

audio_array = generate_audio(text_prompt, history_prompt=output_path, text_temp=0.7, waveform_temp=0.7)

###############################
# generation with more control#
###############################
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=output_path,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=output_path,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=output_path,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

###############################
# play audio                  #
###############################
from IPython.display import Audio
Audio(audio_array, rate=SAMPLE_RATE)

###############################
# save audio to google driver #
###############################
from scipy.io.wavfile import write as write_wav
cloned_voice_file_path = lab_folder + "Voice_Output/cloned_audio.wav" 
write_wav(cloned_voice_file_path, SAMPLE_RATE, audio_array)

I hope you enjoyed today’s content.

You are welcome to my network:

Follow me on Medium

Your claps 👏 keep me continue writing high-quality articles. Thank you!

AI
Bert
Development
Voices
Llm
Recommended from ReadMedium