All in one! 合并main分支和fast_inference_分支 (#1490)

* 合并main分支和fast_inference_分支 * 修复一些bug
2024-08-20 22:19:04 +08:00
parent 9c75f35ece
commit 52c50c6c81
12 changed files with 2653 additions and 76 deletions
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@@ -1,11 +1,10 @@
 # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
 # reference: https://github.com/lifeiteng/vall-e
+import math
+from typing import List, Optional
 import torch
-import random
-import numpy as np
-
 from tqdm import tqdm
-from typing import List
+
 from AR.models.utils import make_pad_mask
 from AR.models.utils import (
    topk_sampling,
@@ -37,6 +36,34 @@ default_config = {
    "EOS": 1024,
 }

+# @torch.jit.script ## 使用的话首次推理会非常慢，而且推理速度不稳定
+# Efficient implementation equivalent to the following:
+def scaled_dot_product_attention(query:torch.Tensor, key:torch.Tensor, value:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, scale:Optional[torch.Tensor]=None) -> torch.Tensor:
+    B, H, L, S =query.size(0), query.size(1), query.size(-2), key.size(-2)
+    if scale is None:
+        scale_factor = torch.tensor(1 / math.sqrt(query.size(-1)))
+    else:
+        scale_factor = scale
+    attn_bias = torch.zeros(B, H, L, S, dtype=query.dtype, device=query.device)
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask, float("-inf"))
+        else:
+            attn_bias += attn_mask
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_weight.masked_fill_(attn_mask, 0)
+        else:
+            attn_mask[attn_mask!=float("-inf")] =0
+            attn_mask[attn_mask==float("-inf")] =1
+            attn_weight.masked_fill_(attn_mask, 0)
+
+    return attn_weight @ value

@torch.jit.script
 class T2SMLP:
@@ -84,61 +111,112 @@ class T2SBlock:
        self.norm_b2 = norm_b2
        self.norm_eps2 = norm_eps2

-    def process_prompt(self, x, attn_mask: torch.Tensor):
-        q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1)
+        self.false = torch.tensor(False, dtype=torch.bool)
+
+    @torch.jit.ignore
+    def to_mask(self, x:torch.Tensor, padding_mask:Optional[torch.Tensor]):
+        if padding_mask is None:
+            return x
+        
+        if padding_mask.dtype == torch.bool:
+            return x.masked_fill(padding_mask, 0)
+        else:
+            return x * padding_mask
+        
+    def process_prompt(self, x:torch.Tensor, attn_mask : torch.Tensor, padding_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True):
+
+            
+        q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1)

        batch_size = q.shape[0]
        q_len = q.shape[1]
        kv_len = k.shape[1]
-
-        k_cache = k
-        v_cache = v
+        
+        q = self.to_mask(q, padding_mask)
+        k_cache = self.to_mask(k, padding_mask)
+        v_cache = self.to_mask(v, padding_mask)

        q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2)
        k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
        v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)

-        attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask)
+        if torch_sdpa:
+            attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask)
+        else:
+            attn = scaled_dot_product_attention(q, k, v, attn_mask)

-        attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim)
-        attn = F.linear(attn, self.out_w, self.out_b)
+        attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim)
+        attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
+        attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b)

-        x = F.layer_norm(
-            x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
-        )
-        x = F.layer_norm(
-            x + self.mlp.forward(x),
-            [self.hidden_dim],
-            self.norm_w2,
-            self.norm_b2,
-            self.norm_eps2,
-        )
+        if padding_mask is not None:
+            for i in range(batch_size):
+                # mask = padding_mask[i,:,0]
+                if self.false.device!= padding_mask.device:
+                    self.false = self.false.to(padding_mask.device)
+                idx = torch.where(padding_mask[i,:,0]==self.false)[0]
+                x_item = x[i,idx,:].unsqueeze(0)
+                attn_item = attn[i,idx,:].unsqueeze(0)
+                x_item = x_item + attn_item
+                x_item = F.layer_norm(
+                    x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
+                )
+                x_item = x_item + self.mlp.forward(x_item)
+                x_item = F.layer_norm(
+                    x_item,
+                    [self.hidden_dim],
+                    self.norm_w2,
+                    self.norm_b2,
+                    self.norm_eps2,
+                )
+                x[i,idx,:] = x_item.squeeze(0)
+            x = self.to_mask(x, padding_mask)
+        else:
+            x = x + attn
+            x = F.layer_norm(
+                x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
+            )
+            x = x + self.mlp.forward(x)
+            x = F.layer_norm(
+                x,
+                [self.hidden_dim],
+                self.norm_w2,
+                self.norm_b2,
+                self.norm_eps2,
+            )
        return x, k_cache, v_cache
-
-    def decode_next_token(self, x, k_cache, v_cache):
+    
+    def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True):
        q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1)

        k_cache = torch.cat([k_cache, k], dim=1)
        v_cache = torch.cat([v_cache, v], dim=1)
-        kv_len = k_cache.shape[1]
-
+        
        batch_size = q.shape[0]
        q_len = q.shape[1]
+        kv_len = k_cache.shape[1]

        q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2)
        k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
        v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)

-        attn = F.scaled_dot_product_attention(q, k, v)

-        attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim)
+        if torch_sdpa:
+            attn = F.scaled_dot_product_attention(q, k, v)
+        else:
+            attn = scaled_dot_product_attention(q, k, v, attn_mask)
+
+        attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim)
+        attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
        attn = F.linear(attn, self.out_w, self.out_b)

+        x = x + attn
        x = F.layer_norm(
-            x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
+            x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
        )
+        x = x + self.mlp.forward(x)
        x = F.layer_norm(
-            x + self.mlp.forward(x),
+            x,
            [self.hidden_dim],
            self.norm_w2,
            self.norm_b2,
@@ -149,25 +227,32 @@ class T2SBlock:

@torch.jit.script
 class T2STransformer:
-    def __init__(self, num_blocks: int, blocks: List[T2SBlock]):
-        self.num_blocks: int = num_blocks
+    def __init__(self, num_blocks : int, blocks: List[T2SBlock]):
+        self.num_blocks : int = num_blocks
        self.blocks = blocks

    def process_prompt(
-            self, x, attn_mask: torch.Tensor):
-        k_cache: List[torch.Tensor] = []
-        v_cache: List[torch.Tensor] = []
+        self, x:torch.Tensor, attn_mask : torch.Tensor,
+        padding_mask : Optional[torch.Tensor]=None, 
+        torch_sdpa:bool=True
+        ):
+        k_cache : List[torch.Tensor] = []
+        v_cache : List[torch.Tensor] = []
        for i in range(self.num_blocks):
-            x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask)
+            x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask, torch_sdpa)
            k_cache.append(k_cache_)
            v_cache.append(v_cache_)
        return x, k_cache, v_cache

    def decode_next_token(
-            self, x, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor]
+        self, x:torch.Tensor, 
+        k_cache: List[torch.Tensor], 
+        v_cache: List[torch.Tensor], 
+        attn_mask : Optional[torch.Tensor]=None,
+        torch_sdpa:bool=True
    ):
        for i in range(self.num_blocks):
-            x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i])
+            x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i], attn_mask, torch_sdpa)
        return x, k_cache, v_cache


@@ -235,7 +320,7 @@ class Text2SemanticDecoder(nn.Module):
                layer.linear2.weight,
                layer.linear2.bias
            )
-            # (layer.self_attn.in_proj_weight, layer.self_attn.in_proj_bias)
+
            block = T2SBlock(
                self.num_head,
                self.model_dim,
@@ -253,7 +338,7 @@ class Text2SemanticDecoder(nn.Module):
            )

            blocks.append(block)
-
+        
        self.t2s_transformer = T2STransformer(self.num_layers, blocks)

    def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
@@ -283,7 +368,7 @@ class Text2SemanticDecoder(nn.Module):
            (0, y_len),
            value=True,
        )
-
+        # x_attn_mask[:, x_len]=False
        y_attn_mask = F.pad(
            torch.triu(
                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
@@ -344,7 +429,7 @@ class Text2SemanticDecoder(nn.Module):

        A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets)
        loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True)
-
+        
        loss = loss_1 + loss_2

        return loss, acc
@@ -488,16 +573,225 @@ class Text2SemanticDecoder(nn.Module):
        # 错位
        return targets[:, :-1], targets[:, 1:]

-    def infer_panel(
-            self,
-            x,  #####全部文本token
-            x_lens,
-            prompts,  ####参考音频token
-            bert_feature,
-            top_k: int = -100,
-            top_p: int = 100,
-            early_stop_num: int = -1,
-            temperature: float = 1.0,
+    def infer_panel_batch_infer(
+        self,
+        x:List[torch.LongTensor],  #####全部文本token
+        x_lens:torch.LongTensor,
+        prompts:torch.LongTensor,  ####参考音频token
+        bert_feature:List[torch.LongTensor],
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs,
+    ):
+        if prompts is None:
+            print("Warning: Prompt free is not supported batch_infer! switch to naive_infer")
+            return self.infer_panel_naive_batched(x, x_lens, prompts, bert_feature, top_k=top_k, top_p=top_p, early_stop_num=early_stop_num, temperature=temperature, **kwargs)
+
+
+        max_len = kwargs.get("max_len",x_lens.max())
+        x_list = []
+        for x_item, bert_item in zip(x, bert_feature):
+            # max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
+            x_item = self.ar_text_embedding(x_item.unsqueeze(0))
+            x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0))
+            x_item = self.ar_text_position(x_item).squeeze(0)
+            x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0]<max_len else x_item
+            x_list.append(x_item)
+        x = torch.stack(x_list, dim=0)
+
+
+        # AR Decoder
+        y = prompts
+        
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+
+        k_cache = None
+        v_cache = None
+        ###################  first step ##########################
+        if y is not None:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
+            y_lens = torch.LongTensor([y_emb.shape[1]]*y_emb.shape[0]).to(x.device)
+            y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            ref_free = False
+        else:
+            y_emb = None
+            y_len = 0
+            prefix_len = 0
+            y_lens = torch.LongTensor([y_len]*x.shape[0]).to(x.device)
+            y_pos = None
+            xy_pos = x
+            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            ref_free = True
+
+
+        ##### create mask #####
+        bsz = x.shape[0]
+        src_len = x_len + y_len
+        y_paddind_mask = make_pad_mask(y_lens, y_len)
+        x_paddind_mask = make_pad_mask(x_lens, max_len)
+        
+        # (bsz, x_len + y_len)
+        xy_padding_mask = torch.concat([x_paddind_mask, y_paddind_mask], dim=1)
+
+        x_mask = F.pad(
+            x_attn_mask,
+            (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+            value=True,
+        )
+        y_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
+            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), 
+            (x_len, 0),
+            value=False,
+        )
+        
+        xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).repeat(bsz, 1, 1).to(x.device)
+        _xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).repeat(1, src_len, 1)
+        
+        for i in range(bsz):
+            l = x_lens[i]
+            _xy_padding_mask[i,l:max_len,:]=True
+            
+        xy_attn_mask = xy_mask.logical_or(_xy_padding_mask)
+        xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
+        xy_attn_mask = xy_attn_mask.bool()
+        xy_padding_mask = xy_padding_mask.view(bsz, src_len, 1).expand(-1, -1, self.model_dim)
+
+        ###### decode #####
+        y_list = [None]*y.shape[0]
+        batch_idx_map = list(range(y.shape[0]))
+        idx_list = [None]*y.shape[0]
+        for idx in tqdm(range(1500)):
+            if idx == 0:
+                xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, xy_padding_mask, False)
+            else:
+                xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache, xy_attn_mask, False)
+            logits = self.ar_predict_layer(
+                xy_dec[:, -1]
+            )
+
+            if idx == 0:
+                xy_attn_mask = F.pad(xy_attn_mask[:,:,-1].unsqueeze(-2),(0,1),value=False)
+                logits = logits[:, :-1]
+            else:
+                xy_attn_mask = F.pad(xy_attn_mask,(0,1),value=False)
+
+            samples = sample(
+                    logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
+                )[0]
+
+            y = torch.concat([y, samples], dim=1)
+            
+            ####### 移除batch中已经生成完毕的序列,进一步优化计算量
+            tokens = torch.argmax(logits, dim=-1)
+            reserved_idx_of_batch_for_y = None
+            if (self.EOS in samples[:, 0]) or \
+                (self.EOS in tokens):  ###如果生成到EOS，则停止
+                    l1 = samples[:, 0]==self.EOS
+                    l2 = tokens==self.EOS
+                    l = l1.logical_or(l2)
+                    removed_idx_of_batch_for_y = torch.where(l==True)[0].tolist()
+                    reserved_idx_of_batch_for_y = torch.where(l==False)[0]
+                    # batch_indexs = torch.tensor(batch_idx_map, device=y.device)[removed_idx_of_batch_for_y]
+                    for i in removed_idx_of_batch_for_y:
+                        batch_index = batch_idx_map[i]
+                        idx_list[batch_index] = idx - 1
+                        y_list[batch_index] = y[i, :-1]
+                
+                    batch_idx_map = [batch_idx_map[i] for i in reserved_idx_of_batch_for_y.tolist()]
+                
+            # 只保留batch中未生成完毕的序列 
+            if reserved_idx_of_batch_for_y is not None:
+                # index = torch.LongTensor(batch_idx_map).to(y.device)
+                y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
+                xy_attn_mask = torch.index_select(xy_attn_mask, dim=0, index=reserved_idx_of_batch_for_y)
+                if k_cache is not None :
+                    for i in range(len(k_cache)):
+                        k_cache[i] = torch.index_select(k_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
+                        v_cache[i] = torch.index_select(v_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
+                
+                
+            if (early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num) or idx==1499:
+                print("use early stop num:", early_stop_num)
+                stop = True
+                for i, batch_index in enumerate(batch_idx_map):
+                    batch_index = batch_idx_map[i]
+                    idx_list[batch_index] = idx
+                    y_list[batch_index] = y[i, :-1]
+                
+            if not (None in idx_list):
+                stop = True
+                
+            if stop:
+                if y.shape[1]==0:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print("bad zero prediction")
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+
+            ####################### update next step ###################################
+            y_emb = self.ar_audio_embedding(y[:, -1:])
+            xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device)            
+
+        if (None in idx_list):
+            for i in range(x.shape[0]):
+                if idx_list[i] is None:
+                    idx_list[i] = 1500-1  ###如果没有生成到EOS，就用最大长度代替
+                    
+        if ref_free:
+            return y_list, [0]*x.shape[0]
+        # print(idx_list)
+        return y_list, idx_list
+    
+    def infer_panel_naive_batched(self,
+        x:List[torch.LongTensor],  #####全部文本token
+        x_lens:torch.LongTensor,
+        prompts:torch.LongTensor,  ####参考音频token
+        bert_feature:List[torch.LongTensor],
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs
+        ):
+        y_list = []
+        idx_list = []
+        for i in range(len(x)):
+            y, idx = self.infer_panel_naive(x[i].unsqueeze(0), 
+                                                  x_lens[i], 
+                                                  prompts[i].unsqueeze(0) if prompts is not None else None, 
+                                                  bert_feature[i].unsqueeze(0), 
+                                                  top_k, 
+                                                  top_p, 
+                                                  early_stop_num, 
+                                                  temperature,
+                                                  repetition_penalty,
+                                                  **kwargs)
+            y_list.append(y[0])
+            idx_list.append(idx)
+        
+        return y_list, idx_list
+    
+    def infer_panel_naive(
+        self,
+        x:torch.LongTensor,  #####全部文本token
+        x_lens:torch.LongTensor,
+        prompts:torch.LongTensor,  ####参考音频token
+        bert_feature:torch.LongTensor,
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs
    ):
        x = self.ar_text_embedding(x)
        x = x + self.bert_proj(bert_feature.transpose(1, 2))
@@ -528,9 +822,10 @@ class Text2SemanticDecoder(nn.Module):
            y_pos = None
            xy_pos = x
            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
-            prompts = y
            ref_free = True

+        bsz = x.shape[0]
+        src_len = x_len + y_len
        x_attn_mask_pad = F.pad(
            x_attn_mask,
            (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
@@ -541,13 +836,15 @@ class Text2SemanticDecoder(nn.Module):
            (x_len, 0),
            value=False,
        )
-        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
-            x.device
-        )
+        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)\
+                                                .unsqueeze(0)\
+                                                .expand(bsz*self.num_head, -1, -1)\
+                                                .view(bsz, self.num_head, src_len, src_len)\
+                                                .to(device=x.device, dtype=torch.bool)

        for idx in tqdm(range(1500)):
            if xy_attn_mask is not None:
-                xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask)
+                xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None)
            else:
                xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)

@@ -558,9 +855,10 @@ class Text2SemanticDecoder(nn.Module):
            if idx == 0:
                xy_attn_mask = None
                logits = logits[:, :-1]
+
            samples = sample(
-                logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
-            )[0].unsqueeze(0)
+                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
+            )[0]

            y = torch.concat([y, samples], dim=1)

@@ -583,4 +881,20 @@ class Text2SemanticDecoder(nn.Module):

        if ref_free:
            return y[:, :-1], 0
-        return y[:, :-1], idx - 1
+        return y[:, :-1], idx - 1
+    
+    
+    def infer_panel(
+        self,
+        x:torch.LongTensor,  #####全部文本token
+        x_lens:torch.LongTensor,
+        prompts:torch.LongTensor,  ####参考音频token
+        bert_feature:torch.LongTensor,
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs
+    ):
+        return self.infer_panel_naive(x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs)