1. 分块与编码
首先,我们将原始词表划分为多个“芯片”(chips),每个芯片代表一个特定长度的词汇子集。接着,根据每个芯片的长度和编码规则生成新的词表。这一过程确保了即使在词表被限制的情况下,也能覆盖尽可能多的有效词汇 。
2. 模糊推理能力
由于采用了基于上下文的方式对原始词表进行采样,这种方法使得模型能够在预测时实现模糊推理。也就是说,即便当前使用的词表不包含所有可能的词汇,模型依然可以通过对相似或相关词汇区域的推断来进行合理的预测 。
3. 动态调整词表
该方法允许根据输入文本的不同动态调整词表的内容。这意味着模型可以根据当前上下文的需求重新定义其有效词汇集,从而提供了更大的灵活性和适应性 。
python">import pandas as pd
import numpy as np
from jieba import lcut
import polars as pldef get_new_voc(voc_len, text_src_id):for j in range(14, 23):chip_len = voc_len // 2 ** jseq_chip = text_src_id // chip_lenchips = sorted(set(seq_chip))chip_seq_count = len(set(seq_chip))## print("最大词表", chip_seq_count * chip_len)# chip_seq_count = 0if chip_seq_count * chip_len > 8192:continue# 使用 原始词表 区长度 区编码 得到新的词表new_voc = voc.filter((voc["voc_id"] // chip_len).is_in(chips))new_voc = new_voc.with_columns(pl.Series("new_voc_id", list(range(len(new_voc)))))return chips, chip_len, new_vocif __name__ == '__main__':voc = pd.read_pickle("voc_26B.pkl")pd_list = ["<|词表开始|>", "<|序列开始|>", "<|序列结束|>", "<|填充|>", "<|chips_start|>", "<|区长度|>" "<|区码1|>","<|区码2|>", "<|区码3|>", "<|chips_end|>", "<|user|>", "<|agent|>"]voc = pd.DataFrame({"voc": pd_list + voc["voc"].values.tolist(), "voc_id": list(range(len(pd_list) + len(voc)))})voc = pl.from_pandas(voc)text = """tical applications, if you want to determine the optimal number of clusters \(k\), you can iterate through a series of possible \(k\) values, calculate the corresponding Silhouette Coefficients, and choose the \(k\) value with the highest Silhouette Coefficient as the final number of clusters . This method helps identify cluster structures that are both compact and well-separated."""text = lcut(text)text_src_id_src = pl.DataFrame({"voc": text})text_src_id = text_src_id_src.join(voc, on="voc", how="left")text_src_id = text_src_id["voc_id"].to_numpy()# text_src_id[np.isnan(te xt_src_id)] = 3# 将词表分成2**8 份 该序列所在 份编号# np.array(list(set(text_src_id //(len(voc)//2**8))))# 所在份的起始位置 与结束位置chips, chip_len, new_voc = get_new_voc(len(voc), text_src_id)token_id = text_src_id_src.join(new_voc["voc", "new_voc_id"], on="voc", how="left")["new_voc_id"].to_numpy().tolist()chips = np.array(chips) + 11chips1 = chips[chips < 8192]chips2 = chips[(chips >= 8192) & (chips < 8192 ** 2)]chips3 = chips[(chips >= 8192 ** 2) & (chips < 8192 ** 3)]if chips2.size > 0:chips2 = [7] + (chips2 // 8192).tolist() + (chips2 % 8192).tolist()if chips3.size > 0:chips3 = [8] + (chips3 // 8192 // 8192).tolist() + (chips3 // 8192).tolist() + (chips3 % 8192).tolist()out = [5] + [chip_len] + [6] + chips1.tolist()if isinstance(chips2, list):out += chips2if isinstance(chips3, list):out += chips3out += [9] + token_id# 生成新编码# 使用 原始词表 区长度 区编码 得到新的词表# 反向求 原始token_id 根据原始词表得到 token# [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 19, 21, 22, 23, 24, 25, 28, 29, 31, 32, 33, 35, 36, 38, 40, 47, 48, 49, 50, 53, 54, 56, 63, 64, 65, 66, 68, 70, 72, 73, 74, 76, 80, 85, 88, 90, 91, 92, 102, 103, 105, 107, 113, 115, 119, 123, 126, 129, 130, 134, 137, 141, 143, 144, 155, 156, 162, 164, 173, 175, 195, 204, 205, 223, 233, 249, 259, 267, 270, 272, 277, 281, 287, 295, 299, 329, 337, 370, 373, 375, 383, 388, 391, 403, 406, 428, 451, 513, 519, 536, 541, 583, 587, 608, 617, 635, 674, 679, 725, 873, 881, 1023, 1092, 1110, 1336, 1391, 1447, 1639, 1738, 1856, 1961, 2117, 3034, 4019, 4737, 6366, 6411, 6714, 7760, 8172, 14524]# <|chips_start|> <|区长度|> <|区码1|> <|区码2|> <|区码3|> <|chips_end|># <|chips_start|> <|区长度|> <|区码1|> <|区码2|> <|区码3|> <|chips_end|># 解码# chips = out[:out.index(9)]# if 8 in chips:# chips3 = chips[out.index(8):]# chips3 = np.array(chips3).reshape([3, -1]) - 11# chips3 = chips3[0, :] * 8192 ** 2 + chips3[1, :] * 8192 + chips3[2, :]# if 7 in chips:# chips2 = chips[out.index(7):out.index(8)]# chips2 = np.array(chips2).reshape([2, -1]) - 11# chips2 = chips2[0, :] * 8192 + chips2[1, :]# if 6 in chips:# chips1 = chips[out.index(6):out.index(7)]# chips1 = np.array(chips1) - 11# chip_len = out[out.index(5) + 1]# chips = chips1.tolist() + chips2.tolist() + chips3.tolist()