1. 数据的质量
2. 准备数据
2.1 准备环境
python">import pandas as pd
import datasetsfrom pprint import pprint
from transformers import AutoTokenizer
2.2 对文本进行标记化处理
python"># 调用预训练模型的分词器
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
text = "Hi, how are you?"
encoded_text = tokenizer(text)["input_ids"]
[12764, 13, 849, 403, 368, 32]
python">decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)
Decoded tokens back into text: Hi, how are you?
python">list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])
Encoded several texts: [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]
2.3 填充或者截断
python">tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])
Using padding: [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]
python">encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])
python">Using truncation: [[12764, 13, 849], [42, 1353, 1175], [4374]]
python">tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])
Using left-side truncation: [[403, 368, 32], [42, 1353, 1175], [4374]]
python">encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])
Using both padding and truncation: [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]
2.4 准备指令数据集
python">import pandas as pdfilename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()if "question" in examples and "answer" in examples:text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:text = examples["input"][0] + examples["output"][0]
else:text = examples["text"][0]prompt_template = """### Question:
{question}### Answer:"""num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):question = examples["question"][i]answer = examples["answer"][i]text_with_prompt_template = prompt_template.format(question=question)finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})from pprint import pprint
print("One datapoint in the finetuning dataset:")
One datapoint in the finetuning dataset:
{'answer': 'Lamini has documentation on Getting Started, Authentication, ''Question Answer Model, Python Library, Batching, Error Handling, ''Advanced topics, and class documentation on LLM Engine available ''at https://lamini-ai.github.io/.','question': '### Question:\n''What are the different types of documents available in the ''repository (e.g., installation guide, API documentation, '"developer's guide)?\n"'\n''### Answer:'}
2.5 对单个示例进行标记化
python">text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(text,return_tensors="np",padding=True
python">[[ 4118 19782 27 187 1276 403 253 1027 3510 273 7177 2130275 253 18491 313 70 15 72 904 12692 7102 13 899010097 13 13722 434 7102 6177 187 187 4118 37741 27 454988 74 556 10097 327 27669 11075 264 13 5271 23058 1319782 37741 10031 13 13814 11397 13 378 16464 13 11759 105351981 13 21798 12989 13 285 966 10097 327 21708 46 107972130 387 5987 1358 77 4988 74 14 2284 15 7280 15900 14206]]
python">max_length = 2048
max_length = min(tokenized_inputs["input_ids"].shape[1],max_length,
)tokenized_inputs = tokenizer(text,return_tensors="np",truncation=True,max_length=max_length
array([[ 4118, 19782, 27, 187, 1276, 403, 253, 1027, 3510,273, 7177, 2130, 275, 253, 18491, 313, 70, 15,72, 904, 12692, 7102, 13, 8990, 10097, 13, 13722,434, 7102, 6177, 187, 187, 4118, 37741, 27, 45,4988, 74, 556, 10097, 327, 27669, 11075, 264, 13,5271, 23058, 13, 19782, 37741, 10031, 13, 13814, 11397,13, 378, 16464, 13, 11759, 10535, 1981, 13, 21798,12989, 13, 285, 966, 10097, 327, 21708, 46, 10797,2130, 387, 5987, 1358, 77, 4988, 74, 14, 2284,15, 7280, 15, 900, 14206]])
2.6 对指令数据集进行标记化
python">def tokenize_function(examples):if "question" in examples and "answer" in examples:text = examples["question"][0] + examples["answer"][0]elif "input" in examples and "output" in examples:text = examples["input"][0] + examples["output"][0]else:text = examples["text"][0]tokenizer.pad_token = tokenizer.eos_tokentokenized_inputs = tokenizer(text,return_tensors="np",padding=True,)max_length = min(tokenized_inputs["input_ids"].shape[1],2048)tokenizer.truncation_side = "left"tokenized_inputs = tokenizer(text,return_tensors="np",truncation=True,max_length=max_length)return tokenized_inputs
python">finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")tokenized_dataset = finetuning_dataset_loaded.map(tokenize_function,batched=True,batch_size=1,drop_last_batch=True
Dataset({features: ['question', 'answer', 'input_ids', 'attention_mask'],num_rows: 1400
python">tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])
2.7 对数据集进行划分
python">split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
DatasetDict({train: Dataset({features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],num_rows: 1260})test: Dataset({features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],num_rows: 140})
2.8 测试自己的数据集加载
python">finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
python">DatasetDict({train: Dataset({features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],num_rows: 1260})test: Dataset({features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],num_rows: 140})
python">taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"dataset_swiftie = datasets.load_dataset(taylor_swift_dataset)
{'question': 'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?', 'answer': 'Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.', 'input_ids': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24619, 434, 346, 2809, 640, 733, 5566, 3, 310, 253, 954, 4633, 4498, 2190, 24933, 8075, 15, 831, 4498, 7033, 281, 253, 24933, 451, 5978, 347, 352, 310, 271, 49689, 273, 1881, 14, 14764, 593, 285, 41859, 581, 434, 2060, 414, 15, 380, 4498, 434, 3935, 273, 417, 13872, 2571, 3324, 368, 1066, 285, 281, 816, 11012, 352, 745, 8146, 684, 342, 253, 24933, 451, 4466, 13, 534, 310, 2223, 7943, 407, 247, 2266, 3282, 273, 2060, 414, 285, 247, 18235, 273, 38058, 22429, 15, 9157, 13, 253, 4498, 434, 598, 19505, 285, 5834, 90, 40641, 2789, 352, 247, 3962, 4944, 323, 253, 24933, 451, 5978, 13, 534, 310, 1929, 323, 697, 2389, 273, 1684, 3440, 15], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24619, 434, 346, 2809, 640, 733, 5566, 3, 310, 253, 954, 4633, 4498, 2190, 24933, 8075, 15, 831, 4498, 7033, 281, 253, 24933, 451, 5978, 347, 352, 310, 271, 49689, 273, 1881, 14, 14764, 593, 285, 41859, 581, 434, 2060, 414, 15, 380, 4498, 434, 3935, 273, 417, 13872, 2571, 3324, 368, 1066, 285, 281, 816, 11012, 352, 745, 8146, 684, 342, 253, 24933, 451, 4466, 13, 534, 310, 2223, 7943, 407, 247, 2266, 3282, 273, 2060, 414, 285, 247, 18235, 273, 38058, 22429, 15, 9157, 13, 253, 4498, 434, 598, 19505, 285, 5834, 90, 40641, 2789, 352, 247, 3962, 4944, 323, 253, 24933, 451, 5978, 13, 534, 310, 1929, 323, 697, 2389, 273, 1684, 3440, 15]}
3. 总结