Unsloth + Llama 3本机微调大模型指南 | 安全快速训练个性化模型
本文是Youtube影片的辅助文档,在观看完成Youtube原影片后,使用本文档效果会更佳。
本机训练最大的好处就是数据安全有保障。无论是个人 企业 还是机构,数据都是最宝贵的产之一,传到云端无论怎么加密都会承担泄露风险,而用本机训练模型则安全经济便捷。
平台软件安装
下载conda安装文件
wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh
验证哈希码
sha256sum Anaconda3-2024.02-1-Linux-x86_64.sh
运行Conda安装脚本
bash Anaconda3-2024.02-1-Linux-x86_64.sh
启用Conda环境变量
source ~/.bashrc
安装Git
sudo apt install git
安装cuda加速驱动
sudo apt install nvidia-cuda-toolkit
安装c语言编译器
sudo apt install build-essential
安装Unsloth
创建conda虚拟环境
conda create --name unsloth_env python=3.10
激活conda虚拟环境
conda activate unsloth_env
安装pytorch cuda xformers
conda install pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers
pip安装unsloth和依赖包
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps trl peft accelerate bitsandbytes
微调代码
train_text.py
from unsloth import FastLanguageModel import torch from trl import SFTTrainer from transformers import TrainingArguments from datasets import load_dataset max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! # Get LAION dataset url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" dataset = load_dataset("json", data_files = {"train" : url}, split = "train") # local_file = "/home/larry/Downloads/my_file.json"; # dataset = load_dataset("json", data_files = {"train" : local_file}, split = "train") # dataset = dataset.map(formatting_prompts_func, batched = True,) # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b "unsloth/gemma-2b-bnb-4bit", "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3 "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-bnb-4bit", max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) # Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, max_seq_length = max_seq_length, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ ) trainer = SFTTrainer( model = model, train_dataset = dataset, dataset_text_field = "text", max_seq_length = max_seq_length, tokenizer = tokenizer, args = TrainingArguments( per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 10, max_steps = 60, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), logging_steps = 1, output_dir = "outputs", optim = "adamw_8bit", seed = 3407, ), ) trainer.train() # model.save_pretrained("my_model") # save # Go to https://github.com/unslothai/unsloth/wiki for advanced tips like # (1) Saving to GGUF / merging to 16bit for vLLM # (2) Continued training from a saved LoRA adapter # (3) Adding an evaluation loop / OOMs # (4) Cutomized chat templates
train_instruction.py
from unsloth import FastLanguageModel import torch from trl import SFTTrainer from transformers import TrainingArguments from datasets import load_dataset max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b "unsloth/gemma-2b-bnb-4bit", "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3 "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-bnb-4bit", max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) # Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, max_seq_length = max_seq_length, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ ) EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" def formatting_prompts_func(examples): instructions = examples["instruction"] inputs = examples["input"] outputs = examples["output"] texts = [] for instruction, input, output in zip(instructions, inputs, outputs): # Must add EOS_TOKEN, otherwise your generation will go on forever! text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN texts.append(text) return { "text" : texts, } # Get local dataset local_file = "/home/larry/Downloads/my_file.json"; dataset = load_dataset("json", data_files = {"train" : local_file}, split = "train") dataset = dataset.map(formatting_prompts_func, batched = True,) trainer = SFTTrainer( model = model, train_dataset = dataset, dataset_text_field = "text", max_seq_length = max_seq_length, tokenizer = tokenizer, args = TrainingArguments( per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 10, max_steps = 60, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), logging_steps = 1, output_dir = "outputs", optim = "adamw_8bit", seed = 3407, ), ) trainer.train() model.save_pretrained("my_model") # save # Go to https://github.com/unslothai/unsloth/wiki for advanced tips like # (1) Saving to GGUF / merging to 16bit for vLLM # (2) Continued training from a saved LoRA adapter # (3) Adding an evaluation loop / OOMs # (4) Cutomized chat templates
text.py
from unsloth import FastLanguageModel from transformers import TextStreamer max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name = "my_model", # the model that you trained max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, ) FastLanguageModel.for_inference(model) # Enable native 2x faster inference alpaca_prompt = """Based on given instruction and context, generate an appropriate response. ### Instruction: {} ### Input: {} ### Response: {}""" inputs = tokenizer( [ alpaca_prompt.format( "小林最擅长的什么领域的工作?", # instruction "", # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") text_streamer = TextStreamer(tokenizer) _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) inputs = tokenizer( [ alpaca_prompt.format( "小林AI实验室的好评率是多少", # instruction "", # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") text_streamer = TextStreamer(tokenizer) _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) inputs = tokenizer( [ alpaca_prompt.format( "小林在Youtube开频道的初衷是什么?", # instruction "", # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") text_streamer = TextStreamer(tokenizer) _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)