Can you train new or forbidden knowledge into a LLM? Let’s fine out as I throw 1 gigabyte of scraped, cleaned, plaintext KiwiFarms posts at Mistral 7B. I go over my experience fine-tuning Mistral 7B on a few large datasets of scraped text data including English language song lyrics, and a huge KiwiFarms post dataset.
Video Link: https://youtu.be/9bl1mJImj10
Video Resources
Jupyter Notebook .ipnyb Uploaded to GDrive
https://drive.google.com/file/d/1mnew-Y1DQ0Z7AGxulF04Xur1w7SHhj3q/view?usp=sharing
Finetuning LLMs with LoRA and QLoRA: Insights from Hundreds of Experiments by Sebastian Raschka
https://lightning.ai/pages/community/lora-insights/
Can LLMs learn from a single example?
https://www.fast.ai/posts/2023-09-04-learning-jumps/
LM Evaluation Harness
https://github.com/EleutherAI/lm-evaluation-harness
Convert with Calibre
https://gist.github.com/rohshall/8980b8f73374c767dbe0a82bcf8ae86c
Calibre
https://calibre-ebook.com/
Unstructured IO
https://github.com/Unstructured-IO
QLoRA
https://github.com/artidoro/qlora
PEFT
https://github.com/huggingface/peft
Bitsandbytes
https://github.com/TimDettmers/bitsandbytes
Original LongLoRA merge script
https://github.com/dvlab-research/LongLoRA/blob/main/merge_lora_weights_and_save_hf_model.py
OpenLLM Leaderboard
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
LM Eval Harness example command:
python main.py –model hf-causal-experimental –model_args pretrained=”/home/nano/textgen/models/mistral-books-br-2048-v2-7300″,low_cpu_mem_usage=True,load_in_4bit=True,bnb_4bit_use_double_quant=True,bnb_4bit_quant_type=”nf4″,bnb_4bit_compute_dtype=bfloat16 –tasks arithmetic_2ds,arithmetic_4ds,truthfulqa_mc –batch_size 8 –num_fewshot 0 –output_path “/home/nano/textgen/models//home/nano/textgen/models/mistral-books-br-2048-v2-7300-arith-truthfulqa_mc.json”
Text Generation WebUI
https://github.com/oobabooga/text-generation-webui
Code for Jupyter Notebook (because WordPress won’t allow me to upload a .ipnyb, and I don’t feel like figuring out how to fix that right now)
# Import libraries
import os
import argparse
import torch
import torch.nn as nn
from datasets import load_dataset,Features,Value,load_from_disk
import transformers
from functools import partial
from transformers import MistralForCausalLM, MistralModel, MistralConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
DataCollatorForLanguageModeling, Trainer, TrainingArguments
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
seed = 5318008
set_seed(seed)
#!pip install -U git+https://github.com/huggingface/transformers
#https://github.com/TimDettmers/bitsandbytes
#!pip install bitsandbytes
#https://github.com/huggingface/peft
#!pip install peft
#https://github.com/Dao-AILab/flash-attention
#!pip install -U flash-attn --no-build-isolation
#Set custom tokens for Mistral
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
def load_model(model_name, bnb_config):
n_gpus = torch.cuda.device_count()
max_memory = f'{12000}MB'
configuration = MistralConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name,
config=configuration,
quantization_config=bnb_config,
device_map="auto", # dispatch efficiently the model on the available ressources
max_memory = {i: max_memory for i in range(n_gpus)},
#Requires Flash Attention 2 installation
use_flash_attention_2=True,
)
max_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right", pad_to_multiple_of=max_length,
model_max_length=max_length,use_fast=False)
special_tokens_dict = dict()
if tokenizer.pad_token is None:
special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
#freeze all our layers, and cast the layer-norm in float32 for stability. We also cast the output of the last layer in float32 for the same reasons.
#https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing#scrollTo=9fTSZntA1iUG
for param in model.parameters():
param.requires_grad = False # freeze the model - train adapters later
if param.ndim == 1:
# cast the small parameters (e.g. layernorm) to fp32 for stability
param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable() # reduce number of stored activations
model.enable_input_require_grads()
if num_new_tokens > 0:
print(num_new_tokens)
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
# Needed for LLaMA tokenizer
#tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def create_bnb_config():
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
return bnb_config
def create_peft_config(modules):
"""
Create Parameter-Efficient Fine-Tuning config for your model
:param modules: Names of the modules to apply Lora to
"""
config = LoraConfig(
r=32, # dimension of the updated matrices
lora_alpha=64, # parameter for scaling
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"],
#target_modules=find_all_linear_names(model)
lora_dropout=0.05, # dropout probability for layers
bias="none",
task_type="CAUSAL_LM",
)
print(modules)
return config
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
# if 'lm_head' in lora_module_names: # needed for 16-bit
# lora_module_names.remove('lm_head')
return list(lora_module_names)
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
from unidecode import unidecode
model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "/home/nano/textgen/models/mistralai_Mistral-7B-v0.1"
bnb_config = create_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)
context_feat = Features({'text': Value(dtype='string', id=None)})
data = load_dataset('csv',data_files={'train': ['/mnt/d/test/train.csv'],
'test': ['/mnt/d/test/val.csv']},features=context_feat)
def tokenize_fn(tokenizer, example):
context_length = tokenizer.model_max_length
outputs = tokenizer(
tokenizer.eos_token.join(example["text"]),
truncation=False,
return_tensors="pt",
pad_to_multiple_of=context_length,
padding=True,
)
return {"input_ids": outputs["input_ids"].view(-1, context_length)}
data = data.map(partial(tokenize_fn,tokenizer),batched=True, batch_size=1000, num_proc=2, remove_columns=["text"])
#data.save_to_disk("/mnt/d/test/tokenized-dataset-books-v2/")
data = load_from_disk("/mnt/d/test/tokenized-dataset-books-v2/")
print(tokenizer.decode(data["train"][1505]["input_ids"]))
output_dir = "/mnt/d/test-books2/"
import os
os.environ["WANDB_DISABLED"] = "true"
#tokenizer.pad_token_id = tokenizer.eos_token_id
def train(model, tokenizer, dataset, output_dir):
# Apply preprocessing to the model to prepare it by
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
model.enable_input_require_grads() # required for gradient checkpointing
model.gradient_checkpointing_enable()
# 2 - Using the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)
# Get lora module names
modules = find_all_linear_names(model)
# Create PEFT config for these modules and wrap the model to PEFT
peft_config = create_peft_config(modules)
model = get_peft_model(model, peft_config)
# Print information about the percentage of trainable parameters
print_trainable_parameters(model)
# Training parameters
trainer = Trainer(
model=model,
train_dataset=dataset,
args=TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
warmup_steps=2,
#max_steps=15,
report_to="tensorboard",
save_steps=100,
num_train_epochs=4,
learning_rate=2e-5,
fp16=True,
logging_steps=1,
output_dir=output_dir,
optim="paged_adamw_8bit",
),
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False # re-enable for inference to speed up predictions for similar inputs
### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
# Verifying the datatypes before training
dtypes = {}
for _, p in model.named_parameters():
dtype = p.dtype
if dtype not in dtypes: dtypes[dtype] = 0
dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items(): total+= v
for k, v in dtypes.items():
print(k, v, v/total)
do_train = True
# Launch training
print("Training...")
if do_train:
train_result = trainer.train(resume_from_checkpoint=True)
# train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)
###
# Saving model
print("Saving last checkpoint of the model...")
os.makedirs(output_dir, exist_ok=True)
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del model
del trainer
torch.cuda.empty_cache()
train(model, tokenizer, data["train"], output_dir)
Test LoRA
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
#Set custom tokens for Mistral
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
tokenizer = AutoTokenizer.from_pretrained("/home/nano/textgen/models/mistralai_Mistral-7B-v0.1", padding_side="left",use_fast=False)
special_tokens_dict = dict()
if tokenizer.pad_token is None:
special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
# Specify input
text = "Test input prompt here"
# Specify device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.config.use_cache = False # re-enable for inference to speed up predictions for similar inputs
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer)
# Get answer
# (Adjust max_new_tokens variable as you wish (maximum number of tokens the model can generate to answer the input))
outputs = model.generate(input_ids=inputs["input_ids"].to(device), #do_sample=True,
#top_k=50,
#top_p=0.95,
#temperature=0.1,
streamer=streamer, attention_mask=inputs["attention_mask"], max_new_tokens=2048)
# Decode output & print it
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Create Dataset
import os
import re
import glob
import pandas as pd
from tqdm.auto import tqdm
file_name_and_text = {}
base_path = "/mnt/e/br2/"
import_files = [file for file in os.listdir(base_path)]
with tqdm() as bar:
for file in import_files:
print(file)
with open(base_path+file, "r") as target_file:
file_name_and_text[file] = target_file.read()
bar.update(1)
file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index').reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))
print((file_data["text"][9]))
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n', "\n", regex=True)
data["train"] = data["train"].str.replace(r'\n\n\n\n\n\n\n\n\n', "\n", regex=True)
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
file_data["text"] = file_data["text"].apply(lambda x: remove_html_tags(str(x)))
header = ["text"]
file_data.to_csv("docs-test.csv", columns=header)
#Source: From here, maybe
#https://www.kaggle.com/code/nourhanaboelsoaoud/english-to-french-translation or
#https://github.com/UtkarshGarg-UG/Deep-Learning-Projects/blob/main/NLP/Custom%20Dataset/loading%20custom%20dataset%20(text).ipynb I have no idea. Maybe I wrote it in a fugue state.
import numpy as np
val_frac = 0.01 #precentage data in val
val_split_idx = int(len(file_data)*val_frac) #index on which to split
data_idx = list(range(len(file_data))) #create a list of ints till len of data
#To shuffle, or not shuffle. If doing eval, shuffle, probably, so the distribution in the eval dataset makes sense.
#I am doing a linear training, where I want the data to be seen in a particular way, so I cannot shuffle, and I don't
#want to make considerations for pulling out non-duplicate data for the eval set, so no eval. Who cares, anyway. It's all subjective.
#np.random.shuffle(data_idx)
#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))
#create the sets
train = file_data.iloc[train_idx].reset_index().drop('index',axis=1)
print(train.info(verbose=False, memory_usage="deep"))
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True)
train.to_csv("/mnt/d/test/train-books2.csv")
val = file_data.iloc[val_idx].reset_index().drop('index',axis=1)
print(val.info(verbose=False, memory_usage="deep"))
val.replace('', np.nan, inplace=True)
val.dropna(inplace=True)
val.to_csv("/mnt/d/test/val-books2.csv")
Convert PDF to Text (OCR Method) using Unstructured
import glob
import pandas as pd
from unstructured.cleaners.core import clean
from unstructured.partition.auto import partition
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dataframe
all_pdfs = glob.glob("/mnt/d/gametheory/*.pdf")
print("Total number of files: ", len(all_pdfs))
tdf_headers = ["text"]
file_data = pd.DataFrame(columns=tdf_headers)
output_s = ""
lst = []
for pdf in all_pdfs:
elements = partition(pdf, content_type="application/pdf")
elements = [clean(str(el)) for el in elements]
for element in elements:
ele_s = str(element)
lst.append(ele_s)
df_extended = pd.DataFrame(lst, columns=file_data.columns)
file_data = pd.concat([file_data, df_extended])
import numpy as np
val_frac = 0.01 #precentage data in val
val_split_idx = int(len(file_data)*val_frac) #index on which to split
data_idx = list(range(len(file_data))) #create a list of ints till len of data
#np.random.shuffle(data_idx)
#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))
#create the sets
train = file_data.iloc[train_idx].reset_index().drop('index',axis=1)
print(train.info(verbose=False, memory_usage="deep"))
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True)
train.to_csv("/mnt/d/gametheory/train-gt.csv")
val = file_data.iloc[val_idx].reset_index().drop('index',axis=1)
print(val.info(verbose=False, memory_usage="deep"))
val.replace('', np.nan, inplace=True)
val.dropna(inplace=True)
val.to_csv("/mnt/d/gametheory/val-gt.csv")
Merge Lora Script
# Written by Yukang Chen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
import argparse
import transformers
from peft import PeftModel
from typing import Dict
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
def parse_config():
parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--base_model', type=str, default="/data/pretrained-models/llama-7b-hf")
parser.add_argument('--peft_model', type=str, default=None, help='')
parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
parser.add_argument('--save_path', type=str, default=None, help='')
parser.add_argument('--cache_dir', type=str, default=None, help='./cache_dir')
args = parser.parse_args()
return args
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
def main(args):
device = "cuda:0"
torch.cuda.set_device(device)
print("base model", args.base_model)
print("peft model", args.peft_model)
# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
args.base_model,
cache_dir=args.cache_dir,
torch_dtype=torch.float16,
device_map={"": "cpu"}
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
args.base_model,
cache_dir=args.cache_dir,
model_max_length=args.context_size,
padding_side="right",
use_fast=False,
)
special_tokens_dict = dict()
if tokenizer.pad_token is None:
special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
smart_tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
)
trainable_params = os.path.join(args.peft_model, "trainable_params.bin")
if os.path.isfile(trainable_params):
model.load_state_dict(torch.load(trainable_params, map_location=model.device), strict=False)
model = PeftModel.from_pretrained(
model,
args.peft_model,
#device_map="auto",
torch_dtype=torch.float16,
device_map={"": "cpu"}
)
model = model.merge_and_unload()
model.save_pretrained(args.save_path, max_shard_size="400MB")
tokenizer.save_pretrained(args.save_path)
if __name__ == "__main__":
args = parse_config()
main(args)
#python llmerge.py --base_model /home/nano/textgen/models/mistralai_Mistral-7B-v0.1 --peft_model /home/nano/gpt/mistral-lyrics-pop-2048-v1.1/checkpoint-3900 --save_path /home/nano/textgen/models/mistral-lyrics-pop-3900