Can you train new or forbidden knowledge into a LLM? Let’s fine out as I throw 1 gigabyte of scraped, cleaned, plaintext KiwiFarms posts at Mistral 7B. I go over my experience fine-tuning Mistral 7B on a few large datasets of scraped text data including English language song lyrics, and a huge KiwiFarms post dataset.

Video Link: https://youtu.be/9bl1mJImj10

Video Resources

Jupyter Notebook .ipnyb Uploaded to GDrive

https://drive.google.com/file/d/1mnew-Y1DQ0Z7AGxulF04Xur1w7SHhj3q/view?usp=sharing

Finetuning LLMs with LoRA and QLoRA: Insights from Hundreds of Experiments by Sebastian Raschka
https://lightning.ai/pages/community/lora-insights/

Can LLMs learn from a single example?
https://www.fast.ai/posts/2023-09-04-learning-jumps/

LM Evaluation Harness
https://github.com/EleutherAI/lm-evaluation-harness

Convert with Calibre
https://gist.github.com/rohshall/8980b8f73374c767dbe0a82bcf8ae86c

Calibre
https://calibre-ebook.com/

Unstructured IO
https://github.com/Unstructured-IO

QLoRA
https://github.com/artidoro/qlora

PEFT
https://github.com/huggingface/peft

Bitsandbytes
https://github.com/TimDettmers/bitsandbytes

Original LongLoRA merge script
https://github.com/dvlab-research/LongLoRA/blob/main/merge_lora_weights_and_save_hf_model.py

OpenLLM Leaderboard
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard

LM Eval Harness example command:
python main.py –model hf-causal-experimental –model_args pretrained=”/home/nano/textgen/models/mistral-books-br-2048-v2-7300″,low_cpu_mem_usage=True,load_in_4bit=True,bnb_4bit_use_double_quant=True,bnb_4bit_quant_type=”nf4″,bnb_4bit_compute_dtype=bfloat16 –tasks arithmetic_2ds,arithmetic_4ds,truthfulqa_mc –batch_size 8 –num_fewshot 0 –output_path “/home/nano/textgen/models//home/nano/textgen/models/mistral-books-br-2048-v2-7300-arith-truthfulqa_mc.json”

Text Generation WebUI
https://github.com/oobabooga/text-generation-webui

Code for Jupyter Notebook (because WordPress won’t allow me to upload a .ipnyb, and I don’t feel like figuring out how to fix that right now)

LoRA-Mistral0.1b

In [ ]:

# Import libraries
import os
import argparse
import torch
import torch.nn as nn
from datasets import load_dataset,Features,Value,load_from_disk
import transformers
from functools import partial
from transformers import MistralForCausalLM, MistralModel, MistralConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from flash_attn import flash_attn_qkvpacked_func, flash_attn_func

seed = 5318008
set_seed(seed)

In [ ]:

#!pip install -U git+https://github.com/huggingface/transformers

In [ ]:

#https://github.com/TimDettmers/bitsandbytes
#!pip install bitsandbytes

In [ ]:

#https://github.com/huggingface/peft
#!pip install peft

In [ ]:

#https://github.com/Dao-AILab/flash-attention
#!pip install -U flash-attn --no-build-isolation

In [ ]:

#Set custom tokens for Mistral
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [ ]:

def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{12000}MB'

    configuration = MistralConfig()
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=configuration,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
        #Requires Flash Attention 2 installation
        use_flash_attention_2=True,
    )


   
    max_length = 2048
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right", pad_to_multiple_of=max_length, 
                                              model_max_length=max_length,use_fast=False)

    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    #freeze all our layers, and cast the layer-norm in float32 for stability. We also cast the output of the last layer in float32 for the same reasons.
    #https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing#scrollTo=9fTSZntA1iUG
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()


    if num_new_tokens > 0:
        print(num_new_tokens)
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

    # Needed for LLaMA tokenizer
    #tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer

In [ ]:

def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [ ]:

def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=32,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"],
        #target_modules=find_all_linear_names(model)
        lora_dropout=0.05,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )
    print(modules)
    return config

In [ ]:

# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

#    if 'lm_head' in lora_module_names:  # needed for 16-bit
#        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [ ]:

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [ ]:

from unidecode import unidecode

In [ ]:

model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "/home/nano/textgen/models/mistralai_Mistral-7B-v0.1"
bnb_config = create_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)

In [ ]:

context_feat = Features({'text': Value(dtype='string', id=None)})
data = load_dataset('csv',data_files={'train': ['/mnt/d/test/train.csv'],
                                      'test': ['/mnt/d/test/val.csv']},features=context_feat)

In [ ]:

def tokenize_fn(tokenizer, example):
    context_length = tokenizer.model_max_length
    outputs = tokenizer(
        tokenizer.eos_token.join(example["text"]),
        truncation=False,
        return_tensors="pt",
        pad_to_multiple_of=context_length,
        padding=True,
    )
    return {"input_ids": outputs["input_ids"].view(-1, context_length)}

In [ ]:

data = data.map(partial(tokenize_fn,tokenizer),batched=True, batch_size=1000, num_proc=2, remove_columns=["text"])

In [ ]:

#data.save_to_disk("/mnt/d/test/tokenized-dataset-books-v2/")

data = load_from_disk("/mnt/d/test/tokenized-dataset-books-v2/")

In [ ]:

print(tokenizer.decode(data["train"][1505]["input_ids"]))

In [ ]:

output_dir = "/mnt/d/test-books2/"

In [ ]:

import os
os.environ["WANDB_DISABLED"] = "true"
#tokenizer.pad_token_id = tokenizer.eos_token_id

In [ ]:

def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.enable_input_require_grads()     # required for gradient checkpointing
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=1,
            warmup_steps=2,
            #max_steps=15,
            report_to="tensorboard",
            save_steps=100,
            num_train_epochs=4,
            learning_rate=2e-5,
            fp16=True,
            logging_steps=1,
            output_dir=output_dir,
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train(resume_from_checkpoint=True)
#        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()
    
    
train(model, tokenizer, data["train"], output_dir)

Test LoRA

In [ ]:

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

In [ ]:

#Set custom tokens for Mistral
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [ ]:

tokenizer = AutoTokenizer.from_pretrained("/home/nano/textgen/models/mistralai_Mistral-7B-v0.1", padding_side="left",use_fast=False)
special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [ ]:

# Specify input
text = "Test input prompt here"

# Specify device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer)
# Get answer
# (Adjust max_new_tokens variable as you wish (maximum number of tokens the model can generate to answer the input))
outputs = model.generate(input_ids=inputs["input_ids"].to(device), #do_sample=True,
                         #top_k=50,
                         #top_p=0.95,
                         #temperature=0.1,
                         streamer=streamer, attention_mask=inputs["attention_mask"], max_new_tokens=2048)

# Decode output & print it
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Create Dataset

In [ ]:

import os
import re
import glob
import pandas as pd
from tqdm.auto import tqdm

In [ ]:

file_name_and_text = {}
base_path = "/mnt/e/br2/"
import_files = [file for file in os.listdir(base_path)]
with tqdm() as bar:
    for file in import_files:
        print(file)
        with open(base_path+file, "r") as target_file:
            file_name_and_text[file] = target_file.read()
            bar.update(1)
    file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index').reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))

In [ ]:

print((file_data["text"][9]))

In [ ]:

file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n\n\n', "\n", regex=True)
file_data["text"] = file_data["text"].str.replace(r'\n\n\n', "\n", regex=True)

In [ ]:

data["train"] = data["train"].str.replace(r'\n\n\n\n\n\n\n\n\n', "\n", regex=True)

In [ ]:

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [ ]:

file_data["text"] = file_data["text"].apply(lambda x: remove_html_tags(str(x)))

In [ ]:

header = ["text"]
file_data.to_csv("docs-test.csv", columns=header)

In [ ]:

#Source: From here, maybe
#https://www.kaggle.com/code/nourhanaboelsoaoud/english-to-french-translation or
#https://github.com/UtkarshGarg-UG/Deep-Learning-Projects/blob/main/NLP/Custom%20Dataset/loading%20custom%20dataset%20(text).ipynb I have no idea. Maybe I wrote it in a fugue state.
import numpy as np

val_frac = 0.01 #precentage data in val
val_split_idx = int(len(file_data)*val_frac) #index on which to split
data_idx = list(range(len(file_data))) #create a list of ints till len of data

#To shuffle, or not shuffle. If doing eval, shuffle, probably, so the distribution in the eval dataset makes sense.
#I am doing a linear training, where I want the data to be seen in a particular way, so I cannot shuffle, and I don't
#want to make considerations for pulling out non-duplicate data for the eval set, so no eval. Who cares, anyway. It's all subjective.
#np.random.shuffle(data_idx)

#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

#create the sets
train = file_data.iloc[train_idx].reset_index().drop('index',axis=1)
print(train.info(verbose=False, memory_usage="deep"))
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True)
train.to_csv("/mnt/d/test/train-books2.csv")
val = file_data.iloc[val_idx].reset_index().drop('index',axis=1)
print(val.info(verbose=False, memory_usage="deep"))
val.replace('', np.nan, inplace=True)
val.dropna(inplace=True)
val.to_csv("/mnt/d/test/val-books2.csv")

In [ ]:

Convert PDF to Text (OCR Method) using Unstructured

import glob
import pandas as pd
from unstructured.cleaners.core import clean
from unstructured.partition.auto import partition
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dataframe
all_pdfs = glob.glob("/mnt/d/gametheory/*.pdf")
print("Total number of files: ", len(all_pdfs))
tdf_headers = ["text"]
file_data = pd.DataFrame(columns=tdf_headers)

output_s = ""
lst = []

for pdf in all_pdfs:
    elements = partition(pdf, content_type="application/pdf")
    elements = [clean(str(el)) for el in elements]
    for element in elements:
        ele_s = str(element)
        lst.append(ele_s)
    df_extended = pd.DataFrame(lst, columns=file_data.columns)
    file_data = pd.concat([file_data, df_extended])

import numpy as np
val_frac = 0.01 #precentage data in val
val_split_idx = int(len(file_data)*val_frac) #index on which to split
data_idx = list(range(len(file_data))) #create a list of ints till len of data

#np.random.shuffle(data_idx)

#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

#create the sets
train = file_data.iloc[train_idx].reset_index().drop('index',axis=1)
print(train.info(verbose=False, memory_usage="deep"))
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True)
train.to_csv("/mnt/d/gametheory/train-gt.csv")
val = file_data.iloc[val_idx].reset_index().drop('index',axis=1)
print(val.info(verbose=False, memory_usage="deep"))
val.replace('', np.nan, inplace=True)
val.dropna(inplace=True)
val.to_csv("/mnt/d/gametheory/val-gt.csv")

Merge Lora Script

# Written by Yukang Chen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import torch
import argparse
import transformers
from peft import PeftModel
from typing import Dict

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def parse_config():
    parser = argparse.ArgumentParser(description='arg parser')
    parser.add_argument('--base_model', type=str, default="/data/pretrained-models/llama-7b-hf")
    parser.add_argument('--peft_model', type=str, default=None, help='')
    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
    parser.add_argument('--save_path', type=str, default=None, help='')
    parser.add_argument('--cache_dir', type=str, default=None, help='./cache_dir')
    args = parser.parse_args()
    return args

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

def main(args):
    device = "cuda:0"
    torch.cuda.set_device(device)

    print("base model", args.base_model)
    print("peft model", args.peft_model)

    # Load model and tokenizer
    model = transformers.AutoModelForCausalLM.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
        torch_dtype=torch.float16,
        device_map={"": "cpu"}
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
        model_max_length=args.context_size,
        padding_side="right",
        use_fast=False,
    )
    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens_dict,
        tokenizer=tokenizer,
        model=model,
    )

    trainable_params = os.path.join(args.peft_model, "trainable_params.bin")
    if os.path.isfile(trainable_params):
        model.load_state_dict(torch.load(trainable_params, map_location=model.device), strict=False)
    model = PeftModel.from_pretrained(
        model,
        args.peft_model,
        #device_map="auto",
        torch_dtype=torch.float16,
        device_map={"": "cpu"}
    )
    model = model.merge_and_unload()
    model.save_pretrained(args.save_path,  max_shard_size="400MB")
    tokenizer.save_pretrained(args.save_path)

if __name__ == "__main__":
    args = parse_config()
    main(args)
#python llmerge.py --base_model /home/nano/textgen/models/mistralai_Mistral-7B-v0.1 --peft_model /home/nano/gpt/mistral-lyrics-pop-2048-v1.1/checkpoint-3900 --save_path /home/nano/textgen/models/mistral-lyrics-pop-3900

Fine Tuning Mistral 7B

Leave a Reply Cancel reply