Fine Tuning Mistral 7B

Can you train new or forbidden knowledge into a LLM? Let’s fine out as I throw 1 gigabyte of scraped, cleaned, plaintext KiwiFarms posts at Mistral 7B. I go over my experience fine-tuning Mistral 7B on a few large datasets of scraped text data including English language song lyrics, and a huge KiwiFarms post dataset.

Video Link: https://youtu.be/9bl1mJImj10

Video Resources

Jupyter Notebook .ipnyb Uploaded to GDrive

https://drive.google.com/file/d/1mnew-Y1DQ0Z7AGxulF04Xur1w7SHhj3q/view?usp=sharing

Finetuning LLMs with LoRA and QLoRA: Insights from Hundreds of Experiments by Sebastian Raschka
https://lightning.ai/pages/community/lora-insights/


Can LLMs learn from a single example?
https://www.fast.ai/posts/2023-09-04-learning-jumps/

LM Evaluation Harness
https://github.com/EleutherAI/lm-evaluation-harness


Convert with Calibre
https://gist.github.com/rohshall/8980b8f73374c767dbe0a82bcf8ae86c

Calibre
https://calibre-ebook.com/

Unstructured IO
https://github.com/Unstructured-IO

QLoRA
https://github.com/artidoro/qlora

PEFT
https://github.com/huggingface/peft

Bitsandbytes
https://github.com/TimDettmers/bitsandbytes

Original LongLoRA merge script
https://github.com/dvlab-research/LongLoRA/blob/main/merge_lora_weights_and_save_hf_model.py

OpenLLM Leaderboard
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard

LM Eval Harness example command:
python main.py –model hf-causal-experimental –model_args pretrained=”/home/nano/textgen/models/mistral-books-br-2048-v2-7300″,low_cpu_mem_usage=True,load_in_4bit=True,bnb_4bit_use_double_quant=True,bnb_4bit_quant_type=”nf4″,bnb_4bit_compute_dtype=bfloat16 –tasks arithmetic_2ds,arithmetic_4ds,truthfulqa_mc –batch_size 8 –num_fewshot 0 –output_path “/home/nano/textgen/models//home/nano/textgen/models/mistral-books-br-2048-v2-7300-arith-truthfulqa_mc.json”

Text Generation WebUI
https://github.com/oobabooga/text-generation-webui

Code for Jupyter Notebook (because WordPress won’t allow me to upload a .ipnyb, and I don’t feel like figuring out how to fix that right now)

LoRA-Mistral0.1b

Convert PDF to Text (OCR Method) using Unstructured

import glob
import pandas as pd
from unstructured.cleaners.core import clean
from unstructured.partition.auto import partition
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dataframe
all_pdfs = glob.glob("/mnt/d/gametheory/*.pdf")
print("Total number of files: ", len(all_pdfs))
tdf_headers = ["text"]
file_data = pd.DataFrame(columns=tdf_headers)
output_s = ""
lst = []

for pdf in all_pdfs:
    elements = partition(pdf, content_type="application/pdf")
    elements = [clean(str(el)) for el in elements]
    for element in elements:
        ele_s = str(element)
        lst.append(ele_s)
    df_extended = pd.DataFrame(lst, columns=file_data.columns)
    file_data = pd.concat([file_data, df_extended])
import numpy as np
val_frac = 0.01 #precentage data in val
val_split_idx = int(len(file_data)*val_frac) #index on which to split
data_idx = list(range(len(file_data))) #create a list of ints till len of data

#np.random.shuffle(data_idx)

#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

#create the sets
train = file_data.iloc[train_idx].reset_index().drop('index',axis=1)
print(train.info(verbose=False, memory_usage="deep"))
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True)
train.to_csv("/mnt/d/gametheory/train-gt.csv")
val = file_data.iloc[val_idx].reset_index().drop('index',axis=1)
print(val.info(verbose=False, memory_usage="deep"))
val.replace('', np.nan, inplace=True)
val.dropna(inplace=True)
val.to_csv("/mnt/d/gametheory/val-gt.csv")

Merge Lora Script

# Written by Yukang Chen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import torch
import argparse
import transformers
from peft import PeftModel
from typing import Dict

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def parse_config():
    parser = argparse.ArgumentParser(description='arg parser')
    parser.add_argument('--base_model', type=str, default="/data/pretrained-models/llama-7b-hf")
    parser.add_argument('--peft_model', type=str, default=None, help='')
    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
    parser.add_argument('--save_path', type=str, default=None, help='')
    parser.add_argument('--cache_dir', type=str, default=None, help='./cache_dir')
    args = parser.parse_args()
    return args

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

def main(args):
    device = "cuda:0"
    torch.cuda.set_device(device)

    print("base model", args.base_model)
    print("peft model", args.peft_model)

    # Load model and tokenizer
    model = transformers.AutoModelForCausalLM.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
        torch_dtype=torch.float16,
        device_map={"": "cpu"}
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
        model_max_length=args.context_size,
        padding_side="right",
        use_fast=False,
    )
    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens_dict,
        tokenizer=tokenizer,
        model=model,
    )

    trainable_params = os.path.join(args.peft_model, "trainable_params.bin")
    if os.path.isfile(trainable_params):
        model.load_state_dict(torch.load(trainable_params, map_location=model.device), strict=False)
    model = PeftModel.from_pretrained(
        model,
        args.peft_model,
        #device_map="auto",
        torch_dtype=torch.float16,
        device_map={"": "cpu"}
    )
    model = model.merge_and_unload()
    model.save_pretrained(args.save_path,  max_shard_size="400MB")
    tokenizer.save_pretrained(args.save_path)

if __name__ == "__main__":
    args = parse_config()
    main(args)
#python llmerge.py --base_model /home/nano/textgen/models/mistralai_Mistral-7B-v0.1 --peft_model /home/nano/gpt/mistral-lyrics-pop-2048-v1.1/checkpoint-3900 --save_path /home/nano/textgen/models/mistral-lyrics-pop-3900

Leave a Reply

Your email address will not be published. Required fields are marked *