diff --git a/generate_instruction.py b/generate_instruction.py index 9f7406b..8606891 100644 --- a/generate_instruction.py +++ b/generate_instruction.py @@ -24,8 +24,26 @@ import utils import fire -def encode_prompt(prompt_instructions): - """Encode multiple prompt instructions into a single string.""" +def encode_prompt(prompt_instructions:list) -> str: + """ + Encode multiple prompt instructions into a single string. + + This function reads a file named "prompt.txt" and appends its content to a string. + Then, it iterates over a list of prompt instructions, each containing an instruction, input, and output. + The function constructs a formatted string representation of each instruction, input, and output, + appending it to the prompt string. + + Parameters + ---------- + prompt_instructions : list of dict + A list containing dictionaries with keys "instruction", "input", and "output", representing + the instructions, inputs, and expected outputs for the prompt. + + Returns + ------- + prompt : str + A single string encoding all the prompt instructions along with the content of "prompt.txt". + """ prompt = open("./prompt.txt").read() + "\n" for idx, task_dict in enumerate(prompt_instructions): @@ -41,7 +59,23 @@ def encode_prompt(prompt_instructions): return prompt -def post_process_gpt3_response(num_prompt_instructions, response): +def post_process_gpt3_response(num_prompt_instructions:int, response:dict) -> list: + """ + Post-processes the response generated by GPT-3, extracting formatted instructions, inputs, and outputs. + + Parameters + ---------- + num_prompt_instructions : int + The number of prompt instructions used in the GPT-3 response. + + response : dict or None + The response generated by GPT-3, containing the text and finish reason. + + Returns + ------- + instructions : list of dict + A list of dictionaries representing formatted instructions, inputs, and outputs extracted from the GPT-3 response. + """ if response is None: return [] raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"] @@ -103,21 +137,73 @@ def post_process_gpt3_response(num_prompt_instructions, response): return instructions -def find_word_in_string(w, s): +def find_word_in_string(w:str, s:str): + """ + Search for a word within a string, ignoring case and word boundaries. + + Parameters + ---------- + w : str + The word to search for within the string. + + s : str + The string in which to search for the word. + + Returns + ------- + match : re.Match or None + A match object if the word is found within the string, otherwise None. + """ return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s) def generate_instruction_following_data( - output_dir="./", - seed_tasks_path="./seed_tasks.jsonl", - num_instructions_to_generate=100, - model_name="text-davinci-003", - num_prompt_instructions=3, - request_batch_size=5, - temperature=1.0, - top_p=1.0, - num_cpus=16, + output_dir:str="./", + seed_tasks_path:str="./seed_tasks.jsonl", + num_instructions_to_generate:int=100, + model_name:str="text-davinci-003", + num_prompt_instructions:int=3, + request_batch_size:int=5, + temperature:float=1.0, + top_p:float=1.0, + num_cpus:int=16, ): + """ + Generate instructions following the provided seed data using the GPT-3 model. + + Parameters + ---------- + output_dir : str, optional + The directory where the generated instructions will be saved. Default is "./". + + seed_tasks_path : str, optional + The path to the file containing seed tasks in JSONL format. Default is "./seed_tasks.jsonl". + + num_instructions_to_generate : int, optional + The number of instructions to generate. Default is 100. + + model_name : str, optional + The name of the GPT-3 model to use. Default is "text-davinci-003". + + num_prompt_instructions : int, optional + The number of prompt instructions to use for generating each instruction. Default is 3. + + request_batch_size : int, optional + The batch size for making requests to the GPT-3 model. Default is 5. + + temperature : float, optional + The temperature parameter for sampling from the model distribution. Default is 1.0. + + top_p : float, optional + The cumulative probability for nucleus sampling. Default is 1.0. + + num_cpus : int, optional + The number of CPUs to use for parallel processing. Default is 16. + + Returns + ------- + None + """ seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")] seed_instruction_data = [ {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]} @@ -209,7 +295,18 @@ def generate_instruction_following_data( utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json")) -def main(task, **kwargs): +def main(task:str, **kwargs): + """ + Main function for executing specific tasks. + + Parameters + ---------- + task : str + The name of the task to execute. + + **kwargs : dict + Additional keyword arguments specific to the task. + """ globals()[task](**kwargs) diff --git a/train.py b/train.py index 2b5a98b..925e09f 100644 --- a/train.py +++ b/train.py @@ -67,9 +67,25 @@ def smart_tokenizer_and_embedding_resize( tokenizer: transformers.PreTrainedTokenizer, model: transformers.PreTrainedModel, ): - """Resize tokenizer and embedding. + """ + Resize tokenizer and embedding. - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + This is the unoptimized version that may make your embedding size not be divisible by 64. + + Parameters + ---------- + special_tokens_dict : Dict + A dictionary containing special tokens to be added to the tokenizer. + + tokenizer : transformers.PreTrainedTokenizer + The tokenizer instance. + + model : transformers.PreTrainedModel + The model instance. + + Returns + ------- + None """ num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) @@ -86,7 +102,22 @@ def smart_tokenizer_and_embedding_resize( def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: - """Tokenize a list of strings.""" + """ + Tokenize a list of strings. + + Parameters + ---------- + strings : Sequence[str] + A list of strings to tokenize. + + tokenizer : transformers.PreTrainedTokenizer + The tokenizer instance. + + Returns + ------- + Dict + A dictionary containing the tokenized input ids and their corresponding lengths. + """ tokenized_list = [ tokenizer( text, @@ -114,7 +145,25 @@ def preprocess( targets: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, ) -> Dict: - """Preprocess the data by tokenizing.""" + """ + Preprocess the data by tokenizing. + + Parameters + ---------- + sources : Sequence[str] + A sequence of source strings. + + targets : Sequence[str] + A sequence of target strings. + + tokenizer : transformers.PreTrainedTokenizer + The tokenizer instance. + + Returns + ------- + Dict + A dictionary containing the tokenized input ids and labels. + """ examples = [s + t for s, t in zip(sources, targets)] examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] input_ids = examples_tokenized["input_ids"] @@ -128,6 +177,17 @@ class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer): + """ + Initialize the SupervisedDataset. + + Parameters + ---------- + data_path : str + The path to the data file. + + tokenizer : transformers.PreTrainedTokenizer + The tokenizer instance. + """ super(SupervisedDataset, self).__init__() logging.warning("Loading data...") list_data_dict = utils.jload(data_path) @@ -146,20 +206,60 @@ class SupervisedDataset(Dataset): self.input_ids = data_dict["input_ids"] self.labels = data_dict["labels"] - def __len__(self): + def __len__(self) -> int: + """ + Return the length of the dataset. + + Parameters + ---------- + None + + Returns + ------- + int + Length of the dataset. + """ return len(self.input_ids) def __getitem__(self, i) -> Dict[str, torch.Tensor]: + """ + Get an item from the dataset. + + Parameters + ---------- + i : int + Index of the item to retrieve. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing the input_ids and labels tensors for the specified item. + """ return dict(input_ids=self.input_ids[i], labels=self.labels[i]) @dataclass class DataCollatorForSupervisedDataset(object): - """Collate examples for supervised fine-tuning.""" + """ + Collate examples for supervised fine-tuning. + """ tokenizer: transformers.PreTrainedTokenizer def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + """ + Collate examples. + + Parameters + ---------- + instances : Sequence[Dict] + A sequence of examples. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing the input ids, labels, and attention mask. + """ input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) input_ids = torch.nn.utils.rnn.pad_sequence( input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id @@ -173,13 +273,41 @@ class DataCollatorForSupervisedDataset(object): def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict: - """Make dataset and collator for supervised fine-tuning.""" + """ + Make dataset and collator for supervised fine-tuning. + + Parameters + ---------- + tokenizer : transformers.PreTrainedTokenizer + The tokenizer instance. + + data_args : Any + Additional data arguments. + + Returns + ------- + Dict + A dictionary containing the train dataset, evaluation dataset, and data collator. + """ train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path) data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator) def train(): + """ + Train the model. + + Parses model, data, and training arguments, initializes the model and tokenizer, preprocesses the data, and then trains the model. + + Parameters + ---------- + None + + Returns + ------- + None + """ parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() diff --git a/utils.py b/utils.py index 0d47b5f..4256134 100644 --- a/utils.py +++ b/utils.py @@ -131,6 +131,22 @@ def openai_completion( def _make_w_io_base(f, mode: str): + """ + Helper function to ensure a file is opened in write mode if it's not already an IOBase object. + + Parameters + ---------- + f : str or IOBase + The file path or IOBase object. + + mode : str + The mode for opening the file. + + Returns + ------- + f : IOBase + The file object opened in the specified mode. + """ if not isinstance(f, io.IOBase): f_dirname = os.path.dirname(f) if f_dirname != "": @@ -140,20 +156,51 @@ def _make_w_io_base(f, mode: str): def _make_r_io_base(f, mode: str): + """ + Helper function to ensure a file is opened in read mode if it's not already an IOBase object. + + Parameters + ---------- + f : str or IOBase + The file path or IOBase object. + + mode : str + The mode for opening the file. + + Returns + ------- + f : IOBase + The file object opened in the specified mode. + """ if not isinstance(f, io.IOBase): f = open(f, mode=mode) return f -def jdump(obj, f, mode="w", indent=4, default=str): - """Dump a str or dictionary to a file in json format. +def jdump(obj, f, mode="w", indent:int=4, default=str) -> None: + """ + Dump a string or dictionary to a file in JSON format. - Args: - obj: An object to be written. - f: A string path to the location on disk. - mode: Mode for opening the file. - indent: Indent for storing json dictionaries. - default: A function to handle non-serializable entries; defaults to `str`. + Parameters + ---------- + obj : object + An object to be written. + + f : str or IOBase + A string path to the location on disk or an IOBase object. + + mode : str, optional + The mode for opening the file. Default is "w". + + indent : int, optional + Indent for storing JSON dictionaries. Default is 4. + + default : function, optional + A function to handle non-serializable entries; defaults to `str`. + + Returns + ------- + None """ f = _make_w_io_base(f, mode) if isinstance(obj, (dict, list)): @@ -165,8 +212,23 @@ def jdump(obj, f, mode="w", indent=4, default=str): f.close() -def jload(f, mode="r"): - """Load a .json file into a dictionary.""" +def jload(f, mode:str="r") -> dict: + """ + Load a .json file into a dictionary. + + Parameters + ---------- + f : str or IOBase + A string path to the JSON file or an IOBase object. + + mode : str, optional + The mode for opening the file. Default is "r". + + Returns + ------- + jdict : dict + The dictionary loaded from the JSON file. + """ f = _make_r_io_base(f, mode) jdict = json.load(f) f.close() diff --git a/weight_diff.py b/weight_diff.py index e5ae61f..5491abc 100644 --- a/weight_diff.py +++ b/weight_diff.py @@ -25,11 +25,33 @@ from train import smart_tokenizer_and_embedding_resize def make_diff( path_raw: str, path_tuned: str, path_diff: str, device="cpu", # "cuda" or "cpu" ): - """Make the weight diff. + """ + Make the weight difference between two pre-trained models. - This function is given to present full transparency of how the weight diff was created. + This function is provided to ensure full transparency of how the weight difference was created. - Run: + Parameters + ---------- + path_raw : str + The path to the directory or file containing the weights of the raw model. + + path_tuned : str + The path to the directory or file containing the weights of the tuned model. + + path_diff : str + The path to save the weight difference. + + device : str, optional + The device to run the model on. Default is "cpu". + + Raises + ------ + FileNotFoundError + If the specified paths do not exist. + + Notes + ----- + Run the following command to execute the function: python weight_diff.py make_diff --path_raw --path_tuned --path_diff """ model_tuned: transformers.PreTrainedModel = transformers.AutoModelForCausalLM.from_pretrained(