In-code documentation and linting update

Dear Developers, I'm pleased to inform you that I have completed the documentation update the python scripts. The updated documentation provides clear explanations of function parameters, return types, and expected behavior. Additionally, it adheres to consistent formatting and organization, ensuring ease of understanding for both current and future developers. Please review the updated documentation at your earliest convenience. If you have any feedback or suggestions for further improvements, please don't hesitate to let me know. Thank you for your attention to this matter. Best regards, Louis Brulé Naudet
2024-10-01 05:35:37 -04:00 · 2024-02-20 07:34:53 +01:00 · 2024-02-20 07:34:53 +01:00 · 4187a25937
commit 4187a25937
parent 761dc5bfbd
4 changed files with 343 additions and 34 deletions
--- a/generate_instruction.py
+++ b/generate_instruction.py
@ -24,8 +24,26 @@ import utils
 import fire


-def encode_prompt(prompt_instructions):
-    """Encode multiple prompt instructions into a single string."""
+def encode_prompt(prompt_instructions:list) -> str:
+    """
+    Encode multiple prompt instructions into a single string.
+
+    This function reads a file named "prompt.txt" and appends its content to a string. 
+    Then, it iterates over a list of prompt instructions, each containing an instruction, input, and output.
+    The function constructs a formatted string representation of each instruction, input, and output, 
+    appending it to the prompt string.
+
+    Parameters
+    ----------
+    prompt_instructions : list of dict
+        A list containing dictionaries with keys "instruction", "input", and "output", representing 
+        the instructions, inputs, and expected outputs for the prompt.
+
+    Returns
+    -------
+    prompt : str
+        A single string encoding all the prompt instructions along with the content of "prompt.txt".
+    """
    prompt = open("./prompt.txt").read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
@ -41,7 +59,23 @@ def encode_prompt(prompt_instructions):
    return prompt


-def post_process_gpt3_response(num_prompt_instructions, response):
+def post_process_gpt3_response(num_prompt_instructions:int, response:dict) -> list:
+    """
+    Post-processes the response generated by GPT-3, extracting formatted instructions, inputs, and outputs.
+
+    Parameters
+    ----------
+    num_prompt_instructions : int
+        The number of prompt instructions used in the GPT-3 response.
+
+    response : dict or None
+        The response generated by GPT-3, containing the text and finish reason.
+
+    Returns
+    -------
+    instructions : list of dict
+        A list of dictionaries representing formatted instructions, inputs, and outputs extracted from the GPT-3 response.
+    """
    if response is None:
        return []
    raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
@ -103,21 +137,73 @@ def post_process_gpt3_response(num_prompt_instructions, response):
    return instructions


-def find_word_in_string(w, s):
+def find_word_in_string(w:str, s:str):
+    """
+    Search for a word within a string, ignoring case and word boundaries.
+
+    Parameters
+    ----------
+    w : str
+        The word to search for within the string.
+
+    s : str
+        The string in which to search for the word.
+
+    Returns
+    -------
+    match : re.Match or None
+        A match object if the word is found within the string, otherwise None.
+    """
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)


 def generate_instruction_following_data(
-    output_dir="./",
-    seed_tasks_path="./seed_tasks.jsonl",
-    num_instructions_to_generate=100,
-    model_name="text-davinci-003",
-    num_prompt_instructions=3,
-    request_batch_size=5,
-    temperature=1.0,
-    top_p=1.0,
-    num_cpus=16,
+    output_dir:str="./",
+    seed_tasks_path:str="./seed_tasks.jsonl",
+    num_instructions_to_generate:int=100,
+    model_name:str="text-davinci-003",
+    num_prompt_instructions:int=3,
+    request_batch_size:int=5,
+    temperature:float=1.0,
+    top_p:float=1.0,
+    num_cpus:int=16,
 ):
+    """
+    Generate instructions following the provided seed data using the GPT-3 model.
+
+    Parameters
+    ----------
+    output_dir : str, optional
+        The directory where the generated instructions will be saved. Default is "./".
+
+    seed_tasks_path : str, optional
+        The path to the file containing seed tasks in JSONL format. Default is "./seed_tasks.jsonl".
+
+    num_instructions_to_generate : int, optional
+        The number of instructions to generate. Default is 100.
+
+    model_name : str, optional
+        The name of the GPT-3 model to use. Default is "text-davinci-003".
+
+    num_prompt_instructions : int, optional
+        The number of prompt instructions to use for generating each instruction. Default is 3.
+
+    request_batch_size : int, optional
+        The batch size for making requests to the GPT-3 model. Default is 5.
+
+    temperature : float, optional
+        The temperature parameter for sampling from the model distribution. Default is 1.0.
+
+    top_p : float, optional
+        The cumulative probability for nucleus sampling. Default is 1.0.
+
+    num_cpus : int, optional
+        The number of CPUs to use for parallel processing. Default is 16.
+
+    Returns
+    -------
+    None
+    """
    seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
    seed_instruction_data = [
        {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
@ -209,7 +295,18 @@ def generate_instruction_following_data(
        utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))


-def main(task, **kwargs):
+def main(task:str, **kwargs):
+    """
+    Main function for executing specific tasks.
+
+    Parameters
+    ----------
+    task : str
+        The name of the task to execute.
+
+    **kwargs : dict
+        Additional keyword arguments specific to the task.
+    """
    globals()[task](**kwargs)


--- a/train.py
+++ b/train.py
@ -67,9 +67,25 @@ def smart_tokenizer_and_embedding_resize(
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
 ):
-    """Resize tokenizer and embedding.
+    """
+    Resize tokenizer and embedding.

-    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    This is the unoptimized version that may make your embedding size not be divisible by 64.
+
+    Parameters
+    ----------
+    special_tokens_dict : Dict
+        A dictionary containing special tokens to be added to the tokenizer.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    model : transformers.PreTrainedModel
+        The model instance.
+
+    Returns
+    -------
+    None
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
@ -86,7 +102,22 @@ def smart_tokenizer_and_embedding_resize(


 def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
-    """Tokenize a list of strings."""
+    """
+    Tokenize a list of strings.
+
+    Parameters
+    ----------
+    strings : Sequence[str]
+        A list of strings to tokenize.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the tokenized input ids and their corresponding lengths.
+    """
    tokenized_list = [
        tokenizer(
            text,
@ -114,7 +145,25 @@ def preprocess(
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
 ) -> Dict:
-    """Preprocess the data by tokenizing."""
+    """
+    Preprocess the data by tokenizing.
+
+    Parameters
+    ----------
+    sources : Sequence[str]
+        A sequence of source strings.
+
+    targets : Sequence[str]
+        A sequence of target strings.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the tokenized input ids and labels.
+    """
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
@ -128,6 +177,17 @@ class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
+        """
+        Initialize the SupervisedDataset.
+
+        Parameters
+        ----------
+        data_path : str
+            The path to the data file.
+
+        tokenizer : transformers.PreTrainedTokenizer
+            The tokenizer instance.
+        """
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        list_data_dict = utils.jload(data_path)
@ -146,20 +206,60 @@ class SupervisedDataset(Dataset):
        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Return the length of the dataset.
+        
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        int
+            Length of the dataset.
+        """
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        """
+        Get an item from the dataset.
+
+        Parameters
+        ----------
+        i : int
+            Index of the item to retrieve.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing the input_ids and labels tensors for the specified item.
+        """
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
 class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
+    """
+    Collate examples for supervised fine-tuning.
+    """

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        """
+        Collate examples.
+
+        Parameters
+        ----------
+        instances : Sequence[Dict]
+            A sequence of examples.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing the input ids, labels, and attention mask.
+        """
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
@ -173,13 +273,41 @@ class DataCollatorForSupervisedDataset(object):


 def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
+    """
+    Make dataset and collator for supervised fine-tuning.
+
+    Parameters
+    ----------
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    data_args : Any
+        Additional data arguments.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the train dataset, evaluation dataset, and data collator.
+    """
    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)


 def train():
+    """
+    Train the model.
+
+    Parses model, data, and training arguments, initializes the model and tokenizer, preprocesses the data, and then trains the model.
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    None
+    """
    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

--- a/utils.py
+++ b/utils.py
@ -131,6 +131,22 @@ def openai_completion(


 def _make_w_io_base(f, mode: str):
+    """
+    Helper function to ensure a file is opened in write mode if it's not already an IOBase object.
+
+    Parameters
+    ----------
+    f : str or IOBase
+        The file path or IOBase object.
+
+    mode : str
+        The mode for opening the file.
+
+    Returns
+    -------
+    f : IOBase
+        The file object opened in the specified mode.
+    """
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
@ -140,20 +156,51 @@ def _make_w_io_base(f, mode: str):


 def _make_r_io_base(f, mode: str):
+    """
+    Helper function to ensure a file is opened in read mode if it's not already an IOBase object.
+
+    Parameters
+    ----------
+    f : str or IOBase
+        The file path or IOBase object.
+
+    mode : str
+        The mode for opening the file.
+
+    Returns
+    -------
+    f : IOBase
+        The file object opened in the specified mode.
+    """
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


-def jdump(obj, f, mode="w", indent=4, default=str):
-    """Dump a str or dictionary to a file in json format.
+def jdump(obj, f, mode="w", indent:int=4, default=str) -> None:
+    """
+    Dump a string or dictionary to a file in JSON format.

-    Args:
-        obj: An object to be written.
-        f: A string path to the location on disk.
-        mode: Mode for opening the file.
-        indent: Indent for storing json dictionaries.
-        default: A function to handle non-serializable entries; defaults to `str`.
+    Parameters
+    ----------
+    obj : object
+        An object to be written.
+
+    f : str or IOBase
+        A string path to the location on disk or an IOBase object.
+
+    mode : str, optional
+        The mode for opening the file. Default is "w".
+
+    indent : int, optional
+        Indent for storing JSON dictionaries. Default is 4.
+
+    default : function, optional
+        A function to handle non-serializable entries; defaults to `str`.
+
+    Returns
+    -------
+    None
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
@ -165,8 +212,23 @@ def jdump(obj, f, mode="w", indent=4, default=str):
    f.close()


-def jload(f, mode="r"):
-    """Load a .json file into a dictionary."""
+def jload(f, mode:str="r") -> dict:
+    """
+    Load a .json file into a dictionary.
+
+    Parameters
+    ----------
+    f : str or IOBase
+        A string path to the JSON file or an IOBase object.
+
+    mode : str, optional
+        The mode for opening the file. Default is "r".
+
+    Returns
+    -------
+    jdict : dict
+        The dictionary loaded from the JSON file.
+    """
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
--- a/weight_diff.py
+++ b/weight_diff.py
@ -25,11 +25,33 @@ from train import smart_tokenizer_and_embedding_resize
 def make_diff(
    path_raw: str, path_tuned: str, path_diff: str, device="cpu",  # "cuda" or "cpu"
 ):
-    """Make the weight diff.
+    """
+    Make the weight difference between two pre-trained models.

-    This function is given to present full transparency of how the weight diff was created.
+    This function is provided to ensure full transparency of how the weight difference was created.

-    Run:
+    Parameters
+    ----------
+    path_raw : str
+        The path to the directory or file containing the weights of the raw model.
+
+    path_tuned : str
+        The path to the directory or file containing the weights of the tuned model.
+
+    path_diff : str
+        The path to save the weight difference.
+
+    device : str, optional
+        The device to run the model on. Default is "cpu".
+
+    Raises
+    ------
+    FileNotFoundError
+        If the specified paths do not exist.
+
+    Notes
+    -----
+    Run the following command to execute the function:
        python weight_diff.py make_diff --path_raw <your_path_raw> --path_tuned <your_path_tuned> --path_diff <your_path_diff>
    """
    model_tuned: transformers.PreTrainedModel = transformers.AutoModelForCausalLM.from_pretrained(