retriever.py

import json
import os
import time
from argparse import ArgumentParser
from glob import glob
from pathlib import Path

import torch
import torch.nn.functional as F
from config import TDArgs
from gen_pr_items import PR_ITEMS

from llama import Llama

from tokenizer import Tokenizer

REPO_ROOT = Path(__file__).resolve().parent


class Retriever:
    """
    Generate embedding for an item based using the model and evaluate it against
    an experiment
    """

    def __init__(self, experiment_name, pr_parse_format):
        self.experiment_name = experiment_name
        self.config = TDArgs()
        assets_path = os.path.join("assets", self.experiment_name)
        self.items = PR_ITEMS[pr_parse_format]()
        self.output_filename = (
            f"{self.experiment_name}-{pr_parse_format.lower()}-output.json"
        )

        # Init Rank/Device
        try:
            self.local_rank = int(os.environ["LOCAL_RANK"])
            self.world_size = int(os.environ["WORLD_SIZE"])
        except KeyError:
            # LOCAL_RANK may not be set if torchrun/torchx is not being used
            self.local_rank = 0
            self.world_size = 1

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Get the list of artifacts:
        # 1. Indexes of unittests generated by indexer (*.pt)
        # 2. Mapping from indices to unittest names (*.json)
        embeddings_files = glob(f"{assets_path}/unittest_index_*.pt")
        mapping_files = glob(f"{assets_path}/unittest_index_mapping_*.json")

        # Sort the above lists
        embeddings_files = sorted(embeddings_files)
        mapping_files = sorted(mapping_files)

        # Read the artifact files and concatenate them into:
        # 1. self.embeddings - with the entire index as a single pytorch tensor
        # 2. self.unittest_names - a single Dict of the form {idx: test_name}
        embeddings = []
        self.unittest_names = []
        for i in range(len(embeddings_files)):
            embeddings.append(torch.load(embeddings_files[i]))

            with open(mapping_files[i]) as f:
                test_map = json.load(f)
            self.unittest_names.extend(test_map["mapping"])

        self.embeddings = torch.cat(embeddings).to(self.device)
        print(self.embeddings.shape)

        # self.tokenizer = Tokenizer("bert-base-uncased")
        # self.model = AutoModelForCausalLM.from_pretrained(
        #     "bert-base-uncased"
        # ).to("cuda:0")
        generator = Llama.build(
            ckpt_dir=os.path.expanduser(self.config.model_ckpt_dir),
            tokenizer_path=os.path.expanduser(self.config.tokenizer_path),
            max_seq_len=self.config.max_context_len,
            max_batch_size=self.config.max_batch_size,
            use_kv_cache=False,
            model_parallel_size=1,
        )
        self.model = generator.model.to(self.device)
        self.tokenizer = Tokenizer(self.config)

    def retrieve(self) -> None:
        # parse and tokenize input (function from a file)
        # run model forward on each chunk of the embeddings
        # cosine similarity per chunk
        # Returns a dictionary mapping test name to a score
        self.model.eval()
        with torch.autocast(
            self.device
        ):  # needed for cpu inference? something about half floats
            with torch.no_grad():
                mapping = {}
                for item in self.items:
                    tensor = torch.full(
                        (1, self.config.max_context_len),
                        self.tokenizer.pad_id,
                        dtype=torch.long,
                    )

                    tokens = self.tokenizer.encode(item)
                    tokens = tokens[: self.config.max_context_len]
                    tensor[0, : len(tokens)] = torch.tensor(
                        tokens, dtype=torch.long
                    )
                    attn_mask = torch.where(
                        tensor == self.tokenizer.pad_id, 0.0, 1.0
                    )

                    tensor = tensor.to(self.device)
                    attn_mask = attn_mask.to(self.device)

                    _, embedding = self.model.forward(
                        tensor,
                        0,
                        output_last_hidden_state=True,
                        attn_mask=attn_mask,
                    )
                    pooled_embedding = torch.sum(embedding, dim=1)
                    similarity_matrix = F.cosine_similarity(
                        self.embeddings, pooled_embedding
                    )

                    for ind in range(similarity_matrix.shape[0]):
                        test = self.unittest_names[ind]
                        score = similarity_matrix[ind]
                        if test not in mapping:
                            mapping[test] = []
                        mapping[test].append(score.item())

                # condense
                for test, score in mapping.items():
                    mapping[test] = sum(score) / len(score)
                self.save_outputs(mapping)

    def save_outputs(self, mapping):
        """Make json file of the mapping in assets/mappings"""
        os.makedirs("assets/mappings", exist_ok=True)
        new_mapping = {}
        for file, score in mapping.items():
            new_mapping[file] = score
        with open(
            REPO_ROOT / "assets/mappings" / self.output_filename, "w"
        ) as f:
            f.write(json.dumps(new_mapping))
            print(f"Made output file assets/mappings/{self.output_filename}")


def main():
    parser = ArgumentParser("Retriever")
    parser.add_argument(
        "--experiment-name",
        type=str,
        required=True,
        help="Uses artifacts from the specified Indexer Experiment",
    )
    parser.add_argument(
        "--pr-parse-format",
        type=str,
        choices=PR_ITEMS.keys(),
        required=True,
        help="Specify what method to parse information from a PR",
    )

    args = parser.parse_args()

    start = time.time()
    retriever = Retriever(args.experiment_name, args.pr_parse_format)
    retriever.retrieve()
    end = time.time()

    print(f"Total time to retreieve: {end-start} seconds")


if __name__ == "__main__":
    main()