From f4671386c1c43b07034c688cdca117dfa5442498 Mon Sep 17 00:00:00 2001
From: Michael Kranzlein <8162250+mkranzlein@users.noreply.github.com>
Date: Sun, 18 Feb 2024 15:03:15 -0500
Subject: [PATCH] Added notebook outlining inference on new data

---
 scripts/notebooks/model_inference.ipynb | 147 ++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 scripts/notebooks/model_inference.ipynb

diff --git a/scripts/notebooks/model_inference.ipynb b/scripts/notebooks/model_inference.ipynb
new file mode 100644
index 0000000..725db8a
--- /dev/null
+++ b/scripts/notebooks/model_inference.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd -q ../.."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "import torch\n",
+    "from transformers import BertTokenizerFast\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print('Using device:', device)\n",
+    "\n",
+    "# Load model and tokenizer\n",
+    "sentence_model = torch.load(\"models/curiam/sentence_level_model_nohipool.pt\")\n",
+    "token_model = torch.load(\"models/curiam/working_model_nohipool.pt\")\n",
+    "bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "with open(\"data/curiam.json\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    json_data = json.load(f)\n",
+    "\n",
+    "# Each document is a list of sentences, and each sentence is a list of tokens.\n",
+    "documents = []\n",
+    "\n",
+    "# labels[i] is an [n, k] tensor where n is the number of tokens in the i-th sentence and\n",
+    "# k is the number of binary labels assigned to each token.\n",
+    "\n",
+    "for raw_document in json_data:\n",
+    "    doc_sentences = [[token[\"text\"].lower() for token in sentence[\"tokens\"]]\n",
+    "                        for sentence in raw_document[\"sentences\"]]\n",
+    "    documents.append(doc_sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence: This is a sentence\n",
+      "Token               FT  MC  DQ  LeS \n",
+      "[CLS]               N   N   N   N   \n",
+      "this                N   N   N   N   \n",
+      "is                  N   N   N   N   \n",
+      "a                   N   N   N   N   \n",
+      "sentence            N   N   N   N   \n",
+      "[SEP]               N   N   N   N   \n"
+     ]
+    }
+   ],
+   "source": [
+    "def predict_sentence_toks(sentence: list[str]):\n",
+    "    y = bert_tokenizer(sentence, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors=\"pt\")\n",
+    "    output = token_model(y[\"input_ids\"].cuda(), mask=y[\"attention_mask\"].cuda(), token_type_ids=y[\"token_type_ids\"].cuda())\n",
+    "    sigmoid_outputs = torch.nn.functional.sigmoid(output)\n",
+    "    print(\"Sentence:\", \" \".join(sentence))\n",
+    "    print(f\"{'Token':<20}{'FT':<4}{'MC':<4}{'DQ':<4}{'LeS':<4}\")\n",
+    "    for token, preds in zip(bert_tokenizer.convert_ids_to_tokens(y[\"input_ids\"][0]), sigmoid_outputs[0]):\n",
+    "        line = [token]\n",
+    "        for pred in preds:\n",
+    "            if pred > .5:\n",
+    "                line.append(\"Y\")\n",
+    "            else:\n",
+    "                line.append(\"N\")\n",
+    "        print(f\"{line[0]:<20}{line[1]:<4}{line[2]:<4}{line[3]:<4}{line[4]:<4}\")\n",
+    "\n",
+    "sample_sentence = [\"This\", \"is\", \"a\", \"sentence\"]\n",
+    "predict_sentence_toks(sample_sentence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: fix output alignment like in previous func\n",
+    "def predict_meta_sentence(sample):\n",
+    "    y = bert_tokenizer(sample, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors=\"pt\")\n",
+    "    y = bert_tokenizer(sample, is_split_into_words=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, return_tensors=\"pt\")\n",
+    "    output = sentence_model(y[\"input_ids\"].cuda(), mask=y[\"attention_mask\"].cuda(), token_type_ids=y[\"token_type_ids\"].cuda())\n",
+    "    sigmoid_outputs = torch.nn.functional.sigmoid(output)\n",
+    "    print(' '.join(sample))\n",
+    "    print('FT\\tMC\\tDQ\\tLeS')\n",
+    "    line_out = \"\"\n",
+    "    for pred in sigmoid_outputs[0]:\n",
+    "        if pred >=.5:\n",
+    "            line_out += f\"Y\\t\"\n",
+    "        else:\n",
+    "            line_out += f\"N\\t\"\n",
+    "    print(line_out)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hipool",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}