yandexdataschool · germanKoch · Jan 7, 2024
diff --git a/week02_classification/seminar.ipynb b/week02_classification/seminar.ipynb
@@ -301,13 +301,16 @@
    "source": [
     "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n",
     "\n",
-    "def as_matrix(sequences, max_len=None):\n",
+    "def as_matrix(sequences, max_len=None, min_len=None):\n",
     "    \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n",
     "    if isinstance(sequences[0], str):\n",
     "        sequences = list(map(str.split, sequences))\n",
     "        \n",
     "    max_len = min(max(map(len, sequences)), max_len or float('inf'))\n",
-    "    \n",
+    "\n",
+    "    if min_len is not None and min_len > max_len:\n",
+    "        max_len = min_len\n",
+    "        \n",
     "    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n",
     "    for i,seq in enumerate(sequences):\n",
     "        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n",
@@ -407,14 +410,14 @@
     "    return batch_tensors\n",
     "\n",
     "\n",
-    "def make_batch(data, max_len=None, word_dropout=0, device=device):\n",
+    "def make_batch(data, max_len=None, min_len=None, word_dropout=0, device=device):\n",
     "    \"\"\"\n",
     "    Creates a keras-friendly dict from the batch data.\n",
     "    :param word_dropout: replaces token index with UNK_IX with this probability\n",
     "    :returns: a dict with {'title' : int64[batch, title_max_len]\n",
     "    \"\"\"\n",
     "    batch = {}\n",
-    "    batch[\"Title\"] = as_matrix(data[\"Title\"].values, max_len)\n",
+    "    batch[\"Title\"] = as_matrix(data[\"Title\"].values, max_len, min_len)\n",
     "    batch[\"FullDescription\"] = as_matrix(data[\"FullDescription\"].values, max_len)\n",
     "    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n",
     "    \n",
@@ -627,7 +630,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def explain(model, sample, col_name='Title'):\n",
+    "def explain(model, sample, col_name='Title', min_len=None):\n",
     "    \"\"\" Computes the effect each word had on model predictions \"\"\"\n",
     "    sample = dict(sample)\n",
     "    sample_col_tokens = [tokens[token_to_id.get(tok, 0)] for tok in sample[col_name].split()]\n",
@@ -637,7 +640,7 @@
     "        data_drop_one_token.loc[drop_i, col_name] = ' '.join(UNK if i == drop_i else tok\n",
     "                                                   for i, tok in enumerate(sample_col_tokens)) \n",
     "\n",
-    "    *predictions_drop_one_token, baseline_pred = model(make_batch(data_drop_one_token, device=device)).detach().cpu()\n",
+    "    *predictions_drop_one_token, baseline_pred = model(make_batch(data_drop_one_token, min_len=min_len, device=device)).detach().cpu()\n",
     "    diffs = baseline_pred - torch.Tensor(predictions_drop_one_token)\n",
     "    return list(zip(sample_col_tokens, diffs))"
    ]
@@ -681,10 +684,10 @@
    "outputs": [],
    "source": [
     "i = 36605\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"Title\", min_len=3)\n",
     "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
     "\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"FullDescription\", min_len=3)\n",
     "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
    ]
   },
@@ -695,10 +698,10 @@
    "outputs": [],
    "source": [
     "i = 12077\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"Title\", min_len=3)\n",
     "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
     "\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"FullDescription\", min_len=3)\n",
     "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
    ]
   },
@@ -712,10 +715,10 @@
     "print(\"Index:\", i)\n",
     "print(\"Salary (gbp):\", np.expm1(model(make_batch(data.iloc[i: i+1], device=device)).detach().cpu()))\n",
     "\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"Title\", min_len=3)\n",
     "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
     "\n",
-    "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
+    "tokens_and_weights = explain(model, data.loc[i], col_name=\"FullDescription\", min_len=3)\n",
     "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
    ]
   },