Code to enrich candidate data

WomenPlusPlus · Oct 28, 2023 · 25b3846 · 25b3846
1 parent 31228e8
commit 25b3846
Showing 1 changed file with 61 additions and 0 deletions.
diff --git a/notebooks/enrich_data.ipynb b/notebooks/enrich_data.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import nltk\n",
+    "import random\n",
+    "import re\n",
+    "import yaml\n",
+    "import notebook_utils as nu\n",
+    "from nltk.tokenize import sent_tokenize\n",
+    "\n",
+    "nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_df = pd.read_csv('IT_entry_swiss_jobs_with_embeddings.csv')\n",
+    "candidates_df = pd.read_csv('IT_candidates_with_embeddings.csv')\n",
+    "\n",
+    "# Extract sentences from all job descriptions and combine them into one list\n",
+    "all_sentences = []\n",
+    "for job_description in job_df['job_description']:\n",
+    "    sentences = sent_tokenize(job_description)\n",
+    "     # Check if there are at least three sentences in the job description\n",
+    "    if len(sentences) >= 5:\n",
+    "        # Exclude the last three sentences and combine the rest into one string\n",
+    "        description_text = \" \".join(sentences[:-5])\n",
+    "        all_sentences.append(description_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Randomly select 15 sentences, append them to the existing content, and perform word replacements\n",
+    "n_sentences_to_select = 15\n",
+    "candidates_df['experience'] = candidates_df.apply(lambda row: select_and_append_sentences(row['experience'], all_sentences, n_sentences_to_select), axis=1)\n",
+    "\n",
+    "candidates_df.to_csv('IT_candidates_with_embeddings-v2.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}