Skip to content

Commit

Permalink
Code to enrich candidate data
Browse files Browse the repository at this point in the history
  • Loading branch information
patw47 authored Oct 28, 2023
1 parent 31228e8 commit 25b3846
Showing 1 changed file with 61 additions and 0 deletions.
61 changes: 61 additions & 0 deletions notebooks/enrich_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import nltk\n",
"import random\n",
"import re\n",
"import yaml\n",
"import notebook_utils as nu\n",
"from nltk.tokenize import sent_tokenize\n",
"\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"job_df = pd.read_csv('IT_entry_swiss_jobs_with_embeddings.csv')\n",
"candidates_df = pd.read_csv('IT_candidates_with_embeddings.csv')\n",
"\n",
"# Extract sentences from all job descriptions and combine them into one list\n",
"all_sentences = []\n",
"for job_description in job_df['job_description']:\n",
" sentences = sent_tokenize(job_description)\n",
" # Check if there are at least three sentences in the job description\n",
" if len(sentences) >= 5:\n",
" # Exclude the last three sentences and combine the rest into one string\n",
" description_text = \" \".join(sentences[:-5])\n",
" all_sentences.append(description_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Randomly select 15 sentences, append them to the existing content, and perform word replacements\n",
"n_sentences_to_select = 15\n",
"candidates_df['experience'] = candidates_df.apply(lambda row: select_and_append_sentences(row['experience'], all_sentences, n_sentences_to_select), axis=1)\n",
"\n",
"candidates_df.to_csv('IT_candidates_with_embeddings-v2.csv', index=False)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 25b3846

Please sign in to comment.