From d9bdf10b8a5317bbd901b0b9ecf4902bed58310a Mon Sep 17 00:00:00 2001 From: Jolene Date: Mon, 23 Oct 2023 13:44:46 +0800 Subject: [PATCH 1/7] cleaned up folders --- .../clean.ipynb | 0 .../new => 01ReformattingGA4}/new.py | 0 .../clean_umami.ipynb | 0 .../parsing urls.ipynb | 0 .../demo.ipynb | 0 .../main.ipynb | 0 .../analysis of the sql data.ipynb | 2 +- .../presenting data.ipynb | 12 +- .../collating fields.ipynb | 385 ------ howard's advisory stuff/getting data.ipynb | 907 --------------- howard's advisory stuff/new/clean.ipynb | 1032 ----------------- huan_yao_code/new.py | 46 - 12 files changed, 7 insertions(+), 2377 deletions(-) rename {huan_yao_code => 01ReformattingGA4}/clean.ipynb (100%) rename {howard's advisory stuff/new => 01ReformattingGA4}/new.py (100%) rename {howard's advisory stuff => 02ReformattingUmami}/clean_umami.ipynb (100%) rename {howard's advisory stuff/exported_csv => 02ReformattingUmami}/parsing urls.ipynb (100%) rename {analysis_application_data => 03AnalysisApplicationData}/demo.ipynb (100%) rename {analysis_application_data => 03AnalysisApplicationData}/main.ipynb (100%) rename {howard's advisory stuff/exported_csv => 04PostgreSQLDumpFIle}/analysis of the sql data.ipynb (99%) rename {howard's advisory stuff => 04PostgreSQLDumpFIle}/presenting data.ipynb (99%) delete mode 100644 howard's advisory stuff/collating fields.ipynb delete mode 100644 howard's advisory stuff/getting data.ipynb delete mode 100644 howard's advisory stuff/new/clean.ipynb delete mode 100644 huan_yao_code/new.py diff --git a/huan_yao_code/clean.ipynb b/01ReformattingGA4/clean.ipynb similarity index 100% rename from huan_yao_code/clean.ipynb rename to 01ReformattingGA4/clean.ipynb diff --git a/howard's advisory stuff/new/new.py b/01ReformattingGA4/new.py similarity index 100% rename from howard's advisory stuff/new/new.py rename to 01ReformattingGA4/new.py diff --git a/howard's advisory stuff/clean_umami.ipynb b/02ReformattingUmami/clean_umami.ipynb similarity index 100% rename from howard's advisory stuff/clean_umami.ipynb rename to 02ReformattingUmami/clean_umami.ipynb diff --git a/howard's advisory stuff/exported_csv/parsing urls.ipynb b/02ReformattingUmami/parsing urls.ipynb similarity index 100% rename from howard's advisory stuff/exported_csv/parsing urls.ipynb rename to 02ReformattingUmami/parsing urls.ipynb diff --git a/analysis_application_data/demo.ipynb b/03AnalysisApplicationData/demo.ipynb similarity index 100% rename from analysis_application_data/demo.ipynb rename to 03AnalysisApplicationData/demo.ipynb diff --git a/analysis_application_data/main.ipynb b/03AnalysisApplicationData/main.ipynb similarity index 100% rename from analysis_application_data/main.ipynb rename to 03AnalysisApplicationData/main.ipynb diff --git a/howard's advisory stuff/exported_csv/analysis of the sql data.ipynb b/04PostgreSQLDumpFIle/analysis of the sql data.ipynb similarity index 99% rename from howard's advisory stuff/exported_csv/analysis of the sql data.ipynb rename to 04PostgreSQLDumpFIle/analysis of the sql data.ipynb index 7bca8ec..ff6aaa9 100644 --- a/howard's advisory stuff/exported_csv/analysis of the sql data.ipynb +++ b/04PostgreSQLDumpFIle/analysis of the sql data.ipynb @@ -3085,7 +3085,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] diff --git a/howard's advisory stuff/presenting data.ipynb b/04PostgreSQLDumpFIle/presenting data.ipynb similarity index 99% rename from howard's advisory stuff/presenting data.ipynb rename to 04PostgreSQLDumpFIle/presenting data.ipynb index 3135aed..2d8111a 100644 --- a/howard's advisory stuff/presenting data.ipynb +++ b/04PostgreSQLDumpFIle/presenting data.ipynb @@ -59,7 +59,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -170,7 +170,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -192,7 +192,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -248,7 +248,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAE6CAYAAABnKomUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAA/30lEQVR4nO3de7wWVb348c8XlYtAmrHBO5tQBElD2uo5KKmhWGkHJI6CpthJ84aYHEMNTDsZYAYpXjAz1N/RSA01854mUtER0BDwhqIkIimQeUG2Cq7fHzN78wDPvgB7c3s+79drv/Y8a2bWrJlnLuv5zpo1kVJCkiRJkiRJpaPJpi6AJEmSJEmSNi4DQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklZhtN3UBANq0aZPKy8s3dTEkSZIkSZK2Gk8//fSSlFJZsXGbRUCovLycGTNmbOpiSJIkSZIkbTUi4u81jfORMUmSJEmSpBJjQEiSJEmSJKnEGBBSgzj44INp3bo122+/PRUVFUyZMgWAq666ivLycpo1a0aHDh245ppris5/2WWXERFr/VUZOXIku+++Oy1btuSEE07gvffeqx537733stdee9G8eXMOP/xwXnvtNQBmz55Nly5d2HHHHRk7dmz19EOGDGHUqFGNsRkkSZIkSdoiREppU5eBioqKZB9CW7bzzz+f/fffn3/84x9ccsklfP7zn+eBBx6gU6dOdOjQgf/+7/9m1KhRLFy4kNdff5099thjtfnnzJnDnDlzAFi6dCmDBw/mgAMO4JlnnmHSpEn079+fvn37UlFRwYgRIzj33HMZN24c//jHPygvL2fffffl29/+Nj/4wQ844IADmDJlCieeeCJz586lR48ejB8/nvfee4/58+fTp08fZs+eTbNmzTbFppIkSZIkaaOIiKdTShXFxtlCSA1i7NixfOMb36BXr140a9aMJk2a8OmnnwKw2267ceSRR7LzzjvTrFkzmjdvvtb8X/jCFxgwYAADBgxg+fLlAJx55pkATJ48GYALLriA4cOHs/POO3PLLbcAMHHiRD766CMuvvhizj33XI477jj+9Kc/MW/ePJYtW0Z5eTk9evRgxYoVVFZWMnToUEaPHm0wSJIkSZJU0jaLt4xpy/fuu+9SVpa9yW7HHXfkpptuYp999mH06NFcfPHFdO7cmSZNmnDzzTdXT1dMSokbb7yRz3zmM5x44okAtG3bFsgCQ02bNmXJkiWsWLGCpUuXVj8etttuuwGw++67A/Dqq68yaNAgjj/+eCZNmkTfvn2ZOnUqlZWV9OvXr9G2gyRJkiRJWwJbCKlBtGrVikcffZRx48ZRWVnJD3/4QxYvXsw111xDt27duPfee/niF7/I4MGDeeONN2rM54knnuDll1/mW9/6Fq1atQLgrLPOonPnzowYMYKDDjqouoVRsZZGVY9ARgT9+vVj3rx5TJ8+nYkTJzJs2DCuvvpqhg8fTvv27enVqxeLFi1qhK0hSZIkSdLmzYCQGsS2227LUUcdxbnnnstBBx3EE088wRNPPMHChQvp168fffr0oV+/frz//vv89a9/BaCyspKPP/54tXxuuOEGYNXjYgBt2rTh2WefZfr06cydO5ddd92VPffck5YtW9KhQweA6iDTwoULAarT27dvT0VFBePHj6dnz540bdqUkSNHVnd6PW7cuEbcKpIkSZIkbZ58ZEwb7JFHHuHOO++kR48eLFiwgKlTp9KuXbvqoMxtt93GLrvswu233w5Ap06dAGjRogVdu3at7kz67bff5t577+WQQw5hv/32q87/zTff5Nprr6VTp048/PDDzJ07tzqQM2DAAC666CKuuOIK3nrrLe655x4OPfRQOnbsWD3/kiVLGDduHNOmTePtt98GYMKECcybN4/u3bs3/gaSJEmSJGkzYwshbbCddtqJp556isGDB3PVVVdx6KGH8vvf/54DDzyQMWPG8NFHH3HOOefw0Ucfce211/LFL36xaD4TJkzgk08+Wa11EECTJk245557OOOMM5gyZQqXXnopgwcPBmCXXXZh4sSJ/Otf/+KCCy7ggAMOqO5wusoll1zCkCFDKCsro2vXrpx99tmMGTOGnXbaqTofSZIkSZJKia+dlyRJkiRJ2grV9tp5HxlrYOUXPbCpiyDVav7oYzZ1ESRJkiRJm5iPjEmSJEmSJJUYA0KSJEmSJEklxoCQJEmSJElSiTEgJEmSJEmSVGIMCEmSJEmSJJUYA0KSJEmSJEklxoCQJEmSJElSiTEgJEmSJEmSVGIMCEmSJEmSJJUYA0KSJEmSJEklxoCQJEmSJElSiTEgJEmSJEmSVGIMCEmSJEmSJJUYA0KSJEmSJEklxoCQJEmSJElSiTEgJEmSJEmSVGIMCEmSJEmSJJUYA0KSJEmSJEklps6AUETsERFPRMQLEfFcRJyXp+8UEX+IiJfz/58tmOfiiHglIl6KiKMbcwUkSZIkSZK0burTQmgF8N8ppS7AvwHnRMS+wEXA4ymlvYHH88/k4wYAXYGvAtdHxDaNUXhJkiRJkiStuzoDQimlRSmlZ/Lh94EXgN2APsCt+WS3An3z4T7Ab1JKH6WUXgNeAQ5q4HJLkiRJkiRpPa1TH0IRUQ4cADwFtEspLYIsaAS0zSfbDVhQMNsbedqaeX03ImZExIzFixevR9ElSZIkSZK0PuodEIqIVsAk4Hsppfdqm7RIWlorIaUbU0oVKaWKsrKy+hZDkiRJkiRJG6heAaGI2I4sGHR7SunuPPmtiNglH78L8Hae/gawR8HsuwNvNkxxJUmSJEmStKHq85axAH4FvJBSGlsw6j5gUD48CPhdQfqAiGgWER2AvYFpDVdkSZIkSZIkbYht6zHNIcDJwOyImJmn/QAYDdwZEd8BXgf+EyCl9FxE3Ak8T/aGsnNSSisbuuCSJEmSJElaP3UGhFJKf6Z4v0AAvWqY5yfATzagXJIkSZIkSWok6/SWMUmSJEmSJG35DAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVGANCkiRJkiRJJcaAkCRJkiRJUokxICRJkiRJklRiDAhJkiRJkiSVmDoDQhExISLejog5BWmXRcTCiJiZ/329YNzFEfFKRLwUEUc3VsElSZIkSZK0furTQugW4KtF0n+eUuqW/z0IEBH7AgOArvk810fENg1VWEmSJEmSJG24OgNCKaUpwD/rmV8f4DcppY9SSq8BrwAHbUD5JEmSJEmS1MA2pA+hwRExK3+k7LN52m7AgoJp3sjT1hIR342IGRExY/HixRtQDEmSJEmSJK2L9Q0IjQc6At2ARcCYPD2KTJuKZZBSujGlVJFSqigrK1vPYkiSJEmSJGldrVdAKKX0VkppZUrpU+CXrHos7A1gj4JJdwfe3LAiSpIkSZIkqSGtV0AoInYp+HgcUPUGsvuAARHRLCI6AHsD0zasiJIkSZIkSWpI29Y1QURMBA4H2kTEG8ClwOER0Y3scbD5wBkAKaXnIuJO4HlgBXBOSmllo5RckiRJkiRJ66XOgFBKaWCR5F/VMv1PgJ9sSKEkSZIkSZLUeDbkLWOSJEmSJEnaAhkQkiRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRJkiSpxBgQkiRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRJkiSpxBgQkiRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRJkiSpxBgQkiRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRJkiSpxBgQkqTNxMsvv8wRRxzB5z73OVq3bs1RRx3FvHnzABgyZAjt2rUjIjj22GNrzCOlxMUXX8yuu+5K8+bN6dy5M3fccUed4wAWLFhAnz59aNmyJTvssAMnnXQSALNnz6ZLly7suOOOjB07tnr6IUOGMGrUqMbYFJIkSZIamQEhSdpMLFy4kE8//ZQf/ehHfPvb3+axxx7jtNNOqx4/YMCAOvN47LHHGD16NLvssgtXXnklCxcu5NRTT+WTTz6pdVxKieOOO44//OEPfP/73+enP/0pZWVlAIwaNYqWLVtyyimncOGFF7J8+XJeeOEFHn74YYYOHdpo20OSJElS49l2UxdAkpTp0aMHTz75ZPXn22+/neeeew6AcePGMX/+fMaNG1drHp9++ikAHTt25KijjuKKK67g448/pkmTJrWOe+KJJ3j66acZPnw4F110Ec2aNSMiAFi2bBnl5eX06NGDa665hsrKSoYOHcro0aNp1qxZY2wKSZIkSY3MFkKStJlo2rRp9fCMGTP45z//yZe//OV1yqN3796cc8453HXXXXTp0oWlS5fy61//mm222abWcc8//zwAkyZNYvvtt+czn/lMdfBp0KBB3HvvvQwcOJC+ffsydepUKisr6devX8OtvCRJkqSNyoCQJG1mXnrpJfr06UN5eTnXXHPNOs9722230bt3b+6++27atWvHqaeeyrJly2od99FHHwGw3Xbbcc8999ChQwe+973vMXfuXPr168e8efOYPn06EydOZNiwYVx99dUMHz6c9u3b06tXLxYtWtQYm0KSJElSIzEgJEmbkeeff57DDjuMbbfdlj/+8Y/ssssudc5TWVnJxx9/DMB9993Hu+++y8knn8xxxx3HkUceycKFC3n++edrHVdeXg7AMcccQ58+fTjmmGNIKfHaa68B0L59eyoqKhg/fjw9e/akadOmjBw5kilTpgDU+SibJEmSpM2LfQhJ0mZiwYIFHH744fzzn//k8ssv56mnnuKpp55iwIABPPDAA8yZM6d6uptuuonDDjuMvffemxYtWtC1a1fmzJlDx44dARg/fjzLly/n/vvvp2nTpnTo0IHXX3+9xnFf+MIXaNu2LZMmTWKvvfbit7/9La1ateKAAw6oLt+SJUsYN24c06ZN4+233wZgwoQJzJs3j+7du2/krSVJkiRpQ9hCSJI2E/PmzWPx4sWsXLmSiy++mIEDBzJw4EAArrzySi666CIAZs2axemnn85f/vKXtfLo168fw4YNY/78+Zx77rnstNNO3HbbbbRp06bWcS1atOC3v/0tzZo145xzzmH77bfn7rvvpm3bttV5X3LJJQwZMoSysjK6du3K2WefzZgxY9hpp50YPHjwxtlIkiRJkhpEpJQ2dRmoqKhIM2bM2NTFaBDlFz2wqYsg1Wr+6GM2dREkSZIkSRtBRDydUqooNs5HxiRtlgyuaktggFWSJElbKh8ZkyRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRJkiSpxBgQkiRJkiRJKjEGhCRJkiRJkkqMASFJkiRJkqQSY0BIkiRtdYYMGUK7du2ICI499tjq9AkTJtCxY0datGjB0UcfzcKFC4vOv3jxYrp160bLli1p3bo1hx12GHPmzKkeP3LkSHbffXdatmzJCSecwHvvvVc9LiJW++vbty8As2fPpkuXLuy4446MHTt2tbKOGjWqgbeAJElS7QwISZKkrdKAAQNW+zxjxgxOO+00dtttN6644gomT57MWWedVeP8X/va17j++us566yzmDJlCkOHDgVg0qRJDB8+nAMPPJAf/OAH3HnnnYwYMWK1eb/5zW8yceJEJk6cyAUXXADAqFGjaNmyJaeccgoXXnghy5cv54UXXuDhhx+uzluSJGljMSAkSZK2OuPGjeP8889fLe3JJ58kpcQZZ5zBkCFD6N69O/fffz9Lly5da/6ysjIuv/xyvv71r/OVr3wFgCZNsmrT5MmTAbjgggsYPnw4O++8M7fccstq8++777584xvfYMCAARx66KEALFu2jPLycnr06MGKFSuorKxk6NChjB49mmbNmjXwFpAkSaqdASFJklQS2rZtC8Cf//xnXnzxRV5++WVSSsyfP7/o9LNnz6Zt27Z87WtfY7fdduOqq65aLZ/Jkyczffp0lixZwvvvv79aYOnyyy+nVatWtG/fnvvvvx+AQYMGce+99zJw4ED69u3L1KlTqayspF+/fo230pIkSTUwICRJkkrC8ccfzyGHHMINN9xAly5d+PjjjwFo3rx50en32msvHnnkEX784x/z5ptv8tOf/hSAs846i86dOzNixAgOOuig6vmr/l944YXcfffd3HjjjbzzzjsMHDiQDz/8kH79+jFv3jymT5/OxIkTGTZsGFdffTXDhw+nffv29OrVi0WLFm2ELSFJkmRASJIklYhmzZoxZcoUZs6cyZw5czj44INp3rw5n//85wGorKysDhIBtGrVit69ezNixAj22GMP7rzzTgDatGnDs88+y/Tp05k7dy677rore+65Jy1btgRg9OjR9O3bl9NPP52jjjqKDz74gAULFgDQvn17KioqGD9+PD179qRp06aMHDmSKVOmANmjbpIkSRvDtpu6AJIkSQ3tgQceqH4r2IIFC7jpppvo2bMn119/PQcccADTp0/nscceY+jQobRo0QKAFi1a0LVrV+bMmcPNN9/MzJkz6datG7NmzeL111/nwAMPBODNN9/k2muvpVOnTjz88MPMnTu3OpDz4IMPctttt3H44Yfzzjvv8NBDD1FWVkaHDh2qy7ZkyRLGjRvHtGnTePvtt4Hs7Wfz5s2je/fuG3MzSZKkEmZASJIkbXWuvPJKnnzySQBmzZrF6aefzq9+9SuefPJJfvGLX9CyZUsGDx7MyJEji85fVlbGgw8+yA033ECrVq049thjq18V36RJE+655x5effVVPve5z3HppZcyePBgIGsBtGjRIoYNG8bKlSupqKhgzJgxNG3atDrvSy65hCFDhlBWVkZZWRlnn302Y8aMoVOnTtX5SJIkNbZIKW3qMlBRUZFmzJixqYvRIMovemBTF0Gq1fzRx2zqItSLx5K2BFvK8SRJkqTSFBFPp5Qqio2zhZAkSVs5A6za3BlclSRp47NTaUmSJEmSpBJjQEiSJEmSJKnEGBCSJEmSJEkqMQaEJEmSJEmSSowBIUmSJEmSpBJjQEiSJEmSJKnE1BkQiogJEfF2RMwpSNspIv4QES/n/z9bMO7iiHglIl6KiKMbq+CSJEmSJElaP/VpIXQL8NU10i4CHk8p7Q08nn8mIvYFBgBd83muj4htGqy0kiRJkiRJ2mB1BoRSSlOAf66R3Ae4NR++FehbkP6blNJHKaXXgFeAgxqmqJIkSZIkSWoI69uHULuU0iKA/H/bPH03YEHBdG/kaWuJiO9GxIyImLF48eL1LIYkSZIkSZLWVUN3Kh1F0lKxCVNKN6aUKlJKFWVlZQ1cDEmSJEmSJNVkfQNCb0XELgD5/7fz9DeAPQqm2x14c/2LJ0mSJEmSpIa2vgGh+4BB+fAg4HcF6QMiollEdAD2BqZtWBElSZIkSZLUkLata4KImAgcDrSJiDeAS4HRwJ0R8R3gdeA/AVJKz0XEncDzwArgnJTSykYquyRJkiRJktZDnQGhlNLAGkb1qmH6nwA/2ZBCSZIkSZIkqfE0dKfSkiRJkiRJ2swZEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSZKkEmNASJIkSZIkqcQYEJIkSZIkSSoxBoQkSZIkSZJKjAEhSZIkSUUNGTKEdu3aEREce+yx1el/+ctf2H///WnWrBndu3fnmWeeqTWfxYsX06ZNGyKCn/3sZ9XpI0eOZPfdd6dly5accMIJvPfeewCklLj44ovZddddad68OZ07d+aOO+4AYPbs2XTp0oUdd9yRsWPHrlbWUaNGNeTqS9JWzYCQJEmSpBoNGDBgtc+VlZV885vf5P333+fnP/85b731Fv3792flypU15nHeeeexfPny1dImTZrE8OHDOfDAA/nBD37AnXfeyYgRIwB47LHHGD16NLvssgtXXnklCxcu5NRTT+WTTz5h1KhRtGzZklNOOYULL7yQ5cuX88ILL/Dwww8zdOjQht8AkrSVMiAkSZIkqahx48Zx/vnnr5b20EMP8dZbb3H22Wdz9tln853vfIfXXnuNyZMnF83joYce4ve//z0XXnjhaulV019wwQUMHz6cnXfemVtuuQWATz/9FICOHTty1FFHscMOO9C6dWuaNGnCsmXLKC8vp0ePHqxYsYLKykqGDh3K6NGjadasWYOuvyRtzQwISZIkSaq31157DYDddtsNgN133x2AV199da1pP/jgA84880xGjRrFnnvuudq4tm3bAllgaPr06SxZsoT333+fpUuX0rt3b8455xzuuusuunTpwtKlS/n1r3/NNttsw6BBg7j33nsZOHAgffv2ZerUqVRWVtKvX7/GXG1J2uoYEJIkSZK03lJKAETEWuOuuOIKtt9+e3r37s3bb78NwNKlS3nnnXc466yz6Ny5MyNGjOCggw6iefPmADRv3pyXXnqJ2267jd69e3P33XfTrl07Tj31VJYtW0a/fv2YN28e06dPZ+LEiQwbNoyrr76a4cOH0759e3r16sWiRYs23gaQpC2UASFJkiRJ9dahQwcA3njjDQAWLly4WnplZSUff/wxAAsWLODFF19kn332qX5kbPTo0Vx33XW0adOGZ599lunTpzN37lx23XVX9txzT1q2bMl9993Hu+++y8knn8xxxx3HkUceycKFC3n++ecBaN++PRUVFYwfP56ePXvStGlTRo4cyZQpU4DsUTdJUu223dQFkCRJkrR5euCBB5gzZw6QBXduuukmDj74YNq2bcv48eNp3bo1v/rVrygvL+fwww8HoEWLFnTt2pU5c+YwePDg6reTTZ48meuuu45TTjmF/v378+abb3LttdfSqVMnHn74YebOnVsdyOnYsSMA48ePZ/ny5dx///00bdq0OugEsGTJEsaNG8e0adOqWx9NmDCBefPm0b179421iSRpi2ULIUmSJElFXXnllVx00UUAzJo1i9NPP52nn36au+66i1atWnHeeefRtm1b7rrrLrbZZpu15q+oqKB///7079+fiooKAPbbbz86d+5MkyZNuOeeezjjjDOYMmUKl156KYMHDwagX79+DBs2jPnz53Puueey0047cdttt9GmTZvqvC+55BKGDBlCWVkZXbt25eyzz2bMmDHstNNO1flIm7MJEybQsWNHWrRowdFHH13d2q6YxYsX06ZNGyKCn/3sZ9Vp3bp1o2XLlrRu3ZrDDjusOoAL0L9/fz772c8SEasdE7Nnz6ZLly7suOOOjB07tjp9yJAhjBo1qhHWVJurqHrmd1OqqKhIM2bM2NTFaBDlFz2wqYsg1Wr+6GM2dRHqxWNJWwKPJ6lheCxJDWNLOZYEM2bM4KCDDuLQQw+lf//+fP/73+foo4/mvvvuKzr9iSeeyO9+9zs+/PBDrrzySi644AIWL17M2LFj6dy5M8899xxXXnklRx11FI8++igAJ510Es2aNePmm2/mnHPO4dprr63Oa+7cufTo0YPx48fz3nvvMX/+fPr06cPs2bN9W99WJiKeTilVFBtnCyFJkiRJkjaiJ598kpQSZ5xxBkOGDKF79+7cf//9LF26dK1pH3roIX7/+99X98NVpaysjMsvv5yvf/3rfOUrXwGgSZNVP/Fvv/12TjnllLXyW7ZsGeXl5fTo0YMVK1ZQWVnJ0KFDGT16tMGgEmMfQpIkSZIkbURt27YF4M9//jNf+tKXePnll0kpMX/+fD73uc9VT/fBBx9w5plnMmrUKFq1arVWPrNnz+aAAw4AYLfdduOqq66qc9mDBg3i+OOPZ9KkSfTt25epU6dSWVlJv379GmbltMWwhZAkSZIkSRvR8ccfzyGHHMINN9xAly5dqt/M17x589Wmu+KKK9h+++3p3bt3defpS5cu5Z133gFgr7324pFHHuHHP/4xb775Jj/96U/rXHa/fv2YN28e06dPZ+LEiQwbNoyrr76a4cOH0759e3r16sWiRYsaeI21OTIgJEmSJEnSRtSsWTOmTJnCzJkzmTNnDgcffDDNmzfn85//PJWVldUBogULFvDiiy+yzz77VD8yNnr0aK677joAWrVqRe/evRkxYgR77LEHd955Z72W3759eyoqKhg/fjw9e/akadOmjBw5kilTpgBUv/FPWzcfGZMkSZIkaSNauXIlQ4cO5YADDmD69Ok89thjDB06lBYtWhARdO3alTlz5jB48GCOPfZYACZPnsx1113HKaecQv/+/bn55puZOXMm3bp1Y9asWbz++usceOCB1cu44447qHp50/PPP89NN93EMcccwy677ALAkiVLGDduHNOmTatufTRhwgTmzZtH9+7dN/IW0aZgQEiSJEmSpI0oInjyySf5xS9+QcuWLRk8eDAjR45ca7qKigoqKrIXRH3wwQcA7LfffnTu3JlXXnmFBx98kBtuuIFWrVpx7LHHrvYa+QsvvJC///3vADzxxBPVf1UBoUsuuYQhQ4ZQVlZGWVkZZ599NmPGjKFTp06rvaZeWy9fO9/AfB2pNndbyutIPZa0JfB4khqGx5LUMLaUYwk8nrT525KOp9rU9tr5DWohFBHzgfeBlcCKlFJFROwE3AGUA/OB41NK72zIciRJkiRJktRwGqJT6SNSSt0KIk4XAY+nlPYGHs8/S5IkSZIkaTPRGG8Z6wPcmg/fCvRthGVIkiRJkiRpPW1oQCgBj0bE0xHx3TytXUppEUD+v22xGSPiuxExIyJmLF68eAOLIUmSJEmSpPra0LeMHZJSejMi2gJ/iIgX6ztjSulG4EbIOpXewHJIkiRJkiSpnjaohVBK6c38/9vAPcBBwFsRsQtA/v/tDS2kJEmSJEmSGs56B4QiomVEtK4aBnoDc4D7gEH5ZIOA321oISVJkiRJktRwNuSRsXbAPRFRlc+vU0oPR8R04M6I+A7wOvCfG15MSZIkSZIkNZT1DgillF4FvlgkfSnQa0MKJUmSJEmSpMbTGK+dlyRJkiRJ0mbMgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJabSAUER8NSJeiohXIuKixlqOJEmSJEmS1k2jBIQiYhvgOuBrwL7AwIjYtzGWJUmSJEmSpHXTWC2EDgJeSSm9mlL6GPgN0KeRliVJkiRJkqR1ECmlhs80oj/w1ZTSafnnk4GDU0qDC6b5LvDd/OM+wEsNXhBtLdoASzZ1IaStgMeS1DA8lqSG4/EkNQyPJdWkfUqprNiIbRtpgVEkbbXIU0rpRuDGRlq+tiIRMSOlVLGpyyFt6TyWpIbhsSQ1HI8nqWF4LGl9NNYjY28AexR83h14s5GWJUmSJEmSpHXQWAGh6cDeEdEhIpoCA4D7GmlZkiRJkiRJWgeN8shYSmlFRAwGHgG2ASaklJ5rjGWpJPhoodQwPJakhuGxJDUcjyepYXgsaZ01SqfSkiRJkiRJ2nw11iNjkiRJkiRJ2kwZEJIkSZIkSSoxBoQ2gohIETGm4PMFEXFZHfMcHhE9Cj6fGRGnNHC5yiNiTg3Lvn+NtFsion8d+f1HRFzUkGVcI/9dI+K3+XC3iPh6A+XbLCIei4iZEXHCGuP+LSKeyse9UPW9Nfa6CiJieEQ8FxGz8u1/cJ5+U0Tsu6nLtz7y4+jDiGhdkHZ1fo5osynL1pgi4nsRsX0N4+YXrnux808N801tyDIWyb96P4uIHzTmsrRxRcQHjZDn5IjwVb8lYn3qdZuDiNgvv57OjIh/RsRr+fBj65DHqRFxbQOV57KIuGA9562IiHHrOW/R49Xr0dal2O+c+uxzG7JvrUPZphaU8cQGzHdiXm8+v8i4UyJiTl63fn59j70i+dbrmlrTb9mafo9q42mUTqW1lo+AfhExKqW0pJ7zHA58AEwFSCnd0EhlazAppftogLfJRcS2KaUVRfJ/E6gKSnUDKoAHN3R5wAHAdimlbkXG3Qocn1J6NiK2AfbJy9Ig69qQatpuW6KI+HfgWKB7SumjvILWFCCldNomLVwREbFNSmllPSd/BegD3BYRTYAjgIWNVrjNw/eA24APGyrDlFKPuqeqXW3f2xr72Q+AkRu6PElbjfWp121yKaXZZPUnIuIW4P6U0m83ZZnWV0ppBjBjU5cDvB5tbRpq36qtXl6wz5QDJwK/boDl7Qz0SCm1LzLua2R1sd4ppTcjojlw8oYucx3Ktu2W8Fu2VNlCaONYQdbre7Fo7TfyFih/y1uptIuIcuBM4Pz8zk3Pwoh23jrm//II8D0R8dk8fXJEXBER0yJibkT0zNPLI+JPEfFM/rdBF678DsqP8rxmR0TnPP3UiLg2InbIp2mSp28fEQsiYruI6BgRD0fE03mZqua9JSLGRsQTwBURcVjBXay/RUTrqghyRDQF/gc4IR9/QkS8HBFleV5NIuKVWKPFRUTsFBH35tvt/yJi/4hoS/ZDtVueV8c1VrctsAggpbQypfR84boWlH1cREyNiFcjb0mVl+P6PBJ/f0Q8WDDuhxExPV+fGyMiCr7Dq/K85kTEQTWVPU+/LJ//UeD/RURZREzK854eEYdsyHe9Ce0CLEkpfQSQUlqSBwRXu7MXER9ExE8i4tl8u7TL0zvmn6dHxP9EfvciIlpFxOMF+26fPL08Il6MiFvzbfzbyFu0RESvfB+cHRETIqJZnj4//x7/DPxnRPSOiL/med8VEa1qWLeJQFVLtMOBv5CdI8jzvTc/Pp6LiO8WpNe0rmudQ/L0soj4Q16eX0TE36uOiYj4VmTniZn5uG0KlnFFvvzHIuKgfHu/GhH/kU+zTURcmW/bWRFxRp5+eD7tb/NteXtkhgC7Ak9EdnzXW75/Tygow5DC7ZH/vyMKWgtGdjx+s45yPhERvwZmR0TLiHgg365zIm8lWLWfRcRooEW+rW6PiB9HxHkFy/tJYbm0ZYp1v662iIjf5NPfAbQoyGtgfr6YExFXFKQXPYa1RVqnel2eXvR8VtM5JWq/Xr0QEb/MrxOPRkSLfNyB+T751/z8V6+77lHD9SvPb2q+z06LVa1bd42sLvdyRPy0IJ+arlPt83WZlf/fs0gZajoGi65TFLTeybfVzfl2mhUR38zTx0fEjHw7/ag+26KWbeT1aCtTy/n98Mjq7U0iq+vtWDDPK5H9Vita34616+VdY1V9a1ZE7J1PV9WqZjTQMx9/fmS/jboVLO8vkdf5C9KaF+zvf4uII/JRjwJt87x6rrG6FwMXVNWlU0qVKaVf5vmdnq/Ds/k6VdV/b8mPoSfyff6w/Bh4IbKAcmGZxkR2/ng8Vv0emxwRIyPiSeC8WP237Jfy5f0VOKcgn+0j4s58W90R2bm0qs5f33q21lVKyb9G/iNr6fMZYD6wA3ABcFk+7rNQ/ba304Ax+fBlZAcua34GZgGH5cP/A1yVD08umP/rwGP58PZA83x4b2BGPlwOzClS3sPJ7hoVpt0C9M+H5wPn5sNnAzflw6cC1+bDvwOOyIdPKJjmcWDvfPhg4I8F+d8PbJN//j1wSD7ciqw1W3V5C5eVf74U+F4+3BuYVGS9rgEuzYe/AsysaX0L5vkh8A5wD3BGwXYsXNdbgLvIAqz7Aq/k6f3JWjA1AXbO86nahjsVLON/gW8UfIe/zIe/XLC+NZX9MuBpoEX++dfAofnwnsALm3r/X89jphUwE5gLXE++vxdso4p8OBVsu58CI/Lh+4GB+fCZwAf58LbAZ/LhNmStdSLft1LBPjeB7DhtDiwAOuXp/69gP5sPDCvIawrQMv98IfDDIut1S75f/B/Zsf9L4LA8rzaF+wbZD8w5wOfqWNeaziHXAhfnw1/N528DdCE7vrbLx10PnFKwjK/lw/eQVS62A75YsM99t2DZzcjuonUgO47eBXYn2+f/WrAvVq9fkW2y2jgKjkey/Xtqvpw2wNKCcld9p8cBt+bDTfPvq0Ud5VwGdMjHfZP8mMs/71BkP/ugYHw58Ew+3ASYV/Ud+bdl/BV+nwVp63pdHQpMyIf3JwsQVJAFP18HysjON38E+ubTFT2G/dvy/lj/et1a57OazinUfr1aAXTLx90JfCsfnkPWQgCyH5pr1fEK1uEWsutR0etXfj59FTgwT/9MXqZT8/QdyK6Rfwf2qG0fJ7vmDMqH/wu4t2Cb1FW3LbpOrH6tuKJq+qrvIP9fdT3dhuxY3j//PJn8/L7GNpmP16Ot5o8iv3PW2OcmU/z8Xvi9Xw18Ox8+uGCaovVt1q6XXwOcVLBPVKV/sOay8s+DWLXvdyL/zbbGOvw3cHM+3JnsmtO82PoWzPPPqv2pyLjPFQxfzqrfeLcAvyE77/QB3gP2y/e1p1l1DkoF6/hDVv0+mgxcX8O2Lzzer2TVcX0B8It8+AusurbWq57t3/r92UJoI0kpvUf2Y3LNyP3uwCMRMRv4PtC1tnwiYgdgx5TSk3nSrWSBgyp35/+fJjsxQFbh+GW+jLvIgha1Frce6cWWU+gOVrWCGADckUdyewB3RcRM4BdkLUGq3JVWNZf9CzA2v9OxY6r7UagJQNVzqf8F3FxkmkPJgi+klP4IfC7fnjVKKf0P2YnoUbImnQ/XMOm9KaVPU9aCqOqu76H5On2aUvoH8ETB9EfkUe/ZZAGewu99Yr7sKcBn8jsTtZX9vpTS8nz4SODafPvel8/fmi1MSukD4EtklajFZPvPqUUm/Zgs+AOr74v/Travw+rNcAMYGRGzgMeA3Vj1fS1IKf0lH76NbJvvA7yWUpqbp695vN2R//83suPqL/m2HwS0r2UV7yY7Lg4G/rTGuCER8SxZ0GgPsiBubeta0znkULILOSmlh8kCkgC9yLbt9LysvYDPFyyjah+fDTyZUvokH65aXm/glHzep8h+uFSVcVpK6Y2U0qdkAb2qeWpT7HxTmPZASumjlD2W8Tarvq8qDwFfiazl1teAKfnxUFc5XytYzyPzu4Q9U0rv1lrYlOYDSyPigHwZf0spLa3HemoztZ7X1S+TnSdIKc0iq9wCHAhMTiktzq9btxfkVdMxrC3Qetbr1jqf1XJOqe169VpKaWY+/DRQntcVWqeUqvqzqe8jKDVdv/YBFqWUpletb0Fd7PGU0rsppUrgeVZd72q7JleV53/Jrk/VajoG12GdjgSuq/qQUqq63h0fEc8AfyP7Htan/uv1aMvVKL9n8uHa6tuF9fK/Aj+IiAuB9gXpNbkLODYitiP7PXNLkWkKfxO8SBaU7VRHvrX5Qt4yaTZwEqufs36fUkpk++ZbKaXZeR3vOVZtr09ZtV2q6s9V7mANRY73/11j3arqrnNYdW1d13q21oF9CG1cVwHPsHqw4hpgbErpvog4nCx6uiE+yv+vZNX3ez7wFtld/iZAZR15LCW7w1VoJ6DwOfliyyl0HzAqInYi+/H5R6Al8K9UvK8eyO6SAJBSGh0RD5BF7P8vIo6srdwppQUR8VZEfIXsR/ZJRSaLYrPWlGdB3vOA8RHxS2BxRHyuyGQfFQzHGv9XL0T23O71ZHd7FkTWEWXzWsqU6ij7soK0JsC/1+OCs9nLg4OTgcn5RWoQa18YP8kvVFDzvljoJLI7919KKX0SEfNZte3ru90LVW37AP6QUhpYx/RVfkN2Lrg1pfRpZE8Mkp8DjiT7Dj+MiMkF5atpXWs6h9RU9siXe3GRcYXL+JR8v87LuG3B/OemlB5ZLdNs2YXHQX2+D1h1vqk6v9R0rimaZ0qpMt9OR5NV2ibWo5yF55q5EfElsnPNqIh4NA8E1+YmsrvkO5MFo7V1q+l6V+z6Uds5Y13PV9r8XcW61etqOp8VO6fUdr1aM58W1H29qknR61f+mEpNdaSa1qO++3idda+CstV3utXyjIgOZK0NDkwpvZM/4tK8yLyFvB5tXWr6PfNawee6fs/8FdgrfwyqL1kLGqihvp3X5wq/019HxFPAMWSB4tPyG7tF5XW/P5C1yDme7Kb0mtbnWH+OVb/H1nQLWUvWZ/Obr4cXjKvaPp+y+v7/KfU7vpcVGb/W8brGuJrS16WerXVgC6GNKKX0T7Kmvd8pSN6BVR3KDipIfx9Yq2VHfrfgnYJnQ08GnlxzujXsQHaX59N8+m3qmP5lsufDu0D27Df5IyN1zFdYzg+AaWRNLe9PWf877wGvRcR/5vlGRHyx2PwR0TGPQl9B1rS28xqTFNs+N5FFpu9MxTvmm0IeKMovwkvyMtUoIo6Jql/r2d2clcC/apunwJ+Bb0b2DHI7Vp1gqyokS/JWU2u+va3qmfFDgXfz77y+ZX8UGFxQ/m71LOtmJSL2ifw561w3sjsg9fV/ZE2vIbujU2UH4O28cn0Eq99d2DOyzqwBBpJ9fy+S3XndK0+v6Xj7P+CQqunyZ6BrvFuTUnodGE4WGCy0A/BOXiHoTHZHpC41nUP+TFaZICJ6s6pS9DjQP7L+s4isf6p1ucvyCHBWfveKiOgUES3rmKfo+Sw3mbxjw8j6MvoWq7emq4/fAN8Geublq3c5I2JX4MOU0m3Az4DuRfL/pCqf3D1kj+EdWLA8baHW87paeE7+AtljY5Dd/T8sItrk+/PAeuSlLdQ61utqU+ycUtv1qlhZ3gHej4iq68aA2qYvUNP160WyuuCBeXrrghsD62pqQXlOIrs+FZa96DG4Duu0Zt3ns2SPuC0D3s3rYF+rRzkn4/Voq5H/FlkUEb0gq++Qbas/1zrj6nkksm08luyxsKoWWPWqb0fE54FXU0rjyG6W77/GJDX9nhkHTM/PMWsqvP50Intk7aU6VmUU8NPIOp4msjcsV7VubE22nbaj+A31ujRh1W+ZE6lj+6aU/kV2XFa1JCpcZmHddV+yR9RgHevZWjfendr4xlBwAiG7c3RXRCwk29k75Om/B34bWSeC566RxyDghsg6/XqV7MJTm+uBSXkg5gmKR2urpeytTt8Cbs5bs3wCnFZX09Ui7iBr+nh4QdpJZK1tRpA9yvYb4Nki834vrwCtJGuO/BCrP172BHBR3mxwVErpDrIT7c0Uf1wMsm19c2TNrz+kfhW1k4GfR8SHZM+xnpRSWrkqRlSrSWSP48wh6wvnKbIAz7/y1kazyZ5Xn77GfO9E9jrKz5A1F12Xsg8Brsun25bsonFmfQq7mWkFXBNZc/EVZH0nfLfWOVb3PbK3eP038ABZ3zaQPb7x+4iYQRbgfLFgnheAQRHxC7Kg6Pj8bt+3yY7Rbcm+q7XekpBSWpzfVZkYeafTwAiy772olNIviiQ/DJyZf38vkZ0T6nIZxc8hP8rLcwLZD9JFwPsppSX58fdoZB2/f0LWoV99A243kfdbkAdLF5PdNavNjcBDEbEopXTEGuN+THZOeJbsDtDD5I/irINHyR7duC+l9PE6lnM/4MqI+JRsW5xVQ/lnRcQzKaWTUkofR9ZB9r9qCD5r87Z9RLxR8Hks635dHc+qc/JMshsgpJQWRcTFZNeoAB5MKf2ugcuvzUt963U1quGcUtv1qibfIesiYBlZcKPOeltN16+8tcoJZNfiFsByshas62MIMCEivk92Li52fNV0DNZnnS4nq/vMIas3/iildHdE/I2sZcSrZF0R1MXr0dbnFLJ9Y0z++Ud5y/91cQdZ/e/UgrT61rdPAL4VEZ8A/yDrH6vQLGBFvs/dklL6eUrp6Yh4j5p/z1xPdqzMJqsjn5r/dqtxBVJKD+aB0cfyfTCxqkXZJWS/Uf5O9ttkXbuaWAZ0jYinyY7PE+qYHrLje0L++6owkHk9cGu+Xf9Gtn3eXZ96tuqvqtM7aYsXWS/0P08prdmz/iYTEa1SSh9E9pjZNLJOi/9Ry/STyTpc2yxepbqlyiuUy1NKKSIGkHUw3aeW6cvJWrJ9YWOVsbHlF8yVKaUVkbV8Gl/L45paB3kg7RngP1NKL2/q8kjasjXUOaWqzpEPXwTsklI6r4GKuUlsjevUkLwebX3ylmKTgc750x0lI2+Zt11+Q7YjWav2TgXBVTUCWwhpq5BXEs5i/Zo6Nqb781YuTYEf1xYMUoP6Ellnf0H2iN9/1T75VmlP4M68svgxcPomLs9WIW/CfD9wj5VvSRuqgc8px+Qt1LYlu9t/6gbmtznYGtepQXg92vpExCnAT4ChpRYMym0PPJE/vhbAWQaDGp8thCRJkiRJkkqMnUpLkiRJkiSVGANCkiRtxiLivyJidkTMiog5+csGapr28Ii4v4GWOznvm62+0/eMiOciYmbeCW3huJV5etXfRQ1RxoYQERURMW4Tl+GyiLighvSF+TabExH/UUc+8yOiTT48Nf9fHhEnFkyz3utb0z4REcdGxN8i4tmIeD4izqgjn6nrs3xJktSw7ENIkqTNVETsDgwHuqeU3o2IVkDZJi5WTU4CfpZSKvZmlOWba6fmeSf+m3NH/j9PKf0sIroAf4qItvXpWyKl1CMfLCd7FfCv8/QGXd+8r4cbgYNSSm/kHdqX17NskiRpE7KFkCRJm6+2wPvABwAppQ9SSq8BRMReEfFY3irjmfyNHACtIuK3EfFiRNyed65ORPTKW3HMjogJVa9urSm9JsWmj4jTgOOBH0bE7fVduYg4MCKm5uswLSJaR0TziLg5z/9vEXFEPu2pEXF3RDwcES9HxE8L8hmYTz8nIq4oSP8gIq6IiKfzbXVQ3srl1arWNoWtqiKiVcGyZ0XENyNim4i4Jc97dkScX2Q9vhERT+XlfSyy1/tWtfCZULDMIQXzDI+IlyLiMWCfurZVSukFslcMt6lpfdco0wf54GigZ97K6Py61jdPHx8RMyJr8fWjOorWmuwG49K8nB+llF7K82kXEffk3++zEdFjjbIREd+PiOn58n+Up5VHxAsR8cu8DI9G3uqspv2+WD6SJKl2BoQkSdp8PQu8BbyW/3D/RsG424HrUkpfBHoAi/L0A4DvAfsCnwcOiYjmwC3ACSml/ch+wJ9VU3pNhalp+pTSTcB9wPdTSsXe9tgiVn9k7ISIaArcAZyXr8ORwHLgHIA8/4HArflyAboBJwD7ASdExB6RvaL3CuAr+fgDI6JvPn1LYHJK6UtkgbXLgaOA44D/KVLOS4B3U0r7pZT2B/6Y57lbSukLeZmKtYD6M/BvKaUDgN8AwwrGdQaOBg4CLo2I7SLiS8AAsu+qH3BgkTxXExEHA58C29WyvsVcBPwppdQtpfTzeqwvwPCUUgWwP3BYROxfU+YppX+Sffd/j4iJEXFSZG83BBgHPJl/v92B59ZYp97A3mTbphvwpYj4cj56b7L9uyvZ2yK/maevtd/XkY8kSaqBASFJkjZTKaWVwFeB/sBc4Od5q5PWZEGKe/LpKlNKH+azTUspvZE/VjST7PGdfYDXUkpz82luBb5cS3pN1nX6KsvzgETV3x15XotSStPzdXgvpbQCOBT43zztRbJXTXfK83k8pfRuSqkSeB5oTxZMmZxSWpzPf3tBmT4GHs6HZ5MFJz7Jh8uLlPNI4LqqDymld4BXgc9HxDUR8VXgvSLz7Q48EhGzge8DXQvGPZC3mlkCvA20A3qSvSr6w5TSe2QBlZqcHxEzgZ+RBcMqalnfdVVsfQGOj4hngL/l67JvbZmklE4DegHTgAuACfmorwDj82lWppTeXWPW3vnf34BnyIJne+fjXkspzcyHnwbKa9nva8tHkiTVwICQJEmbsZSZllIaRdaq5JtA1DLLRwXDK8la8dQ0fW35NMT0deWV1nEZ67JuAJ+klKqW8WnV/HmwrFg/imuVKQ+SfBGYTNZ66aYi810DXJu3IDoDaF4wrliZWXM5tfh5HkTrmVL6E438HUREB7KgTq+81dADrL4+RaWUZuctkI5iVWue+ix/VEGgcK+U0q/yceu6H9eUjyRJqoEBIUmSNlMRsWtEdC9I6gb8PW9V8kbVo0KR9eOzfS1ZvUjWwmKv/PPJwJO1pK9rPuvjRWDXiDgwX4fWEbEtMIWsg2oiohOwJ/BSLfk8RfZYU5uI2IbsMbP1LdOjwOCqDxHx2cje2tUkpTSJ7BGr7kXm2wFYmA8PqsdypgDHRUSLvNXLN+qaocC6ru/7ZP38FLPW+gKfAZYB7+Z9IX2ttsLk/RAdXpDUjaxVF8Dj5I8gRtYX02fWmP0R4L8i6yydiNgtItrWtKxa9vt1ykeSJGUMCEmStPnaDvhZZB1EzyR7ZOi8fNzJwJCImAVMBXauKZP8EatvA3fljzV9CtxQU/q65lOP9VizD6HRKaWP8/W5JiKeBf5A1hLlemCbPP87gFNTSh/VlHFKaRFwMfAEWZ9Lz6SUflePMhVzOfDZyDprfhY4AtgNmJxv/1vyZa3pMrJt8idgSV0LSSk9Q7ZuM4FJwJ/qW8D1WN9ZwIq8E+Y1O8Rea31TSs+SPXr1HNmjX3+po0gBDIusg+yZwI+AU/Nx5wFH5N/l06z+KB0ppUfJ3n7213ya31Jz8KrKWvv9euYjSVLJi1UtqSVJkiRJklQKbCEkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXGgJAkSZIkSVKJMSAkSZIkSZJUYgwISZIkSZIklRgDQpIkSZIkSSXm/wOT9eHUjvEIlQAAAABJRU5ErkJggg==\n", + "image/png": "", "text/plain": [ "
" ] @@ -298,7 +298,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -376,7 +376,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] diff --git a/howard's advisory stuff/collating fields.ipynb b/howard's advisory stuff/collating fields.ipynb deleted file mode 100644 index 3acca0a..0000000 --- a/howard's advisory stuff/collating fields.ipynb +++ /dev/null @@ -1,385 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 108, - "id": "f7ebf578", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os \n", - "filesdir = os.path.dirname('C:/Users/howar/Downloads/Telegram Desktop/umami_preprocessed/')\n", - "files = os.listdir(filesdir)\n", - "file = 'present.csv'\n", - "filedir = os.path.join(filesdir , file)\n", - "data = pd.read_csv(filedir)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "id": "9a8a8df6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
visitor_urlvaluepercentindustriesorganisationschoolcourse of studysearch
0visitor_urlvaluepercentNaNNaNNaNNaNNaN
1/?size=n_20_n2780025%NaNNaNNaNNaNNaN
2/2780012%NaNNaNNaNNaNNaN
3/?size=n_20_n&filters[0][field]=industries&fil...136001%Banking and FinanceNaNNaNNaNNaN
4/?current=n_2_n&size=n_20_n136001%NaNNaNNaNNaNNaN
...........................
187/?size=n_20_n&filters[0][field]=industries&fil...520%SecurityNaNNaNNaNNaN
188/?size=n_20_n&filters[0][field]=organisation&f...510%NaNMcKinsey & CompanyNaNNaNNaN
189/?current=n_2_n&size=n_20_n&filters[0][field]=...510%NaNBoston\\n Consulting GroupNaNNaNNaN
190/?current=n_4_n&size=n_20_n&filters[0][field]=...510%Data\\n Science & AnalyticsNaNNaNNaNNaN
191/?current=n_5_n&size=n_20_n&filters[0][field]=...510%Data\\n Science & AnalyticsNaNNaNNaNNaN
\n", - "

192 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " visitor_url value percent \\\n", - "0 visitor_url value percent \n", - "1 /?size=n_20_n 27800 25% \n", - "2 / 27800 12% \n", - "3 /?size=n_20_n&filters[0][field]=industries&fil... 13600 1% \n", - "4 /?current=n_2_n&size=n_20_n 13600 1% \n", - ".. ... ... ... \n", - "187 /?size=n_20_n&filters[0][field]=industries&fil... 52 0% \n", - "188 /?size=n_20_n&filters[0][field]=organisation&f... 51 0% \n", - "189 /?current=n_2_n&size=n_20_n&filters[0][field]=... 51 0% \n", - "190 /?current=n_4_n&size=n_20_n&filters[0][field]=... 51 0% \n", - "191 /?current=n_5_n&size=n_20_n&filters[0][field]=... 51 0% \n", - "\n", - " industries organisation school \\\n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 Banking and Finance NaN NaN \n", - "4 NaN NaN NaN \n", - ".. ... ... ... \n", - "187 Security NaN NaN \n", - "188 NaN McKinsey & Company NaN \n", - "189 NaN Boston\\n Consulting Group NaN \n", - "190 Data\\n Science & Analytics NaN NaN \n", - "191 Data\\n Science & Analytics NaN NaN \n", - "\n", - " course of study search \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - ".. ... ... \n", - "187 NaN NaN \n", - "188 NaN NaN \n", - "189 NaN NaN \n", - "190 NaN NaN \n", - "191 NaN NaN \n", - "\n", - "[192 rows x 8 columns]" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "id": "fc76fd3c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value
search
\"harish kumar malaga\"83
gic110
google266
harish112
healthcare123
investment banking136
product52
\n", - "
" - ], - "text/plain": [ - " value\n", - "search \n", - "\"harish kumar malaga\" 83\n", - "gic 110\n", - "google 266\n", - "harish 112\n", - "healthcare 123\n", - "investment banking 136\n", - "product 52" - ] - }, - "execution_count": 155, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = pd.read_csv(filedir)\n", - "data = data.drop(data[data['search'].isnull()].index) #dropping Nan for column, industries onwards\n", - "data = data.loc[:,['value', 'search']]\n", - "data.iloc[:,0] = data.iloc[:,0].apply(int)\n", - "data = data.groupby(by='search').sum()\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "id": "bc639281", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'value': {'\"harish kumar malaga\"': 83,\n", - " 'gic': 110,\n", - " 'google': 266,\n", - " 'harish': 112,\n", - " 'healthcare': 123,\n", - " 'investment banking': 136,\n", - " 'product': 52}}" - ] - }, - "execution_count": 156, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45bcd10d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/howard's advisory stuff/getting data.ipynb b/howard's advisory stuff/getting data.ipynb deleted file mode 100644 index ef0a829..0000000 --- a/howard's advisory stuff/getting data.ipynb +++ /dev/null @@ -1,907 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "id": "a53dc1ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2020w1.csv',\n", - " '2020w2.csv',\n", - " '2020w3.csv',\n", - " '2021w1.csv',\n", - " '2022.csv',\n", - " 'E-Scholars Guidebook AY2324.pdf',\n", - " 'umami_dumpfile_v2',\n", - " 'umami_preprocessed',\n", - " 'umami_preprocessed.zip']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#gettin files\n", - "import os \n", - "filesdir = os.path.dirname('C:/Users/howar/Downloads/Telegram Desktop/')\n", - "files = os.listdir(filesdir)\n", - "files" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e75f9905", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mentor 1 2\n", - "Mentor 2 4\n", - "Mentor 3 10\n", - "Mentor 4 540\n", - "Unnamed: 4 1045\n", - "dtype: int64\n" - ] - }, - { - "data": { - "text/plain": [ - "{'-',\n", - " '.',\n", - " '1.\\tJayasutha Samuthiran',\n", - " 'A',\n", - " 'Aaron Ng',\n", - " 'Aaron Tay',\n", - " 'Adrian Chew',\n", - " 'Adrian Liew',\n", - " 'Akshay Maheshwari',\n", - " 'Alayana Ling',\n", - " 'Alayna Liang',\n", - " 'Alayna Ling',\n", - " 'Alayna Ying',\n", - " 'Alvin Seo',\n", - " 'Alvin Tan',\n", - " 'Alvin Tan, LinkedIn',\n", - " 'Alvona Loh',\n", - " 'Alvona Loh (Doctor)',\n", - " 'Alvona Loh (Doctor, Institute of Mental Health)',\n", - " 'Alvona Loh (Institute of Mental Health)',\n", - " 'Alvona Zhang',\n", - " 'Amra Naidoo',\n", - " 'Amra Naidoo, Co-Founder, Accelerating Asia',\n", - " 'Andrew Pang',\n", - " 'Andrew Tan',\n", - " 'Andrew tan',\n", - " 'Andrezj Surzyn',\n", - " 'Andrzej Surzyn',\n", - " 'Ang Eng Hieang',\n", - " 'Anira Perera',\n", - " 'Arrchana Muruganantham',\n", - " 'Arrchana Muruganatham',\n", - " 'Bhargav Sriganesh',\n", - " 'Bianca Stringuini',\n", - " 'Biance Stringuini',\n", - " 'Billy Loh',\n", - " 'Brain Liu',\n", - " 'Brian Liu',\n", - " 'Brian Liu: Lazada Group',\n", - " 'Bryan Low',\n", - " 'C',\n", - " 'Carol Chuah',\n", - " 'Carol Soon',\n", - " 'Celine Teoh',\n", - " 'Celine Toh',\n", - " 'Celine Toh, McKinsey',\n", - " 'Cheah Wui Ling',\n", - " 'Chia Lih Wei',\n", - " 'Chin Hui Lim',\n", - " 'Chin Hui Lin',\n", - " 'Choon Hong Tay',\n", - " 'Chow Yeh Wah',\n", - " 'Chow Yew Wah',\n", - " 'Choy Mun Kit',\n", - " 'Chris Chia',\n", - " 'Chris Tan',\n", - " 'Christian Cadeo',\n", - " 'Christian Caedo',\n", - " 'Christine Kim',\n", - " 'Chua Ruo Mei',\n", - " 'Chua Seng Tat',\n", - " 'Chua Seng Tat-',\n", - " 'Chun Huey Yei',\n", - " 'Cindy Chng',\n", - " 'Clarence Quek',\n", - " 'Cliff Hartono',\n", - " 'Clinton Yip',\n", - " 'Clinton yip',\n", - " 'Colin Lee',\n", - " 'Crystal Koh',\n", - " 'Damian Ngiam',\n", - " 'Damian Ngiam (Tiktok)',\n", - " 'Damson Capital',\n", - " 'Daniel Teh',\n", - " 'David Chua',\n", - " 'Davis Tan',\n", - " 'Davis Tan, Senior Associate, Rajah & Tann',\n", - " 'Debby Caroline',\n", - " 'Derrick Sim',\n", - " 'Desmond Koh',\n", - " 'Desmond Koh (Managing Director (SEA) at BNP Paribas Wealth Management)',\n", - " 'Desmond koh',\n", - " 'Divya Jagtiani',\n", - " 'Dr Alvona Loh',\n", - " 'Dr Carol Soon',\n", - " 'Dr Cheah Wui Ling',\n", - " 'Dr Geraldine Tan',\n", - " 'Dr Hamid Rahmatullah Bin Abd Razak',\n", - " 'Dr Hamid Razak',\n", - " 'Dr Janice Soo',\n", - " 'Dr Li Jingmei',\n", - " 'Dr Mala Satkunanantham',\n", - " 'Dr Neo Mei Lin',\n", - " 'Dr. Alvona Loh',\n", - " 'Dr. Cheah Wei Ling',\n", - " 'Dr. Cheah Wui Ling',\n", - " 'Dr. Geraldine Tan',\n", - " 'Dr. Neo Mei Lin',\n", - " 'Duo Geng',\n", - " 'Duo Geng Goh',\n", - " 'E',\n", - " 'Edmond Twohill',\n", - " 'Edmund Teo',\n", - " 'Edmund Twohill',\n", - " 'Edmund twohill',\n", - " 'Edward Yee',\n", - " 'Edwin Lee',\n", - " 'Edwin Lee from DXC Tech',\n", - " 'Eileen Lee',\n", - " 'Emil Tan',\n", - " 'Esther Seah',\n", - " 'Eugene Wee',\n", - " 'Felicia Ng',\n", - " 'Felicia ng',\n", - " 'Fong Yoong Kheong',\n", - " 'Foo Wan Xuan',\n", - " 'Foo Wan Xuan (Wildlife Reserves Singapore)',\n", - " 'G',\n", - " 'Genie Gan',\n", - " 'Geraldine Tan',\n", - " 'Geraldine Tan (Director and Principal Psychologist)',\n", - " 'Goh Duo Deng',\n", - " 'Goh Duo Geng',\n", - " 'Goh Duo Seng',\n", - " 'Grace Lee',\n", - " 'Grace Lee Khoo',\n", - " 'Grace Lee-Khoo',\n", - " 'Grace Ng',\n", - " 'Grace Ng (Senior Scriptwriter)',\n", - " \"Grace Ng (Senior Scriptwriter, Mediacorp Studios' English Drama Productions)\",\n", - " 'Grace Tong',\n", - " 'Grace Zhu',\n", - " 'Grace zhu - ministry on social development (disability)',\n", - " 'Guo Duo Sheng',\n", - " 'Hamid Razak',\n", - " 'Hannah Lim',\n", - " 'Harsh Raghuvir',\n", - " 'Hemaa Sakar',\n", - " 'Hemaa Sekar',\n", - " 'Heng Kai Le',\n", - " 'I Naishad Kai Ren',\n", - " \"I couldn't really find anyone specifically in the biotech/microbio area, it was more data science, but if there are others available I dont mind!\",\n", - " 'Irnina Wong',\n", - " 'Isabel Lee',\n", - " 'Jacklyn Seow',\n", - " 'Jacky Yap',\n", - " 'Jaclyn Seow',\n", - " 'Jaclyn Seow (Head of ESG and Impact at Openspace Ventures)',\n", - " 'Jacyln Seow',\n", - " 'Jamie Kloor',\n", - " 'Janice Loh',\n", - " 'Janice Soo',\n", - " 'Jared Kang',\n", - " 'Jared Kang, State Counsel, AGC',\n", - " 'Jaslyn Seah',\n", - " 'Jason Ong',\n", - " 'Jason Ong (Hubspot)',\n", - " 'Jayasutha Samuthiran',\n", - " 'Jeanne Tai',\n", - " 'Jee Soo Lee',\n", - " 'Jeremy Chia',\n", - " 'Jerviel',\n", - " 'Jerviel Human resource - awwa',\n", - " 'Jerviel Lim',\n", - " 'Jessica Tan',\n", - " 'Jiang Xin Yu',\n", - " 'Jiang Xinyu',\n", - " 'Jiaxi Zhang',\n", - " 'Jillian Lye',\n", - " 'Jimmy Ong',\n", - " 'Jimmy Sia',\n", - " 'Jimmys Sim',\n", - " 'Jocelyn - pathlight',\n", - " 'Jocelyn Low',\n", - " 'Johanna Tay',\n", - " 'John Wu',\n", - " 'Johnathan Kuek',\n", - " 'Jonathan Ang',\n", - " 'Jonathan Kuek',\n", - " 'Jonathan Lee',\n", - " 'Jonathan Lee, Associate General Counsel, Facebook',\n", - " 'Joshua AU',\n", - " 'Joshua Au',\n", - " 'Joshua Au (Head of the Data Centre)',\n", - " 'Joshua Tan',\n", - " 'Juhi Ramireddi',\n", - " 'Justin Ho',\n", - " 'Justin Ho (Vice President, JP Morgan Chase)',\n", - " 'Kamil Haque',\n", - " 'Kanitha Jagatheson',\n", - " 'Karen Sim',\n", - " 'Karen Sim (Senior Sustainability Strategist at Forum for the Future)',\n", - " 'Karpagam Venkkatesan',\n", - " 'Kenneth Bok',\n", - " 'Kenneth Lau',\n", - " 'Kenneth Law',\n", - " 'Kenneth Tay',\n", - " 'Kenny Sng',\n", - " 'Kevin Low',\n", - " 'Kevin Low (Infrastructure Asia)',\n", - " 'Kia Liang Fua',\n", - " 'Koh Ching Ching',\n", - " 'Kok Ching Ching',\n", - " 'Kristene Chan',\n", - " 'Kush Sagar',\n", - " 'Kush Sagar, Facebook',\n", - " 'Lai-Yee Soh',\n", - " 'Lavinia Thanapathy',\n", - " 'Leanne Thachil',\n", - " 'Lee Boon Pin',\n", - " 'Lee Hui Min',\n", - " 'Lee Jee Soo',\n", - " 'Lee Jee Soo, BCG',\n", - " 'Lee Keng Leong',\n", - " 'Lee Sing Kok',\n", - " 'Lee Sing-Kok',\n", - " 'Lee Sing-Kok (SK)',\n", - " 'Lee Wei An',\n", - " 'Lee Wei An from Uber Eats',\n", - " 'Lee Zhihan',\n", - " 'Leon Khee Pay',\n", - " 'Leon Khee Pay (Multi-Asset Product Executive, Schroders)',\n", - " 'Leon Toh',\n", - " 'Leslie Lim',\n", - " 'Li Jingmei',\n", - " 'Li Jingmei (Senior Research Scientist)',\n", - " 'Lim Boon Pin',\n", - " 'Lim Boon Pin (Policy & Planning at Municipal Services Office)',\n", - " 'Lim Huishan',\n", - " 'Lim Wei Jie',\n", - " 'Lim Wei Jie (Foreword)',\n", - " 'Ling Han',\n", - " 'Lionel Chong',\n", - " 'Lionel Choong',\n", - " 'Lionel Tan',\n", - " 'Loke Jia Li',\n", - " 'Low Siew Ling',\n", - " 'Mala Satkunanantham',\n", - " 'Malminderjit Singh',\n", - " 'Mark Lee',\n", - " 'Mark Tang',\n", - " 'Mark Yong',\n", - " 'Matthew Wong',\n", - " 'Mavis Tan',\n", - " 'Mavis Tan, Senior Consultant E&Y',\n", - " 'Mdm Eileen Lee',\n", - " 'Mdm Jayasutha Samuthiran',\n", - " 'Mdm Vivian Yeong',\n", - " 'Meera Sachdeva',\n", - " 'Melissa Low',\n", - " 'Melissa Low ( Energy Studies Institute )',\n", - " 'Melissa Luki',\n", - " 'Melissa Luki (Cistri)',\n", - " 'Michael Ngo',\n", - " 'Michelle Carvalho',\n", - " 'Mike Brown',\n", - " 'Minh (Mark) Vu',\n", - " 'Miss Grace Tong',\n", - " 'Miss Janice Soo',\n", - " 'Miss Jayasutha Samuthiran',\n", - " 'Miss Jeanne Tai',\n", - " 'Miss Jillian Lye',\n", - " 'Miss Johanna Tay',\n", - " 'Miss Mala Satkunanantham',\n", - " 'Miss Nicole Teoh',\n", - " 'Miss Shennon Ho',\n", - " 'Miss Stephanie Siow',\n", - " 'Miss Valerie Lim',\n", - " 'Mr Aaron Ng',\n", - " 'Mr Adrian Liew',\n", - " 'Mr Akshay Maheshwari',\n", - " 'Mr Alvin Seo',\n", - " 'Mr Alvin Tan',\n", - " 'Mr Andrew Pang',\n", - " 'Mr Andrzej Surzyn',\n", - " 'Mr Andrzej Surzyn (Cerberus Capital Management)',\n", - " 'Mr Bhargav Sriganesh',\n", - " 'Mr Billy Loh',\n", - " 'Mr Brian Liu',\n", - " 'Mr Chris Tan',\n", - " 'Mr Christian Cadeo',\n", - " 'Mr Cliff Hartono',\n", - " 'Mr David Chua',\n", - " 'Mr Desmond Koh',\n", - " 'Mr Edward Yee',\n", - " 'Mr Emil Tan',\n", - " 'Mr Eugene Wee',\n", - " 'Mr Hamid Razak',\n", - " 'Mr Jared Kang',\n", - " 'Mr Jimmy Sia',\n", - " 'Mr John Wu',\n", - " 'Mr Jonathan Kuek',\n", - " 'Mr Jonathan Lee',\n", - " 'Mr Kenneth Lau',\n", - " 'Mr Kenny Sng',\n", - " 'Mr Kevin Low',\n", - " 'Mr Kush Sagar',\n", - " 'Mr Leon Toh',\n", - " 'Mr Leslie Lim',\n", - " 'Mr Lim Boon Pin',\n", - " 'Mr Lim Wei Jie',\n", - " 'Mr Lionel Choong',\n", - " 'Mr Loke Jia Li',\n", - " 'Mr Mark Yong',\n", - " 'Mr Naishad Kai-Ren I',\n", - " 'Mr Nakul Asjia',\n", - " 'Mr Navjeev Singh',\n", - " 'Mr Nesh Sooriyan',\n", - " 'Mr Rahul Daswani',\n", - " 'Mr Raymond Tay',\n", - " 'Mr Rondy Krish',\n", - " 'Mr Sanjay Nair',\n", - " 'Mr Simon Phua',\n", - " 'Mr Tay Choon Hong',\n", - " 'Mr Teddy Low',\n", - " 'Mr Xu Si Han',\n", - " 'Mr Yujie Tag',\n", - " 'Mr Yuvan Mohan',\n", - " 'Mr. Alvin Tan',\n", - " 'Mr. Bhargav Sriganesh',\n", - " 'Mr. Davis Tan',\n", - " 'Mr. Desmond Koh',\n", - " 'Mr. Kenneth Tay',\n", - " 'Mr. Leon Toh',\n", - " 'Mrinalini Vekatachalam',\n", - " 'Mrinalini Venkatachalam',\n", - " 'Ms Alayna Ling',\n", - " 'Ms Alvona Loh',\n", - " 'Ms Amra Naidoo',\n", - " 'Ms Carol Chuah',\n", - " 'Ms Celine Toh',\n", - " 'Ms Cheah Wui Ling',\n", - " 'Ms Chin Hui Lin',\n", - " 'Ms Christine Kim',\n", - " 'Ms Crystal Koh',\n", - " 'Ms Eileen Lee',\n", - " 'Ms Esther Seah',\n", - " 'Ms Foo Wan Xuan',\n", - " 'Ms Genie Gan',\n", - " 'Ms Geraldine Tan',\n", - " 'Ms Grace Ng',\n", - " 'Ms Grace Zhu',\n", - " 'Ms Jaclyn Seow',\n", - " 'Ms Jamie Kloor',\n", - " 'Ms Janice Soo',\n", - " 'Ms Jaslyn Seah',\n", - " 'Ms Jessica Tan',\n", - " 'Ms Jiaxi Zhang',\n", - " 'Ms Jillian Lye',\n", - " 'Ms Jocelyn Low',\n", - " 'Ms Juhi Ramireddi',\n", - " 'Ms Kanitha Jagatheson',\n", - " 'Ms Karen Sim (Forum for the future)',\n", - " 'Ms Koh Ching Ching',\n", - " 'Ms Lavinia Thanapathy',\n", - " 'Ms Leanne Thachil',\n", - " 'Ms Lee Jee Soo',\n", - " 'Ms Li Jingmei',\n", - " 'Ms Ling Han',\n", - " 'Ms Mala Satkunanantham',\n", - " 'Ms Mavis Tan',\n", - " 'Ms Meera Sachdeva',\n", - " 'Ms Melissa Low',\n", - " 'Ms Nadia Yeo',\n", - " 'Ms Neo Mei Lin',\n", - " 'Ms Nicole Teoh (Changi Airports International)',\n", - " 'Ms Quek Jiahui',\n", - " 'Ms Rashi Tulshyan',\n", - " 'Ms Regine Chan',\n", - " 'Ms Ruo Mei Chua',\n", - " 'Ms Samantha Thian',\n", - " 'Ms Samyukta Venkatraman',\n", - " 'Ms Serene Goh',\n", - " 'Ms Stephanie Siow',\n", - " 'Ms Sulin Tan',\n", - " 'Ms Valerie Lim',\n", - " 'Ms Velda Wong',\n", - " 'Ms Vivian Yeong',\n", - " 'Ms. Celine Toh',\n", - " 'Ms. Felicia Ng',\n", - " 'Ms. Vicki Wong',\n", - " 'Muhammad Ashiq Chu',\n", - " 'Musarrat Maisha Reza',\n", - " 'Musarrat Maisha Reza (lecturer)',\n", - " 'N.A',\n", - " 'N.A.',\n", - " 'N.i.l',\n", - " 'NIL',\n", - " 'Nadia Yeo',\n", - " 'Nadia Yeo, Deputy Director (Legislation and Policy Advisory, Ministry of Home Affairs (MHA)',\n", - " 'Naishad I',\n", - " 'Naishad Kai Ren',\n", - " 'Naishad Kai-Ren I',\n", - " 'Nakul Asija',\n", - " 'Nakul Asija, the founder of The Gosto Foods Co.',\n", - " 'Nakul Asjia',\n", - " 'Nanthinee Jevanandam',\n", - " 'Nanthinee jevanandam',\n", - " 'Narjeev Singh',\n", - " 'Nathan Ong',\n", - " 'Nathan ong',\n", - " 'Navjeev Singh',\n", - " 'Neo Mei Lin',\n", - " 'Neo Meilin',\n", - " 'Nesh Sooriyan',\n", - " 'Nicole Teoh',\n", - " 'Nicole Teoh (Asset Management Analyst, Changi Airports International)',\n", - " 'Nil',\n", - " 'None',\n", - " \"None as I'm looking for a PM mentor and she's the only one\",\n", - " 'Not sure yet',\n", - " 'Peng Jing Kai',\n", - " 'Peng Jingkai',\n", - " 'Professor Cheah Wui Ling',\n", - " 'Quek Jia Hui',\n", - " 'Quek Jiahui',\n", - " 'Rahul Daswani',\n", - " 'Rashi Tulshyan',\n", - " 'Raymond Tay',\n", - " 'Rebecca Tan',\n", - " 'Regine Chan',\n", - " 'Remus Tan',\n", - " 'Rick Liu',\n", - " 'Rondy Krish',\n", - " 'Ruo Mei Chua',\n", - " 'Samantha Thian',\n", - " 'Samantha Thian (Seastainable)',\n", - " 'Samyukta Venkatraman',\n", - " 'Sanjay Nair',\n", - " 'Sarah Song',\n", - " 'Sarkunan Chandra',\n", - " 'Seah Wen Yan Jasyln',\n", - " 'Seastainable',\n", - " 'Serene Goh',\n", - " 'Shankar Venugopal',\n", - " 'Shanthakumar Bannirchelvam',\n", - " 'Shanthakumar Bannirchelvam (Managing Partner at Global Impact Partners)',\n", - " 'Shennon Ho',\n", - " 'Shermin Ho',\n", - " 'Simon Gwozdz',\n", - " 'Simon Phua',\n", - " 'Simon Phua, Facebook',\n", - " 'Soh Lai Yee',\n", - " 'Soh Lai Yi',\n", - " 'Sophia Tan',\n", - " 'Stephanie Siow',\n", - " 'Stephanie Sutanto',\n", - " 'Stephanie Sutanto: Rakuten Viki',\n", - " 'Sugidha Nithiananthan',\n", - " 'Suhaimi bin Zainal Shah',\n", - " 'Suhaimi bin Zainal Shah- Educational Technology Officer- Ministry of Education Singapore',\n", - " 'Sujatha Selvakumar',\n", - " 'Sujatha Selvakumar (Legislative Assistant, Singapore Parliament)',\n", - " 'Sulin Tan',\n", - " 'Sulin Tan: Carousell',\n", - " 'Tag Yujie',\n", - " 'Tan Chia Boon',\n", - " 'Tan Su Lin',\n", - " 'Tan Wei Xiong',\n", - " 'Tay Choon Hong',\n", - " 'Teddy Low',\n", - " 'Teddy Low, Assistant Director for Training Programmes, MFA',\n", - " 'Three is enough',\n", - " 'Tristan Theurier',\n", - " 'Valerie Lim',\n", - " 'Valerie Lim (Gojek)',\n", - " 'Velda Wong',\n", - " 'Vera Chng',\n", - " 'Vera Chng (Communications and Engagement Planning Officer [Strategic Communications], Ministry of Education)',\n", - " 'Vicki Wong',\n", - " 'Vivian Yeong',\n", - " 'Vivian Yeong @Verz Design',\n", - " 'Vivian yeong',\n", - " 'Walton Zhang',\n", - " 'Wendy Cheong',\n", - " 'Wendy Cheong (Duke-NUS Medical School)',\n", - " 'Wilson Tang',\n", - " 'Wong Yoke Yong',\n", - " 'Xin Yu Jiang',\n", - " 'XinYu Jiang',\n", - " 'Xingyu Jiang',\n", - " 'Xinyu Blair Jiang',\n", - " 'Xinyu Jiang',\n", - " 'Xu Si Han',\n", - " 'Ye-Her Wu',\n", - " 'Yeher Wu',\n", - " 'Yip Ren Kai',\n", - " 'Yu Jie',\n", - " 'Yu Jie Tag',\n", - " 'Yuan Ning (Mock)',\n", - " 'Yuan Ning Mock',\n", - " 'Yujie Tag',\n", - " 'Yujie Tag (Associate, McKinsey and Co)',\n", - " 'Yujie Tang',\n", - " 'Yuvan Mohan',\n", - " 'Yuvaraj Anandan',\n", - " 'alvona loh',\n", - " 'celine toh',\n", - " 'christine kim',\n", - " 'edward yee',\n", - " 'g',\n", - " 'geraldine tan',\n", - " 'grace tong',\n", - " 'jacky yap',\n", - " 'jonathan kuek',\n", - " 'justin ho',\n", - " 'kamil haque',\n", - " 'karen sim',\n", - " 'mala satkunanantham',\n", - " 'miss alvona loh',\n", - " 'mr hamid razak',\n", - " 'na',\n", - " nan,\n", - " 'nil',\n", - " 'regine chan',\n", - " '–'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#getting list of mentors\n", - "\n", - "import pandas as pd\n", - "file = '2020w1.csv'\n", - "filedir = os.path.join(filesdir , file)\n", - "data = pd.read_csv(filedir)\n", - "print(data.isnull().sum())\n", - "data.dropna(how='all', inplace=True)\n", - "mentors = set()\n", - "\n", - "for column in range(4):\n", - " for mentor in data.iloc[:, column]:\n", - " mentors.add(mentor)\n", - " \n", - "mentors" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e132af95", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
0NaN548
1Regine Chan85
2Alayna Ling78
3Celine Toh75
4Geraldine Tan72
.........
511Jimmys Sim1
512Celine Teoh1
513Johnathan Kuek1
514Christian Caedo1
515Leon Khee Pay (Multi-Asset Product Executive, ...1
\n", - "

516 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " 0 1\n", - "0 NaN 548\n", - "1 Regine Chan 85\n", - "2 Alayna Ling 78\n", - "3 Celine Toh 75\n", - "4 Geraldine Tan 72\n", - ".. ... ...\n", - "511 Jimmys Sim 1\n", - "512 Celine Teoh 1\n", - "513 Johnathan Kuek 1\n", - "514 Christian Caedo 1\n", - "515 Leon Khee Pay (Multi-Asset Product Executive, ... 1\n", - "\n", - "[516 rows x 2 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#finding frequency of mentors\n", - "\n", - "frequency_dic = {x:0 for x in mentors}\n", - "print(type(frequency_dic))\n", - "for column in range(4):\n", - " for mentor in data.iloc[:, column]:\n", - " frequency_dic[mentor] += 1\n", - "\n", - "sorted_data = pd.DataFrame(sorted(list(frequency_dic.items()),key=lambda x: x[1], reverse=True))\n", - "sorted_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "bb5b80df", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#plotting frequency of mentors\n", - "import matplotlib.pyplot as plt\n", - "\n", - "plt.bar(sorted_data.iloc[1:200,0], sorted_data.iloc[1:200,1])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c22a2500", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
0NaN548
1Regine Chan85
2Alayna Ling78
3Celine Toh75
4Geraldine Tan72
5-55
6Desmond Koh52
7Janice Soo51
8Alvin Tan51
9Alvona Loh51
\n", - "
" - ], - "text/plain": [ - " 0 1\n", - "0 NaN 548\n", - "1 Regine Chan 85\n", - "2 Alayna Ling 78\n", - "3 Celine Toh 75\n", - "4 Geraldine Tan 72\n", - "5 - 55\n", - "6 Desmond Koh 52\n", - "7 Janice Soo 51\n", - "8 Alvin Tan 51\n", - "9 Alvona Loh 51" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sorted_data[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fed40cc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/howard's advisory stuff/new/clean.ipynb b/howard's advisory stuff/new/clean.ipynb deleted file mode 100644 index b37bdce..0000000 --- a/howard's advisory stuff/new/clean.ipynb +++ /dev/null @@ -1,1032 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reformating data from GA4" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from urllib.parse import urlparse, parse_qs, unquote" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('data-export.csv')\n", - "df.head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ignore Rows with Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Landing pageSessionsUsersNew usersAverage engagement time per sessionConversionsTotal revenue
0NaN373582121521361105.798329700
1/%3Fsize=n_20_n43730724036.3821510300
2(not set)23415114.52564102600
3/%3Fsize=n_60_n1710000
4/%3Fcurrent=n_2_n&q=software%20&size=n_20_n169980.87500
\n", - "
" - ], - "text/plain": [ - " Landing page Sessions Users New users \\\n", - "0 NaN 37358 21215 21361 \n", - "1 /%3Fsize=n_20_n 437 307 240 \n", - "2 (not set) 234 151 1 \n", - "3 /%3Fsize=n_60_n 17 1 0 \n", - "4 /%3Fcurrent=n_2_n&q=software%20&size=n_20_n 16 9 9 \n", - "\n", - " Average engagement time per session Conversions Total revenue \n", - "0 105.7983297 0 0 \n", - "1 36.38215103 0 0 \n", - "2 4.525641026 0 0 \n", - "3 0 0 0 \n", - "4 80.875 0 0 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_cleaned = df[10:]\n", - "\n", - "# take first row as heading and reset index\n", - "df_cleaned.columns = df_cleaned.iloc[0]\n", - "\n", - "# remove index from header row\n", - "df_cleaned = df_cleaned[1:]\n", - "df_cleaned = df_cleaned.reset_index(drop=True)\n", - "df_cleaned = df_cleaned.rename_axis(None, axis=1)\n", - "\n", - "df_cleaned.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Landing pageSessionsUsers
0NaN3735821215
1/%3Fsize=n_20_n437307
2(not set)234151
3/%3Fsize=n_60_n171
4/%3Fcurrent=n_2_n&q=software%20&size=n_20_n169
\n", - "
" - ], - "text/plain": [ - " Landing page Sessions Users\n", - "0 NaN 37358 21215\n", - "1 /%3Fsize=n_20_n 437 307\n", - "2 (not set) 234 151\n", - "3 /%3Fsize=n_60_n 17 1\n", - "4 /%3Fcurrent=n_2_n&q=software%20&size=n_20_n 16 9" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# drop columns\n", - "df_cleaned = df_cleaned.drop(['New users', 'Average engagement time per session', 'Conversions', 'Total revenue'], axis=1)\n", - "df_cleaned.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Landing pageSessions
0NaN37358
1/%3Fsize=n_20_n437
2(not set)234
3/%3Fsize=n_60_n17
4/%3Fcurrent=n_2_n&q=software%20&size=n_20_n16
\n", - "
" - ], - "text/plain": [ - " Landing page Sessions\n", - "0 NaN 37358\n", - "1 /%3Fsize=n_20_n 437\n", - "2 (not set) 234\n", - "3 /%3Fsize=n_60_n 17\n", - "4 /%3Fcurrent=n_2_n&q=software%20&size=n_20_n 16" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# drop number of users\n", - "df_cleaned = df_cleaned.drop(['Users'], axis=1)\n", - "\n", - "# we will be exploding by sessions later" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Landing pageSessionsquery_params
0nan37358{}
1/%3Fsize=n_20_n437{'size': ['n_20_n']}
2(not set)234{}
3/%3Fsize=n_60_n17{'size': ['n_60_n']}
4/%3Fcurrent=n_2_n&q=software%20&size=n_20_n16{'current': ['n_2_n'], 'q': ['software '], 'si...
\n", - "
" - ], - "text/plain": [ - " Landing page Sessions \\\n", - "0 nan 37358 \n", - "1 /%3Fsize=n_20_n 437 \n", - "2 (not set) 234 \n", - "3 /%3Fsize=n_60_n 17 \n", - "4 /%3Fcurrent=n_2_n&q=software%20&size=n_20_n 16 \n", - "\n", - " query_params \n", - "0 {} \n", - "1 {'size': ['n_20_n']} \n", - "2 {} \n", - "3 {'size': ['n_60_n']} \n", - "4 {'current': ['n_2_n'], 'q': ['software '], 'si... " - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_preprocessed = df_cleaned.copy(deep=True)\n", - "\n", - "# expand landing page links to utm query params\n", - "# example: /%3Fcurrent=n_2_n&q=software%20&size=n_20_n\n", - "\n", - "# Define a function to extract query params from a URL\n", - "def extract_query_params(url):\n", - " url = unquote(url)\n", - " query_params = parse_qs(urlparse(url).query)\n", - " return query_params\n", - "\n", - "df_preprocessed['Landing page'] = df_preprocessed['Landing page'].astype(str)\n", - "df_preprocessed['query_params'] = df_preprocessed['Landing page'].apply(extract_query_params)\n", - "\n", - "df_preprocessed.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'current': ['n_2_n'], 'q': ['software '], 'size': ['n_20_n']}" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_preprocessed.iloc[4]['query_params']" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Landing pagequery_params
0/%3Fsize=n_20_n{'size': ['n_20_n']}
1/%3Fsize=n_20_n{'size': ['n_20_n']}
2/%3Fsize=n_20_n{'size': ['n_20_n']}
3/%3Fsize=n_20_n{'size': ['n_20_n']}
4/%3Fsize=n_20_n{'size': ['n_20_n']}
\n", - "
" - ], - "text/plain": [ - " Landing page query_params\n", - "0 /%3Fsize=n_20_n {'size': ['n_20_n']}\n", - "1 /%3Fsize=n_20_n {'size': ['n_20_n']}\n", - "2 /%3Fsize=n_20_n {'size': ['n_20_n']}\n", - "3 /%3Fsize=n_20_n {'size': ['n_20_n']}\n", - "4 /%3Fsize=n_20_n {'size': ['n_20_n']}" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# explode by sessions for easier analysis\n", - "\n", - "df_preprocessed['Sessions'] = df_preprocessed['Sessions'].astype(int)\n", - "\n", - "df_preprocessed = df_preprocessed[df_preprocessed['Landing page'] != '(not set)']\n", - "df_preprocessed = df_preprocessed[df_preprocessed['Landing page'] != 'nan']\n", - "\n", - "df_preprocessed['copy'] = df_preprocessed['Sessions'].apply(lambda x: list(range(x)))\n", - "df_exploded = df_preprocessed.explode('copy').drop(columns='Sessions')\n", - "df_exploded.drop(columns='copy', inplace=True)\n", - "\n", - "# reset index\n", - "df_exploded = df_exploded.reset_index(drop=True)\n", - "\n", - "df_exploded.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
visitor_urlquery_params
0/%3Fsize=n_20_n{'size': ['n_20_n']}
1/%3Fsize=n_20_n{'size': ['n_20_n']}
2/%3Fsize=n_20_n{'size': ['n_20_n']}
3/%3Fsize=n_20_n{'size': ['n_20_n']}
4/%3Fsize=n_20_n{'size': ['n_20_n']}
.........
663/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...
664/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...
665/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...
666/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...
667/%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=indu...{'size': ['n_60_n'], 'filters[0][field]': ['in...
\n", - "

668 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " visitor_url \\\n", - "0 /%3Fsize=n_20_n \n", - "1 /%3Fsize=n_20_n \n", - "2 /%3Fsize=n_20_n \n", - "3 /%3Fsize=n_20_n \n", - "4 /%3Fsize=n_20_n \n", - ".. ... \n", - "663 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "664 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "665 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "666 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "667 /%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=indu... \n", - "\n", - " query_params \n", - "0 {'size': ['n_20_n']} \n", - "1 {'size': ['n_20_n']} \n", - "2 {'size': ['n_20_n']} \n", - "3 {'size': ['n_20_n']} \n", - "4 {'size': ['n_20_n']} \n", - ".. ... \n", - "663 {'size': ['n_20_n'], 'filters[0][field]': ['sc... \n", - "664 {'size': ['n_20_n'], 'filters[0][field]': ['sc... \n", - "665 {'size': ['n_20_n'], 'filters[0][field]': ['sc... \n", - "666 {'size': ['n_20_n'], 'filters[0][field]': ['sc... \n", - "667 {'size': ['n_60_n'], 'filters[0][field]': ['in... \n", - "\n", - "[668 rows x 2 columns]" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# rename landing page to visitor_url\n", - "df_exploded = df_exploded.rename(columns={'Landing page': 'visitor_url'})\n", - "df_exploded" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
visitor_urlquery_paramssizecurrentqfilters[0][field]filters[0][values][0]sort-fieldsort-directionfbclidamp;amp;size
663/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...[n_20_n]NaNNaN[school][London School o]NaNNaNNaNNaN
664/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...[n_20_n]NaNNaN[school][Nanyang Polytechn]NaNNaNNaNNaN
665/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...[n_20_n]NaNNaN[school][Nanyang Technolog]NaNNaNNaNNaN
666/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...{'size': ['n_20_n'], 'filters[0][field]': ['sc...[n_20_n]NaNNaN[school][National Universi]NaNNaNNaNNaN
667/%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=indu...{'size': ['n_60_n'], 'filters[0][field]': ['in...[n_60_n]NaNNaN[industries][Data Science%]NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " visitor_url \\\n", - "663 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "664 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "665 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "666 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho... \n", - "667 /%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=indu... \n", - "\n", - " query_params size current q \\\n", - "663 {'size': ['n_20_n'], 'filters[0][field]': ['sc... [n_20_n] NaN NaN \n", - "664 {'size': ['n_20_n'], 'filters[0][field]': ['sc... [n_20_n] NaN NaN \n", - "665 {'size': ['n_20_n'], 'filters[0][field]': ['sc... [n_20_n] NaN NaN \n", - "666 {'size': ['n_20_n'], 'filters[0][field]': ['sc... [n_20_n] NaN NaN \n", - "667 {'size': ['n_60_n'], 'filters[0][field]': ['in... [n_60_n] NaN NaN \n", - "\n", - " filters[0][field] filters[0][values][0] sort-field sort-direction fbclid \\\n", - "663 [school] [London School o] NaN NaN NaN \n", - "664 [school] [Nanyang Polytechn] NaN NaN NaN \n", - "665 [school] [Nanyang Technolog] NaN NaN NaN \n", - "666 [school] [National Universi] NaN NaN NaN \n", - "667 [industries] [Data Science%] NaN NaN NaN \n", - "\n", - " amp;amp;size \n", - "663 NaN \n", - "664 NaN \n", - "665 NaN \n", - "666 NaN \n", - "667 NaN " - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# convert query params to columns\n", - "df_exploded = pd.concat([df_exploded, df_exploded['query_params'].apply(pd.Series)], axis=1)\n", - "\n", - "df_exploded.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "# what is the amp;amp;size column?\n", - "df_exploded['amp;amp;size'].value_counts()\n", - "\n", - "df_exploded.drop('amp;amp;size', axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
visitor_urlquery_paramssizecurrentqfilters[0][field]filters[0][values][0]sort-fieldsort-directionfbclid
663/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=London%20School%20o{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['London School o']}[n_20_n]NaNNaN[school][London School o]NaNNaNNaN
664/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Polytechn{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Polytechn']}[n_20_n]NaNNaN[school][Nanyang Polytechn]NaNNaNNaN
665/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Technolog{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Technolog']}[n_20_n]NaNNaN[school][Nanyang Technolog]NaNNaNNaN
666/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=National%20Universi{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['National Universi']}[n_20_n]NaNNaN[school][National Universi]NaNNaNNaN
667/%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=industries&filters%5B0%5D%5Bvalues%5D%5B0%5D=Data%20Science%{'size': ['n_60_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Data Science%']}[n_60_n]NaNNaN[industries][Data Science%]NaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " visitor_url \\\n", - "663 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=London%20School%20o \n", - "664 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Polytechn \n", - "665 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Technolog \n", - "666 /%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=National%20Universi \n", - "667 /%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=industries&filters%5B0%5D%5Bvalues%5D%5B0%5D=Data%20Science% \n", - "\n", - " query_params \\\n", - "663 {'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['London School o']} \n", - "664 {'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Polytechn']} \n", - "665 {'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Technolog']} \n", - "666 {'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['National Universi']} \n", - "667 {'size': ['n_60_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Data Science%']} \n", - "\n", - " size current q filters[0][field] filters[0][values][0] sort-field \\\n", - "663 [n_20_n] NaN NaN [school] [London School o] NaN \n", - "664 [n_20_n] NaN NaN [school] [Nanyang Polytechn] NaN \n", - "665 [n_20_n] NaN NaN [school] [Nanyang Technolog] NaN \n", - "666 [n_20_n] NaN NaN [school] [National Universi] NaN \n", - "667 [n_60_n] NaN NaN [industries] [Data Science%] NaN \n", - "\n", - " sort-direction fbclid \n", - "663 NaN NaN \n", - "664 NaN NaN \n", - "665 NaN NaN \n", - "666 NaN NaN \n", - "667 NaN NaN " - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.set_option('display.max_colwidth', None)\n", - "\n", - "df_exploded.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "# export data\n", - "df_exploded.to_csv('data-preprocessed.csv', index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "vscode": { - "interpreter": { - "hash": "26663b30a87f8b6b521f0a61964c10b31d4c5e2632109eb7f9afdda680fa86bd" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/huan_yao_code/new.py b/huan_yao_code/new.py deleted file mode 100644 index 386cff1..0000000 --- a/huan_yao_code/new.py +++ /dev/null @@ -1,46 +0,0 @@ -from urllib import parse -import csv -store=[] -processed=[] -with open('data-export.csv', 'r')as basefile: - for i in range(13): - heading = next(basefile) - for row in csv.reader(basefile): - store+=[row[0]] -for index in range(len(store)): - store[index]=store[index].replace('%3F','?') - processed+=[parse.parse_qs(parse.urlsplit(store[index]).query)] -####print(processed) -keystore=[] -uniquekey=[] -valuestore=[] -unwanted=['size','current','fbclid','sort-field','sort-direction','amp;amp;size'] -for dictionary in processed: -## print(dictionary) - keys = [] - values = [] - for key in dictionary: - if key in unwanted: - continue - keys.append(key) - values.append(dictionary[key]) - if key not in uniquekey: - uniquekey.append(key) - keystore+=[keys] - valuestore+=[values] -####print(uniquekey) - -with open('data-export - Copy.csv', 'w',newline='') as file: - with open('data-export.csv', 'r')as basefile: - cursor=csv.writer(file) - ##hardocding skipping 13 lines - count=0 - for row in csv.reader(basefile): - count+=1 - if count<14: - cursor.writerow(row) - continue - for i in valuestore[count-14]: - if len(i[0])>1: - row+=[i] - cursor.writerow(row) From ea4bd2e6231edc06806b70feaeb534ae9cb527f3 Mon Sep 17 00:00:00 2001 From: Jolene Date: Mon, 23 Oct 2023 13:49:48 +0800 Subject: [PATCH 2/7] feat: prevent adding of files without extensions --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index c824773..709de35 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,13 @@ +# ignores files without extensions (avoids pushing compiled db files) +# except gitignore and license files +* +!*/ +!*.* +!.gitignore +!LICENSE + + + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 210db323c49e30439c1dd7fc952677c50c8c7815 Mon Sep 17 00:00:00 2001 From: Jolene Date: Mon, 23 Oct 2023 13:52:45 +0800 Subject: [PATCH 3/7] feat: updated folder names --- .gitignore | 2 -- .../analysis of the sql data.ipynb | 0 .../presenting data.ipynb | 0 3 files changed, 2 deletions(-) rename {04PostgreSQLDumpFIle => 04URLsInPostgreSQL}/analysis of the sql data.ipynb (100%) rename {04PostgreSQLDumpFIle => 04URLsInPostgreSQL}/presenting data.ipynb (100%) diff --git a/.gitignore b/.gitignore index 709de35..f395e59 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,6 @@ !.gitignore !LICENSE - - # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/04PostgreSQLDumpFIle/analysis of the sql data.ipynb b/04URLsInPostgreSQL/analysis of the sql data.ipynb similarity index 100% rename from 04PostgreSQLDumpFIle/analysis of the sql data.ipynb rename to 04URLsInPostgreSQL/analysis of the sql data.ipynb diff --git a/04PostgreSQLDumpFIle/presenting data.ipynb b/04URLsInPostgreSQL/presenting data.ipynb similarity index 100% rename from 04PostgreSQLDumpFIle/presenting data.ipynb rename to 04URLsInPostgreSQL/presenting data.ipynb From 8caa9873a0c2a1469237957623f2a7251dae321a Mon Sep 17 00:00:00 2001 From: Jolene Date: Mon, 23 Oct 2023 16:25:17 +0800 Subject: [PATCH 4/7] feat: added script to task 03 --- 03AnalysisApplicationData/main.py | 139 ++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 03AnalysisApplicationData/main.py diff --git a/03AnalysisApplicationData/main.py b/03AnalysisApplicationData/main.py new file mode 100644 index 0000000..384dfc3 --- /dev/null +++ b/03AnalysisApplicationData/main.py @@ -0,0 +1,139 @@ +# +# Script to process mentorship data from CSV files and save to CSV +# # TODO: NOT TESTED YET (getting credentials) +# +# 1. Set up /data with Application from different waves and their respective files +# 2. Create env file with your credentials for Elastic Cloud +# - CLOUD_ID= +# - PASSWORD= +# - USER= +# 3. Run script, which produces mentors.csv and mentors_per_application files in /data +# +# Written by: Jolene +# + + +import pandas as pd +import numpy as np +from elasticsearch import Elasticsearch +import re + +class ProcessApplicationData: + def __init__(self, user, cloud_id, password): + self.es = Elasticsearch( + cloud_id=cloud_id, + http_auth=(user, password) + ) + self.mentors = {} + self.unknown_mentors = [] + + def search_documents(self, index, query_body): + result = self.es.search(index=index, body=query_body) + return result + + def get_mentor_info_by_name(self, name): + search_options = { + 'query': { + 'bool': { + 'should': [] + } + } + } + + # Add match query for the name + search_options['query']['bool']['should'].append({ + 'match': { + 'name': name + } + }) + + # Search by organization if name includes organization information in brackets + organization = re.search(r'\((.*?)\)', name) + if organization: + organization_name = organization.group(1) + # Add match query for the organisation + search_options['query']['bool']['should'].append({ + 'match': { + 'organisation': organization_name + } + }) + + result = self.search_documents('enterprise-search-engine-mentorship-page', search_options) + exact_matches = [doc for doc in result['hits']['hits']] + + if len(exact_matches) == 0: + return None + + if 'organisation' in exact_matches[0]['_source']: + return { + 'name': exact_matches[0]['_source']['name'], + 'industries': exact_matches[0]['_source']['industries'], + 'organisation': exact_matches[0]['_source']['organisation'] + } + else: + return { + 'name': exact_matches[0]['_source']['name'], + 'industries': exact_matches[0]['_source']['industries'], + 'organisation': None + } + + def check_same_name(self, name): + search_options = { + 'query': { + 'match': { + 'name': name + } + } + } + result = self.search_documents('enterprise-search-engine-mentorship-page', search_options) + documents = result['hits']['hits'] + if len(documents) > 1: + return True + else: + return False + + def process_mentors_data(self, dataframes): + df = pd.concat(dataframes) + df['year'] = np.concatenate([np.full(len(df_i), year_i) for df_i, year_i in dataframes]) + df.columns = ['mentor_name', 'year'] + df = df[df['mentor_name'] != '[INSERT NAME LIST OF WAVE 3 MENTORS]'] + + df['industries'] = "" + df['organisation'] = "" + + for index, row in df.iterrows(): + name = row['mentor_name'].lower() + year = row['year'] + + if name in self.mentors: + mentor = self.mentors[name] + else: + mentor = self.get_mentor_info_by_name(name) + + if mentor is not None: + row['industries'] = mentor['industries'] + row['organisation'] = mentor['organisation'] + else: + self.unknown_mentors.append(name) + + return df + + def save_data_to_csv(self, df, filename): + df.to_csv(filename, index=False) + +if __name__ == '__main__': + CLOUD_ID = "" + PASSWORD = '' + USER = "jolene" + + mentorship_system = ProcessApplicationData(USER, CLOUD_ID, PASSWORD) + + # Load CSV dataframes + df_2020w3 = pd.read_csv('data/2020w3.csv') + df_2021w1 = pd.read_csv('data/2021w1.csv') + df_2022 = pd.read_csv('data/2022.csv') + + # Process and save mentor data to CSV + dataframes = [df_2020w3, df_2021w1, df_2022] + mentors_df = mentorship_system.process_mentors_data(dataframes) + mentorship_system.save_data_to_csv(mentors_df, 'data/mentors.csv') \ No newline at end of file From e2e84b3960cb01596b83ede29a3a2dc4b9e0334b Mon Sep 17 00:00:00 2001 From: raven0205 <73115813+raven0205@users.noreply.github.com> Date: Sat, 11 Nov 2023 18:32:37 +0800 Subject: [PATCH 5/7] Create wave2.py --- 03AnalysisApplicationData/data/wave2.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 03AnalysisApplicationData/data/wave2.py diff --git a/03AnalysisApplicationData/data/wave2.py b/03AnalysisApplicationData/data/wave2.py new file mode 100644 index 0000000..d25889c --- /dev/null +++ b/03AnalysisApplicationData/data/wave2.py @@ -0,0 +1 @@ +## this will contain the application data for wave 2 From ca26054ff5a9a98c6fb9d4aea0fbb52fe669d049 Mon Sep 17 00:00:00 2001 From: Jolene Date: Sun, 12 Nov 2023 17:44:24 +0800 Subject: [PATCH 6/7] rebasing to remove data files --- 02ReformattingUmami/parsing urls.ipynb | 2073 ++++++++++++++++++++++++ 1 file changed, 2073 insertions(+) diff --git a/02ReformattingUmami/parsing urls.ipynb b/02ReformattingUmami/parsing urls.ipynb index fe4c034..02e27ba 100644 --- a/02ReformattingUmami/parsing urls.ipynb +++ b/02ReformattingUmami/parsing urls.ipynb @@ -2,18 +2,27 @@ "cells": [ { "cell_type": "code", +<<<<<<< HEAD "execution_count": 23, +======= + "execution_count": 11, +>>>>>>> d5fed27 (Update parsing urls.ipynb) "id": "4ba88477", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", + "import numpy as np\n", "from urllib.parse import urlparse, parse_qs, unquote" ] }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 24, +======= + "execution_count": 12, +>>>>>>> d5fed27 (Update parsing urls.ipynb) "id": "c20ad040", "metadata": {}, "outputs": [], @@ -23,7 +32,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 25, +======= + "execution_count": 13, +>>>>>>> d5fed27 (Update parsing urls.ipynb) "id": "e731453c", "metadata": {}, "outputs": [], @@ -40,7 +53,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 26, +======= + "execution_count": 14, +>>>>>>> d5fed27 (Update parsing urls.ipynb) "id": "c7471d8e", "metadata": {}, "outputs": [], @@ -50,12 +67,2068 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 27, +======= + "execution_count": 49, + "id": "5604d86c", + "metadata": {}, + "outputs": [], + "source": [ + "x2 = x.assign(industries=lambda x: np.nan, \n", + " school=lambda x: np.nan,\n", + " course_of_study=lambda x: np.nan,\n", + " organisation=lambda x: np.nan) #create copy with the columns that i want" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "33cd1847", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1019300\n", + "198379000\n", + "400201500\n", + "588100000\n", + "771322300\n", + "967610800\n", + "1166690100\n", + "1361307400\n", + "1567462400\n", + "1760879700\n", + "1972686300\n", + "2173218500\n", + "2374591800\n", + "2586973200\n", + "2845110800\n", + "3069079200\n", + "3274099700\n", + "3490637300\n", + "3738567700\n", + "3945910200\n", + "4154997800\n", + "4394562500\n", + "4606142900\n", + "4810549700\n", + "5002853500\n", + "5205447300\n", + "5402053600\n", + "5595158200\n", + "5795015500\n", + "6013136400\n", + "6219393100\n", + "6423695500\n", + "6616866700\n", + "6835747800\n", + "7024130500\n", + "7217490600\n", + "7436969500\n", + "7646088200\n", + "7847100200\n", + "8058185700\n", + "8270893600\n", + "8482217300\n", + "8679724000\n", + "8879031800\n", + "9058311200\n", + "9247930700\n", + "9451453100\n", + "9668764300\n", + "9886907500\n", + "10091585800\n", + "10282182200\n", + "10469907500\n", + "10689785100\n", + "10905295500\n", + "11129143200\n", + "11329276500\n", + "11520791000\n", + "11730772100\n", + "11920881500\n", + "12137022700\n", + "12350234400\n", + "12557910000\n", + "12759372500\n", + "12976088100\n", + "13189948600\n", + "13401493300\n", + "13584390200\n", + "13768548200\n", + "13947810100\n", + "14144238300\n", + "14319362600\n", + "14499509900\n", + "14671370800\n", + "14847169000\n", + "15009801100\n", + "15171386200\n", + "15343520400\n", + "15518651900\n", + "15691240500\n", + "15858359500\n", + "16034491600\n", + "16212696400\n", + "16380638300\n", + "16547752500\n", + "16713832500\n", + "16887430900\n", + "17054953600\n", + "17230079900\n", + "17404971300\n", + "17576107500\n", + "17765259700\n", + "17946401500\n", + "18122570500\n", + "18298027500\n", + "18490654000\n", + "18676347300\n", + "18858481400\n", + "19062635600\n", + "19240343800\n", + "19441518900\n", + "19632663700\n", + "19829562000\n", + "20001634200\n", + "20149930500\n", + "20318680000\n", + "20503769000\n", + "20667341800\n", + "20853630300\n", + "21018679200\n", + "21212859100\n", + "21388865100\n", + "21575672800\n", + "21758869500\n", + "21933173500\n" + ] + } + ], + "source": [ + "def isnotNaN(num):\n", + " return num == num\n", + "\n", + "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", + " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", + " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", + "\n", + "\n", + "for i in range(len(x2)): #go through original DF\n", + "\n", + " row = x2.iloc[i] # for each row,\n", + "\n", + " temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", + " 1:[None,list()],\n", + " 2:[None,list()],\n", + " 3:[None,list()]\n", + " }\n", + "\n", + " for j in range(4): #identify which field corresponds to reach index\n", + " if isnotNaN(row.loc[f'filters[{j}][field]']):\n", + " temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", + " else:\n", + " break\n", + "\n", + " for k in range(4): #condense all the values for each of the fields into list of values\n", + " for l in range(20):\n", + " if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", + " temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", + " else:\n", + " break\n", + "\n", + " for z in range(4): #add the new column:values to dataframe\n", + " field = temp[z][0]\n", + " values = temp[z][1]\n", + " if field is None:\n", + " break\n", + " else:\n", + " x2.loc[i,field] = str(values)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 61, +>>>>>>> d5fed27 (Update parsing urls.ipynb) "id": "57f88523", "metadata": {}, "outputs": [], "source": [ +<<<<<<< HEAD "x.to_csv('data-preprocessed.csv', index=False)" +======= + "x2.to_csv('data-preprocessed.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "473214f9", + "metadata": {}, + "source": [ + "testing\n", + "\n", + "


























" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "b8ab56af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
industriesschoolcourse_of_studyorganisation
7480NaNNaNNaNNaN
7481NaNNaNNaNNaN
7482[Banking and Finance]NaNNaNNaN
7483[Banking and Finance][National University of Singapore]NaNNaN
7484[Banking and Finance][National University of Singapore, Singapore M...NaNNaN
7485[Banking and Finance][National University of Singapore, Singapore M...[Business Administration]NaN
7486NaNNaNNaNNaN
7487[Banking and Finance]NaNNaNNaN
7488NaNNaNNaNNaN
7489NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " industries \\\n", + "7480 NaN \n", + "7481 NaN \n", + "7482 [Banking and Finance] \n", + "7483 [Banking and Finance] \n", + "7484 [Banking and Finance] \n", + "7485 [Banking and Finance] \n", + "7486 NaN \n", + "7487 [Banking and Finance] \n", + "7488 NaN \n", + "7489 NaN \n", + "\n", + " school \\\n", + "7480 NaN \n", + "7481 NaN \n", + "7482 NaN \n", + "7483 [National University of Singapore] \n", + "7484 [National University of Singapore, Singapore M... \n", + "7485 [National University of Singapore, Singapore M... \n", + "7486 NaN \n", + "7487 NaN \n", + "7488 NaN \n", + "7489 NaN \n", + "\n", + " course_of_study organisation \n", + "7480 NaN NaN \n", + "7481 NaN NaN \n", + "7482 NaN NaN \n", + "7483 NaN NaN \n", + "7484 NaN NaN \n", + "7485 [Business Administration] NaN \n", + "7486 NaN NaN \n", + "7487 NaN NaN \n", + "7488 NaN NaN \n", + "7489 NaN NaN " + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a.iloc[7480:7490,17:21]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "340baaf6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...fbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;sizeindustriesschoolcourse_of_studyorganisation
74817684213922022-05-29 06:50:08.711+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
74827685213922022-05-29 06:50:12.671+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']NaNNaNNaN
74837686213922022-05-29 06:50:18.17+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore']NaNNaN
74847687213922022-05-29 06:50:20.512+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...NaNNaN
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...['Business Administration']NaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

106108 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " view_id website_id session_id created_at \\\n", + "7481 7684 2 1392 2022-05-29 06:50:08.711+00 \n", + "7482 7685 2 1392 2022-05-29 06:50:12.671+00 \n", + "7483 7686 2 1392 2022-05-29 06:50:18.17+00 \n", + "7484 7687 2 1392 2022-05-29 06:50:20.512+00 \n", + "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", + "... ... ... ... ... \n", + "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", + "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", + "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", + "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", + "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", + "\n", + " url \\\n", + "7481 /?size=n_20_n \n", + "7482 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "... ... \n", + "113584 / \n", + "113585 /events/ \n", + "113586 /press-releases/ \n", + "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", + "113588 /2017/07/30/conversations-with-marvin-kang/ \n", + "\n", + " referrer \\\n", + "7481 / \n", + "7482 /?size=n_20_n \n", + "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "... ... \n", + "113584 https://static.elfsight.com/ \n", + "113585 https://beta.advisory.sg/ \n", + "113586 NaN \n", + "113587 http://localhost:2368/ \n", + "113588 http://localhost:2368/tag/social-service/ \n", + "\n", + " query_params q size \\\n", + "7481 {'size': ['n_20_n']} NaN [n_20_n] \n", + "7482 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", + "7483 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", + "7484 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", + "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", + "... ... ... ... \n", + "113584 {} NaN NaN \n", + "113585 {} NaN NaN \n", + "113586 {} NaN NaN \n", + "113587 {} NaN NaN \n", + "113588 {} NaN NaN \n", + "\n", + " filters[0][field] ... fbclid filters[3][field] trk \\\n", + "7481 NaN ... NaN NaN NaN \n", + "7482 [industries] ... NaN NaN NaN \n", + "7483 [industries] ... NaN NaN NaN \n", + "7484 [industries] ... NaN NaN NaN \n", + "7485 [industries] ... NaN NaN NaN \n", + "... ... ... ... ... ... \n", + "113584 NaN ... NaN NaN NaN \n", + "113585 NaN ... NaN NaN NaN \n", + "113586 NaN ... NaN NaN NaN \n", + "113587 NaN ... NaN NaN NaN \n", + "113588 NaN ... NaN NaN NaN \n", + "\n", + " filters[3][values][0] filters[3][values][1] amp;amp;size \\\n", + "7481 NaN NaN NaN \n", + "7482 NaN NaN NaN \n", + "7483 NaN NaN NaN \n", + "7484 NaN NaN NaN \n", + "7485 NaN NaN NaN \n", + "... ... ... ... \n", + "113584 NaN NaN NaN \n", + "113585 NaN NaN NaN \n", + "113586 NaN NaN NaN \n", + "113587 NaN NaN NaN \n", + "113588 NaN NaN NaN \n", + "\n", + " industries \\\n", + "7481 NaN \n", + "7482 ['Banking and Finance'] \n", + "7483 ['Banking and Finance'] \n", + "7484 ['Banking and Finance'] \n", + "7485 ['Banking and Finance'] \n", + "... ... \n", + "113584 NaN \n", + "113585 NaN \n", + "113586 NaN \n", + "113587 NaN \n", + "113588 NaN \n", + "\n", + " school \\\n", + "7481 NaN \n", + "7482 NaN \n", + "7483 ['National University of Singapore'] \n", + "7484 ['National University of Singapore', 'Singapor... \n", + "7485 ['National University of Singapore', 'Singapor... \n", + "... ... \n", + "113584 NaN \n", + "113585 NaN \n", + "113586 NaN \n", + "113587 NaN \n", + "113588 NaN \n", + "\n", + " course_of_study organisation \n", + "7481 NaN NaN \n", + "7482 NaN NaN \n", + "7483 NaN NaN \n", + "7484 NaN NaN \n", + "7485 ['Business Administration'] NaN \n", + "... ... ... \n", + "113584 NaN NaN \n", + "113585 NaN NaN \n", + "113586 NaN NaN \n", + "113587 NaN NaN \n", + "113588 NaN NaN \n", + "\n", + "[106108 rows x 51 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x2.iloc[7481:,:-9]" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "794a6b93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n", + "y = pd.DataFrame(columns=columns)\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc3ea68f", + "metadata": {}, + "outputs": [], + "source": [ + "18888237600\n", + "40737104200\n", + "72785283300\n", + "108758322400\n", + "145217173600\n", + "186793474100\n", + "225162609200\n", + "264882508000\n", + "314453042500\n", + "365454468800\n", + "408983182300\n", + "455546595500\n", + "504377514500\n", + "549119827600\n", + "591181220700\n", + "635682159900\n", + "682860726900\n", + "732106724800\n", + "782814304400\n", + "835621612600\n", + "891255040500\n", + "943683757500\n", + "983562692700\n", + "1023982651900\n", + "1065614924700\n", + "1108148358200\n", + "1151829557700\n", + "1196612275400\n", + "1242725911700\n", + "1291049334900\n", + "1341243997100\n", + "1392687859900\n", + "1445250854800\n", + "1498960928900\n", + "1553929939500\n", + "1611468594200\n", + "1671760449600\n", + "1734306435500\n", + "1795385092900\n", + "1857648008700\n", + "1921126209100\n", + "1989747478400\n", + "2056883359300\n", + "2124365102700\n", + "2192953255500\n", + "2262542653300\n", + "2333537347900\n", + "2407247346000\n", + "2481174709300\n", + "2556079180300\n", + "2632721770500\n", + "2711853018800\n", + "2797623695000\n", + "2885214009200\n", + "2969392615200\n", + "3052316078600\n", + "3135706844500\n", + "3220985459900\n", + "3308010920400\n", + "3394591298400\n", + "3481977400200\n", + "3570721798700\n", + "3660806746900\n", + "3751976516100\n", + "3844537433200\n", + "3938891733100\n", + "4034454066400\n", + "4137224107900\n", + "4239237949900\n", + "4341107871900\n", + "4445053302400\n", + "4550355588000\n", + "4656088215800\n", + "4760841571100\n", + "4866702552200\n", + "4973539051500\n", + "5082814611800\n", + "5192301727000\n", + "5303393414800\n", + "5415569427500\n", + "5529336622100\n", + "5644090487900\n", + "7905513294400\n", + "8022122905300\n", + "8142019076400\n", + "8262193145400\n", + "8413007131000\n", + "8569099970100\n", + "8699897960500\n", + "8861730423300\n", + "9046117615700\n", + "9196323885200\n", + "9364978097600\n", + "9523016066800\n", + "9688182973700\n", + "9856712632500\n", + "10026784638300\n", + "10197057256100\n", + "10319851429200\n", + "10375110798500\n", + "10430841262800\n", + "10487271861600\n", + "10544539678800\n", + "10602246570200\n", + "10659615402100\n", + "10718192663600\n", + "10777064820100\n", + "10836482106000\n", + "10896603119500\n", + "10957077660600\n", + "11020635868700" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "ded3c20e", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([pd.DataFrame(row, columns=columns), y], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "36d21011", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "17f73e73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...filters[0][values][8]filters[0][values][9]_sm_au_vfbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;size
02332932022-05-24 01:43:25.814+00/?q=chemical&size=n_20_nNaN{'q': ['chemical'], 'size': ['n_20_n']}[chemical][n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12342942022-05-24 01:45:16.891+00/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22352942022-05-24 01:45:17.571+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32362942022-05-24 01:46:04.09+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42372942022-05-24 01:46:10.66+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

113589 rows × 47 columns

\n", + "
" + ], + "text/plain": [ + " view_id website_id session_id created_at \\\n", + "0 233 2 93 2022-05-24 01:43:25.814+00 \n", + "1 234 2 94 2022-05-24 01:45:16.891+00 \n", + "2 235 2 94 2022-05-24 01:45:17.571+00 \n", + "3 236 2 94 2022-05-24 01:46:04.09+00 \n", + "4 237 2 94 2022-05-24 01:46:10.66+00 \n", + "... ... ... ... ... \n", + "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", + "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", + "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", + "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", + "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", + "\n", + " url \\\n", + "0 /?q=chemical&size=n_20_n \n", + "1 / \n", + "2 /?size=n_20_n \n", + "3 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "... ... \n", + "113584 / \n", + "113585 /events/ \n", + "113586 /press-releases/ \n", + "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", + "113588 /2017/07/30/conversations-with-marvin-kang/ \n", + "\n", + " referrer \\\n", + "0 NaN \n", + "1 NaN \n", + "2 / \n", + "3 /?size=n_20_n \n", + "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "... ... \n", + "113584 https://static.elfsight.com/ \n", + "113585 https://beta.advisory.sg/ \n", + "113586 NaN \n", + "113587 http://localhost:2368/ \n", + "113588 http://localhost:2368/tag/social-service/ \n", + "\n", + " query_params q \\\n", + "0 {'q': ['chemical'], 'size': ['n_20_n']} [chemical] \n", + "1 {} NaN \n", + "2 {'size': ['n_20_n']} NaN \n", + "3 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", + "4 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", + "... ... ... \n", + "113584 {} NaN \n", + "113585 {} NaN \n", + "113586 {} NaN \n", + "113587 {} NaN \n", + "113588 {} NaN \n", + "\n", + " size filters[0][field] ... filters[0][values][8] \\\n", + "0 [n_20_n] NaN ... NaN \n", + "1 NaN NaN ... NaN \n", + "2 [n_20_n] NaN ... NaN \n", + "3 [n_20_n] [industries] ... NaN \n", + "4 [n_20_n] [industries] ... NaN \n", + "... ... ... ... ... \n", + "113584 NaN NaN ... NaN \n", + "113585 NaN NaN ... NaN \n", + "113586 NaN NaN ... NaN \n", + "113587 NaN NaN ... NaN \n", + "113588 NaN NaN ... NaN \n", + "\n", + " filters[0][values][9] _sm_au_ v fbclid filters[3][field] trk \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN \n", + "... ... ... ... ... ... ... \n", + "113584 NaN NaN NaN NaN NaN NaN \n", + "113585 NaN NaN NaN NaN NaN NaN \n", + "113586 NaN NaN NaN NaN NaN NaN \n", + "113587 NaN NaN NaN NaN NaN NaN \n", + "113588 NaN NaN NaN NaN NaN NaN \n", + "\n", + " filters[3][values][0] filters[3][values][1] amp;amp;size \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "113584 NaN NaN NaN \n", + "113585 NaN NaN NaN \n", + "113586 NaN NaN NaN \n", + "113587 NaN NaN NaN \n", + "113588 NaN NaN NaN \n", + "\n", + "[113589 rows x 47 columns]" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "a5c9804c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "view_id 7688\n", + "website_id 2\n", + "session_id 1392\n", + "created_at 2022-05-29 06:50:26.338+00\n", + "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", + "q NaN\n", + "size [n_20_n]\n", + "filters[0][field] [industries]\n", + "filters[0][values][0] [Banking and Finance]\n", + "filters[0][type] [all]\n", + "filters[0][values][1] NaN\n", + "filters[0][values][2] NaN\n", + "current NaN\n", + "sort-field NaN\n", + "sort-direction NaN\n", + "filters[1][field] [school]\n", + "filters[1][values][0] [National University of Singapore]\n", + "filters[1][type] [any]\n", + "filters[0][values][3] NaN\n", + "filters[0][values][4] NaN\n", + "filters[1][values][1] [Singapore Management University]\n", + "filters[1][values][2] NaN\n", + "filters[1][values][3] NaN\n", + "filters[1][values][4] NaN\n", + "filters[1][values][5] NaN\n", + "filters[2][field] [course_of_study]\n", + "filters[2][values][0] [Business Administration]\n", + "filters[2][values][1] NaN\n", + "filters[2][values][2] NaN\n", + "filters[2][values][3] NaN\n", + "filters[1][values][6] NaN\n", + "filters[2][type] [any]\n", + "filters[0][values][5] NaN\n", + "filters[0][values][6] NaN\n", + "filters[0][values][7] NaN\n", + "filters[0][values][8] NaN\n", + "filters[0][values][9] NaN\n", + "_sm_au_ NaN\n", + "v NaN\n", + "fbclid NaN\n", + "filters[3][field] NaN\n", + "trk NaN\n", + "filters[3][values][0] NaN\n", + "filters[3][values][1] NaN\n", + "amp;amp;size NaN\n", + "Name: 7485, dtype: object" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row = x.iloc[7485]\n", + "row" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "2335961c", + "metadata": {}, + "outputs": [], + "source": [ + "def isnotNaN(num):\n", + " return num == num\n", + "\n", + "temp ={0:[None,list()],\n", + " 1:[None,list()],\n", + " 2:[None,list()],\n", + " 3:[None,list()]\n", + " }\n", + "\n", + "for i in range(4):\n", + " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", + " temp[i][0] = row.loc[f'filters[{i}][field]'][0]\n", + " \n", + "for i in range(4):\n", + " for j in range(20):\n", + " if f'filters[{i}][values][{j}]' in row.index and isnotNaN(row.loc[f'filters[{i}][values][{j}]']):\n", + " temp[i][1].append(row.loc[f'filters[{i}][values][{j}]'][0]) \n", + " \n", + "for i in range(4):\n", + " for j in range(20):\n", + " if f'filters[{i}][values][{j}]' in row.index:\n", + " row = row.drop(labels=f'filters[{i}][values][{j}]')\n", + " \n", + "for i in range(4):\n", + " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", + " row = row.drop(labels=[f'filters[{i}][field]', f'filters[{i}][type]'])\n", + " \n", + "for i in range(4):\n", + " field = temp[i][0]\n", + " values = temp[i][1]\n", + " row[field] = values" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "e393f951", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "9c3dba3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: ['industries', ['Banking and Finance']],\n", + " 1: ['school',\n", + " ['National University of Singapore', 'Singapore Management University']],\n", + " 2: ['course_of_study', ['Business Administration']],\n", + " 3: [None, []]}" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "b57d4ca9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "view_id 7688\n", + "website_id 2\n", + "session_id 1392\n", + "created_at 2022-05-29 06:50:26.338+00\n", + "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", + "q NaN\n", + "size [n_20_n]\n", + "current NaN\n", + "sort-field NaN\n", + "sort-direction NaN\n", + "_sm_au_ NaN\n", + "v NaN\n", + "fbclid NaN\n", + "filters[3][field] NaN\n", + "trk NaN\n", + "amp;amp;size NaN\n", + "industries [Banking and Finance]\n", + "school [National University of Singapore, Singapore M...\n", + "course_of_study [Business Administration]\n", + "None []\n", + "Name: 7485, dtype: object" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "5d91fe23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index([ 'view_id', 'website_id', 'session_id',\n", + " 'created_at', 'url', 'referrer',\n", + " 'query_params', 'q', 'size',\n", + " 'current', 'sort-field', 'sort-direction',\n", + " '_sm_au_', 'v', 'fbclid',\n", + " 'filters[3][field]', 'trk', 'amp;amp;size',\n", + " 'industries', 'school', 'course_of_study',\n", + " None],\n", + " dtype='object')" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row.index" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "d3b752a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "043c5d16", + "metadata": {}, + "outputs": [], + "source": [ + "l = list(row.index)\n", + "l.append('organisation')" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "7787dc1e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'filters[3][field]', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', None, 'organisation']\n" + ] + } + ], + "source": [ + "print(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "b21898f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "view_id 7688\n", + "website_id 2\n", + "session_id 1392\n", + "created_at 2022-05-29 06:50:26.338+00\n", + "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", + "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", + "q NaN\n", + "size [n_20_n]\n", + "current NaN\n", + "sort-field NaN\n", + "sort-direction NaN\n", + "_sm_au_ NaN\n", + "v NaN\n", + "fbclid NaN\n", + "filters[3][field] NaN\n", + "trk NaN\n", + "amp;amp;size NaN\n", + "industries [Banking and Finance]\n", + "school [National University of Singapore, Singapore M...\n", + "course_of_study [Business Administration]\n", + "None []\n", + "Name: 7485, dtype: object" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "a3a0f1c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = pd.DataFrame(columns=columns)\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "bc449824", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent..._sm_au_vfbclidfilters[3][field]trkamp;amp;sizeindustriesschoolcourse_of_studyNone
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaN[Banking and Finance][National University of Singapore, Singapore M...[Business Administration][]
\n", + "

1 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " view_id website_id session_id created_at \\\n", + "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", + "\n", + " url \\\n", + "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "\n", + " referrer \\\n", + "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", + "\n", + " query_params q size \\\n", + "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", + "\n", + " current ... _sm_au_ v fbclid filters[3][field] trk amp;amp;size \\\n", + "7485 NaN ... NaN NaN NaN NaN NaN NaN \n", + "\n", + " industries \\\n", + "7485 [Banking and Finance] \n", + "\n", + " school \\\n", + "7485 [National University of Singapore, Singapore M... \n", + "\n", + " course_of_study None \n", + "7485 [Business Administration] [] \n", + "\n", + "[1 rows x 22 columns]" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "403e6169", + "metadata": {}, + "outputs": [], + "source": [ + "x = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", + " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", + " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "36451395", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['view_id',\n", + " 'website_id',\n", + " 'session_id',\n", + " 'created_at',\n", + " 'url',\n", + " 'referrer',\n", + " 'query_params',\n", + " 'q',\n", + " 'size',\n", + " 'current',\n", + " 'sort-field',\n", + " 'sort-direction',\n", + " '_sm_au_',\n", + " 'v',\n", + " 'fbclid',\n", + " 'trk',\n", + " 'amp;amp;size',\n", + " 'industries',\n", + " 'school',\n", + " 'course_of_study',\n", + " 'organisation']" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x" +>>>>>>> d5fed27 (Update parsing urls.ipynb) ] }, { From 81df9a94bc99ec8562c7ac5d3d08a75a95edd77a Mon Sep 17 00:00:00 2001 From: Jolene Date: Sun, 12 Nov 2023 16:43:19 +0800 Subject: [PATCH 7/7] optimized code and added comments w more details --- 02ReformattingUmami/parsing urls.ipynb | 2164 ------------------------ 04URLsInPostgreSQL/parsing urls.ipynb | 385 +++++ 2 files changed, 385 insertions(+), 2164 deletions(-) delete mode 100644 02ReformattingUmami/parsing urls.ipynb create mode 100644 04URLsInPostgreSQL/parsing urls.ipynb diff --git a/02ReformattingUmami/parsing urls.ipynb b/02ReformattingUmami/parsing urls.ipynb deleted file mode 100644 index 02e27ba..0000000 --- a/02ReformattingUmami/parsing urls.ipynb +++ /dev/null @@ -1,2164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 23, -======= - "execution_count": 11, ->>>>>>> d5fed27 (Update parsing urls.ipynb) - "id": "4ba88477", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from urllib.parse import urlparse, parse_qs, unquote" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 24, -======= - "execution_count": 12, ->>>>>>> d5fed27 (Update parsing urls.ipynb) - "id": "c20ad040", - "metadata": {}, - "outputs": [], - "source": [ - "original_df = pd.read_csv('v1_pageview')" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 25, -======= - "execution_count": 13, ->>>>>>> d5fed27 (Update parsing urls.ipynb) - "id": "e731453c", - "metadata": {}, - "outputs": [], - "source": [ - "def extract_query_params(url):\n", - " url = unquote(url)\n", - " query_params = parse_qs(urlparse(url).query)\n", - " return query_params\n", - "\n", - "new_df = original_df.copy() \n", - "new_df['url'] = new_df['url'].astype(str)\n", - "new_df['query_params'] = new_df['url'].apply(extract_query_params)\n" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 26, -======= - "execution_count": 14, ->>>>>>> d5fed27 (Update parsing urls.ipynb) - "id": "c7471d8e", - "metadata": {}, - "outputs": [], - "source": [ - "x = pd.concat([new_df, new_df['query_params'].apply(pd.Series)], axis=1)" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 27, -======= - "execution_count": 49, - "id": "5604d86c", - "metadata": {}, - "outputs": [], - "source": [ - "x2 = x.assign(industries=lambda x: np.nan, \n", - " school=lambda x: np.nan,\n", - " course_of_study=lambda x: np.nan,\n", - " organisation=lambda x: np.nan) #create copy with the columns that i want" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "33cd1847", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1019300\n", - "198379000\n", - "400201500\n", - "588100000\n", - "771322300\n", - "967610800\n", - "1166690100\n", - "1361307400\n", - "1567462400\n", - "1760879700\n", - "1972686300\n", - "2173218500\n", - "2374591800\n", - "2586973200\n", - "2845110800\n", - "3069079200\n", - "3274099700\n", - "3490637300\n", - "3738567700\n", - "3945910200\n", - "4154997800\n", - "4394562500\n", - "4606142900\n", - "4810549700\n", - "5002853500\n", - "5205447300\n", - "5402053600\n", - "5595158200\n", - "5795015500\n", - "6013136400\n", - "6219393100\n", - "6423695500\n", - "6616866700\n", - "6835747800\n", - "7024130500\n", - "7217490600\n", - "7436969500\n", - "7646088200\n", - "7847100200\n", - "8058185700\n", - "8270893600\n", - "8482217300\n", - "8679724000\n", - "8879031800\n", - "9058311200\n", - "9247930700\n", - "9451453100\n", - "9668764300\n", - "9886907500\n", - "10091585800\n", - "10282182200\n", - "10469907500\n", - "10689785100\n", - "10905295500\n", - "11129143200\n", - "11329276500\n", - "11520791000\n", - "11730772100\n", - "11920881500\n", - "12137022700\n", - "12350234400\n", - "12557910000\n", - "12759372500\n", - "12976088100\n", - "13189948600\n", - "13401493300\n", - "13584390200\n", - "13768548200\n", - "13947810100\n", - "14144238300\n", - "14319362600\n", - "14499509900\n", - "14671370800\n", - "14847169000\n", - "15009801100\n", - "15171386200\n", - "15343520400\n", - "15518651900\n", - "15691240500\n", - "15858359500\n", - "16034491600\n", - "16212696400\n", - "16380638300\n", - "16547752500\n", - "16713832500\n", - "16887430900\n", - "17054953600\n", - "17230079900\n", - "17404971300\n", - "17576107500\n", - "17765259700\n", - "17946401500\n", - "18122570500\n", - "18298027500\n", - "18490654000\n", - "18676347300\n", - "18858481400\n", - "19062635600\n", - "19240343800\n", - "19441518900\n", - "19632663700\n", - "19829562000\n", - "20001634200\n", - "20149930500\n", - "20318680000\n", - "20503769000\n", - "20667341800\n", - "20853630300\n", - "21018679200\n", - "21212859100\n", - "21388865100\n", - "21575672800\n", - "21758869500\n", - "21933173500\n" - ] - } - ], - "source": [ - "def isnotNaN(num):\n", - " return num == num\n", - "\n", - "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", - " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", - " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", - "\n", - "\n", - "for i in range(len(x2)): #go through original DF\n", - "\n", - " row = x2.iloc[i] # for each row,\n", - "\n", - " temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", - " 1:[None,list()],\n", - " 2:[None,list()],\n", - " 3:[None,list()]\n", - " }\n", - "\n", - " for j in range(4): #identify which field corresponds to reach index\n", - " if isnotNaN(row.loc[f'filters[{j}][field]']):\n", - " temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", - " else:\n", - " break\n", - "\n", - " for k in range(4): #condense all the values for each of the fields into list of values\n", - " for l in range(20):\n", - " if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", - " temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", - " else:\n", - " break\n", - "\n", - " for z in range(4): #add the new column:values to dataframe\n", - " field = temp[z][0]\n", - " values = temp[z][1]\n", - " if field is None:\n", - " break\n", - " else:\n", - " x2.loc[i,field] = str(values)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 61, ->>>>>>> d5fed27 (Update parsing urls.ipynb) - "id": "57f88523", - "metadata": {}, - "outputs": [], - "source": [ -<<<<<<< HEAD - "x.to_csv('data-preprocessed.csv', index=False)" -======= - "x2.to_csv('data-preprocessed.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "473214f9", - "metadata": {}, - "source": [ - "testing\n", - "\n", - "


























" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "id": "b8ab56af", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
industriesschoolcourse_of_studyorganisation
7480NaNNaNNaNNaN
7481NaNNaNNaNNaN
7482[Banking and Finance]NaNNaNNaN
7483[Banking and Finance][National University of Singapore]NaNNaN
7484[Banking and Finance][National University of Singapore, Singapore M...NaNNaN
7485[Banking and Finance][National University of Singapore, Singapore M...[Business Administration]NaN
7486NaNNaNNaNNaN
7487[Banking and Finance]NaNNaNNaN
7488NaNNaNNaNNaN
7489NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " industries \\\n", - "7480 NaN \n", - "7481 NaN \n", - "7482 [Banking and Finance] \n", - "7483 [Banking and Finance] \n", - "7484 [Banking and Finance] \n", - "7485 [Banking and Finance] \n", - "7486 NaN \n", - "7487 [Banking and Finance] \n", - "7488 NaN \n", - "7489 NaN \n", - "\n", - " school \\\n", - "7480 NaN \n", - "7481 NaN \n", - "7482 NaN \n", - "7483 [National University of Singapore] \n", - "7484 [National University of Singapore, Singapore M... \n", - "7485 [National University of Singapore, Singapore M... \n", - "7486 NaN \n", - "7487 NaN \n", - "7488 NaN \n", - "7489 NaN \n", - "\n", - " course_of_study organisation \n", - "7480 NaN NaN \n", - "7481 NaN NaN \n", - "7482 NaN NaN \n", - "7483 NaN NaN \n", - "7484 NaN NaN \n", - "7485 [Business Administration] NaN \n", - "7486 NaN NaN \n", - "7487 NaN NaN \n", - "7488 NaN NaN \n", - "7489 NaN NaN " - ] - }, - "execution_count": 195, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a.iloc[7480:7490,17:21]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "340baaf6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...fbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;sizeindustriesschoolcourse_of_studyorganisation
74817684213922022-05-29 06:50:08.711+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
74827685213922022-05-29 06:50:12.671+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']NaNNaNNaN
74837686213922022-05-29 06:50:18.17+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore']NaNNaN
74847687213922022-05-29 06:50:20.512+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...NaNNaN
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...['Business Administration']NaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

106108 rows × 51 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "7481 7684 2 1392 2022-05-29 06:50:08.711+00 \n", - "7482 7685 2 1392 2022-05-29 06:50:12.671+00 \n", - "7483 7686 2 1392 2022-05-29 06:50:18.17+00 \n", - "7484 7687 2 1392 2022-05-29 06:50:20.512+00 \n", - "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", - "... ... ... ... ... \n", - "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", - "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", - "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", - "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", - "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", - "\n", - " url \\\n", - "7481 /?size=n_20_n \n", - "7482 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 / \n", - "113585 /events/ \n", - "113586 /press-releases/ \n", - "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", - "113588 /2017/07/30/conversations-with-marvin-kang/ \n", - "\n", - " referrer \\\n", - "7481 / \n", - "7482 /?size=n_20_n \n", - "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 https://static.elfsight.com/ \n", - "113585 https://beta.advisory.sg/ \n", - "113586 NaN \n", - "113587 http://localhost:2368/ \n", - "113588 http://localhost:2368/tag/social-service/ \n", - "\n", - " query_params q size \\\n", - "7481 {'size': ['n_20_n']} NaN [n_20_n] \n", - "7482 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7483 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7484 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "... ... ... ... \n", - "113584 {} NaN NaN \n", - "113585 {} NaN NaN \n", - "113586 {} NaN NaN \n", - "113587 {} NaN NaN \n", - "113588 {} NaN NaN \n", - "\n", - " filters[0][field] ... fbclid filters[3][field] trk \\\n", - "7481 NaN ... NaN NaN NaN \n", - "7482 [industries] ... NaN NaN NaN \n", - "7483 [industries] ... NaN NaN NaN \n", - "7484 [industries] ... NaN NaN NaN \n", - "7485 [industries] ... NaN NaN NaN \n", - "... ... ... ... ... ... \n", - "113584 NaN ... NaN NaN NaN \n", - "113585 NaN ... NaN NaN NaN \n", - "113586 NaN ... NaN NaN NaN \n", - "113587 NaN ... NaN NaN NaN \n", - "113588 NaN ... NaN NaN NaN \n", - "\n", - " filters[3][values][0] filters[3][values][1] amp;amp;size \\\n", - "7481 NaN NaN NaN \n", - "7482 NaN NaN NaN \n", - "7483 NaN NaN NaN \n", - "7484 NaN NaN NaN \n", - "7485 NaN NaN NaN \n", - "... ... ... ... \n", - "113584 NaN NaN NaN \n", - "113585 NaN NaN NaN \n", - "113586 NaN NaN NaN \n", - "113587 NaN NaN NaN \n", - "113588 NaN NaN NaN \n", - "\n", - " industries \\\n", - "7481 NaN \n", - "7482 ['Banking and Finance'] \n", - "7483 ['Banking and Finance'] \n", - "7484 ['Banking and Finance'] \n", - "7485 ['Banking and Finance'] \n", - "... ... \n", - "113584 NaN \n", - "113585 NaN \n", - "113586 NaN \n", - "113587 NaN \n", - "113588 NaN \n", - "\n", - " school \\\n", - "7481 NaN \n", - "7482 NaN \n", - "7483 ['National University of Singapore'] \n", - "7484 ['National University of Singapore', 'Singapor... \n", - "7485 ['National University of Singapore', 'Singapor... \n", - "... ... \n", - "113584 NaN \n", - "113585 NaN \n", - "113586 NaN \n", - "113587 NaN \n", - "113588 NaN \n", - "\n", - " course_of_study organisation \n", - "7481 NaN NaN \n", - "7482 NaN NaN \n", - "7483 NaN NaN \n", - "7484 NaN NaN \n", - "7485 ['Business Administration'] NaN \n", - "... ... ... \n", - "113584 NaN NaN \n", - "113585 NaN NaN \n", - "113586 NaN NaN \n", - "113587 NaN NaN \n", - "113588 NaN NaN \n", - "\n", - "[106108 rows x 51 columns]" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x2.iloc[7481:,:-9]" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "id": "794a6b93", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 157, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n", - "y = pd.DataFrame(columns=columns)\n", - "y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc3ea68f", - "metadata": {}, - "outputs": [], - "source": [ - "18888237600\n", - "40737104200\n", - "72785283300\n", - "108758322400\n", - "145217173600\n", - "186793474100\n", - "225162609200\n", - "264882508000\n", - "314453042500\n", - "365454468800\n", - "408983182300\n", - "455546595500\n", - "504377514500\n", - "549119827600\n", - "591181220700\n", - "635682159900\n", - "682860726900\n", - "732106724800\n", - "782814304400\n", - "835621612600\n", - "891255040500\n", - "943683757500\n", - "983562692700\n", - "1023982651900\n", - "1065614924700\n", - "1108148358200\n", - "1151829557700\n", - "1196612275400\n", - "1242725911700\n", - "1291049334900\n", - "1341243997100\n", - "1392687859900\n", - "1445250854800\n", - "1498960928900\n", - "1553929939500\n", - "1611468594200\n", - "1671760449600\n", - "1734306435500\n", - "1795385092900\n", - "1857648008700\n", - "1921126209100\n", - "1989747478400\n", - "2056883359300\n", - "2124365102700\n", - "2192953255500\n", - "2262542653300\n", - "2333537347900\n", - "2407247346000\n", - "2481174709300\n", - "2556079180300\n", - "2632721770500\n", - "2711853018800\n", - "2797623695000\n", - "2885214009200\n", - "2969392615200\n", - "3052316078600\n", - "3135706844500\n", - "3220985459900\n", - "3308010920400\n", - "3394591298400\n", - "3481977400200\n", - "3570721798700\n", - "3660806746900\n", - "3751976516100\n", - "3844537433200\n", - "3938891733100\n", - "4034454066400\n", - "4137224107900\n", - "4239237949900\n", - "4341107871900\n", - "4445053302400\n", - "4550355588000\n", - "4656088215800\n", - "4760841571100\n", - "4866702552200\n", - "4973539051500\n", - "5082814611800\n", - "5192301727000\n", - "5303393414800\n", - "5415569427500\n", - "5529336622100\n", - "5644090487900\n", - "7905513294400\n", - "8022122905300\n", - "8142019076400\n", - "8262193145400\n", - "8413007131000\n", - "8569099970100\n", - "8699897960500\n", - "8861730423300\n", - "9046117615700\n", - "9196323885200\n", - "9364978097600\n", - "9523016066800\n", - "9688182973700\n", - "9856712632500\n", - "10026784638300\n", - "10197057256100\n", - "10319851429200\n", - "10375110798500\n", - "10430841262800\n", - "10487271861600\n", - "10544539678800\n", - "10602246570200\n", - "10659615402100\n", - "10718192663600\n", - "10777064820100\n", - "10836482106000\n", - "10896603119500\n", - "10957077660600\n", - "11020635868700" - ] - }, - { - "cell_type": "code", - "execution_count": 160, - "id": "ded3c20e", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([pd.DataFrame(row, columns=columns), y], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "id": "36d21011", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 161, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df\n" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "id": "17f73e73", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...filters[0][values][8]filters[0][values][9]_sm_au_vfbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;size
02332932022-05-24 01:43:25.814+00/?q=chemical&size=n_20_nNaN{'q': ['chemical'], 'size': ['n_20_n']}[chemical][n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12342942022-05-24 01:45:16.891+00/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22352942022-05-24 01:45:17.571+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32362942022-05-24 01:46:04.09+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42372942022-05-24 01:46:10.66+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

113589 rows × 47 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "0 233 2 93 2022-05-24 01:43:25.814+00 \n", - "1 234 2 94 2022-05-24 01:45:16.891+00 \n", - "2 235 2 94 2022-05-24 01:45:17.571+00 \n", - "3 236 2 94 2022-05-24 01:46:04.09+00 \n", - "4 237 2 94 2022-05-24 01:46:10.66+00 \n", - "... ... ... ... ... \n", - "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", - "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", - "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", - "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", - "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", - "\n", - " url \\\n", - "0 /?q=chemical&size=n_20_n \n", - "1 / \n", - "2 /?size=n_20_n \n", - "3 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 / \n", - "113585 /events/ \n", - "113586 /press-releases/ \n", - "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", - "113588 /2017/07/30/conversations-with-marvin-kang/ \n", - "\n", - " referrer \\\n", - "0 NaN \n", - "1 NaN \n", - "2 / \n", - "3 /?size=n_20_n \n", - "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 https://static.elfsight.com/ \n", - "113585 https://beta.advisory.sg/ \n", - "113586 NaN \n", - "113587 http://localhost:2368/ \n", - "113588 http://localhost:2368/tag/social-service/ \n", - "\n", - " query_params q \\\n", - "0 {'q': ['chemical'], 'size': ['n_20_n']} [chemical] \n", - "1 {} NaN \n", - "2 {'size': ['n_20_n']} NaN \n", - "3 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", - "4 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", - "... ... ... \n", - "113584 {} NaN \n", - "113585 {} NaN \n", - "113586 {} NaN \n", - "113587 {} NaN \n", - "113588 {} NaN \n", - "\n", - " size filters[0][field] ... filters[0][values][8] \\\n", - "0 [n_20_n] NaN ... NaN \n", - "1 NaN NaN ... NaN \n", - "2 [n_20_n] NaN ... NaN \n", - "3 [n_20_n] [industries] ... NaN \n", - "4 [n_20_n] [industries] ... NaN \n", - "... ... ... ... ... \n", - "113584 NaN NaN ... NaN \n", - "113585 NaN NaN ... NaN \n", - "113586 NaN NaN ... NaN \n", - "113587 NaN NaN ... NaN \n", - "113588 NaN NaN ... NaN \n", - "\n", - " filters[0][values][9] _sm_au_ v fbclid filters[3][field] trk \\\n", - "0 NaN NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... \n", - "113584 NaN NaN NaN NaN NaN NaN \n", - "113585 NaN NaN NaN NaN NaN NaN \n", - "113586 NaN NaN NaN NaN NaN NaN \n", - "113587 NaN NaN NaN NaN NaN NaN \n", - "113588 NaN NaN NaN NaN NaN NaN \n", - "\n", - " filters[3][values][0] filters[3][values][1] amp;amp;size \n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "113584 NaN NaN NaN \n", - "113585 NaN NaN NaN \n", - "113586 NaN NaN NaN \n", - "113587 NaN NaN NaN \n", - "113588 NaN NaN NaN \n", - "\n", - "[113589 rows x 47 columns]" - ] - }, - "execution_count": 140, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "id": "a5c9804c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "filters[0][field] [industries]\n", - "filters[0][values][0] [Banking and Finance]\n", - "filters[0][type] [all]\n", - "filters[0][values][1] NaN\n", - "filters[0][values][2] NaN\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "filters[1][field] [school]\n", - "filters[1][values][0] [National University of Singapore]\n", - "filters[1][type] [any]\n", - "filters[0][values][3] NaN\n", - "filters[0][values][4] NaN\n", - "filters[1][values][1] [Singapore Management University]\n", - "filters[1][values][2] NaN\n", - "filters[1][values][3] NaN\n", - "filters[1][values][4] NaN\n", - "filters[1][values][5] NaN\n", - "filters[2][field] [course_of_study]\n", - "filters[2][values][0] [Business Administration]\n", - "filters[2][values][1] NaN\n", - "filters[2][values][2] NaN\n", - "filters[2][values][3] NaN\n", - "filters[1][values][6] NaN\n", - "filters[2][type] [any]\n", - "filters[0][values][5] NaN\n", - "filters[0][values][6] NaN\n", - "filters[0][values][7] NaN\n", - "filters[0][values][8] NaN\n", - "filters[0][values][9] NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "filters[3][values][0] NaN\n", - "filters[3][values][1] NaN\n", - "amp;amp;size NaN\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 147, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = x.iloc[7485]\n", - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "id": "2335961c", - "metadata": {}, - "outputs": [], - "source": [ - "def isnotNaN(num):\n", - " return num == num\n", - "\n", - "temp ={0:[None,list()],\n", - " 1:[None,list()],\n", - " 2:[None,list()],\n", - " 3:[None,list()]\n", - " }\n", - "\n", - "for i in range(4):\n", - " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", - " temp[i][0] = row.loc[f'filters[{i}][field]'][0]\n", - " \n", - "for i in range(4):\n", - " for j in range(20):\n", - " if f'filters[{i}][values][{j}]' in row.index and isnotNaN(row.loc[f'filters[{i}][values][{j}]']):\n", - " temp[i][1].append(row.loc[f'filters[{i}][values][{j}]'][0]) \n", - " \n", - "for i in range(4):\n", - " for j in range(20):\n", - " if f'filters[{i}][values][{j}]' in row.index:\n", - " row = row.drop(labels=f'filters[{i}][values][{j}]')\n", - " \n", - "for i in range(4):\n", - " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", - " row = row.drop(labels=[f'filters[{i}][field]', f'filters[{i}][type]'])\n", - " \n", - "for i in range(4):\n", - " field = temp[i][0]\n", - " values = temp[i][1]\n", - " row[field] = values" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "id": "e393f951", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "9c3dba3e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: ['industries', ['Banking and Finance']],\n", - " 1: ['school',\n", - " ['National University of Singapore', 'Singapore Management University']],\n", - " 2: ['course_of_study', ['Business Administration']],\n", - " 3: [None, []]}" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "temp" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "id": "b57d4ca9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "amp;amp;size NaN\n", - "industries [Banking and Finance]\n", - "school [National University of Singapore, Singapore M...\n", - "course_of_study [Business Administration]\n", - "None []\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "id": "5d91fe23", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index([ 'view_id', 'website_id', 'session_id',\n", - " 'created_at', 'url', 'referrer',\n", - " 'query_params', 'q', 'size',\n", - " 'current', 'sort-field', 'sort-direction',\n", - " '_sm_au_', 'v', 'fbclid',\n", - " 'filters[3][field]', 'trk', 'amp;amp;size',\n", - " 'industries', 'school', 'course_of_study',\n", - " None],\n", - " dtype='object')" - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row.index" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "d3b752a2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pandas.core.series.Series" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "id": "043c5d16", - "metadata": {}, - "outputs": [], - "source": [ - "l = list(row.index)\n", - "l.append('organisation')" - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "id": "7787dc1e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'filters[3][field]', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', None, 'organisation']\n" - ] - } - ], - "source": [ - "print(l)" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "id": "b21898f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "amp;amp;size NaN\n", - "industries [Banking and Finance]\n", - "school [National University of Singapore, Singapore M...\n", - "course_of_study [Business Administration]\n", - "None []\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 158, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 166, - "id": "a3a0f1c7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 166, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = pd.DataFrame(columns=columns)\n", - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "id": "bc449824", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent..._sm_au_vfbclidfilters[3][field]trkamp;amp;sizeindustriesschoolcourse_of_studyNone
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaN[Banking and Finance][National University of Singapore, Singapore M...[Business Administration][]
\n", - "

1 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", - "\n", - " url \\\n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "\n", - " referrer \\\n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "\n", - " query_params q size \\\n", - "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "\n", - " current ... _sm_au_ v fbclid filters[3][field] trk amp;amp;size \\\n", - "7485 NaN ... NaN NaN NaN NaN NaN NaN \n", - "\n", - " industries \\\n", - "7485 [Banking and Finance] \n", - "\n", - " school \\\n", - "7485 [National University of Singapore, Singapore M... \n", - "\n", - " course_of_study None \n", - "7485 [Business Administration] [] \n", - "\n", - "[1 rows x 22 columns]" - ] - }, - "execution_count": 164, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 167, - "id": "403e6169", - "metadata": {}, - "outputs": [], - "source": [ - "x = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", - " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", - " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n" - ] - }, - { - "cell_type": "code", - "execution_count": 168, - "id": "36451395", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['view_id',\n", - " 'website_id',\n", - " 'session_id',\n", - " 'created_at',\n", - " 'url',\n", - " 'referrer',\n", - " 'query_params',\n", - " 'q',\n", - " 'size',\n", - " 'current',\n", - " 'sort-field',\n", - " 'sort-direction',\n", - " '_sm_au_',\n", - " 'v',\n", - " 'fbclid',\n", - " 'trk',\n", - " 'amp;amp;size',\n", - " 'industries',\n", - " 'school',\n", - " 'course_of_study',\n", - " 'organisation']" - ] - }, - "execution_count": 168, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x" ->>>>>>> d5fed27 (Update parsing urls.ipynb) - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afede6fa", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/04URLsInPostgreSQL/parsing urls.ipynb b/04URLsInPostgreSQL/parsing urls.ipynb new file mode 100644 index 0000000..7fd5c6b --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parsing URLs with data from PostgreSQL dump file\n", + "\n", + "Input dataframe:\n", + " - view_id\n", + " - website_id\n", + " - session_id\n", + " - created_at\n", + " - url\n", + " - referrer\n", + "\n", + "Output dataframe:\n", + "- industries (this will contain a list of all the values in this field)\n", + "- course_of_study\n", + "- organisaton\n", + "- school\n", + "\n", + "Summary of insights gained: (if any)\n", + "\n", + "Written by: Howard and Jolene (only mostly optimizations)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ba88477", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 view_id 113589 non-null int64 \n", + " 1 website_id 113589 non-null int64 \n", + " 2 session_id 113589 non-null int64 \n", + " 3 created_at 113589 non-null object\n", + " 4 url 113589 non-null object\n", + " 5 referrer 99529 non-null object\n", + "dtypes: int64(3), object(3)\n", + "memory usage: 5.2+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from urllib.parse import urlparse, parse_qs, unquote\n", + "\n", + "original_df = pd.read_csv('v1_pageview')\n", + "original_df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e731453c", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_query_params(url):\n", + " url = unquote(url) # make it human readable, not percentages\n", + " query_params = parse_qs(urlparse(url).query)\n", + " return query_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explaining process_query_params\n", + "\n", + "What does the url look like when navigating the page?\n", + "```\n", + "// filter by industry Information and Communications Technology\n", + "filters[0][field]=industries&filters[0][values][0]=Information and Communications Technology\n", + "\n", + "// filter by organization Google and SAP\n", + "filters[0][field]=organisation&filters[0][values][0]=SAP\n", + "filters[1][field]=organisation&filters[1][values][0]=Google\n", + "\n", + "// filter by school \n", + "filters[1][field]=school&filters[1][values][0]=National University of Singapore\n", + "filters[1][type]=any # we will ignore this part, not sure what it as, its always in 'any'\n", + "filters[2][field]=course_of_study\n", + "filters[2][values][0]=Economics%2C Psychology\n", + "filters[2][type]=any\n", + "```\n", + "\n", + "To summarize, the number in filters[0][field] is the first/second/third filter applied etc. while the second value which is either 'field', 'value' or 'type' shows what the text after = is. `filters[1][field]=school&filters[1][values][0]=National University of Singapore` this means the second filter applied is a school filter, the value of the school filter applied is `filters[1][values][0]=National University of Singapore`.\n", + "\n", + "So we can make use of this to extract the filters applied to each vistor URL." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def process_query_params(params):\n", + " '''\n", + " Parameters:\n", + " params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}\n", + " \n", + " Returns:\n", + " {\n", + " \"search_query\": \"search term\",\n", + " \"filter_name\": [\"filter value 1\", \"filter value 2\"]\n", + " }\n", + "\n", + " Note:\n", + " - The filter_name is the name of the filter, e.g. industries, school etc.\n", + " - size and type are ignored\n", + " '''\n", + " result = {}\n", + " current_field = ''\n", + "\n", + " for key, value in params.items():\n", + " if 'filters' in key:\n", + " parts = key.split('[')\n", + " field_or_value = parts[2].strip(']')\n", + "\n", + " if field_or_value == 'field':\n", + " # if its a field then use it as a key\n", + " current_field = value[0]\n", + " result[current_field] = []\n", + " elif field_or_value == 'type':\n", + " # there's a type of all in all queries, not sure what that is and whether its relevant\n", + " pass\n", + " else:\n", + " # if its a value then add it to the list by using the last saved field\n", + " result[current_field].extend(value)\n", + " elif key == \"q\":\n", + " result[\"search_query\"] = value[0]\n", + "\n", + " result = dict(result)\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Main" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Process URLs to extract query params" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = original_df.copy(deep=True)\n", + "df['url'] = df['url'].astype(str)\n", + "df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column \n", + "\n", + "# x2 = x.assign(industries=lambda x: np.nan, \n", + "# school=lambda x: np.nan,\n", + "# course_of_study=lambda x: np.nan,\n", + "# organisation=lambda x: np.nan) #create copy with the columns that i want\n", + "\n", + "# def isnotNaN(num):\n", + "# return num == num\n", + "\n", + "# columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", + "# 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", + "# 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", + "\n", + "\n", + "# for i in range(len(x2)): #go through original DF\n", + "\n", + "# row = x2.iloc[i] # for each row,\n", + "\n", + "# temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", + "# 1:[None,list()],\n", + "# 2:[None,list()],\n", + "# 3:[None,list()]\n", + "# }\n", + "\n", + "# for j in range(4): #identify which field corresponds to reach index\n", + "# if isnotNaN(row.loc[f'filters[{j}][field]']):\n", + "# temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", + "# else:\n", + "# break\n", + "\n", + "# for k in range(4): #condense all the values for each of the fields into list of values\n", + "# for l in range(20):\n", + "# if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", + "# temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", + "# else:\n", + "# break\n", + "\n", + "# for z in range(4): #add the new column:values to dataframe\n", + "# field = temp[z][0]\n", + "# values = temp[z][1]\n", + "# if field is None:\n", + "# break\n", + "# else:\n", + "# x2.loc[i,field] = str(values)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# optimized version of the code above\n", + "df_processed = df.copy(deep=True)\n", + "df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop\n", + "df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why are there so many additional columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indust\n", + "[] 4\n", + "Name: count, dtype: int64\n", + "ind\n", + "[] 22\n", + "Name: count, dtype: int64\n", + "i\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "cou\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "sch\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "o\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "course_of\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "wave_id\n", + "[n_2_n] 8\n", + "[n_1_n] 7\n", + "[n_3_n] 5\n", + "[n_0_n] 2\n", + "[n_0_n, n_2_n] 1\n", + "[n_3_n, n_1_n] 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "print(df_processed['indust'].value_counts())\n", + "print(df_processed['ind'].value_counts())\n", + "print(df_processed['i'].value_counts())\n", + "print(df_processed['cou'].value_counts())\n", + "print(df_processed['sch'].value_counts())\n", + "print(df_processed['o'].value_counts())\n", + "print(df_processed['course_of'].value_counts())\n", + "print(df_processed['wave_id'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the additional columns all contain empty list, so we can safely drop them. As for wave_id, we will leave it in for now." + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 search_query 20959 non-null object\n", + " 1 industries 29829 non-null object\n", + " 2 course_of_study 7103 non-null object\n", + " 3 organisation 13074 non-null object\n", + " 4 school 3657 non-null object\n", + " 5 course 1 non-null object\n", + " 6 wave_id 24 non-null object\n", + "dtypes: object(7)\n", + "memory usage: 6.1+ MB\n" + ] + } + ], + "source": [ + "# drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)\n", + "df_processed.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Export to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "57f88523", + "metadata": {}, + "outputs": [], + "source": [ + "df_processed.to_csv('data-preprocessed.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}