diff --git a/02ReformattingUmami/parsing urls.ipynb b/02ReformattingUmami/parsing urls.ipynb
deleted file mode 100644
index fc8ae17..0000000
--- a/02ReformattingUmami/parsing urls.ipynb
+++ /dev/null
@@ -1,2140 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "4ba88477",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "from urllib.parse import urlparse, parse_qs, unquote"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "c20ad040",
- "metadata": {},
- "outputs": [],
- "source": [
- "original_df = pd.read_csv('v1_pageview')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "e731453c",
- "metadata": {},
- "outputs": [],
- "source": [
- "def extract_query_params(url):\n",
- " url = unquote(url) #make it human readable, not percentages\n",
- " query_params = parse_qs(urlparse(url).query)\n",
- " return query_params\n",
- "\n",
- "new_df = original_df.copy() \n",
- "new_df['url'] = new_df['url'].astype(str)\n",
- "new_df['query_params'] = new_df['url'].apply(extract_query_params) #gets all the query params and makes it a dictionary \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "c7471d8e",
- "metadata": {},
- "outputs": [],
- "source": [
- "x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "5604d86c",
- "metadata": {},
- "outputs": [],
- "source": [
- "x2 = x.assign(industries=lambda x: np.nan, \n",
- " school=lambda x: np.nan,\n",
- " course_of_study=lambda x: np.nan,\n",
- " organisation=lambda x: np.nan) #create copy with the columns that i want"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "33cd1847",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1019300\n",
- "198379000\n",
- "400201500\n",
- "588100000\n",
- "771322300\n",
- "967610800\n",
- "1166690100\n",
- "1361307400\n",
- "1567462400\n",
- "1760879700\n",
- "1972686300\n",
- "2173218500\n",
- "2374591800\n",
- "2586973200\n",
- "2845110800\n",
- "3069079200\n",
- "3274099700\n",
- "3490637300\n",
- "3738567700\n",
- "3945910200\n",
- "4154997800\n",
- "4394562500\n",
- "4606142900\n",
- "4810549700\n",
- "5002853500\n",
- "5205447300\n",
- "5402053600\n",
- "5595158200\n",
- "5795015500\n",
- "6013136400\n",
- "6219393100\n",
- "6423695500\n",
- "6616866700\n",
- "6835747800\n",
- "7024130500\n",
- "7217490600\n",
- "7436969500\n",
- "7646088200\n",
- "7847100200\n",
- "8058185700\n",
- "8270893600\n",
- "8482217300\n",
- "8679724000\n",
- "8879031800\n",
- "9058311200\n",
- "9247930700\n",
- "9451453100\n",
- "9668764300\n",
- "9886907500\n",
- "10091585800\n",
- "10282182200\n",
- "10469907500\n",
- "10689785100\n",
- "10905295500\n",
- "11129143200\n",
- "11329276500\n",
- "11520791000\n",
- "11730772100\n",
- "11920881500\n",
- "12137022700\n",
- "12350234400\n",
- "12557910000\n",
- "12759372500\n",
- "12976088100\n",
- "13189948600\n",
- "13401493300\n",
- "13584390200\n",
- "13768548200\n",
- "13947810100\n",
- "14144238300\n",
- "14319362600\n",
- "14499509900\n",
- "14671370800\n",
- "14847169000\n",
- "15009801100\n",
- "15171386200\n",
- "15343520400\n",
- "15518651900\n",
- "15691240500\n",
- "15858359500\n",
- "16034491600\n",
- "16212696400\n",
- "16380638300\n",
- "16547752500\n",
- "16713832500\n",
- "16887430900\n",
- "17054953600\n",
- "17230079900\n",
- "17404971300\n",
- "17576107500\n",
- "17765259700\n",
- "17946401500\n",
- "18122570500\n",
- "18298027500\n",
- "18490654000\n",
- "18676347300\n",
- "18858481400\n",
- "19062635600\n",
- "19240343800\n",
- "19441518900\n",
- "19632663700\n",
- "19829562000\n",
- "20001634200\n",
- "20149930500\n",
- "20318680000\n",
- "20503769000\n",
- "20667341800\n",
- "20853630300\n",
- "21018679200\n",
- "21212859100\n",
- "21388865100\n",
- "21575672800\n",
- "21758869500\n",
- "21933173500\n"
- ]
- }
- ],
- "source": [
- "def isnotNaN(num):\n",
- " return num == num\n",
- "\n",
- "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n",
- " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n",
- " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n",
- "\n",
- "\n",
- "for i in range(len(x2)): #go through original DF\n",
- "\n",
- " row = x2.iloc[i] # for each row,\n",
- "\n",
- " temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n",
- " 1:[None,list()],\n",
- " 2:[None,list()],\n",
- " 3:[None,list()]\n",
- " }\n",
- "\n",
- " for j in range(4): #identify which field corresponds to reach index\n",
- " if isnotNaN(row.loc[f'filters[{j}][field]']):\n",
- " temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n",
- " else:\n",
- " break\n",
- "\n",
- " for k in range(4): #condense all the values for each of the fields into list of values\n",
- " for l in range(20):\n",
- " if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n",
- " temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n",
- " else:\n",
- " break\n",
- "\n",
- " for z in range(4): #add the new column:values to dataframe\n",
- " field = temp[z][0]\n",
- " values = temp[z][1]\n",
- " if field is None:\n",
- " break\n",
- " else:\n",
- " x2.loc[i,field] = str(values)\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "57f88523",
- "metadata": {},
- "outputs": [],
- "source": [
- "x2.to_csv('data-preprocessed.csv', index=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "473214f9",
- "metadata": {},
- "source": [
- "testing\n",
- "\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 195,
- "id": "b8ab56af",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " organisation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 7480 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7481 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7482 | \n",
- " [Banking and Finance] | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7483 | \n",
- " [Banking and Finance] | \n",
- " [National University of Singapore] | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7484 | \n",
- " [Banking and Finance] | \n",
- " [National University of Singapore, Singapore M... | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7485 | \n",
- " [Banking and Finance] | \n",
- " [National University of Singapore, Singapore M... | \n",
- " [Business Administration] | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7486 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7487 | \n",
- " [Banking and Finance] | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7488 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7489 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " industries \\\n",
- "7480 NaN \n",
- "7481 NaN \n",
- "7482 [Banking and Finance] \n",
- "7483 [Banking and Finance] \n",
- "7484 [Banking and Finance] \n",
- "7485 [Banking and Finance] \n",
- "7486 NaN \n",
- "7487 [Banking and Finance] \n",
- "7488 NaN \n",
- "7489 NaN \n",
- "\n",
- " school \\\n",
- "7480 NaN \n",
- "7481 NaN \n",
- "7482 NaN \n",
- "7483 [National University of Singapore] \n",
- "7484 [National University of Singapore, Singapore M... \n",
- "7485 [National University of Singapore, Singapore M... \n",
- "7486 NaN \n",
- "7487 NaN \n",
- "7488 NaN \n",
- "7489 NaN \n",
- "\n",
- " course_of_study organisation \n",
- "7480 NaN NaN \n",
- "7481 NaN NaN \n",
- "7482 NaN NaN \n",
- "7483 NaN NaN \n",
- "7484 NaN NaN \n",
- "7485 [Business Administration] NaN \n",
- "7486 NaN NaN \n",
- "7487 NaN NaN \n",
- "7488 NaN NaN \n",
- "7489 NaN NaN "
- ]
- },
- "execution_count": 195,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "a.iloc[7480:7490,17:21]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "340baaf6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " filters[0][field] | \n",
- " ... | \n",
- " fbclid | \n",
- " filters[3][field] | \n",
- " trk | \n",
- " filters[3][values][0] | \n",
- " filters[3][values][1] | \n",
- " amp;amp;size | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " organisation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 7481 | \n",
- " 7684 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:08.711+00 | \n",
- " /?size=n_20_n | \n",
- " / | \n",
- " {'size': ['n_20_n']} | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7482 | \n",
- " 7685 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:12.671+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ['Banking and Finance'] | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7483 | \n",
- " 7686 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:18.17+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ['Banking and Finance'] | \n",
- " ['National University of Singapore'] | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7484 | \n",
- " 7687 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:20.512+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ['Banking and Finance'] | \n",
- " ['National University of Singapore', 'Singapor... | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 7485 | \n",
- " 7688 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:26.338+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ['Banking and Finance'] | \n",
- " ['National University of Singapore', 'Singapor... | \n",
- " ['Business Administration'] | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 113584 | \n",
- " 113814 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:18:54.34+00 | \n",
- " / | \n",
- " https://static.elfsight.com/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113585 | \n",
- " 113815 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:51:55.512+00 | \n",
- " /events/ | \n",
- " https://beta.advisory.sg/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113586 | \n",
- " 113816 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:52:04.993+00 | \n",
- " /press-releases/ | \n",
- " NaN | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113587 | \n",
- " 113817 | \n",
- " 1 | \n",
- " 16140 | \n",
- " 2023-04-24 16:38:43.474+00 | \n",
- " /2017/10/05/conversations-with-tee-chee-yen/ | \n",
- " http://localhost:2368/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113588 | \n",
- " 113818 | \n",
- " 1 | \n",
- " 16140 | \n",
- " 2023-04-24 16:40:18.422+00 | \n",
- " /2017/07/30/conversations-with-marvin-kang/ | \n",
- " http://localhost:2368/tag/social-service/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
106108 rows × 51 columns
\n",
- "
"
- ],
- "text/plain": [
- " view_id website_id session_id created_at \\\n",
- "7481 7684 2 1392 2022-05-29 06:50:08.711+00 \n",
- "7482 7685 2 1392 2022-05-29 06:50:12.671+00 \n",
- "7483 7686 2 1392 2022-05-29 06:50:18.17+00 \n",
- "7484 7687 2 1392 2022-05-29 06:50:20.512+00 \n",
- "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n",
- "... ... ... ... ... \n",
- "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n",
- "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n",
- "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n",
- "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n",
- "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n",
- "\n",
- " url \\\n",
- "7481 /?size=n_20_n \n",
- "7482 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "... ... \n",
- "113584 / \n",
- "113585 /events/ \n",
- "113586 /press-releases/ \n",
- "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n",
- "113588 /2017/07/30/conversations-with-marvin-kang/ \n",
- "\n",
- " referrer \\\n",
- "7481 / \n",
- "7482 /?size=n_20_n \n",
- "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "... ... \n",
- "113584 https://static.elfsight.com/ \n",
- "113585 https://beta.advisory.sg/ \n",
- "113586 NaN \n",
- "113587 http://localhost:2368/ \n",
- "113588 http://localhost:2368/tag/social-service/ \n",
- "\n",
- " query_params q size \\\n",
- "7481 {'size': ['n_20_n']} NaN [n_20_n] \n",
- "7482 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n",
- "7483 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n",
- "7484 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n",
- "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n",
- "... ... ... ... \n",
- "113584 {} NaN NaN \n",
- "113585 {} NaN NaN \n",
- "113586 {} NaN NaN \n",
- "113587 {} NaN NaN \n",
- "113588 {} NaN NaN \n",
- "\n",
- " filters[0][field] ... fbclid filters[3][field] trk \\\n",
- "7481 NaN ... NaN NaN NaN \n",
- "7482 [industries] ... NaN NaN NaN \n",
- "7483 [industries] ... NaN NaN NaN \n",
- "7484 [industries] ... NaN NaN NaN \n",
- "7485 [industries] ... NaN NaN NaN \n",
- "... ... ... ... ... ... \n",
- "113584 NaN ... NaN NaN NaN \n",
- "113585 NaN ... NaN NaN NaN \n",
- "113586 NaN ... NaN NaN NaN \n",
- "113587 NaN ... NaN NaN NaN \n",
- "113588 NaN ... NaN NaN NaN \n",
- "\n",
- " filters[3][values][0] filters[3][values][1] amp;amp;size \\\n",
- "7481 NaN NaN NaN \n",
- "7482 NaN NaN NaN \n",
- "7483 NaN NaN NaN \n",
- "7484 NaN NaN NaN \n",
- "7485 NaN NaN NaN \n",
- "... ... ... ... \n",
- "113584 NaN NaN NaN \n",
- "113585 NaN NaN NaN \n",
- "113586 NaN NaN NaN \n",
- "113587 NaN NaN NaN \n",
- "113588 NaN NaN NaN \n",
- "\n",
- " industries \\\n",
- "7481 NaN \n",
- "7482 ['Banking and Finance'] \n",
- "7483 ['Banking and Finance'] \n",
- "7484 ['Banking and Finance'] \n",
- "7485 ['Banking and Finance'] \n",
- "... ... \n",
- "113584 NaN \n",
- "113585 NaN \n",
- "113586 NaN \n",
- "113587 NaN \n",
- "113588 NaN \n",
- "\n",
- " school \\\n",
- "7481 NaN \n",
- "7482 NaN \n",
- "7483 ['National University of Singapore'] \n",
- "7484 ['National University of Singapore', 'Singapor... \n",
- "7485 ['National University of Singapore', 'Singapor... \n",
- "... ... \n",
- "113584 NaN \n",
- "113585 NaN \n",
- "113586 NaN \n",
- "113587 NaN \n",
- "113588 NaN \n",
- "\n",
- " course_of_study organisation \n",
- "7481 NaN NaN \n",
- "7482 NaN NaN \n",
- "7483 NaN NaN \n",
- "7484 NaN NaN \n",
- "7485 ['Business Administration'] NaN \n",
- "... ... ... \n",
- "113584 NaN NaN \n",
- "113585 NaN NaN \n",
- "113586 NaN NaN \n",
- "113587 NaN NaN \n",
- "113588 NaN NaN \n",
- "\n",
- "[106108 rows x 51 columns]"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x2.iloc[7481:,:-9]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 157,
- "id": "794a6b93",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " current | \n",
- " ... | \n",
- " sort-direction | \n",
- " _sm_au_ | \n",
- " v | \n",
- " fbclid | \n",
- " trk | \n",
- " amp;amp;size | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " organisation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
0 rows × 21 columns
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n",
- "Index: []\n",
- "\n",
- "[0 rows x 21 columns]"
- ]
- },
- "execution_count": 157,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n",
- "y = pd.DataFrame(columns=columns)\n",
- "y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cc3ea68f",
- "metadata": {},
- "outputs": [],
- "source": [
- "18888237600\n",
- "40737104200\n",
- "72785283300\n",
- "108758322400\n",
- "145217173600\n",
- "186793474100\n",
- "225162609200\n",
- "264882508000\n",
- "314453042500\n",
- "365454468800\n",
- "408983182300\n",
- "455546595500\n",
- "504377514500\n",
- "549119827600\n",
- "591181220700\n",
- "635682159900\n",
- "682860726900\n",
- "732106724800\n",
- "782814304400\n",
- "835621612600\n",
- "891255040500\n",
- "943683757500\n",
- "983562692700\n",
- "1023982651900\n",
- "1065614924700\n",
- "1108148358200\n",
- "1151829557700\n",
- "1196612275400\n",
- "1242725911700\n",
- "1291049334900\n",
- "1341243997100\n",
- "1392687859900\n",
- "1445250854800\n",
- "1498960928900\n",
- "1553929939500\n",
- "1611468594200\n",
- "1671760449600\n",
- "1734306435500\n",
- "1795385092900\n",
- "1857648008700\n",
- "1921126209100\n",
- "1989747478400\n",
- "2056883359300\n",
- "2124365102700\n",
- "2192953255500\n",
- "2262542653300\n",
- "2333537347900\n",
- "2407247346000\n",
- "2481174709300\n",
- "2556079180300\n",
- "2632721770500\n",
- "2711853018800\n",
- "2797623695000\n",
- "2885214009200\n",
- "2969392615200\n",
- "3052316078600\n",
- "3135706844500\n",
- "3220985459900\n",
- "3308010920400\n",
- "3394591298400\n",
- "3481977400200\n",
- "3570721798700\n",
- "3660806746900\n",
- "3751976516100\n",
- "3844537433200\n",
- "3938891733100\n",
- "4034454066400\n",
- "4137224107900\n",
- "4239237949900\n",
- "4341107871900\n",
- "4445053302400\n",
- "4550355588000\n",
- "4656088215800\n",
- "4760841571100\n",
- "4866702552200\n",
- "4973539051500\n",
- "5082814611800\n",
- "5192301727000\n",
- "5303393414800\n",
- "5415569427500\n",
- "5529336622100\n",
- "5644090487900\n",
- "7905513294400\n",
- "8022122905300\n",
- "8142019076400\n",
- "8262193145400\n",
- "8413007131000\n",
- "8569099970100\n",
- "8699897960500\n",
- "8861730423300\n",
- "9046117615700\n",
- "9196323885200\n",
- "9364978097600\n",
- "9523016066800\n",
- "9688182973700\n",
- "9856712632500\n",
- "10026784638300\n",
- "10197057256100\n",
- "10319851429200\n",
- "10375110798500\n",
- "10430841262800\n",
- "10487271861600\n",
- "10544539678800\n",
- "10602246570200\n",
- "10659615402100\n",
- "10718192663600\n",
- "10777064820100\n",
- "10836482106000\n",
- "10896603119500\n",
- "10957077660600\n",
- "11020635868700"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 160,
- "id": "ded3c20e",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.concat([pd.DataFrame(row, columns=columns), y], ignore_index=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 161,
- "id": "36d21011",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " current | \n",
- " ... | \n",
- " sort-direction | \n",
- " _sm_au_ | \n",
- " v | \n",
- " fbclid | \n",
- " trk | \n",
- " amp;amp;size | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " organisation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
0 rows × 21 columns
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n",
- "Index: []\n",
- "\n",
- "[0 rows x 21 columns]"
- ]
- },
- "execution_count": 161,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 140,
- "id": "17f73e73",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " filters[0][field] | \n",
- " ... | \n",
- " filters[0][values][8] | \n",
- " filters[0][values][9] | \n",
- " _sm_au_ | \n",
- " v | \n",
- " fbclid | \n",
- " filters[3][field] | \n",
- " trk | \n",
- " filters[3][values][0] | \n",
- " filters[3][values][1] | \n",
- " amp;amp;size | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 233 | \n",
- " 2 | \n",
- " 93 | \n",
- " 2022-05-24 01:43:25.814+00 | \n",
- " /?q=chemical&size=n_20_n | \n",
- " NaN | \n",
- " {'q': ['chemical'], 'size': ['n_20_n']} | \n",
- " [chemical] | \n",
- " [n_20_n] | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 234 | \n",
- " 2 | \n",
- " 94 | \n",
- " 2022-05-24 01:45:16.891+00 | \n",
- " / | \n",
- " NaN | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 235 | \n",
- " 2 | \n",
- " 94 | \n",
- " 2022-05-24 01:45:17.571+00 | \n",
- " /?size=n_20_n | \n",
- " / | \n",
- " {'size': ['n_20_n']} | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 236 | \n",
- " 2 | \n",
- " 94 | \n",
- " 2022-05-24 01:46:04.09+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 237 | \n",
- " 2 | \n",
- " 94 | \n",
- " 2022-05-24 01:46:10.66+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " [industries] | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 113584 | \n",
- " 113814 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:18:54.34+00 | \n",
- " / | \n",
- " https://static.elfsight.com/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113585 | \n",
- " 113815 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:51:55.512+00 | \n",
- " /events/ | \n",
- " https://beta.advisory.sg/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113586 | \n",
- " 113816 | \n",
- " 1 | \n",
- " 16139 | \n",
- " 2023-04-24 14:52:04.993+00 | \n",
- " /press-releases/ | \n",
- " NaN | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113587 | \n",
- " 113817 | \n",
- " 1 | \n",
- " 16140 | \n",
- " 2023-04-24 16:38:43.474+00 | \n",
- " /2017/10/05/conversations-with-tee-chee-yen/ | \n",
- " http://localhost:2368/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 113588 | \n",
- " 113818 | \n",
- " 1 | \n",
- " 16140 | \n",
- " 2023-04-24 16:40:18.422+00 | \n",
- " /2017/07/30/conversations-with-marvin-kang/ | \n",
- " http://localhost:2368/tag/social-service/ | \n",
- " {} | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
113589 rows × 47 columns
\n",
- "
"
- ],
- "text/plain": [
- " view_id website_id session_id created_at \\\n",
- "0 233 2 93 2022-05-24 01:43:25.814+00 \n",
- "1 234 2 94 2022-05-24 01:45:16.891+00 \n",
- "2 235 2 94 2022-05-24 01:45:17.571+00 \n",
- "3 236 2 94 2022-05-24 01:46:04.09+00 \n",
- "4 237 2 94 2022-05-24 01:46:10.66+00 \n",
- "... ... ... ... ... \n",
- "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n",
- "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n",
- "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n",
- "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n",
- "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n",
- "\n",
- " url \\\n",
- "0 /?q=chemical&size=n_20_n \n",
- "1 / \n",
- "2 /?size=n_20_n \n",
- "3 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "... ... \n",
- "113584 / \n",
- "113585 /events/ \n",
- "113586 /press-releases/ \n",
- "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n",
- "113588 /2017/07/30/conversations-with-marvin-kang/ \n",
- "\n",
- " referrer \\\n",
- "0 NaN \n",
- "1 NaN \n",
- "2 / \n",
- "3 /?size=n_20_n \n",
- "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "... ... \n",
- "113584 https://static.elfsight.com/ \n",
- "113585 https://beta.advisory.sg/ \n",
- "113586 NaN \n",
- "113587 http://localhost:2368/ \n",
- "113588 http://localhost:2368/tag/social-service/ \n",
- "\n",
- " query_params q \\\n",
- "0 {'q': ['chemical'], 'size': ['n_20_n']} [chemical] \n",
- "1 {} NaN \n",
- "2 {'size': ['n_20_n']} NaN \n",
- "3 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n",
- "4 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n",
- "... ... ... \n",
- "113584 {} NaN \n",
- "113585 {} NaN \n",
- "113586 {} NaN \n",
- "113587 {} NaN \n",
- "113588 {} NaN \n",
- "\n",
- " size filters[0][field] ... filters[0][values][8] \\\n",
- "0 [n_20_n] NaN ... NaN \n",
- "1 NaN NaN ... NaN \n",
- "2 [n_20_n] NaN ... NaN \n",
- "3 [n_20_n] [industries] ... NaN \n",
- "4 [n_20_n] [industries] ... NaN \n",
- "... ... ... ... ... \n",
- "113584 NaN NaN ... NaN \n",
- "113585 NaN NaN ... NaN \n",
- "113586 NaN NaN ... NaN \n",
- "113587 NaN NaN ... NaN \n",
- "113588 NaN NaN ... NaN \n",
- "\n",
- " filters[0][values][9] _sm_au_ v fbclid filters[3][field] trk \\\n",
- "0 NaN NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN \n",
- "... ... ... ... ... ... ... \n",
- "113584 NaN NaN NaN NaN NaN NaN \n",
- "113585 NaN NaN NaN NaN NaN NaN \n",
- "113586 NaN NaN NaN NaN NaN NaN \n",
- "113587 NaN NaN NaN NaN NaN NaN \n",
- "113588 NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " filters[3][values][0] filters[3][values][1] amp;amp;size \n",
- "0 NaN NaN NaN \n",
- "1 NaN NaN NaN \n",
- "2 NaN NaN NaN \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "... ... ... ... \n",
- "113584 NaN NaN NaN \n",
- "113585 NaN NaN NaN \n",
- "113586 NaN NaN NaN \n",
- "113587 NaN NaN NaN \n",
- "113588 NaN NaN NaN \n",
- "\n",
- "[113589 rows x 47 columns]"
- ]
- },
- "execution_count": 140,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 147,
- "id": "a5c9804c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "view_id 7688\n",
- "website_id 2\n",
- "session_id 1392\n",
- "created_at 2022-05-29 06:50:26.338+00\n",
- "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n",
- "q NaN\n",
- "size [n_20_n]\n",
- "filters[0][field] [industries]\n",
- "filters[0][values][0] [Banking and Finance]\n",
- "filters[0][type] [all]\n",
- "filters[0][values][1] NaN\n",
- "filters[0][values][2] NaN\n",
- "current NaN\n",
- "sort-field NaN\n",
- "sort-direction NaN\n",
- "filters[1][field] [school]\n",
- "filters[1][values][0] [National University of Singapore]\n",
- "filters[1][type] [any]\n",
- "filters[0][values][3] NaN\n",
- "filters[0][values][4] NaN\n",
- "filters[1][values][1] [Singapore Management University]\n",
- "filters[1][values][2] NaN\n",
- "filters[1][values][3] NaN\n",
- "filters[1][values][4] NaN\n",
- "filters[1][values][5] NaN\n",
- "filters[2][field] [course_of_study]\n",
- "filters[2][values][0] [Business Administration]\n",
- "filters[2][values][1] NaN\n",
- "filters[2][values][2] NaN\n",
- "filters[2][values][3] NaN\n",
- "filters[1][values][6] NaN\n",
- "filters[2][type] [any]\n",
- "filters[0][values][5] NaN\n",
- "filters[0][values][6] NaN\n",
- "filters[0][values][7] NaN\n",
- "filters[0][values][8] NaN\n",
- "filters[0][values][9] NaN\n",
- "_sm_au_ NaN\n",
- "v NaN\n",
- "fbclid NaN\n",
- "filters[3][field] NaN\n",
- "trk NaN\n",
- "filters[3][values][0] NaN\n",
- "filters[3][values][1] NaN\n",
- "amp;amp;size NaN\n",
- "Name: 7485, dtype: object"
- ]
- },
- "execution_count": 147,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "row = x.iloc[7485]\n",
- "row"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 148,
- "id": "2335961c",
- "metadata": {},
- "outputs": [],
- "source": [
- "def isnotNaN(num):\n",
- " return num == num\n",
- "\n",
- "temp ={0:[None,list()],\n",
- " 1:[None,list()],\n",
- " 2:[None,list()],\n",
- " 3:[None,list()]\n",
- " }\n",
- "\n",
- "for i in range(4):\n",
- " if isnotNaN(row.loc[f'filters[{i}][field]']):\n",
- " temp[i][0] = row.loc[f'filters[{i}][field]'][0]\n",
- " \n",
- "for i in range(4):\n",
- " for j in range(20):\n",
- " if f'filters[{i}][values][{j}]' in row.index and isnotNaN(row.loc[f'filters[{i}][values][{j}]']):\n",
- " temp[i][1].append(row.loc[f'filters[{i}][values][{j}]'][0]) \n",
- " \n",
- "for i in range(4):\n",
- " for j in range(20):\n",
- " if f'filters[{i}][values][{j}]' in row.index:\n",
- " row = row.drop(labels=f'filters[{i}][values][{j}]')\n",
- " \n",
- "for i in range(4):\n",
- " if isnotNaN(row.loc[f'filters[{i}][field]']):\n",
- " row = row.drop(labels=[f'filters[{i}][field]', f'filters[{i}][type]'])\n",
- " \n",
- "for i in range(4):\n",
- " field = temp[i][0]\n",
- " values = temp[i][1]\n",
- " row[field] = values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "id": "e393f951",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 114,
- "id": "9c3dba3e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{0: ['industries', ['Banking and Finance']],\n",
- " 1: ['school',\n",
- " ['National University of Singapore', 'Singapore Management University']],\n",
- " 2: ['course_of_study', ['Business Administration']],\n",
- " 3: [None, []]}"
- ]
- },
- "execution_count": 114,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "temp"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 117,
- "id": "b57d4ca9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "view_id 7688\n",
- "website_id 2\n",
- "session_id 1392\n",
- "created_at 2022-05-29 06:50:26.338+00\n",
- "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n",
- "q NaN\n",
- "size [n_20_n]\n",
- "current NaN\n",
- "sort-field NaN\n",
- "sort-direction NaN\n",
- "_sm_au_ NaN\n",
- "v NaN\n",
- "fbclid NaN\n",
- "filters[3][field] NaN\n",
- "trk NaN\n",
- "amp;amp;size NaN\n",
- "industries [Banking and Finance]\n",
- "school [National University of Singapore, Singapore M...\n",
- "course_of_study [Business Administration]\n",
- "None []\n",
- "Name: 7485, dtype: object"
- ]
- },
- "execution_count": 117,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "row"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 149,
- "id": "5d91fe23",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index([ 'view_id', 'website_id', 'session_id',\n",
- " 'created_at', 'url', 'referrer',\n",
- " 'query_params', 'q', 'size',\n",
- " 'current', 'sort-field', 'sort-direction',\n",
- " '_sm_au_', 'v', 'fbclid',\n",
- " 'filters[3][field]', 'trk', 'amp;amp;size',\n",
- " 'industries', 'school', 'course_of_study',\n",
- " None],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "row.index"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 96,
- "id": "d3b752a2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "pandas.core.series.Series"
- ]
- },
- "execution_count": 96,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "type(row)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 153,
- "id": "043c5d16",
- "metadata": {},
- "outputs": [],
- "source": [
- "l = list(row.index)\n",
- "l.append('organisation')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 155,
- "id": "7787dc1e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'filters[3][field]', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', None, 'organisation']\n"
- ]
- }
- ],
- "source": [
- "print(l)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 158,
- "id": "b21898f8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "view_id 7688\n",
- "website_id 2\n",
- "session_id 1392\n",
- "created_at 2022-05-29 06:50:26.338+00\n",
- "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n",
- "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n",
- "q NaN\n",
- "size [n_20_n]\n",
- "current NaN\n",
- "sort-field NaN\n",
- "sort-direction NaN\n",
- "_sm_au_ NaN\n",
- "v NaN\n",
- "fbclid NaN\n",
- "filters[3][field] NaN\n",
- "trk NaN\n",
- "amp;amp;size NaN\n",
- "industries [Banking and Finance]\n",
- "school [National University of Singapore, Singapore M...\n",
- "course_of_study [Business Administration]\n",
- "None []\n",
- "Name: 7485, dtype: object"
- ]
- },
- "execution_count": 158,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "row"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 166,
- "id": "a3a0f1c7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " current | \n",
- " ... | \n",
- " sort-direction | \n",
- " _sm_au_ | \n",
- " v | \n",
- " fbclid | \n",
- " trk | \n",
- " amp;amp;size | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " organisation | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
0 rows × 21 columns
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n",
- "Index: []\n",
- "\n",
- "[0 rows x 21 columns]"
- ]
- },
- "execution_count": 166,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "a = pd.DataFrame(columns=columns)\n",
- "a"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 164,
- "id": "bc449824",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " view_id | \n",
- " website_id | \n",
- " session_id | \n",
- " created_at | \n",
- " url | \n",
- " referrer | \n",
- " query_params | \n",
- " q | \n",
- " size | \n",
- " current | \n",
- " ... | \n",
- " _sm_au_ | \n",
- " v | \n",
- " fbclid | \n",
- " filters[3][field] | \n",
- " trk | \n",
- " amp;amp;size | \n",
- " industries | \n",
- " school | \n",
- " course_of_study | \n",
- " None | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 7485 | \n",
- " 7688 | \n",
- " 2 | \n",
- " 1392 | \n",
- " 2022-05-29 06:50:26.338+00 | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... | \n",
- " {'size': ['n_20_n'], 'filters[0][field]': ['in... | \n",
- " NaN | \n",
- " [n_20_n] | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " [Banking and Finance] | \n",
- " [National University of Singapore, Singapore M... | \n",
- " [Business Administration] | \n",
- " [] | \n",
- "
\n",
- " \n",
- "
\n",
- "
1 rows × 22 columns
\n",
- "
"
- ],
- "text/plain": [
- " view_id website_id session_id created_at \\\n",
- "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n",
- "\n",
- " url \\\n",
- "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "\n",
- " referrer \\\n",
- "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n",
- "\n",
- " query_params q size \\\n",
- "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n",
- "\n",
- " current ... _sm_au_ v fbclid filters[3][field] trk amp;amp;size \\\n",
- "7485 NaN ... NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " industries \\\n",
- "7485 [Banking and Finance] \n",
- "\n",
- " school \\\n",
- "7485 [National University of Singapore, Singapore M... \n",
- "\n",
- " course_of_study None \n",
- "7485 [Business Administration] [] \n",
- "\n",
- "[1 rows x 22 columns]"
- ]
- },
- "execution_count": 164,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 167,
- "id": "403e6169",
- "metadata": {},
- "outputs": [],
- "source": [
- "x = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n",
- " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n",
- " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 168,
- "id": "36451395",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['view_id',\n",
- " 'website_id',\n",
- " 'session_id',\n",
- " 'created_at',\n",
- " 'url',\n",
- " 'referrer',\n",
- " 'query_params',\n",
- " 'q',\n",
- " 'size',\n",
- " 'current',\n",
- " 'sort-field',\n",
- " 'sort-direction',\n",
- " '_sm_au_',\n",
- " 'v',\n",
- " 'fbclid',\n",
- " 'trk',\n",
- " 'amp;amp;size',\n",
- " 'industries',\n",
- " 'school',\n",
- " 'course_of_study',\n",
- " 'organisation']"
- ]
- },
- "execution_count": 168,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c2eff724",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/04URLsInPostgreSQL/parsing urls.ipynb b/04URLsInPostgreSQL/parsing urls.ipynb
new file mode 100644
index 0000000..7fd5c6b
--- /dev/null
+++ b/04URLsInPostgreSQL/parsing urls.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Parsing URLs with data from PostgreSQL dump file\n",
+ "\n",
+ "Input dataframe:\n",
+ " - view_id\n",
+ " - website_id\n",
+ " - session_id\n",
+ " - created_at\n",
+ " - url\n",
+ " - referrer\n",
+ "\n",
+ "Output dataframe:\n",
+ "- industries (this will contain a list of all the values in this field)\n",
+ "- course_of_study\n",
+ "- organisaton\n",
+ "- school\n",
+ "\n",
+ "Summary of insights gained: (if any)\n",
+ "\n",
+ "Written by: Howard and Jolene (only mostly optimizations)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4ba88477",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 113589 entries, 0 to 113588\n",
+ "Data columns (total 6 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 view_id 113589 non-null int64 \n",
+ " 1 website_id 113589 non-null int64 \n",
+ " 2 session_id 113589 non-null int64 \n",
+ " 3 created_at 113589 non-null object\n",
+ " 4 url 113589 non-null object\n",
+ " 5 referrer 99529 non-null object\n",
+ "dtypes: int64(3), object(3)\n",
+ "memory usage: 5.2+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from urllib.parse import urlparse, parse_qs, unquote\n",
+ "\n",
+ "original_df = pd.read_csv('v1_pageview')\n",
+ "original_df.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Helper Functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e731453c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_query_params(url):\n",
+ " url = unquote(url) # make it human readable, not percentages\n",
+ " query_params = parse_qs(urlparse(url).query)\n",
+ " return query_params"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Explaining process_query_params\n",
+ "\n",
+ "What does the url look like when navigating the page?\n",
+ "```\n",
+ "// filter by industry Information and Communications Technology\n",
+ "filters[0][field]=industries&filters[0][values][0]=Information and Communications Technology\n",
+ "\n",
+ "// filter by organization Google and SAP\n",
+ "filters[0][field]=organisation&filters[0][values][0]=SAP\n",
+ "filters[1][field]=organisation&filters[1][values][0]=Google\n",
+ "\n",
+ "// filter by school \n",
+ "filters[1][field]=school&filters[1][values][0]=National University of Singapore\n",
+ "filters[1][type]=any # we will ignore this part, not sure what it as, its always in 'any'\n",
+ "filters[2][field]=course_of_study\n",
+ "filters[2][values][0]=Economics%2C Psychology\n",
+ "filters[2][type]=any\n",
+ "```\n",
+ "\n",
+ "To summarize, the number in filters[0][field] is the first/second/third filter applied etc. while the second value which is either 'field', 'value' or 'type' shows what the text after = is. `filters[1][field]=school&filters[1][values][0]=National University of Singapore` this means the second filter applied is a school filter, the value of the school filter applied is `filters[1][values][0]=National University of Singapore`.\n",
+ "\n",
+ "So we can make use of this to extract the filters applied to each vistor URL."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_query_params(params):\n",
+ " '''\n",
+ " Parameters:\n",
+ " params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}\n",
+ " \n",
+ " Returns:\n",
+ " {\n",
+ " \"search_query\": \"search term\",\n",
+ " \"filter_name\": [\"filter value 1\", \"filter value 2\"]\n",
+ " }\n",
+ "\n",
+ " Note:\n",
+ " - The filter_name is the name of the filter, e.g. industries, school etc.\n",
+ " - size and type are ignored\n",
+ " '''\n",
+ " result = {}\n",
+ " current_field = ''\n",
+ "\n",
+ " for key, value in params.items():\n",
+ " if 'filters' in key:\n",
+ " parts = key.split('[')\n",
+ " field_or_value = parts[2].strip(']')\n",
+ "\n",
+ " if field_or_value == 'field':\n",
+ " # if its a field then use it as a key\n",
+ " current_field = value[0]\n",
+ " result[current_field] = []\n",
+ " elif field_or_value == 'type':\n",
+ " # there's a type of all in all queries, not sure what that is and whether its relevant\n",
+ " pass\n",
+ " else:\n",
+ " # if its a value then add it to the list by using the last saved field\n",
+ " result[current_field].extend(value)\n",
+ " elif key == \"q\":\n",
+ " result[\"search_query\"] = value[0]\n",
+ "\n",
+ " result = dict(result)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Main"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Process URLs to extract query params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = original_df.copy(deep=True)\n",
+ "df['url'] = df['url'].astype(str)\n",
+ "df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column \n",
+ "\n",
+ "# x2 = x.assign(industries=lambda x: np.nan, \n",
+ "# school=lambda x: np.nan,\n",
+ "# course_of_study=lambda x: np.nan,\n",
+ "# organisation=lambda x: np.nan) #create copy with the columns that i want\n",
+ "\n",
+ "# def isnotNaN(num):\n",
+ "# return num == num\n",
+ "\n",
+ "# columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n",
+ "# 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n",
+ "# 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n",
+ "\n",
+ "\n",
+ "# for i in range(len(x2)): #go through original DF\n",
+ "\n",
+ "# row = x2.iloc[i] # for each row,\n",
+ "\n",
+ "# temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n",
+ "# 1:[None,list()],\n",
+ "# 2:[None,list()],\n",
+ "# 3:[None,list()]\n",
+ "# }\n",
+ "\n",
+ "# for j in range(4): #identify which field corresponds to reach index\n",
+ "# if isnotNaN(row.loc[f'filters[{j}][field]']):\n",
+ "# temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n",
+ "# else:\n",
+ "# break\n",
+ "\n",
+ "# for k in range(4): #condense all the values for each of the fields into list of values\n",
+ "# for l in range(20):\n",
+ "# if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n",
+ "# temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n",
+ "# else:\n",
+ "# break\n",
+ "\n",
+ "# for z in range(4): #add the new column:values to dataframe\n",
+ "# field = temp[z][0]\n",
+ "# values = temp[z][1]\n",
+ "# if field is None:\n",
+ "# break\n",
+ "# else:\n",
+ "# x2.loc[i,field] = str(values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# optimized version of the code above\n",
+ "df_processed = df.copy(deep=True)\n",
+ "df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop\n",
+ "df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Why are there so many additional columns?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "indust\n",
+ "[] 4\n",
+ "Name: count, dtype: int64\n",
+ "ind\n",
+ "[] 22\n",
+ "Name: count, dtype: int64\n",
+ "i\n",
+ "[] 3\n",
+ "Name: count, dtype: int64\n",
+ "cou\n",
+ "[] 3\n",
+ "Name: count, dtype: int64\n",
+ "sch\n",
+ "[] 1\n",
+ "Name: count, dtype: int64\n",
+ "o\n",
+ "[] 1\n",
+ "Name: count, dtype: int64\n",
+ "course_of\n",
+ "[] 1\n",
+ "Name: count, dtype: int64\n",
+ "wave_id\n",
+ "[n_2_n] 8\n",
+ "[n_1_n] 7\n",
+ "[n_3_n] 5\n",
+ "[n_0_n] 2\n",
+ "[n_0_n, n_2_n] 1\n",
+ "[n_3_n, n_1_n] 1\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n",
+ "print(df_processed['indust'].value_counts())\n",
+ "print(df_processed['ind'].value_counts())\n",
+ "print(df_processed['i'].value_counts())\n",
+ "print(df_processed['cou'].value_counts())\n",
+ "print(df_processed['sch'].value_counts())\n",
+ "print(df_processed['o'].value_counts())\n",
+ "print(df_processed['course_of'].value_counts())\n",
+ "print(df_processed['wave_id'].value_counts())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We see that the additional columns all contain empty list, so we can safely drop them. As for wave_id, we will leave it in for now."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 113589 entries, 0 to 113588\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 search_query 20959 non-null object\n",
+ " 1 industries 29829 non-null object\n",
+ " 2 course_of_study 7103 non-null object\n",
+ " 3 organisation 13074 non-null object\n",
+ " 4 school 3657 non-null object\n",
+ " 5 course 1 non-null object\n",
+ " 6 wave_id 24 non-null object\n",
+ "dtypes: object(7)\n",
+ "memory usage: 6.1+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n",
+ "df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)\n",
+ "df_processed.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Export to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "57f88523",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_processed.to_csv('data-preprocessed.csv', index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}