diff --git a/02ReformattingUmami/parsing urls.ipynb b/02ReformattingUmami/parsing urls.ipynb deleted file mode 100644 index fc8ae17..0000000 --- a/02ReformattingUmami/parsing urls.ipynb +++ /dev/null @@ -1,2140 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 11, - "id": "4ba88477", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from urllib.parse import urlparse, parse_qs, unquote" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "c20ad040", - "metadata": {}, - "outputs": [], - "source": [ - "original_df = pd.read_csv('v1_pageview')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "e731453c", - "metadata": {}, - "outputs": [], - "source": [ - "def extract_query_params(url):\n", - " url = unquote(url) #make it human readable, not percentages\n", - " query_params = parse_qs(urlparse(url).query)\n", - " return query_params\n", - "\n", - "new_df = original_df.copy() \n", - "new_df['url'] = new_df['url'].astype(str)\n", - "new_df['query_params'] = new_df['url'].apply(extract_query_params) #gets all the query params and makes it a dictionary \n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c7471d8e", - "metadata": {}, - "outputs": [], - "source": [ - "x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column " - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "5604d86c", - "metadata": {}, - "outputs": [], - "source": [ - "x2 = x.assign(industries=lambda x: np.nan, \n", - " school=lambda x: np.nan,\n", - " course_of_study=lambda x: np.nan,\n", - " organisation=lambda x: np.nan) #create copy with the columns that i want" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "33cd1847", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1019300\n", - "198379000\n", - "400201500\n", - "588100000\n", - "771322300\n", - "967610800\n", - "1166690100\n", - "1361307400\n", - "1567462400\n", - "1760879700\n", - "1972686300\n", - "2173218500\n", - "2374591800\n", - "2586973200\n", - "2845110800\n", - "3069079200\n", - "3274099700\n", - "3490637300\n", - "3738567700\n", - "3945910200\n", - "4154997800\n", - "4394562500\n", - "4606142900\n", - "4810549700\n", - "5002853500\n", - "5205447300\n", - "5402053600\n", - "5595158200\n", - "5795015500\n", - "6013136400\n", - "6219393100\n", - "6423695500\n", - "6616866700\n", - "6835747800\n", - "7024130500\n", - "7217490600\n", - "7436969500\n", - "7646088200\n", - "7847100200\n", - "8058185700\n", - "8270893600\n", - "8482217300\n", - "8679724000\n", - "8879031800\n", - "9058311200\n", - "9247930700\n", - "9451453100\n", - "9668764300\n", - "9886907500\n", - "10091585800\n", - "10282182200\n", - "10469907500\n", - "10689785100\n", - "10905295500\n", - "11129143200\n", - "11329276500\n", - "11520791000\n", - "11730772100\n", - "11920881500\n", - "12137022700\n", - "12350234400\n", - "12557910000\n", - "12759372500\n", - "12976088100\n", - "13189948600\n", - "13401493300\n", - "13584390200\n", - "13768548200\n", - "13947810100\n", - "14144238300\n", - "14319362600\n", - "14499509900\n", - "14671370800\n", - "14847169000\n", - "15009801100\n", - "15171386200\n", - "15343520400\n", - "15518651900\n", - "15691240500\n", - "15858359500\n", - "16034491600\n", - "16212696400\n", - "16380638300\n", - "16547752500\n", - "16713832500\n", - "16887430900\n", - "17054953600\n", - "17230079900\n", - "17404971300\n", - "17576107500\n", - "17765259700\n", - "17946401500\n", - "18122570500\n", - "18298027500\n", - "18490654000\n", - "18676347300\n", - "18858481400\n", - "19062635600\n", - "19240343800\n", - "19441518900\n", - "19632663700\n", - "19829562000\n", - "20001634200\n", - "20149930500\n", - "20318680000\n", - "20503769000\n", - "20667341800\n", - "20853630300\n", - "21018679200\n", - "21212859100\n", - "21388865100\n", - "21575672800\n", - "21758869500\n", - "21933173500\n" - ] - } - ], - "source": [ - "def isnotNaN(num):\n", - " return num == num\n", - "\n", - "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", - " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", - " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", - "\n", - "\n", - "for i in range(len(x2)): #go through original DF\n", - "\n", - " row = x2.iloc[i] # for each row,\n", - "\n", - " temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", - " 1:[None,list()],\n", - " 2:[None,list()],\n", - " 3:[None,list()]\n", - " }\n", - "\n", - " for j in range(4): #identify which field corresponds to reach index\n", - " if isnotNaN(row.loc[f'filters[{j}][field]']):\n", - " temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", - " else:\n", - " break\n", - "\n", - " for k in range(4): #condense all the values for each of the fields into list of values\n", - " for l in range(20):\n", - " if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", - " temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", - " else:\n", - " break\n", - "\n", - " for z in range(4): #add the new column:values to dataframe\n", - " field = temp[z][0]\n", - " values = temp[z][1]\n", - " if field is None:\n", - " break\n", - " else:\n", - " x2.loc[i,field] = str(values)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "57f88523", - "metadata": {}, - "outputs": [], - "source": [ - "x2.to_csv('data-preprocessed.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "473214f9", - "metadata": {}, - "source": [ - "testing\n", - "\n", - "


























" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "id": "b8ab56af", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
industriesschoolcourse_of_studyorganisation
7480NaNNaNNaNNaN
7481NaNNaNNaNNaN
7482[Banking and Finance]NaNNaNNaN
7483[Banking and Finance][National University of Singapore]NaNNaN
7484[Banking and Finance][National University of Singapore, Singapore M...NaNNaN
7485[Banking and Finance][National University of Singapore, Singapore M...[Business Administration]NaN
7486NaNNaNNaNNaN
7487[Banking and Finance]NaNNaNNaN
7488NaNNaNNaNNaN
7489NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " industries \\\n", - "7480 NaN \n", - "7481 NaN \n", - "7482 [Banking and Finance] \n", - "7483 [Banking and Finance] \n", - "7484 [Banking and Finance] \n", - "7485 [Banking and Finance] \n", - "7486 NaN \n", - "7487 [Banking and Finance] \n", - "7488 NaN \n", - "7489 NaN \n", - "\n", - " school \\\n", - "7480 NaN \n", - "7481 NaN \n", - "7482 NaN \n", - "7483 [National University of Singapore] \n", - "7484 [National University of Singapore, Singapore M... \n", - "7485 [National University of Singapore, Singapore M... \n", - "7486 NaN \n", - "7487 NaN \n", - "7488 NaN \n", - "7489 NaN \n", - "\n", - " course_of_study organisation \n", - "7480 NaN NaN \n", - "7481 NaN NaN \n", - "7482 NaN NaN \n", - "7483 NaN NaN \n", - "7484 NaN NaN \n", - "7485 [Business Administration] NaN \n", - "7486 NaN NaN \n", - "7487 NaN NaN \n", - "7488 NaN NaN \n", - "7489 NaN NaN " - ] - }, - "execution_count": 195, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a.iloc[7480:7490,17:21]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "340baaf6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...fbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;sizeindustriesschoolcourse_of_studyorganisation
74817684213922022-05-29 06:50:08.711+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
74827685213922022-05-29 06:50:12.671+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']NaNNaNNaN
74837686213922022-05-29 06:50:18.17+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore']NaNNaN
74847687213922022-05-29 06:50:20.512+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...NaNNaN
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaN['Banking and Finance']['National University of Singapore', 'Singapor...['Business Administration']NaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

106108 rows × 51 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "7481 7684 2 1392 2022-05-29 06:50:08.711+00 \n", - "7482 7685 2 1392 2022-05-29 06:50:12.671+00 \n", - "7483 7686 2 1392 2022-05-29 06:50:18.17+00 \n", - "7484 7687 2 1392 2022-05-29 06:50:20.512+00 \n", - "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", - "... ... ... ... ... \n", - "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", - "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", - "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", - "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", - "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", - "\n", - " url \\\n", - "7481 /?size=n_20_n \n", - "7482 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 / \n", - "113585 /events/ \n", - "113586 /press-releases/ \n", - "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", - "113588 /2017/07/30/conversations-with-marvin-kang/ \n", - "\n", - " referrer \\\n", - "7481 / \n", - "7482 /?size=n_20_n \n", - "7483 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7484 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 https://static.elfsight.com/ \n", - "113585 https://beta.advisory.sg/ \n", - "113586 NaN \n", - "113587 http://localhost:2368/ \n", - "113588 http://localhost:2368/tag/social-service/ \n", - "\n", - " query_params q size \\\n", - "7481 {'size': ['n_20_n']} NaN [n_20_n] \n", - "7482 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7483 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7484 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "... ... ... ... \n", - "113584 {} NaN NaN \n", - "113585 {} NaN NaN \n", - "113586 {} NaN NaN \n", - "113587 {} NaN NaN \n", - "113588 {} NaN NaN \n", - "\n", - " filters[0][field] ... fbclid filters[3][field] trk \\\n", - "7481 NaN ... NaN NaN NaN \n", - "7482 [industries] ... NaN NaN NaN \n", - "7483 [industries] ... NaN NaN NaN \n", - "7484 [industries] ... NaN NaN NaN \n", - "7485 [industries] ... NaN NaN NaN \n", - "... ... ... ... ... ... \n", - "113584 NaN ... NaN NaN NaN \n", - "113585 NaN ... NaN NaN NaN \n", - "113586 NaN ... NaN NaN NaN \n", - "113587 NaN ... NaN NaN NaN \n", - "113588 NaN ... NaN NaN NaN \n", - "\n", - " filters[3][values][0] filters[3][values][1] amp;amp;size \\\n", - "7481 NaN NaN NaN \n", - "7482 NaN NaN NaN \n", - "7483 NaN NaN NaN \n", - "7484 NaN NaN NaN \n", - "7485 NaN NaN NaN \n", - "... ... ... ... \n", - "113584 NaN NaN NaN \n", - "113585 NaN NaN NaN \n", - "113586 NaN NaN NaN \n", - "113587 NaN NaN NaN \n", - "113588 NaN NaN NaN \n", - "\n", - " industries \\\n", - "7481 NaN \n", - "7482 ['Banking and Finance'] \n", - "7483 ['Banking and Finance'] \n", - "7484 ['Banking and Finance'] \n", - "7485 ['Banking and Finance'] \n", - "... ... \n", - "113584 NaN \n", - "113585 NaN \n", - "113586 NaN \n", - "113587 NaN \n", - "113588 NaN \n", - "\n", - " school \\\n", - "7481 NaN \n", - "7482 NaN \n", - "7483 ['National University of Singapore'] \n", - "7484 ['National University of Singapore', 'Singapor... \n", - "7485 ['National University of Singapore', 'Singapor... \n", - "... ... \n", - "113584 NaN \n", - "113585 NaN \n", - "113586 NaN \n", - "113587 NaN \n", - "113588 NaN \n", - "\n", - " course_of_study organisation \n", - "7481 NaN NaN \n", - "7482 NaN NaN \n", - "7483 NaN NaN \n", - "7484 NaN NaN \n", - "7485 ['Business Administration'] NaN \n", - "... ... ... \n", - "113584 NaN NaN \n", - "113585 NaN NaN \n", - "113586 NaN NaN \n", - "113587 NaN NaN \n", - "113588 NaN NaN \n", - "\n", - "[106108 rows x 51 columns]" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x2.iloc[7481:,:-9]" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "id": "794a6b93", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 157, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n", - "y = pd.DataFrame(columns=columns)\n", - "y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc3ea68f", - "metadata": {}, - "outputs": [], - "source": [ - "18888237600\n", - "40737104200\n", - "72785283300\n", - "108758322400\n", - "145217173600\n", - "186793474100\n", - "225162609200\n", - "264882508000\n", - "314453042500\n", - "365454468800\n", - "408983182300\n", - "455546595500\n", - "504377514500\n", - "549119827600\n", - "591181220700\n", - "635682159900\n", - "682860726900\n", - "732106724800\n", - "782814304400\n", - "835621612600\n", - "891255040500\n", - "943683757500\n", - "983562692700\n", - "1023982651900\n", - "1065614924700\n", - "1108148358200\n", - "1151829557700\n", - "1196612275400\n", - "1242725911700\n", - "1291049334900\n", - "1341243997100\n", - "1392687859900\n", - "1445250854800\n", - "1498960928900\n", - "1553929939500\n", - "1611468594200\n", - "1671760449600\n", - "1734306435500\n", - "1795385092900\n", - "1857648008700\n", - "1921126209100\n", - "1989747478400\n", - "2056883359300\n", - "2124365102700\n", - "2192953255500\n", - "2262542653300\n", - "2333537347900\n", - "2407247346000\n", - "2481174709300\n", - "2556079180300\n", - "2632721770500\n", - "2711853018800\n", - "2797623695000\n", - "2885214009200\n", - "2969392615200\n", - "3052316078600\n", - "3135706844500\n", - "3220985459900\n", - "3308010920400\n", - "3394591298400\n", - "3481977400200\n", - "3570721798700\n", - "3660806746900\n", - "3751976516100\n", - "3844537433200\n", - "3938891733100\n", - "4034454066400\n", - "4137224107900\n", - "4239237949900\n", - "4341107871900\n", - "4445053302400\n", - "4550355588000\n", - "4656088215800\n", - "4760841571100\n", - "4866702552200\n", - "4973539051500\n", - "5082814611800\n", - "5192301727000\n", - "5303393414800\n", - "5415569427500\n", - "5529336622100\n", - "5644090487900\n", - "7905513294400\n", - "8022122905300\n", - "8142019076400\n", - "8262193145400\n", - "8413007131000\n", - "8569099970100\n", - "8699897960500\n", - "8861730423300\n", - "9046117615700\n", - "9196323885200\n", - "9364978097600\n", - "9523016066800\n", - "9688182973700\n", - "9856712632500\n", - "10026784638300\n", - "10197057256100\n", - "10319851429200\n", - "10375110798500\n", - "10430841262800\n", - "10487271861600\n", - "10544539678800\n", - "10602246570200\n", - "10659615402100\n", - "10718192663600\n", - "10777064820100\n", - "10836482106000\n", - "10896603119500\n", - "10957077660600\n", - "11020635868700" - ] - }, - { - "cell_type": "code", - "execution_count": 160, - "id": "ded3c20e", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([pd.DataFrame(row, columns=columns), y], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "id": "36d21011", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 161, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df\n" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "id": "17f73e73", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizefilters[0][field]...filters[0][values][8]filters[0][values][9]_sm_au_vfbclidfilters[3][field]trkfilters[3][values][0]filters[3][values][1]amp;amp;size
02332932022-05-24 01:43:25.814+00/?q=chemical&size=n_20_nNaN{'q': ['chemical'], 'size': ['n_20_n']}[chemical][n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12342942022-05-24 01:45:16.891+00/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22352942022-05-24 01:45:17.571+00/?size=n_20_n/{'size': ['n_20_n']}NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32362942022-05-24 01:46:04.09+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42372942022-05-24 01:46:10.66+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n][industries]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
1135841138141161392023-04-24 14:18:54.34+00/https://static.elfsight.com/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135851138151161392023-04-24 14:51:55.512+00/events/https://beta.advisory.sg/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135861138161161392023-04-24 14:52:04.993+00/press-releases/NaN{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135871138171161402023-04-24 16:38:43.474+00/2017/10/05/conversations-with-tee-chee-yen/http://localhost:2368/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1135881138181161402023-04-24 16:40:18.422+00/2017/07/30/conversations-with-marvin-kang/http://localhost:2368/tag/social-service/{}NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

113589 rows × 47 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "0 233 2 93 2022-05-24 01:43:25.814+00 \n", - "1 234 2 94 2022-05-24 01:45:16.891+00 \n", - "2 235 2 94 2022-05-24 01:45:17.571+00 \n", - "3 236 2 94 2022-05-24 01:46:04.09+00 \n", - "4 237 2 94 2022-05-24 01:46:10.66+00 \n", - "... ... ... ... ... \n", - "113584 113814 1 16139 2023-04-24 14:18:54.34+00 \n", - "113585 113815 1 16139 2023-04-24 14:51:55.512+00 \n", - "113586 113816 1 16139 2023-04-24 14:52:04.993+00 \n", - "113587 113817 1 16140 2023-04-24 16:38:43.474+00 \n", - "113588 113818 1 16140 2023-04-24 16:40:18.422+00 \n", - "\n", - " url \\\n", - "0 /?q=chemical&size=n_20_n \n", - "1 / \n", - "2 /?size=n_20_n \n", - "3 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 / \n", - "113585 /events/ \n", - "113586 /press-releases/ \n", - "113587 /2017/10/05/conversations-with-tee-chee-yen/ \n", - "113588 /2017/07/30/conversations-with-marvin-kang/ \n", - "\n", - " referrer \\\n", - "0 NaN \n", - "1 NaN \n", - "2 / \n", - "3 /?size=n_20_n \n", - "4 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "... ... \n", - "113584 https://static.elfsight.com/ \n", - "113585 https://beta.advisory.sg/ \n", - "113586 NaN \n", - "113587 http://localhost:2368/ \n", - "113588 http://localhost:2368/tag/social-service/ \n", - "\n", - " query_params q \\\n", - "0 {'q': ['chemical'], 'size': ['n_20_n']} [chemical] \n", - "1 {} NaN \n", - "2 {'size': ['n_20_n']} NaN \n", - "3 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", - "4 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN \n", - "... ... ... \n", - "113584 {} NaN \n", - "113585 {} NaN \n", - "113586 {} NaN \n", - "113587 {} NaN \n", - "113588 {} NaN \n", - "\n", - " size filters[0][field] ... filters[0][values][8] \\\n", - "0 [n_20_n] NaN ... NaN \n", - "1 NaN NaN ... NaN \n", - "2 [n_20_n] NaN ... NaN \n", - "3 [n_20_n] [industries] ... NaN \n", - "4 [n_20_n] [industries] ... NaN \n", - "... ... ... ... ... \n", - "113584 NaN NaN ... NaN \n", - "113585 NaN NaN ... NaN \n", - "113586 NaN NaN ... NaN \n", - "113587 NaN NaN ... NaN \n", - "113588 NaN NaN ... NaN \n", - "\n", - " filters[0][values][9] _sm_au_ v fbclid filters[3][field] trk \\\n", - "0 NaN NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... \n", - "113584 NaN NaN NaN NaN NaN NaN \n", - "113585 NaN NaN NaN NaN NaN NaN \n", - "113586 NaN NaN NaN NaN NaN NaN \n", - "113587 NaN NaN NaN NaN NaN NaN \n", - "113588 NaN NaN NaN NaN NaN NaN \n", - "\n", - " filters[3][values][0] filters[3][values][1] amp;amp;size \n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "113584 NaN NaN NaN \n", - "113585 NaN NaN NaN \n", - "113586 NaN NaN NaN \n", - "113587 NaN NaN NaN \n", - "113588 NaN NaN NaN \n", - "\n", - "[113589 rows x 47 columns]" - ] - }, - "execution_count": 140, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "id": "a5c9804c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "filters[0][field] [industries]\n", - "filters[0][values][0] [Banking and Finance]\n", - "filters[0][type] [all]\n", - "filters[0][values][1] NaN\n", - "filters[0][values][2] NaN\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "filters[1][field] [school]\n", - "filters[1][values][0] [National University of Singapore]\n", - "filters[1][type] [any]\n", - "filters[0][values][3] NaN\n", - "filters[0][values][4] NaN\n", - "filters[1][values][1] [Singapore Management University]\n", - "filters[1][values][2] NaN\n", - "filters[1][values][3] NaN\n", - "filters[1][values][4] NaN\n", - "filters[1][values][5] NaN\n", - "filters[2][field] [course_of_study]\n", - "filters[2][values][0] [Business Administration]\n", - "filters[2][values][1] NaN\n", - "filters[2][values][2] NaN\n", - "filters[2][values][3] NaN\n", - "filters[1][values][6] NaN\n", - "filters[2][type] [any]\n", - "filters[0][values][5] NaN\n", - "filters[0][values][6] NaN\n", - "filters[0][values][7] NaN\n", - "filters[0][values][8] NaN\n", - "filters[0][values][9] NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "filters[3][values][0] NaN\n", - "filters[3][values][1] NaN\n", - "amp;amp;size NaN\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 147, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = x.iloc[7485]\n", - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "id": "2335961c", - "metadata": {}, - "outputs": [], - "source": [ - "def isnotNaN(num):\n", - " return num == num\n", - "\n", - "temp ={0:[None,list()],\n", - " 1:[None,list()],\n", - " 2:[None,list()],\n", - " 3:[None,list()]\n", - " }\n", - "\n", - "for i in range(4):\n", - " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", - " temp[i][0] = row.loc[f'filters[{i}][field]'][0]\n", - " \n", - "for i in range(4):\n", - " for j in range(20):\n", - " if f'filters[{i}][values][{j}]' in row.index and isnotNaN(row.loc[f'filters[{i}][values][{j}]']):\n", - " temp[i][1].append(row.loc[f'filters[{i}][values][{j}]'][0]) \n", - " \n", - "for i in range(4):\n", - " for j in range(20):\n", - " if f'filters[{i}][values][{j}]' in row.index:\n", - " row = row.drop(labels=f'filters[{i}][values][{j}]')\n", - " \n", - "for i in range(4):\n", - " if isnotNaN(row.loc[f'filters[{i}][field]']):\n", - " row = row.drop(labels=[f'filters[{i}][field]', f'filters[{i}][type]'])\n", - " \n", - "for i in range(4):\n", - " field = temp[i][0]\n", - " values = temp[i][1]\n", - " row[field] = values" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "id": "e393f951", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "9c3dba3e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: ['industries', ['Banking and Finance']],\n", - " 1: ['school',\n", - " ['National University of Singapore', 'Singapore Management University']],\n", - " 2: ['course_of_study', ['Business Administration']],\n", - " 3: [None, []]}" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "temp" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "id": "b57d4ca9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "amp;amp;size NaN\n", - "industries [Banking and Finance]\n", - "school [National University of Singapore, Singapore M...\n", - "course_of_study [Business Administration]\n", - "None []\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "id": "5d91fe23", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index([ 'view_id', 'website_id', 'session_id',\n", - " 'created_at', 'url', 'referrer',\n", - " 'query_params', 'q', 'size',\n", - " 'current', 'sort-field', 'sort-direction',\n", - " '_sm_au_', 'v', 'fbclid',\n", - " 'filters[3][field]', 'trk', 'amp;amp;size',\n", - " 'industries', 'school', 'course_of_study',\n", - " None],\n", - " dtype='object')" - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row.index" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "d3b752a2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pandas.core.series.Series" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "id": "043c5d16", - "metadata": {}, - "outputs": [], - "source": [ - "l = list(row.index)\n", - "l.append('organisation')" - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "id": "7787dc1e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params', 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'filters[3][field]', 'trk', 'amp;amp;size', 'industries', 'school', 'course_of_study', None, 'organisation']\n" - ] - } - ], - "source": [ - "print(l)" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "id": "b21898f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "view_id 7688\n", - "website_id 2\n", - "session_id 1392\n", - "created_at 2022-05-29 06:50:26.338+00\n", - "url /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "referrer /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...\n", - "query_params {'size': ['n_20_n'], 'filters[0][field]': ['in...\n", - "q NaN\n", - "size [n_20_n]\n", - "current NaN\n", - "sort-field NaN\n", - "sort-direction NaN\n", - "_sm_au_ NaN\n", - "v NaN\n", - "fbclid NaN\n", - "filters[3][field] NaN\n", - "trk NaN\n", - "amp;amp;size NaN\n", - "industries [Banking and Finance]\n", - "school [National University of Singapore, Singapore M...\n", - "course_of_study [Business Administration]\n", - "None []\n", - "Name: 7485, dtype: object" - ] - }, - "execution_count": 158, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 166, - "id": "a3a0f1c7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent...sort-direction_sm_au_vfbclidtrkamp;amp;sizeindustriesschoolcourse_of_studyorganisation
\n", - "

0 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [view_id, website_id, session_id, created_at, url, referrer, query_params, q, size, current, sort-field, sort-direction, _sm_au_, v, fbclid, trk, amp;amp;size, industries, school, course_of_study, organisation]\n", - "Index: []\n", - "\n", - "[0 rows x 21 columns]" - ] - }, - "execution_count": 166, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = pd.DataFrame(columns=columns)\n", - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "id": "bc449824", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
view_idwebsite_idsession_idcreated_aturlreferrerquery_paramsqsizecurrent..._sm_au_vfbclidfilters[3][field]trkamp;amp;sizeindustriesschoolcourse_of_studyNone
74857688213922022-05-29 06:50:26.338+00/?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust.../?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust...{'size': ['n_20_n'], 'filters[0][field]': ['in...NaN[n_20_n]NaN...NaNNaNNaNNaNNaNNaN[Banking and Finance][National University of Singapore, Singapore M...[Business Administration][]
\n", - "

1 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " view_id website_id session_id created_at \\\n", - "7485 7688 2 1392 2022-05-29 06:50:26.338+00 \n", - "\n", - " url \\\n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "\n", - " referrer \\\n", - "7485 /?size=n_20_n&filters%5B0%5D%5Bfield%5D=indust... \n", - "\n", - " query_params q size \\\n", - "7485 {'size': ['n_20_n'], 'filters[0][field]': ['in... NaN [n_20_n] \n", - "\n", - " current ... _sm_au_ v fbclid filters[3][field] trk amp;amp;size \\\n", - "7485 NaN ... NaN NaN NaN NaN NaN NaN \n", - "\n", - " industries \\\n", - "7485 [Banking and Finance] \n", - "\n", - " school \\\n", - "7485 [National University of Singapore, Singapore M... \n", - "\n", - " course_of_study None \n", - "7485 [Business Administration] [] \n", - "\n", - "[1 rows x 22 columns]" - ] - }, - "execution_count": 164, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 167, - "id": "403e6169", - "metadata": {}, - "outputs": [], - "source": [ - "x = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", - " 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", - " 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation']\n" - ] - }, - { - "cell_type": "code", - "execution_count": 168, - "id": "36451395", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['view_id',\n", - " 'website_id',\n", - " 'session_id',\n", - " 'created_at',\n", - " 'url',\n", - " 'referrer',\n", - " 'query_params',\n", - " 'q',\n", - " 'size',\n", - " 'current',\n", - " 'sort-field',\n", - " 'sort-direction',\n", - " '_sm_au_',\n", - " 'v',\n", - " 'fbclid',\n", - " 'trk',\n", - " 'amp;amp;size',\n", - " 'industries',\n", - " 'school',\n", - " 'course_of_study',\n", - " 'organisation']" - ] - }, - "execution_count": 168, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2eff724", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/04URLsInPostgreSQL/parsing urls.ipynb b/04URLsInPostgreSQL/parsing urls.ipynb new file mode 100644 index 0000000..7fd5c6b --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parsing URLs with data from PostgreSQL dump file\n", + "\n", + "Input dataframe:\n", + " - view_id\n", + " - website_id\n", + " - session_id\n", + " - created_at\n", + " - url\n", + " - referrer\n", + "\n", + "Output dataframe:\n", + "- industries (this will contain a list of all the values in this field)\n", + "- course_of_study\n", + "- organisaton\n", + "- school\n", + "\n", + "Summary of insights gained: (if any)\n", + "\n", + "Written by: Howard and Jolene (only mostly optimizations)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ba88477", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 view_id 113589 non-null int64 \n", + " 1 website_id 113589 non-null int64 \n", + " 2 session_id 113589 non-null int64 \n", + " 3 created_at 113589 non-null object\n", + " 4 url 113589 non-null object\n", + " 5 referrer 99529 non-null object\n", + "dtypes: int64(3), object(3)\n", + "memory usage: 5.2+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from urllib.parse import urlparse, parse_qs, unquote\n", + "\n", + "original_df = pd.read_csv('v1_pageview')\n", + "original_df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e731453c", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_query_params(url):\n", + " url = unquote(url) # make it human readable, not percentages\n", + " query_params = parse_qs(urlparse(url).query)\n", + " return query_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explaining process_query_params\n", + "\n", + "What does the url look like when navigating the page?\n", + "```\n", + "// filter by industry Information and Communications Technology\n", + "filters[0][field]=industries&filters[0][values][0]=Information and Communications Technology\n", + "\n", + "// filter by organization Google and SAP\n", + "filters[0][field]=organisation&filters[0][values][0]=SAP\n", + "filters[1][field]=organisation&filters[1][values][0]=Google\n", + "\n", + "// filter by school \n", + "filters[1][field]=school&filters[1][values][0]=National University of Singapore\n", + "filters[1][type]=any # we will ignore this part, not sure what it as, its always in 'any'\n", + "filters[2][field]=course_of_study\n", + "filters[2][values][0]=Economics%2C Psychology\n", + "filters[2][type]=any\n", + "```\n", + "\n", + "To summarize, the number in filters[0][field] is the first/second/third filter applied etc. while the second value which is either 'field', 'value' or 'type' shows what the text after = is. `filters[1][field]=school&filters[1][values][0]=National University of Singapore` this means the second filter applied is a school filter, the value of the school filter applied is `filters[1][values][0]=National University of Singapore`.\n", + "\n", + "So we can make use of this to extract the filters applied to each vistor URL." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def process_query_params(params):\n", + " '''\n", + " Parameters:\n", + " params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}\n", + " \n", + " Returns:\n", + " {\n", + " \"search_query\": \"search term\",\n", + " \"filter_name\": [\"filter value 1\", \"filter value 2\"]\n", + " }\n", + "\n", + " Note:\n", + " - The filter_name is the name of the filter, e.g. industries, school etc.\n", + " - size and type are ignored\n", + " '''\n", + " result = {}\n", + " current_field = ''\n", + "\n", + " for key, value in params.items():\n", + " if 'filters' in key:\n", + " parts = key.split('[')\n", + " field_or_value = parts[2].strip(']')\n", + "\n", + " if field_or_value == 'field':\n", + " # if its a field then use it as a key\n", + " current_field = value[0]\n", + " result[current_field] = []\n", + " elif field_or_value == 'type':\n", + " # there's a type of all in all queries, not sure what that is and whether its relevant\n", + " pass\n", + " else:\n", + " # if its a value then add it to the list by using the last saved field\n", + " result[current_field].extend(value)\n", + " elif key == \"q\":\n", + " result[\"search_query\"] = value[0]\n", + "\n", + " result = dict(result)\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Main" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Process URLs to extract query params" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = original_df.copy(deep=True)\n", + "df['url'] = df['url'].astype(str)\n", + "df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column \n", + "\n", + "# x2 = x.assign(industries=lambda x: np.nan, \n", + "# school=lambda x: np.nan,\n", + "# course_of_study=lambda x: np.nan,\n", + "# organisation=lambda x: np.nan) #create copy with the columns that i want\n", + "\n", + "# def isnotNaN(num):\n", + "# return num == num\n", + "\n", + "# columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", + "# 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", + "# 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", + "\n", + "\n", + "# for i in range(len(x2)): #go through original DF\n", + "\n", + "# row = x2.iloc[i] # for each row,\n", + "\n", + "# temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", + "# 1:[None,list()],\n", + "# 2:[None,list()],\n", + "# 3:[None,list()]\n", + "# }\n", + "\n", + "# for j in range(4): #identify which field corresponds to reach index\n", + "# if isnotNaN(row.loc[f'filters[{j}][field]']):\n", + "# temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", + "# else:\n", + "# break\n", + "\n", + "# for k in range(4): #condense all the values for each of the fields into list of values\n", + "# for l in range(20):\n", + "# if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", + "# temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", + "# else:\n", + "# break\n", + "\n", + "# for z in range(4): #add the new column:values to dataframe\n", + "# field = temp[z][0]\n", + "# values = temp[z][1]\n", + "# if field is None:\n", + "# break\n", + "# else:\n", + "# x2.loc[i,field] = str(values)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# optimized version of the code above\n", + "df_processed = df.copy(deep=True)\n", + "df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop\n", + "df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why are there so many additional columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indust\n", + "[] 4\n", + "Name: count, dtype: int64\n", + "ind\n", + "[] 22\n", + "Name: count, dtype: int64\n", + "i\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "cou\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "sch\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "o\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "course_of\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "wave_id\n", + "[n_2_n] 8\n", + "[n_1_n] 7\n", + "[n_3_n] 5\n", + "[n_0_n] 2\n", + "[n_0_n, n_2_n] 1\n", + "[n_3_n, n_1_n] 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "print(df_processed['indust'].value_counts())\n", + "print(df_processed['ind'].value_counts())\n", + "print(df_processed['i'].value_counts())\n", + "print(df_processed['cou'].value_counts())\n", + "print(df_processed['sch'].value_counts())\n", + "print(df_processed['o'].value_counts())\n", + "print(df_processed['course_of'].value_counts())\n", + "print(df_processed['wave_id'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the additional columns all contain empty list, so we can safely drop them. As for wave_id, we will leave it in for now." + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 search_query 20959 non-null object\n", + " 1 industries 29829 non-null object\n", + " 2 course_of_study 7103 non-null object\n", + " 3 organisation 13074 non-null object\n", + " 4 school 3657 non-null object\n", + " 5 course 1 non-null object\n", + " 6 wave_id 24 non-null object\n", + "dtypes: object(7)\n", + "memory usage: 6.1+ MB\n" + ] + } + ], + "source": [ + "# drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)\n", + "df_processed.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Export to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "57f88523", + "metadata": {}, + "outputs": [], + "source": [ + "df_processed.to_csv('data-preprocessed.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}