diff --git a/03AnalysisApplicationData/data/wave2.py b/03AnalysisApplicationData/data/wave2.py new file mode 100644 index 0000000..d25889c --- /dev/null +++ b/03AnalysisApplicationData/data/wave2.py @@ -0,0 +1 @@ +## this will contain the application data for wave 2 diff --git a/04URLsInPostgreSQL/parsing urls.ipynb b/04URLsInPostgreSQL/parsing urls.ipynb new file mode 100644 index 0000000..7fd5c6b --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parsing URLs with data from PostgreSQL dump file\n", + "\n", + "Input dataframe:\n", + " - view_id\n", + " - website_id\n", + " - session_id\n", + " - created_at\n", + " - url\n", + " - referrer\n", + "\n", + "Output dataframe:\n", + "- industries (this will contain a list of all the values in this field)\n", + "- course_of_study\n", + "- organisaton\n", + "- school\n", + "\n", + "Summary of insights gained: (if any)\n", + "\n", + "Written by: Howard and Jolene (only mostly optimizations)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ba88477", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 view_id 113589 non-null int64 \n", + " 1 website_id 113589 non-null int64 \n", + " 2 session_id 113589 non-null int64 \n", + " 3 created_at 113589 non-null object\n", + " 4 url 113589 non-null object\n", + " 5 referrer 99529 non-null object\n", + "dtypes: int64(3), object(3)\n", + "memory usage: 5.2+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from urllib.parse import urlparse, parse_qs, unquote\n", + "\n", + "original_df = pd.read_csv('v1_pageview')\n", + "original_df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e731453c", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_query_params(url):\n", + " url = unquote(url) # make it human readable, not percentages\n", + " query_params = parse_qs(urlparse(url).query)\n", + " return query_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explaining process_query_params\n", + "\n", + "What does the url look like when navigating the page?\n", + "```\n", + "// filter by industry Information and Communications Technology\n", + "filters[0][field]=industries&filters[0][values][0]=Information and Communications Technology\n", + "\n", + "// filter by organization Google and SAP\n", + "filters[0][field]=organisation&filters[0][values][0]=SAP\n", + "filters[1][field]=organisation&filters[1][values][0]=Google\n", + "\n", + "// filter by school \n", + "filters[1][field]=school&filters[1][values][0]=National University of Singapore\n", + "filters[1][type]=any # we will ignore this part, not sure what it as, its always in 'any'\n", + "filters[2][field]=course_of_study\n", + "filters[2][values][0]=Economics%2C Psychology\n", + "filters[2][type]=any\n", + "```\n", + "\n", + "To summarize, the number in filters[0][field] is the first/second/third filter applied etc. while the second value which is either 'field', 'value' or 'type' shows what the text after = is. `filters[1][field]=school&filters[1][values][0]=National University of Singapore` this means the second filter applied is a school filter, the value of the school filter applied is `filters[1][values][0]=National University of Singapore`.\n", + "\n", + "So we can make use of this to extract the filters applied to each vistor URL." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def process_query_params(params):\n", + " '''\n", + " Parameters:\n", + " params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}\n", + " \n", + " Returns:\n", + " {\n", + " \"search_query\": \"search term\",\n", + " \"filter_name\": [\"filter value 1\", \"filter value 2\"]\n", + " }\n", + "\n", + " Note:\n", + " - The filter_name is the name of the filter, e.g. industries, school etc.\n", + " - size and type are ignored\n", + " '''\n", + " result = {}\n", + " current_field = ''\n", + "\n", + " for key, value in params.items():\n", + " if 'filters' in key:\n", + " parts = key.split('[')\n", + " field_or_value = parts[2].strip(']')\n", + "\n", + " if field_or_value == 'field':\n", + " # if its a field then use it as a key\n", + " current_field = value[0]\n", + " result[current_field] = []\n", + " elif field_or_value == 'type':\n", + " # there's a type of all in all queries, not sure what that is and whether its relevant\n", + " pass\n", + " else:\n", + " # if its a value then add it to the list by using the last saved field\n", + " result[current_field].extend(value)\n", + " elif key == \"q\":\n", + " result[\"search_query\"] = value[0]\n", + "\n", + " result = dict(result)\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Main" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Process URLs to extract query params" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = original_df.copy(deep=True)\n", + "df['url'] = df['url'].astype(str)\n", + "df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column \n", + "\n", + "# x2 = x.assign(industries=lambda x: np.nan, \n", + "# school=lambda x: np.nan,\n", + "# course_of_study=lambda x: np.nan,\n", + "# organisation=lambda x: np.nan) #create copy with the columns that i want\n", + "\n", + "# def isnotNaN(num):\n", + "# return num == num\n", + "\n", + "# columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n", + "# 'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n", + "# 'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n", + "\n", + "\n", + "# for i in range(len(x2)): #go through original DF\n", + "\n", + "# row = x2.iloc[i] # for each row,\n", + "\n", + "# temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n", + "# 1:[None,list()],\n", + "# 2:[None,list()],\n", + "# 3:[None,list()]\n", + "# }\n", + "\n", + "# for j in range(4): #identify which field corresponds to reach index\n", + "# if isnotNaN(row.loc[f'filters[{j}][field]']):\n", + "# temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n", + "# else:\n", + "# break\n", + "\n", + "# for k in range(4): #condense all the values for each of the fields into list of values\n", + "# for l in range(20):\n", + "# if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n", + "# temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n", + "# else:\n", + "# break\n", + "\n", + "# for z in range(4): #add the new column:values to dataframe\n", + "# field = temp[z][0]\n", + "# values = temp[z][1]\n", + "# if field is None:\n", + "# break\n", + "# else:\n", + "# x2.loc[i,field] = str(values)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# optimized version of the code above\n", + "df_processed = df.copy(deep=True)\n", + "df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop\n", + "df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why are there so many additional columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indust\n", + "[] 4\n", + "Name: count, dtype: int64\n", + "ind\n", + "[] 22\n", + "Name: count, dtype: int64\n", + "i\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "cou\n", + "[] 3\n", + "Name: count, dtype: int64\n", + "sch\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "o\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "course_of\n", + "[] 1\n", + "Name: count, dtype: int64\n", + "wave_id\n", + "[n_2_n] 8\n", + "[n_1_n] 7\n", + "[n_3_n] 5\n", + "[n_0_n] 2\n", + "[n_0_n, n_2_n] 1\n", + "[n_3_n, n_1_n] 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "print(df_processed['indust'].value_counts())\n", + "print(df_processed['ind'].value_counts())\n", + "print(df_processed['i'].value_counts())\n", + "print(df_processed['cou'].value_counts())\n", + "print(df_processed['sch'].value_counts())\n", + "print(df_processed['o'].value_counts())\n", + "print(df_processed['course_of'].value_counts())\n", + "print(df_processed['wave_id'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the additional columns all contain empty list, so we can safely drop them. As for wave_id, we will leave it in for now." + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 113589 entries, 0 to 113588\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 search_query 20959 non-null object\n", + " 1 industries 29829 non-null object\n", + " 2 course_of_study 7103 non-null object\n", + " 3 organisation 13074 non-null object\n", + " 4 school 3657 non-null object\n", + " 5 course 1 non-null object\n", + " 6 wave_id 24 non-null object\n", + "dtypes: object(7)\n", + "memory usage: 6.1+ MB\n" + ] + } + ], + "source": [ + "# drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n", + "df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)\n", + "df_processed.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Export to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "57f88523", + "metadata": {}, + "outputs": [], + "source": [ + "df_processed.to_csv('data-preprocessed.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}