Merge branch 'main' of https://github.com/AdvisorySG/mentorship-da

AdvisorySG · Jan 4, 2024 · aba7cc1 · aba7cc1
2 parents 7931fa3 + 81df9a9
commit aba7cc1
Show file tree

Hide file tree

Showing 2 changed files with 386 additions and 0 deletions.
diff --git a/03AnalysisApplicationData/data/wave2.py b/03AnalysisApplicationData/data/wave2.py
@@ -0,0 +1 @@
+## this will contain the application data for wave 2
diff --git a/04URLsInPostgreSQL/parsing urls.ipynb b/04URLsInPostgreSQL/parsing urls.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parsing URLs with data from PostgreSQL dump file\n",
+    "\n",
+    "Input dataframe:\n",
+    " - view_id\n",
+    " - website_id\n",
+    " - session_id\n",
+    " - created_at\n",
+    " - url\n",
+    " - referrer\n",
+    "\n",
+    "Output dataframe:\n",
+    "- industries (this will contain a list of all the values in this field)\n",
+    "- course_of_study\n",
+    "- organisaton\n",
+    "- school\n",
+    "\n",
+    "Summary of insights gained: (if any)\n",
+    "\n",
+    "Written by: Howard and Jolene (only mostly optimizations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4ba88477",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 113589 entries, 0 to 113588\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count   Dtype \n",
+      "---  ------      --------------   ----- \n",
+      " 0   view_id     113589 non-null  int64 \n",
+      " 1   website_id  113589 non-null  int64 \n",
+      " 2   session_id  113589 non-null  int64 \n",
+      " 3   created_at  113589 non-null  object\n",
+      " 4   url         113589 non-null  object\n",
+      " 5   referrer    99529 non-null   object\n",
+      "dtypes: int64(3), object(3)\n",
+      "memory usage: 5.2+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from urllib.parse import urlparse, parse_qs, unquote\n",
+    "\n",
+    "original_df = pd.read_csv('v1_pageview')\n",
+    "original_df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Helper Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e731453c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_query_params(url):\n",
+    "    url = unquote(url) # make it human readable, not percentages\n",
+    "    query_params = parse_qs(urlparse(url).query)\n",
+    "    return query_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Explaining process_query_params\n",
+    "\n",
+    "What does the url look like when navigating the page?\n",
+    "```\n",
+    "// filter by industry Information and Communications Technology\n",
+    "filters[0][field]=industries&filters[0][values][0]=Information and Communications Technology\n",
+    "\n",
+    "// filter by organization Google and SAP\n",
+    "filters[0][field]=organisation&filters[0][values][0]=SAP\n",
+    "filters[1][field]=organisation&filters[1][values][0]=Google\n",
+    "\n",
+    "// filter by school \n",
+    "filters[1][field]=school&filters[1][values][0]=National University of Singapore\n",
+    "filters[1][type]=any # we will ignore this part, not sure what it as, its always in 'any'\n",
+    "filters[2][field]=course_of_study\n",
+    "filters[2][values][0]=Economics%2C Psychology\n",
+    "filters[2][type]=any\n",
+    "```\n",
+    "\n",
+    "To summarize, the number in filters[0][field] is the first/second/third filter applied etc. while the second value which is either 'field', 'value' or 'type' shows what the text after = is. `filters[1][field]=school&filters[1][values][0]=National University of Singapore` this means the second filter applied is a school filter, the value of the school filter applied is `filters[1][values][0]=National University of Singapore`.\n",
+    "\n",
+    "So we can make use of this to extract the filters applied to each vistor URL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_query_params(params):\n",
+    "    '''\n",
+    "    Parameters:\n",
+    "    params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}\n",
+    "    \n",
+    "    Returns:\n",
+    "    {\n",
+    "        \"search_query\": \"search term\",\n",
+    "        \"filter_name\": [\"filter value 1\", \"filter value 2\"]\n",
+    "    }\n",
+    "\n",
+    "    Note:\n",
+    "    - The filter_name is the name of the filter, e.g. industries, school etc.\n",
+    "    - size and type are ignored\n",
+    "    '''\n",
+    "    result = {}\n",
+    "    current_field = ''\n",
+    "\n",
+    "    for key, value in params.items():\n",
+    "        if 'filters' in key:\n",
+    "            parts = key.split('[')\n",
+    "            field_or_value = parts[2].strip(']')\n",
+    "\n",
+    "            if field_or_value == 'field':\n",
+    "                # if its a field then use it as a key\n",
+    "                current_field = value[0]\n",
+    "                result[current_field] = []\n",
+    "            elif field_or_value == 'type':\n",
+    "                # there's a type of all in all queries, not sure what that is and whether its relevant\n",
+    "                pass\n",
+    "            else:\n",
+    "                # if its a value then add it to the list by using the last saved field\n",
+    "                result[current_field].extend(value)\n",
+    "        elif key == \"q\":\n",
+    "            result[\"search_query\"] = value[0]\n",
+    "\n",
+    "    result = dict(result)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Main"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Process URLs to extract query params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = original_df.copy(deep=True)\n",
+    "df['url'] = df['url'].astype(str)\n",
+    "df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x = pd.concat([new_df, new_df['query_params'].apply(lambda x: pd.Series(x, dtype=\"object\"))], axis=1) #separate each of the key:value pairs into it's own column \n",
+    "\n",
+    "# x2 = x.assign(industries=lambda x: np.nan, \n",
+    "#              school=lambda x: np.nan,\n",
+    "#              course_of_study=lambda x: np.nan,\n",
+    "#              organisation=lambda x: np.nan)  #create copy with the columns that i want\n",
+    "\n",
+    "# def isnotNaN(num):\n",
+    "#     return num == num\n",
+    "\n",
+    "# columns = ['view_id', 'website_id', 'session_id', 'created_at', 'url', 'referrer', 'query_params',\n",
+    "#            'q', 'size', 'current', 'sort-field', 'sort-direction', '_sm_au_', 'v', 'fbclid', 'trk',\n",
+    "#            'amp;amp;size', 'industries', 'school', 'course_of_study', 'organisation'] #columns that i eventually want\n",
+    "\n",
+    "\n",
+    "# for i in range(len(x2)): #go through original DF\n",
+    "\n",
+    "#     row = x2.iloc[i] # for each row,\n",
+    "\n",
+    "#     temp ={0:[None,list()], #theres 4 possible fields [field, list of values]\n",
+    "#            1:[None,list()],\n",
+    "#            2:[None,list()],\n",
+    "#            3:[None,list()]\n",
+    "#           }\n",
+    "\n",
+    "#     for j in range(4): #identify which field corresponds to reach index\n",
+    "#         if isnotNaN(row.loc[f'filters[{j}][field]']):\n",
+    "#             temp[j][0] = row.loc[f'filters[{j}][field]'][0]\n",
+    "#         else:\n",
+    "#             break\n",
+    "\n",
+    "#     for k in range(4): #condense all the values for each of the fields into list of values\n",
+    "#         for l in range(20):\n",
+    "#             if f'filters[{k}][values][{l}]' in row.index and isnotNaN(row.loc[f'filters[{k}][values][{l}]']):\n",
+    "#                 temp[k][1].append(row.loc[f'filters[{k}][values][{l}]'][0]) \n",
+    "#             else:\n",
+    "#                 break\n",
+    "\n",
+    "#     for z in range(4): #add the new column:values to dataframe\n",
+    "#         field = temp[z][0]\n",
+    "#         values = temp[z][1]\n",
+    "#         if field is None:\n",
+    "#             break\n",
+    "#         else:\n",
+    "#             x2.loc[i,field] = str(values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# optimized version of the code above\n",
+    "df_processed = df.copy(deep=True)\n",
+    "df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop\n",
+    "df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Why are there so many additional columns?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "indust\n",
+      "[]    4\n",
+      "Name: count, dtype: int64\n",
+      "ind\n",
+      "[]    22\n",
+      "Name: count, dtype: int64\n",
+      "i\n",
+      "[]    3\n",
+      "Name: count, dtype: int64\n",
+      "cou\n",
+      "[]    3\n",
+      "Name: count, dtype: int64\n",
+      "sch\n",
+      "[]    1\n",
+      "Name: count, dtype: int64\n",
+      "o\n",
+      "[]    1\n",
+      "Name: count, dtype: int64\n",
+      "course_of\n",
+      "[]    1\n",
+      "Name: count, dtype: int64\n",
+      "wave_id\n",
+      "[n_2_n]           8\n",
+      "[n_1_n]           7\n",
+      "[n_3_n]           5\n",
+      "[n_0_n]           2\n",
+      "[n_0_n, n_2_n]    1\n",
+      "[n_3_n, n_1_n]    1\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n",
+    "print(df_processed['indust'].value_counts())\n",
+    "print(df_processed['ind'].value_counts())\n",
+    "print(df_processed['i'].value_counts())\n",
+    "print(df_processed['cou'].value_counts())\n",
+    "print(df_processed['sch'].value_counts())\n",
+    "print(df_processed['o'].value_counts())\n",
+    "print(df_processed['course_of'].value_counts())\n",
+    "print(df_processed['wave_id'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see that the additional columns all contain empty list, so we can safely drop them. As for wave_id, we will leave it in for now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 113589 entries, 0 to 113588\n",
+      "Data columns (total 7 columns):\n",
+      " #   Column           Non-Null Count  Dtype \n",
+      "---  ------           --------------  ----- \n",
+      " 0   search_query     20959 non-null  object\n",
+      " 1   industries       29829 non-null  object\n",
+      " 2   course_of_study  7103 non-null   object\n",
+      " 3   organisation     13074 non-null  object\n",
+      " 4   school           3657 non-null   object\n",
+      " 5   course           1 non-null      object\n",
+      " 6   wave_id          24 non-null     object\n",
+      "dtypes: object(7)\n",
+      "memory usage: 6.1+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']\n",
+    "df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)\n",
+    "df_processed.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Export to CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "57f88523",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_processed.to_csv('data-preprocessed.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}