From 9b2735e93cb5a9298dba71fd511e03ffeb34a508 Mon Sep 17 00:00:00 2001 From: Restia Date: Thu, 4 Jan 2024 21:40:40 +0800 Subject: [PATCH 1/2] issue resolved created a class??? similar to what i was asked created a script which is how i would do it (and it is without class) --- 04URLsInPostgreSQL/parsing urls class.py | 73 +++++++++++++++++++++++ 04URLsInPostgreSQL/parsing urls script.py | 63 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 04URLsInPostgreSQL/parsing urls class.py create mode 100644 04URLsInPostgreSQL/parsing urls script.py diff --git a/04URLsInPostgreSQL/parsing urls class.py b/04URLsInPostgreSQL/parsing urls class.py new file mode 100644 index 0000000..bf2e45d --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls class.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np +from urllib.parse import urlparse, parse_qs, unquote + + +class parsing_urls: + def __init__(self, input_csv_file): + original_df = pd.read_csv(input_csv_file) + + def helper_extract_query_params(url): + url = unquote(url) # make it human readable, not percentages + query_params = parse_qs(urlparse(url).query) + + return query_params + + def extract_query_params(self): + self.df = self.original_df.copy(deep=True) + self.df['url'] = self.df['url'].astype(str) + self.df['query_params'] = self.df['url'].apply(self.helper_extract_query_params) # gets all the query params and makes it a dictionary + + def helper_process_query_params(params): + ''' + Parameters: + params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']} + + Returns: + { + "search_query": "search term", + "filter_name": ["filter value 1", "filter value 2"] + } + + Note: + - The filter_name is the name of the filter, e.g. industries, school etc. + - size and type are ignored + ''' + result = {} + current_field = '' + + for key, value in params.items(): + if 'filters' in key: + parts = key.split('[') + field_or_value = parts[2].strip(']') + + if field_or_value == 'field': + # if its a field then use it as a key + current_field = value[0] + result[current_field] = [] + elif field_or_value == 'type': + # there's a type of all in all queries, not sure what that is and whether its relevant + pass + else: + # if its a value then add it to the list by using the last saved field + result[current_field].extend(value) + elif key == "q": + result["search_query"] = value[0] + + result = dict(result) + return result + + def process_query_params(self): + self.df_processed = self.df.copy(deep=True) + self.df_processed['query_params'] = self.df_processed['query_params'].apply(self.helper_process_query_params) # use only 1 for loop + self.df_processed = self.pd.DataFrame(self.df_processed['query_params'].values.tolist()) + + def drop_unused_tables(self): + # drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'] + self.df_processed = self.df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1) + self.df_processed.info() + + def to_csv(self, output_csv_file): + df_processed.to_csv(output_csv_file, index=False) + + diff --git a/04URLsInPostgreSQL/parsing urls script.py b/04URLsInPostgreSQL/parsing urls script.py new file mode 100644 index 0000000..fb667ad --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls script.py @@ -0,0 +1,63 @@ +import pandas as pd +import numpy as np +from urllib.parse import urlparse, parse_qs, unquote + + + +def helper_extract_query_params(url): + url = unquote(url) # make it human readable, not percentages + query_params = parse_qs(urlparse(url).query) + + return query_params + +def helper_process_query_params(params): + ''' + Parameters: + params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']} + + Returns: + { + "search_query": "search term", + "filter_name": ["filter value 1", "filter value 2"] + } + + Note: + - The filter_name is the name of the filter, e.g. industries, school etc. + - size and type are ignored + ''' + result = {} + current_field = '' + + for key, value in params.items(): + if 'filters' in key: + parts = key.split('[') + field_or_value = parts[2].strip(']') + + if field_or_value == 'field': + # if its a field then use it as a key + current_field = value[0] + result[current_field] = [] + elif field_or_value == 'type': + # there's a type of all in all queries, not sure what that is and whether its relevant + pass + else: + # if its a value then add it to the list by using the last saved field + result[current_field].extend(value) + elif key == "q": + result["search_query"] = value[0] + + result = dict(result) + return result + +input_csv_file = input('enter the input csv file name') +output_csv_file = input('enter the output csv file name') + +original_df = pd.read_csv(input_csv_file) +df = original_df.copy(deep=True) +df['url'] = df['url'].astype(str) +df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary +df_processed = df.copy(deep=True) +df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop +df_processed = pd.DataFrame(df_processed['query_params'].values.tolist()) +df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1) #drop unused columns +df_processed.to_csv(output_csv_file, index=False) \ No newline at end of file From 160805ac481fdcbf242a4a82c525a68ad729f350 Mon Sep 17 00:00:00 2001 From: fallendynasty <33785667+fallendynasty@users.noreply.github.com> Date: Sat, 6 Jan 2024 17:56:10 +0800 Subject: [PATCH 2/2] issue resolved --- 04URLsInPostgreSQL/parsing urls script.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/04URLsInPostgreSQL/parsing urls script.py b/04URLsInPostgreSQL/parsing urls script.py index fb667ad..25c416d 100644 --- a/04URLsInPostgreSQL/parsing urls script.py +++ b/04URLsInPostgreSQL/parsing urls script.py @@ -4,13 +4,13 @@ -def helper_extract_query_params(url): +def extract_query_params(url): url = unquote(url) # make it human readable, not percentages query_params = parse_qs(urlparse(url).query) return query_params -def helper_process_query_params(params): +def process_query_params(params): ''' Parameters: params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']} @@ -49,8 +49,8 @@ def helper_process_query_params(params): result = dict(result) return result -input_csv_file = input('enter the input csv file name') -output_csv_file = input('enter the output csv file name') +input_csv_file = input('enter the input csv file name: ') +output_csv_file = input('enter the output csv file name: ') original_df = pd.read_csv(input_csv_file) df = original_df.copy(deep=True)