diff --git a/04URLsInPostgreSQL/parsing urls class.py b/04URLsInPostgreSQL/parsing urls class.py new file mode 100644 index 0000000..bf2e45d --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls class.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np +from urllib.parse import urlparse, parse_qs, unquote + + +class parsing_urls: + def __init__(self, input_csv_file): + original_df = pd.read_csv(input_csv_file) + + def helper_extract_query_params(url): + url = unquote(url) # make it human readable, not percentages + query_params = parse_qs(urlparse(url).query) + + return query_params + + def extract_query_params(self): + self.df = self.original_df.copy(deep=True) + self.df['url'] = self.df['url'].astype(str) + self.df['query_params'] = self.df['url'].apply(self.helper_extract_query_params) # gets all the query params and makes it a dictionary + + def helper_process_query_params(params): + ''' + Parameters: + params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']} + + Returns: + { + "search_query": "search term", + "filter_name": ["filter value 1", "filter value 2"] + } + + Note: + - The filter_name is the name of the filter, e.g. industries, school etc. + - size and type are ignored + ''' + result = {} + current_field = '' + + for key, value in params.items(): + if 'filters' in key: + parts = key.split('[') + field_or_value = parts[2].strip(']') + + if field_or_value == 'field': + # if its a field then use it as a key + current_field = value[0] + result[current_field] = [] + elif field_or_value == 'type': + # there's a type of all in all queries, not sure what that is and whether its relevant + pass + else: + # if its a value then add it to the list by using the last saved field + result[current_field].extend(value) + elif key == "q": + result["search_query"] = value[0] + + result = dict(result) + return result + + def process_query_params(self): + self.df_processed = self.df.copy(deep=True) + self.df_processed['query_params'] = self.df_processed['query_params'].apply(self.helper_process_query_params) # use only 1 for loop + self.df_processed = self.pd.DataFrame(self.df_processed['query_params'].values.tolist()) + + def drop_unused_tables(self): + # drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'] + self.df_processed = self.df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1) + self.df_processed.info() + + def to_csv(self, output_csv_file): + df_processed.to_csv(output_csv_file, index=False) + + diff --git a/04URLsInPostgreSQL/parsing urls script.py b/04URLsInPostgreSQL/parsing urls script.py new file mode 100644 index 0000000..25c416d --- /dev/null +++ b/04URLsInPostgreSQL/parsing urls script.py @@ -0,0 +1,63 @@ +import pandas as pd +import numpy as np +from urllib.parse import urlparse, parse_qs, unquote + + + +def extract_query_params(url): + url = unquote(url) # make it human readable, not percentages + query_params = parse_qs(urlparse(url).query) + + return query_params + +def process_query_params(params): + ''' + Parameters: + params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']} + + Returns: + { + "search_query": "search term", + "filter_name": ["filter value 1", "filter value 2"] + } + + Note: + - The filter_name is the name of the filter, e.g. industries, school etc. + - size and type are ignored + ''' + result = {} + current_field = '' + + for key, value in params.items(): + if 'filters' in key: + parts = key.split('[') + field_or_value = parts[2].strip(']') + + if field_or_value == 'field': + # if its a field then use it as a key + current_field = value[0] + result[current_field] = [] + elif field_or_value == 'type': + # there's a type of all in all queries, not sure what that is and whether its relevant + pass + else: + # if its a value then add it to the list by using the last saved field + result[current_field].extend(value) + elif key == "q": + result["search_query"] = value[0] + + result = dict(result) + return result + +input_csv_file = input('enter the input csv file name: ') +output_csv_file = input('enter the output csv file name: ') + +original_df = pd.read_csv(input_csv_file) +df = original_df.copy(deep=True) +df['url'] = df['url'].astype(str) +df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary +df_processed = df.copy(deep=True) +df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop +df_processed = pd.DataFrame(df_processed['query_params'].values.tolist()) +df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1) #drop unused columns +df_processed.to_csv(output_csv_file, index=False) \ No newline at end of file