diff --git a/code/jjm-dashboard-scraping/get-village-info.py b/code/jjm-dashboard-scraping/get-village-info.py new file mode 100644 index 0000000..fc638be --- /dev/null +++ b/code/jjm-dashboard-scraping/get-village-info.py @@ -0,0 +1,83 @@ +import requests +import pandas as pd +import os +from glob import glob + +df = pd.DataFrame() +s = requests.session() + +url_search='https://ejalshakti.gov.in/jjmreport/JJMVillage_Profile.aspx/Bind_Fhtc_info' +headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'} + +old_data = pd.read_csv('../../data/scraping/village-data-odisha.csv') +villagesCollected = old_data['VillCode'] + +search_parameters = pd.read_csv('../../data/scraping/odisha-first-villcode.csv') +search_parameters = search_parameters[search_parameters['Collected'] == 0] +search_parameters = search_parameters.reset_index() + +for index, row in search_parameters.iterrows(): + DtCode = row['DtCode'] + StCode = row['StCode'] + VillCode = row['VillCode'] + i = 0 + while i < 15: + print(VillCode) + payloadStCode = str(StCode) + payloadStCode = payloadStCode.replace('0', '%3A') + payloadDtCode = str(DtCode) + payloadDtCode = payloadDtCode.replace('0', '%3A') + payloadVillCode = str(VillCode) + payloadVillCode = payloadVillCode.replace('0', '%3A') + + if not payloadVillCode in villagesCollected: + payload = { + "Cat" : "11", + "DtCode11" : payloadDtCode, # This also varies a bit + "Param" : "11", + "StCode11" : payloadStCode, + "SubCat" : "11", + "VillCode" : payloadVillCode + } + req = s.post(url_search, headers = headers, json = payload) + info = req.json() + df_new = pd.DataFrame(info['d']) + if not df_new.empty: + df_new["VillCode"] = payloadVillCode + df_new['DtCode'] = payloadDtCode + df_new['StCode'] = StCode + print('clean data') + df = pd.concat([df, df_new]) + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + i = 0 + if df_new.empty: + i = i + 1 + print(i) + VillCode = VillCode + 10 + if not df.empty: + file = '../../data/scraping/odisha/village-data-' + payloadDtCode + '.csv' + if os.path.exists(file): + print('combining with old data') + df_old = pd.read_csv(file, low_memory=False) + df_old = df_old.drop_duplicates() + df = pd.concat([df, df_old]) + df = df.drop_duplicates() + df.to_csv(file) + + +files = glob(os.path.join('../../data/scraping/odisha', '*.csv')) + +df = pd.DataFrame() +for file in files: + print(file) + df_i = pd.read_csv(file) + # Assumes everyone has same columns + df = pd.concat((df, df_i)) + +df = df.loc[:, ~df.columns.str.contains('^Unnamed')] +df = df.drop_duplicates() + +village_names = pd.read_csv('../../data/scraping/village-list-odisha.csv') +df = village_names.merge(df, how = 'left', on = 'VillageId') + +df.to_csv('../../data/scraping/village-data-odisha.csv') diff --git a/code/jjm-dashboard-scraping/list-villages.py b/code/jjm-dashboard-scraping/list-villages.py new file mode 100644 index 0000000..7b0e41d --- /dev/null +++ b/code/jjm-dashboard-scraping/list-villages.py @@ -0,0 +1,53 @@ +import requests +import pandas as pd +import os +import json +from random import randint +from time import sleep + +s = requests.session() +headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'} + +# List states +url = " https://ejalshakti.gov.in/jjmreport/JJMIndia.aspx/JJM_StateDistrictSearch" +payload = { + "StCode" : "11", + "Name" : "1" +} + +req = s.post(url, headers = headers, json = payload) +info = req.json() +states_df = pd.DataFrame(info['d']) + +# Dont overwrite: was manually edited to inlcude the 'StCode' column +# states_df.to_csv('../../data/scraping/states-list.csv') + +req = s.post(url, headers = headers, json = payload) +info = req.json() + +dtcode_list = ['4951','4881', '4891','4921','4941','4991','4931','4%3A61','48%3A1','4971','4%3A%3A1','4%3A81','4%3A51','4%3A71','4%3A31','4%3A41','4811','4%3A21','4841','49%3A1','4911','4821','4%3A11','4861','4%3A91','4871','4961','4981','4831','4851'] +alphabetsearch_lst = ['B1','C1','D1','E1','F1','G1','H1','I1','J1','K1','L1','M1','N1','O1','P1','Q1','R1','S1','T1','U1','V1','W1','X1','Y1','Z1','%5B1'] +villages_df = pd.DataFrame() +url = 'https://ejalshakti.gov.in/jjmreport/JJMIndia.aspx/Bind_search_Village' + +#for StCode in "321": +for DtCode in dtcode_list: + print(DtCode) + for letter in alphabetsearch_lst: + print(letter) + payload = { + 'DtCode': DtCode, + 'StCode': '321', + 'VillageName': letter + } + req = s.post(url, headers = headers, json = payload) + info = req.json() + new_village = pd.DataFrame(info['d']) + if not new_village.empty: + new_village['DtCode'] = DtCode + new_village['StCode'] = '321' + villages_df = pd.concat([villages_df, new_village]) + +villages_df = villages_df.drop_duplicates() +villages_df.to_csv('../../data/scraping/village-list-odisha.csv') +