automate_gdp_data_collection.py

# -*- coding: utf-8 -*-
"""Automate_GDP_Data_Collection.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1WmKpJ5CVigFVqc_FFAgCwyhtoENt5Tlg

### Our aim is to **automate GDP data collection**. We will have to give a output which will have 9 sectors which are:-
- Agriculture	
- Manufacturing	
- Minning and Quarrying	
- Electricity, Gas and Water supply	
- Construction	
- Trade, Hotels and Restaurents	
- Transport, Storage and Communication	
- Financial Real and Other Combined	
- Public Administration	
- Total of each sector	

Our data will tell us the contribution of each sector on different regions(It can be wards/zones/district etc.)

How to achieve this aim- 
First of all we need data - 
### For GDP
We will try to directly import the data through website if not then we will download it and put it in the google drive.

### For land use 
If somebody can automate this task it is well and good. Other wise we have go **manually** and **distribute the proportion** for agriculture and manufacturing according to land usage.

### For population 
We will get the data directly from the sheet.

*Google id - maheshbansal2021@gmail.com

*password - sapiogdp2021 (Please don't misuse the id)

We can mount the google drive while working

Three kinds of data will be there 
*   GDP( Gross Domestic Data of the most recent year)
*   Land usage by different regions( wards/zones/distric)
*   Population Proportion

Our task is simple import all this data achieve our aim mention above and export it in a csv file. 

I would also like to mention that everything has its own use. If we are able to do something very easily and efficiently using other tools(like excel) then don't waste time, 
many data scientist don't understand this simple thing.

We will divide our work in three parts:-

* 1) To automate the gdp data collection ( Utkarsh) 
* 2) To find a way that how we can use land map to find the proportion of agriculture and organised manufacture (Siddhant and Rishi)
* 3) To make a function so that we can automate the working of calculation we currently do in excel. (Pratyusha and Rishi)

We will try to complete the task by 2nd june 4:00 PM
"""

!pip install tabula-py

# Installing library so that we can directly use our googlesheet here.
!pip install --upgrade -q pygsheets

# importing required libraries
import numpy as np
import pandas as pd
from tabula import read_pdf

"""I had shown here that how we can link the google spreadsheet and make changes without even going to sheet.
For further details on how we can use various functions refer the following link 'https://gspread.readthedocs.io/en/latest/'.
"""

# This is the authenticatication code 
# We don't have to pay much attention on this code
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

# Using the URL importing the googlesheet 
# This is an example where I have created a google sheet and put some data 
# The sheet is stored in the google drive (practice_spreadsheet)

wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1SVM2TySNDLT2FOdE_ifmT4HBnvd6q6pflTeFmGpnz8k/edit#gid=0')

# On which worksheet we want to work currently we will work with sheet 1 
sheet = wb.worksheet('Sheet1')

# Using all the data of the sheet
data = sheet.get_all_values()

# Converting our data into dataframe
df = pd.DataFrame(data)

# Finding out the first five rows
df.head()

"""## Performed by Utkarsh
Trying camelot(for pdf_to_tables)
"""

!pip install --upgrade pip camelat-py[dev]

!pip install camelot-py[cv]

import camelot

tables=camelot.read_pdf('gdrive/My Drive/table.pdf',pages='11',flavor='lattice' )#camelot doesnt work because one of its sub module

"""Performing this with tabula, as the tables obtained here can't be extracted without nan values, it'd take more time to perform cleaning"""

from google.colab import drive 
drive.mount('/content/gdrive')

!pip install tabula-py

import tabula

df=tabula.read_pdf('gdrive/My Drive/table.pdf',pages=11,multiple_tables=True)

df[1]

d=df[1].dropna(axis='columns',how='all')
d

df[0][df[0].columns[0:2]].dropna()

tabula.convert_into('gdrive/My Drive/table.pdf','test.csv',output_format='csv',pages=11)

def convert_data(location,page,pdfname):  #function in general if pdf has no waste values
  df=tabula.read_pdf('location',pages=11,multiple_tables=True)
  tabula.convert_into('location',pdfname+'.csv',output_format='csv',pages=page)

"""**For Automation of blending two images(try to make snippets of similar size manually by taking snip although I've used resize to make them of similar shape and size but will be better when they are manually snipped to similar size)**"""

#import the libraries
from PIL import Image
import cv2
import numpy as np
from google.colab.patches import cv2_imshow
#upload the landuse image of Sangli district
fg = Image.open("gdrive/My Drive/landuse.JPG")
# converting the color mode of the second image(for Wards of Sangli) to match the first image, while opening the second image
bg = Image.open("gdrive/My Drive/wards.JPG").convert(fg.mode)

# resizing the second image to the same dimensions as the first one
bg = bg.resize(fg.size)
# creating an numpy array off both the image objects, for using in addWeighted()
bg = np.array(bg)
fg = np.array(fg)
#add how much enhanced the respective images should  be in the blended image
img = cv2.addWeighted(fg, 0.4, bg, 0.6, 0)


cv2_imshow(fg)

cv2_imshow(bg)

cv2_imshow(img)
cv2.waitKey(0)
cv2.destroyAllWindows()
#boundary isn't properly matching

"""Doing this procedure again but doing translation(or image shifting) in the wards image"""

fg = Image.open("gdrive/My Drive/landuse.JPG")
# converting the color mode of the second image(for Wards of Sangli) to match the first image, while opening the second image
bg = Image.open("gdrive/My Drive/wards.JPG").convert(fg.mode)
bg = bg.resize(fg.size)
bg = np.array(bg)
fg = np.array(fg)

num_rows, num_cols = bg.shape[:2]

translation_matrix = np.float32([ [1,0,-10], [0,1,0] ])
img_translation = cv2.warpAffine(bg, translation_matrix, (num_cols, num_rows))

cv2_imshow(img_translation)

#add how much enhanced the respective images should  be in the blended image
img = cv2.addWeighted(fg, 0.4, img_translation, 0.6, 0)

cv2_imshow(img)
cv2.waitKey(0)
cv2.destroyAllWindows()

"""Clearly the above blending of images is perfect

### Performed by mahesh bansal
saving a table from pdf file into cvs 
and then extract it to get the data frame
"""

!pip install tabula-py

import pandas as pd
import numpy as np
import tabula

import tabula
madhya_pradesh_gdp = tabula.read_pdf("madhya_pradesh_gdp-131.pdf", pages="all", multiple_tables=True)

tabula.convert_into("madhya_pradesh_gdp-131.pdf","mp.csv",output_format="csv")

data = pd.read_csv("mp.csv")

pd.DataFrame(data)

pdf, pages = ["madhya_pradesh_gdp-131.pdf", "all"]

def data_convert(pdf,pages):
  data = tabula.read_pdf(pdf,pages=pages,multiple_tables=True)
  x =  tabula.convert_into(pdf,'%s.csv'% pdf, output_format="csv")
  return x


# Using the URL importing the googlesheet 
# This is an example where I have created a google sheet and put some data 
# The sheet is stored in the google drive (practice_spreadsheet)

wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1SVM2TySNDLT2FOdE_ifmT4HBnvd6q6pflTeFmGpnz8k/edit#gid=0')

# On which worksheet we want to work currently we will work with sheet 1 
sheet = wb.worksheet('Sheet1')

gc.create('A new spreadsheet')

sh

worksheet = wb.get_worksheet(0)

wb.worksheets()

wb.del_worksheet(worksheet)

values_list = sheet.row_values(1)

values_list


##########################################################################################################################################

sheet2  = wb.worksheet("GDP")

file = pd.DataFrame(sheet2.get_all_records())

file["Agri_area"] = (file["area"]).mul(file["agriculture_proportion"])

file["manu_area"] = (file["area"]).mul(file["manufacturing_proportion"])

file

file["agri_area_proportion"] =  file["Agri_area"].divide(file["Agri_area"].sum())

file["manu_area_proportion"] = file["manu_area"].divide(file["manu_area"].sum())

file["population_proportion"] = file["population"].divide(file["population"].sum())


def proportion(sheetname):
  sheet2  = wb.worksheet(sheetname)
  file = pd.DataFrame(sheet2.get_all_records())
  file["Agri_area"] = (file["area"]).mul(file["agriculture_proportion"])
  file["manu_area"] = (file["area"]).mul(file["manufacturing_proportion"])
  file["agri_area_proportion"] =  file["Agri_area"].divide(file["Agri_area"].sum())
  file["manu_area_proportion"] = file["manu_area"].divide(file["manu_area"].sum())
  file["population_proportion"] = file["population"].divide(file["population"].sum())
  return file

prop = proportion("GDP")

prop

#################################################################################################################

sheet3  = wb.worksheet("qwerty")

gdp_data = pd.DataFrame(sheet3.row_values(2))

gdp_data

columns = ["Agriculture_and_Food","Manufacturing","Mining","Electricity_Gas_Water","Construction","Trade_Hotel_Restaurents","Transportation_Storage_Communication","Financing_Realestate_Business_services","Community_social_public_admin"]

gdp_frame = pd.DataFrame(0,columns = columns , index = range(0,prop.shape[0]))

gdp_frame["Agriculture_and_Food"] = prop["agri_area_proportion"]*int(gdp_data.loc[0])

gdp_frame["Manufacturing"] = prop["manu_area_proportion"]*int(gdp_data.loc[1])

gdp_frame["Mining"] = prop["population_proportion"]*int(gdp_data.loc[2])
gdp_frame["Electricity_Gas_Water"] = prop["population_proportion"]*int(gdp_data.loc[3])
gdp_frame["Construction"] = prop["population_proportion"]*int(gdp_data.loc[4])
gdp_frame["Trade_Hotel_Restaurents"] = prop["population_proportion"]*int(gdp_data.loc[5])
gdp_frame["Transportation_Storage_Communication"] = prop["population_proportion"]*int(gdp_data.loc[6])
gdp_frame["Financing_Realestate_Business_services"] = prop["population_proportion"]*int(gdp_data.loc[7])
gdp_frame["Community_social_public_admin"] = prop["population_proportion"]*int(gdp_data.loc[8])

gdp_frame.loc[:].sum()

def proportion(sheet1):
  sheet2  = wb.worksheet(sheet1)
  file = pd.DataFrame(sheet2.get_all_records())
  file["Agri_area"] = (file["area"]).mul(file["agriculture_proportion"])
  file["manu_area"] = (file["area"]).mul(file["manufacturing_proportion"])
  file["agri_area_proportion"] =  file["Agri_area"].divide(file["Agri_area"].sum())
  file["manu_area_proportion"] = file["manu_area"].divide(file["manu_area"].sum())
  file["population_proportion"] = file["population"].divide(file["population"].sum())
  return file

def gdp_caculation(sheet2):
  sheet3  = wb.worksheet(sheet2)
  gdp_data = pd.DataFrame(sheet3.row_values(2))
  columns = ["Agriculture_and_Food","Manufacturing","Mining","Electricity_Gas_Water",
             "Construction","Trade_Hotel_Restaurents","Transportation_Storage_Communication","Financing_Realestate_Business_services",
             "Community_social_public_admin"]
  gdp_frame = pd.DataFrame(0,columns = columns , index = range(0,file.shape[0]))
  gdp_frame["Agriculture_and_Food"] = file["agri_area_proportion"]*int(gdp_data.loc[0])
  gdp_frame["Manufacturing"] = file["manu_area_proportion"]*int(gdp_data.loc[1])
  gdp_frame["Mining"] = file["population_proportion"]*int(gdp_data.loc[2])
  gdp_frame["Electricity_Gas_Water"] = file["population_proportion"]*int(gdp_data.loc[3])
  gdp_frame["Construction"] = file["population_proportion"]*int(gdp_data.loc[4])
  gdp_frame["Trade_Hotel_Restaurents"] = file["population_proportion"]*int(gdp_data.loc[5])
  gdp_frame["Transportation_Storage_Communication"] = file["population_proportion"]*int(gdp_data.loc[6])
  gdp_frame["Financing_Realestate_Business_services"] = file["population_proportion"]*int(gdp_data.loc[7])
  gdp_frame["Community_social_public_admin"] = file["population_proportion"]*int(gdp_data.loc[8])
  total_sum = int(np.matrix(gdp_frame.sum()).sum())
  gdp_frame = gdp_frame*(100/total_sum)
  return gdp_frame


sheet1 , sheet2 = "GDP" , "qwerty"

file = proportion("GDP")

z = gdp_caculation(sheet2)


total_sum = int(np.matrix(gdp_frame.sum()).sum())

total_sum

gdp_frame*(100/total_sum)

sheet3.update_acell('D4',3)

!pip install gspread-pandas

from __future__ import print_function
from gspread_pandas import Spread, Client

def proportion(sheet1):
  sheet2  = wb.worksheet(sheet1)
  file = pd.DataFrame(sheet2.get_all_records())
  file["Agri_area"] = (file["area"]).mul(file["agriculture_proportion"])
  file["manu_area"] = (file["area"]).mul(file["manufacturing_proportion"])
  file["agri_area_proportion"] =  file["Agri_area"].divide(file["Agri_area"].sum())
  file["manu_area_proportion"] = file["manu_area"].divide(file["manu_area"].sum())
  file["population_proportion"] = file["population"].divide(file["population"].sum())
  return file

def gdp_caculation(sheet2):
  sheet3  = wb.worksheet(sheet2)
  gdp_data = pd.DataFrame(sheet3.row_values(2))
  columns = ["Agriculture_and_Food","Manufacturing","Mining","Electricity_Gas_Water",
             "Construction","Trade_Hotel_Restaurents","Transportation_Storage_Communication","Financing_Realestate_Business_services",
             "Community_social_public_admin"]
  gdp_frame = pd.DataFrame(0,columns = columns , index = range(0,file.shape[0]))
  gdp_frame["Agriculture_and_Food"] = file["agri_area_proportion"]*int(gdp_data.loc[0])
  gdp_frame["Manufacturing"] = file["manu_area_proportion"]*int(gdp_data.loc[1])
  gdp_frame["Mining"] = file["population_proportion"]*int(gdp_data.loc[2])
  gdp_frame["Electricity_Gas_Water"] = file["population_proportion"]*int(gdp_data.loc[3])
  gdp_frame["Construction"] = file["population_proportion"]*int(gdp_data.loc[4])
  gdp_frame["Trade_Hotel_Restaurents"] = file["population_proportion"]*int(gdp_data.loc[5])
  gdp_frame["Transportation_Storage_Communication"] = file["population_proportion"]*int(gdp_data.loc[6])
  gdp_frame["Financing_Realestate_Business_services"] = file["population_proportion"]*int(gdp_data.loc[7])
  gdp_frame["Community_social_public_admin"] = file["population_proportion"]*int(gdp_data.loc[8])
  total_sum = int(np.matrix(gdp_frame.sum()).sum())
  gdp_frame = gdp_frame*(100/total_sum)
  return gdp_frame