From 44145fc856154af55ef07cddd003441bd2d68a64 Mon Sep 17 00:00:00 2001 From: akmiller01 Date: Tue, 23 Apr 2024 09:41:46 -0400 Subject: [PATCH 1/2] WDI no longer on CKAN api --- data_updates/R/wdi.R | 67 ++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/data_updates/R/wdi.R b/data_updates/R/wdi.R index 84b1805f8..4724f9ef2 100644 --- a/data_updates/R/wdi.R +++ b/data_updates/R/wdi.R @@ -28,44 +28,31 @@ con = dbConnect(drv, table.name = "wdi" table.quote = c("repo",table.name) -ckan.url = "https://datacatalog.worldbank.org/api/3/action/package_show?id=90a34ea4-8a5c-11e6-ae22-56b6b64001" - -res = GET(ckan.url) -if(res$status_code==200){ - dat = content(res) - resources = dat$result[[1]]$resources - resource_names = sapply(resources,`[`,"name") - csv_index = resource_names == "CSV" - csv_resource = resources[csv_index][[1]] - csv_url = csv_resource$url - tmp.zip = tempfile(fileext = ".zip") - download.file(url=csv_url,destfile=tmp.zip) - tmp.csv = unzip(tmp.zip,files="WDIData.csv",exdir="/tmp") - wdi = fread(tmp.csv, header=T) - wdi[,V64:=NULL] - names(wdi)[1:4] = tolower(make.sql.names(make.names(names(wdi)[1:4]))) - - # Append melt - id.vars=c("country_name","country_code", "indicator_name", "indicator_code") - variable.name="year" - chunk.size=5000 - num_chunks = floor(nrow(wdi)/chunk.size) - pb = txtProgressBar(max=num_chunks,style=3) - for(i in 0:num_chunks){ - setTxtProgressBar(pb, i) - start_ind = 1 + (i * chunk.size) - end_ind = (i+1) * chunk.size - end_ind = min(end_ind,nrow(wdi)) - chunk = wdi[start_ind:end_ind,] - chunk.m = melt(chunk,id.vars=id.vars,variable.name=variable.name) - rm(chunk) - gc() - dbWriteTable(con, name = table.quote, value = chunk.m, row.names = F, overwrite = (i==0), append = (i>0)) - rm(chunk.m) - gc() - } - close(pb) - dbDisconnect(con) -}else{ - stop("HTTP error: ",res$status_code) +csv_url = "https://databank.worldbank.org/data/download/WDI_CSV.zip" +tmp.zip = tempfile(fileext = ".zip") +download.file(url=csv_url,destfile=tmp.zip) +tmp.csv = unzip(tmp.zip,files="WDICSV.csv",exdir="/tmp") +wdi = fread(tmp.csv, header=T) +names(wdi)[1:4] = tolower(make.sql.names(make.names(names(wdi)[1:4]))) + +# Append melt +id.vars=c("country_name","country_code", "indicator_name", "indicator_code") +variable.name="year" +chunk.size=5000 +num_chunks = floor(nrow(wdi)/chunk.size) +pb = txtProgressBar(max=num_chunks,style=3) +for(i in 0:num_chunks){ + setTxtProgressBar(pb, i) + start_ind = 1 + (i * chunk.size) + end_ind = (i+1) * chunk.size + end_ind = min(end_ind,nrow(wdi)) + chunk = wdi[start_ind:end_ind,] + chunk.m = melt(chunk,id.vars=id.vars,variable.name=variable.name) + rm(chunk) + gc() + dbWriteTable(con, name = table.quote, value = chunk.m, row.names = F, overwrite = (i==0), append = (i>0)) + rm(chunk.m) + gc() } +close(pb) +dbDisconnect(con) From edf8d8aa9afbf437e48017f7e9be950aabc5bad4 Mon Sep 17 00:00:00 2001 From: akmiller01 Date: Tue, 23 Apr 2024 10:02:38 -0400 Subject: [PATCH 2/2] New columns for CRS April 2024 --- data_updates/Python/download_oecd.py | 17 ++++++++++------- data_updates/R/load_mirrors.R | 4 ++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/data_updates/Python/download_oecd.py b/data_updates/Python/download_oecd.py index 2aac802c2..67c69d674 100644 --- a/data_updates/Python/download_oecd.py +++ b/data_updates/Python/download_oecd.py @@ -146,13 +146,16 @@ def download(scrape_path, download_path, output_folder_prefix): # Unzip dir_path = os.path.dirname(os.path.realpath(__file__)) remove_null_script_path = os.path.abspath(os.path.join(dir_path, "..", "remove_null.sh")) - with zipfile.ZipFile(path, "r") as zip_ref: - zip_ref.extractall(content_directory) - extracted_files = zip_ref.namelist() - for extracted_file in extracted_files: - full_path_extracted_file = os.path.join(content_directory, extracted_file) - rm_null_cmd = [remove_null_script_path, full_path_extracted_file] - subprocess.run(rm_null_cmd) + try: + with zipfile.ZipFile(path, "r") as zip_ref: + zip_ref.extractall(content_directory) + extracted_files = zip_ref.namelist() + for extracted_file in extracted_files: + full_path_extracted_file = os.path.join(content_directory, extracted_file) + rm_null_cmd = [remove_null_script_path, full_path_extracted_file] + subprocess.run(rm_null_cmd) + except zipfile.BadZipFile: + print("{} is not a valid zip file. Skipping...".format(name)) # Finished! print("Finished.\t\t\t") diff --git a/data_updates/R/load_mirrors.R b/data_updates/R/load_mirrors.R index 0b2670029..a0eae1979 100644 --- a/data_updates/R/load_mirrors.R +++ b/data_updates/R/load_mirrors.R @@ -160,6 +160,7 @@ merge_crs_tables = function(file_vec){ "integer", "integer", "text", + "text", "integer", "text", "text", @@ -167,6 +168,7 @@ merge_crs_tables = function(file_vec){ "integer", "integer", "text", + "text", "integer", "text", "integer", @@ -254,6 +256,7 @@ merge_crs_tables = function(file_vec){ names(crs_field_types) = c( "year" ,"donor_code" + ,"donor_iso3_code" # New Apr 2024 ,"donor_name" ,"agency_code" ,"agency_name" @@ -261,6 +264,7 @@ merge_crs_tables = function(file_vec){ ,"project_number" ,"initial_report" ,"recipient_code" + ,"recipient_iso3_code" # New Apr 2024 ,"recipient_name" ,"region_code" ,"region_name"