Skip to content

Commit

Permalink
Merge pull request #1053 from devinit/feature/update-pandas
Browse files Browse the repository at this point in the history
Pin Pandas version and move back to native to_sql
  • Loading branch information
akmiller01 committed Feb 29, 2024
2 parents d0909c6 + fb9a221 commit a81e0d8
Show file tree
Hide file tree
Showing 11 changed files with 19 additions and 177 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ jobs:

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.7
- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: '3.7'
python-version: '3.8'
architecture: 'x64'

- uses: actions/cache@v4
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# start with a base image
FROM python:3.7
FROM python:3.8
LABEL maintainer="akmiller01 <Alex Miller, [email protected]>"

RUN mkdir /src
Expand Down
22 changes: 3 additions & 19 deletions data_updates/Python/iati_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import progressbar
import pandas as pd
import sqlalchemy
from sqlalchemy import and_, distinct, create_engine, MetaData, or_, select, Table, text, insert, Column
from sqlalchemy import and_, distinct, create_engine, MetaData, or_, select, Table, text, Column
from lxml import etree
from lxml.etree import XMLParser
from iati_transaction_spec import IatiFlat, A_DTYPES, A_NUMERIC_DTYPES, T_DTYPES, T_NUMERIC_DTYPES
Expand All @@ -13,7 +13,6 @@
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import boto3
from datetime import datetime
from sql_utils import batch_generator, dataframe_records_gen


current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Expand Down Expand Up @@ -181,15 +180,8 @@ def main(args):
flat_activity_data[numeric_column] = pd.to_numeric(flat_activity_data[numeric_column], errors='coerce')
flat_activity_data = flat_activity_data.astype(dtype=A_DTYPES)

flat_activity_data_records = dataframe_records_gen(flat_activity_data)
flat_activity_data_batches_generator = batch_generator(flat_activity_data_records)
with engine.begin() as conn:
for flat_activity_data_batch in flat_activity_data_batches_generator:
conn.execute(
insert(tmp_activity_table).values(
flat_activity_data_batch
)
)
flat_activity_data.to_sql(name=TMP_ACTIVITY_DATA_TABLENAME, con=conn, schema=DATA_SCHEMA, index=False, if_exists="append")

if not flat_transactions:
continue
Expand All @@ -201,16 +193,8 @@ def main(args):
flat_transaction_data[numeric_column] = pd.to_numeric(flat_transaction_data[numeric_column], errors='coerce')
flat_transaction_data = flat_transaction_data.astype(dtype=T_DTYPES)

flat_transaction_data_records = dataframe_records_gen(flat_transaction_data)
flat_transaction_data_batches_generator = batch_generator(flat_transaction_data_records)
with engine.begin() as conn:
for flat_transaction_data_batch in flat_transaction_data_batches_generator:
conn.execute(
insert(tmp_transaction_table).values(
flat_transaction_data_batch
)
)

flat_transaction_data.to_sql(name=TMP_DATA_TABLENAME, con=conn, schema=DATA_SCHEMA, index=False, if_exists="append")

# Delete repeats, insert tmp into permanent, erase tmp
with engine.begin() as conn:
Expand Down
19 changes: 2 additions & 17 deletions data_updates/Python/iati_transactions_reconfigure.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import boto3
from botocore.exceptions import ClientError
from datetime import datetime
from sql_utils import batch_generator, dataframe_records_gen


current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Expand Down Expand Up @@ -99,15 +98,8 @@ def main():
for numeric_column in A_NUMERIC_DTYPES:
flat_activity_data[numeric_column] = pd.to_numeric(flat_activity_data[numeric_column], errors='coerce')
flat_activity_data = flat_activity_data.astype(dtype=A_DTYPES)
flat_activity_data_records = dataframe_records_gen(flat_activity_data)
flat_activity_data_batches_generator = batch_generator(flat_activity_data_records)
with engine.begin() as conn:
for flat_activity_data_batch in flat_activity_data_batches_generator:
conn.execute(
insert(activity_table).values(
flat_activity_data_batch
)
)
flat_activity_data.to_sql(name=ACTIVITY_DATA_TABLENAME, con=conn, schema=DATA_SCHEMA, index=False, if_exists="append")

if not flat_transactions:
continue
Expand All @@ -118,15 +110,8 @@ def main():
for numeric_column in T_NUMERIC_DTYPES:
flat_transaction_data[numeric_column] = pd.to_numeric(flat_transaction_data[numeric_column], errors='coerce')
flat_transaction_data = flat_transaction_data.astype(dtype=T_DTYPES)
flat_transaction_data_records = dataframe_records_gen(flat_transaction_data)
flat_transaction_data_batches_generator = batch_generator(flat_transaction_data_records)
with engine.begin() as conn:
for flat_transaction_data_batch in flat_transaction_data_batches_generator:
conn.execute(
insert(transaction_table).values(
flat_transaction_data_batch
)
)
flat_transaction_data.to_sql(name=DATA_TABLENAME, con=conn, schema=DATA_SCHEMA, index=False, if_exists="append")


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions data_updates/Python/manual_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import glob
import os
import sys
from sql_utils import df_to_sql


def main(filename=None):
Expand All @@ -29,7 +28,8 @@ def process_file(filename):
csv_dat = pd.read_csv(filename, keep_default_na=False, na_values=[''])
except UnicodeDecodeError:
csv_dat = pd.read_csv(filename, keep_default_na=False, na_values=[''], encoding='latin1')
df_to_sql(csv_dat, engine, table_name, "repo", "replace")
with engine.begin() as conn:
csv_dat.to_sql(name=table_name, con=conn, schema="repo", index=False, if_exists="replace")


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions data_updates/Python/manual_data_fts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from sqlalchemy import create_engine
import glob
import os
from sql_utils import df_to_sql


def main():
Expand All @@ -18,7 +17,8 @@ def main():
table_name = os.path.splitext(basename)[0]
if table_name not in ["meta", "meta_columns"]:
csv_dat = pd.read_csv(abs_csv_path, keep_default_na=False, na_values=[''], encoding='latin1')
df_to_sql(csv_dat, engine, table_name, "repo", "replace")
with engine.begin() as conn:
csv_dat.to_sql(name=table_name, con=conn, schema="repo", index=False, if_exists="replace")


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions data_updates/Python/povcal_agg.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import numpy as np
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sql_utils import df_to_sql


def requests_retry_session(
Expand Down Expand Up @@ -69,7 +68,8 @@ def fetch_and_write_full_data(schema_name, table_name, engine):
append_or_replace = "replace"
for povline in progressbar.progressbar(np.linspace(0.01, 10, 1000)):
pov_data = fetch_data(poverty_line=povline)
df_to_sql(pov_data, engine, "PovCalNetAgg", "repo", append_or_replace)
with engine.connect() as conn:
pov_data.to_sql(name="PovCalNetAgg", con=conn, schema="repo", index=False, if_exists=append_or_replace)
append_or_replace = "append"


Expand Down
4 changes: 2 additions & 2 deletions data_updates/Python/povcal_p20.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
from sqlalchemy import create_engine
from sql_utils import df_to_sql


def main():
Expand All @@ -21,7 +20,8 @@ def main():
pass

p20_data = pd.concat(p20_data_list, ignore_index=True)
df_to_sql(p20_data, engine, "PovCalNetP20", "repo", "replace")
with engine.connect() as conn:
p20_data.to_sql(name="PovCalNetP20", con=conn, schema="repo", index=False, if_exists="replace")


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions data_updates/Python/povcal_smy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import numpy as np
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sql_utils import df_to_sql


def requests_retry_session(
Expand Down Expand Up @@ -70,7 +69,8 @@ def fetch_and_write_full_data(schema_name, table_name, engine):
append_or_replace = "replace"
for povline in progressbar.progressbar(np.linspace(0.01, 10, 1000)):
pov_data = fetch_data(poverty_line=povline)
df_to_sql(pov_data, engine, "PovCalNetSmy", "repo", append_or_replace)
with engine.connect() as conn:
pov_data.to_sql(name="PovCalNetSmy", con=conn, schema="repo", index=False, if_exists=append_or_replace)
append_or_replace = "append"


Expand Down
127 changes: 0 additions & 127 deletions data_updates/Python/sql_utils.py

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ importlib-metadata==4.13.0
lxml>=4.6.3,<6.0
markdown>=3.2.2,<4.0
numpy>=1.18.4,<2.0
pandas>=1.0.3,<3.0
pandas>=2.0.3,<3.0
progressbar2>=3.39.3,<4.0
psycopg2>=2.8.5,<3.0
psycopg2-binary>=2.8.5,<3.0
Expand Down

0 comments on commit a81e0d8

Please sign in to comment.