From cde1c86f61b323ea493334bb386a181c7ead8ec0 Mon Sep 17 00:00:00 2001 From: Kristi Nikolla Date: Thu, 25 Apr 2024 19:32:20 -0400 Subject: [PATCH] Read cost as a Decimal from collected invoices This will read the cost columns from collected invoices as a Decimal, using the integration between Panda and PyArrow, which allows Panda to use more data types.[0] The decimal defined can represent numbers up to 12 significant digits and up to 3 decimal points. [0]. https://pandas.pydata.org/docs/user_guide/pyarrow.html --- process_report/process_report.py | 8 ++++++-- requirements.txt | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/process_report/process_report.py b/process_report/process_report.py index f9514e3..7553e4d 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -2,10 +2,12 @@ import os import sys import datetime +from decimal import Decimal import json import pandas import boto3 +import pyarrow ### Invoice field names @@ -229,7 +231,9 @@ def merge_csv(files): """Merge multiple CSV files and return a single pandas dataframe""" dataframes = [] for file in files: - dataframe = pandas.read_csv(file) + dataframe = pandas.read_csv( + file, dtype={COST_FIELD: pandas.ArrowDtype(pyarrow.decimal128(12, 3))} + ) dataframes.append(dataframe) merged_dataframe = pandas.concat(dataframes, ignore_index=True) @@ -298,7 +302,7 @@ def apply_credits_new_pi(dataframe, old_pi_file): dataframe[CREDIT_FIELD] = None dataframe[CREDIT_CODE_FIELD] = None - dataframe[BALANCE_FIELD] = 0 + dataframe[BALANCE_FIELD] = Decimal(0) old_pi_dict = load_old_pis(old_pi_file) diff --git a/requirements.txt b/requirements.txt index b650973..cc0d852 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ pandas +pyarrow boto3