Skip to content

Commit

Permalink
Merge pull request #300 from coderxio/jrlegrand/excipients
Browse files Browse the repository at this point in the history
New mart: Products to inactive ingredients (excipients)
  • Loading branch information
jrlegrand authored Jul 3, 2024
2 parents 1d4231f + 030bae1 commit 2b5b24e
Show file tree
Hide file tree
Showing 11 changed files with 261 additions and 13 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ plugins
# Desktop Services Store
.DS_Store

#GCP
gcp.json
# GCP
gcp.json
64 changes: 64 additions & 0 deletions airflow/dags/fda_unii/dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pendulum

from airflow_operator import create_dag
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.decorators import task

from common_dag_tasks import extract, transform, generate_sql_list, get_ds_folder
from sagerx import read_sql_file

dag_id = "fda_unii"

dag = create_dag(
dag_id=dag_id,
schedule="0 4 * * *",
start_date=pendulum.yesterday(),
catchup=False,
concurrency=2,
)

with dag:
url= "https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip"
ds_folder = get_ds_folder(dag_id)

extract_task = extract(dag_id,url)
transform_task = transform(dag_id)

@task
def get_file_name(data_path) -> str:
import re
import os
import logging

logging.info(f'Data path: {data_path}')

file_name = ''
# note: extract_task contains the path to /opt/data/fda_unii/UNII_Data/
# example file_name: UNII_Records_22Jun2024.txt
for subfile in os.listdir(data_path):
if re.match("UNII_Records", subfile):
file_name = subfile

if file_name == '':
logging.error('Could not find file_name.')

return file_name

file_name_task = get_file_name(extract_task)

sql_tasks = []
for sql in generate_sql_list(dag_id):
sql_path = ds_folder / sql
task_id = sql[:-4] #remove .sql
sql_task = PostgresOperator(
task_id=task_id,
postgres_conn_id="postgres_default",
sql=read_sql_file(sql_path).format(
data_path=extract_task,
file_name=file_name_task
),
dag=dag
)
sql_tasks.append(sql_task)

file_name_task >> sql_tasks >> transform_task
31 changes: 31 additions & 0 deletions airflow/dags/fda_unii/load_unii.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/* sagerx_lake.fda_unii */
DROP TABLE IF EXISTS sagerx_lake.fda_unii CASCADE;

CREATE TABLE sagerx_lake.fda_unii (
unii TEXT NOT NULL,
display_name TEXT,
rn TEXT,
ec TEXT,
ncit TEXT,
rxcui TEXT,
pubchem TEXT,
epa_comptox TEXT,
catalogue_of_life TEXT,
itis TEXT,
ncbi TEXT,
plants TEXT,
grin TEXT,
mpns TEXT,
inn_id TEXT,
usan_id TEXT,
mf TEXT,
inchikey TEXT,
smiles TEXT,
ingredient_type TEXT,
substance_type TEXT,
uuid TEXT,
dailymed TEXT
);

COPY sagerx_lake.fda_unii
FROM '{data_path}/{file_name}' DELIMITER E'\t' CSV HEADER ENCODING 'WIN1252';;
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
-- int_mthspl_products_to_inactive_ingredients.sql
{{ config(materialized='table') }}

with

substance as (

select * from {{ ref('stg_rxnorm__mthspl_substances') }}
)

),

product as (

, product as (
select * from {{ ref('stg_rxnorm__mthspl_products') }}

)

select distinct
Expand All @@ -26,9 +23,9 @@ select distinct
, substance.tty as inactive_ingredient_tty
, product.active as active
, product.prescribable as prescribable
from sagerx_lake.rxnorm_rxnrel rxnrel
inner join substance
on rxnrel.rxaui1 = substance.rxaui
inner join product
from product
inner join sagerx_lake.rxnorm_rxnrel rxnrel
on rxnrel.rxaui2 = product.rxaui
inner join substance
on substance.rxaui = rxnrel.rxaui1
where rela = 'has_inactive_ingredient'
22 changes: 22 additions & 0 deletions dbt/sagerx/models/marts/products/_products__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,25 @@ models:
description: The start marketing date of the **product**. Can be different from that of the package.
- name: package_startmarketingdate
description: The start marketing date of the **package**. Can be different from that of the product.

- name: products_to_inactive_ingredients
description: |
DailyMed (MTHSPL) products (along with 9-digit NDCs), mapped to DailyMed
substances, which are mapped to normalized FDA UNII display names.
columns:
- name: product_name
description: |
NOTE: this field is aggregated and pipe-delimited (' | '). It appears
as though the MTHSPL data in RxNorm can have multiple
product names for a given RXCUI. Since we are mostly concerned
with NDC to UNII code, the names of the products as they
exist in the MTHSPL SAB of RxNorm should be for quick reference
and validation only.
- name: inactive_ingredient_name
description: |
NOTE: this field is aggregated and pipe-delimited (' | '). It appears
as though the MTHSPL data in RxNorm can have multiple
substance names for a given RXCUI. Since we are mostly concerned
with NDC to UNII code, the names of the substances as they
exist in the MTHSPL SAB of RxNorm should be for quick reference
and validation only.
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,71 @@ with products_to_inactive_ingredients as (
select * from {{ ref('int_mthspl_products_to_inactive_ingredients') }}
)

select * from products_to_inactive_ingredients
, unii_codes as (
select * from {{ ref('stg_fda_unii__unii_codes') }}
)

, usp_preservatives as (
select * from {{ ref('usp_preservatives') }}
)

select
ndc9
, ndc
, unii_codes.unii as fda_unii_code
, unii_codes.display_name as fda_unii_display_name
, unii_codes.pubchem as pubchem_id
, max(case
when preservative.cas_rn is not null
then 1
end) as preservative
, product_rxcui
, string_agg(product_name, ' | ') as product_name
, product_tty
, inactive_ingredient_unii
, inactive_ingredient_rxcui
, string_agg(inactive_ingredient_name, ' | ') as inactive_ingredient_name
, inactive_ingredient_tty
, active
, prescribable
from products_to_inactive_ingredients
/*
need to join unii_codes twice - once
to pull in the actual UNII -> displa
y name
mapping, and another initial one to try
to map substance RXCUIs to FDA UNII RXCUIs.
*/
left join unii_codes rxcui_to_unii
on rxcui_to_unii.rxcui = inactive_ingredient_rxcui
/*
if MTHSPL (DailyMed) has a substance UNII,
use that. if it does not, try to map the
substance RXCUI to the FDA UNII RXCUI and
then use the resulting matched UNII to pull
in the UNII display name.
*/
left join unii_codes
on unii_codes.unii = case
when (
inactive_ingredient_unii is not null
and
inactive_ingredient_unii != 'NOCODE'
) then inactive_ingredient_unii
else rxcui_to_unii.unii
end
left join usp_preservatives preservative
on preservative.cas_rn = unii_codes.rn
group by
ndc9
, ndc
, unii_codes.unii
, unii_codes.display_name
, unii_codes.pubchem
, product_rxcui
, product_tty
, inactive_ingredient_unii
, inactive_ingredient_rxcui
, inactive_ingredient_tty
, active
, prescribable
9 changes: 9 additions & 0 deletions dbt/sagerx/models/staging/fda_unii/_fda_unii__sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: 2

sources:
- name: fda_unii
description: FDA UNII codes.
schema: sagerx_lake
tables:
- name: fda_unii
desciption: FDA UNII codes.
18 changes: 18 additions & 0 deletions dbt/sagerx/models/staging/fda_unii/stg_fda_unii__unii_codes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- stg_fda_unii__unii_codes.sql

with

fda_unii as (
select * from {{ source('fda_unii', 'fda_unii') }}
)

select
unii
, display_name
, rxcui
, pubchem
, rn
, ncit
, ncbi
, dailymed
from fda_unii
2 changes: 2 additions & 0 deletions dbt/sagerx/models/staging/rxnorm/_rxnorm__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ models:
- name: name
- name: tty
- name: rxaui
tests:
- unique
- name: ndc
- name: active
- name: prescribable
Expand Down
8 changes: 8 additions & 0 deletions dbt/sagerx/seeds/_seeds__models.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: 2

seeds:
- name: usp_preservatives
description: |
A list of CAS RN identifiers and USP product names obtained manually from
searching the [USP catalog](https://store.usp.org/preservatives/category/USP-1213)
for products in the "Preservatives" category.
30 changes: 30 additions & 0 deletions dbt/sagerx/seeds/usp_preservatives.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
cas_rn,usp_product_name
17927-65-0,Aluminum Sulfate (2 g)
60-00-4,Edetic Acid (200 mg)
79-09-4,Propionic Acid (1.5 mL/ampule; 3 ampules)
6001-64-5,Chlorobutanol (200 mg)
59-51-8,Racemethionine (200 mg)
128-37-0,Butylated Hydroxytoluene (500 mg)
5793-89-5,Calcium Saccharate (200 mg)
121-00-6,3-tert-Butyl-4-hydroxyanisole (200 mg)
137-40-6,Sodium Propionate (200 mg)
89-65-6,Erythorbic Acid (50 mg)
122-99-6,Phenoxyethanol (500 mg) (2-Phenoxyethanol)
94-13-3,Propylparaben (200 mg)
8001-54-5,Benzalkonium Chloride (5 mL of approx. 4% aqueous solution)
7681-57-4,Sodium Metabisulfite (2 X 500 mg)
110-44-1,Sorbic Acid (1 g)
100-51-6,Benzyl Alcohol (500 mg/ampule)
99-76-3,Methylparaben (125 mg)
590-00-1,Potassium Sorbate (1 g)
24634-61-5,Potassium Sorbate (1 g)
532-32-1,Sodium Benzoate (1 g)
88-32-4,2-tert-Butyl-4-hydroxyanisole (200 mg)
120-47-8,Ethylparaben (200 mg)
90-64-2,Mandelic Acid (500 mg)
121-79-9,Propyl Gallate (200 mg)
4075-81-4,Calcium Propionate (100 mg)
94-26-8,Butylparaben (200 mg)
39236-46-9,Imidurea (200 mg)
520-45-6,Dehydroacetic Acid (200 mg)
57-09-0,Cetrimonium Bromide (1 g)

0 comments on commit 2b5b24e

Please sign in to comment.