Skip to content

Commit

Permalink
Merge pull request #235 from coderxio/jrlegrand/dbt-fda-excluded-unfi…
Browse files Browse the repository at this point in the history
…nished

Bring FDA NDC / excluded / unfinished up to speed (Airflow + dbt)
  • Loading branch information
jrlegrand authored Jan 20, 2024
2 parents e82ffbf + 9b6b57d commit 1ea1fb3
Show file tree
Hide file tree
Showing 29 changed files with 409 additions and 140 deletions.
13 changes: 6 additions & 7 deletions airflow/dags/fda_excluded/dag.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from airflow_operator import create_dag
from airflow.utils.helpers import chain

from common_dag_tasks import extract, get_ordered_sql_tasks, get_ds_folder
from common_dag_tasks import extract, transform, get_ordered_sql_tasks, get_ds_folder
from sagerx import read_sql_file
from airflow.providers.postgres.operators.postgres import PostgresOperator

Expand All @@ -20,19 +20,18 @@
ds_folder = get_ds_folder(dag_id)

extract_task = extract(dag_id,url)
transform_task = transform(dag_id)

task_list = [extract_task]
sql_tasks = []
for sql in get_ordered_sql_tasks(dag_id):
sql_path = ds_folder / sql
task_id = sql[:-4] #remove .sql

sql_task = PostgresOperator(
task_id=task_id,
postgres_conn_id="postgres_default",
sql=read_sql_file(sql_path).format(data_path=extract_task),
dag=dag
)
task_list.append(sql_task)

chain(*task_list)

sql_tasks.append(sql_task)

extract_task >> sql_tasks >> transform_task
File renamed without changes.
File renamed without changes.
5 changes: 4 additions & 1 deletion airflow/dags/fda_ndc/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
extract_task = extract(dag_id,url)
transform_task = transform(dag_id)

sql_tasks = []
for sql in generate_sql_list(dag_id):
sql_path = ds_folder / sql
task_id = sql[:-4] #remove .sql
Expand All @@ -32,4 +33,6 @@
sql=read_sql_file(sql_path).format(data_path=extract_task),
dag=dag
)
extract_task >> sql_task >> transform_task
sql_tasks.append(sql_task)

extract_task >> sql_tasks >> transform_task
22 changes: 0 additions & 22 deletions airflow/dags/fda_unfinished/alter-comments.sql

This file was deleted.

11 changes: 5 additions & 6 deletions airflow/dags/fda_unfinished/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from airflow_operator import create_dag
from airflow.utils.helpers import chain

from common_dag_tasks import extract, get_ordered_sql_tasks, get_ds_folder
from common_dag_tasks import extract, transform, get_ordered_sql_tasks, get_ds_folder
from sagerx import read_sql_file
from airflow.providers.postgres.operators.postgres import PostgresOperator

Expand All @@ -21,19 +21,18 @@
ds_folder = get_ds_folder(dag_id)

extract_task = extract(dag_id,url)
transform_task = transform(dag_id)

task_list = [extract_task]
sql_tasks = []
for sql in get_ordered_sql_tasks(dag_id):
sql_path = ds_folder / sql
task_id = sql[:-4] #remove .sql

sql_task = PostgresOperator(
task_id=task_id,
postgres_conn_id="postgres_default",
sql=read_sql_file(sql_path).format(data_path=extract_task),
dag=dag
)
task_list.append(sql_task)
sql_tasks.append(sql_task)

chain(*task_list)

extract_task >> sql_tasks >> transform_task
45 changes: 0 additions & 45 deletions airflow/dags/fda_unfinished/staging-fda_unfinished.sql

This file was deleted.

24 changes: 0 additions & 24 deletions airflow/dags/fda_unfinished/staging-fda_unfinished_substance.sql

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ spl as (

fda as (

select * from {{ ref('stg_fda_ndc__ndc') }}
select * from {{ ref('stg_fda_ndc__ndcs') }}

)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@ with rxnorm_historical_ndcs as
, fda_ndc_ndcs as
(
select distinct ndc11 as ndc
from {{ ref('stg_fda_ndc__ndc') }}
from {{ ref('stg_fda_ndc__ndcs') }}
)

, fda_excluded_ndcs as
(
select distinct ndc11 as ndc
from staging.fda_excluded
from {{ ref('stg_fda_excluded__ndcs') }}
)

, fda_unfinished_ndcs as
(
select distinct ndc11 as ndc
from staging.fda_unfinished
from {{ ref('stg_fda_unfinished__ndcs') }}
)

, all_distinct_ndcs as
Expand Down
85 changes: 85 additions & 0 deletions dbt/sagerx/models/staging/fda_excluded/_fda_excluded__models.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
version: 2

models:
- name: stg_fda_excluded__ndcs
description: "FDA excluded NDCs"
columns:
- name: ndc11
description: "The ndcpackagecode field, normalized to a NDC11 format."
tests:
- unique
- not_null
- name: productid
description: "ProductID is a concatenation of the NDCproduct code and SPL documentID. It is included to help prevent duplicate rows from appearing when joining the product and package files together. It has no regulatory value or significance."
- name: productndc
description: "The labeler code and product code segments of the National Drug Code number, separated by a hyphen. Asterisks are no longer used or included within the product code segment to indicate certain configurations of the NDC. www.fda.gov/edrls under Structured Product Labeling Resources."
- name: producttypename
description: "Indicates the type of product, such as Human Prescription Drug or Human OTC Drug. This data element corresponds to the Document Type of the SPL submission for the listing."
- name: proprietaryname
description: "Also known as the trade name. It is the name of the product chosen by the labeler."
- name: proprietarynamesuffix
description: "A suffix to the proprietary name, a value here should be appended to the ProprietaryName field to obtain the complete name of the product. This suffix is often used to distinguish characteristics of a product such as extended release (XR) or sleep aid (PM). Although many companies follow certain naming conventions for suffices, there is no recognized standard"
- name: nonproprietaryname
description: "Sometimes called the generic name, this is usually the active ingredient(s) of the product."
- name: dosageformname
description: "The translation of the DosageForm Code submitted by the firm. The complete list of codes and translations can be found www.fda.gov/edrls under Structured Product Labeling Resources."
- name: routename
description: "The translation of the Route Code submitted by the firm, indicating route of administration. The complete list of codes and translations can be found at www.fda.gov/edrls under Structured Product Labeling Resources."
- name: product_startmarketingdate
description: "This is the date that the labeler indicates was the start of its marketing of the drug product."
- name: product_endmarketingdate
description: "This is the date the product will no longer be available on the market. If a product is no longer being manufactured, in most cases, the FDA recommends firms use the expiration date of the last lot produced as the EndMarketingDate, to reflect the potential for drug product to remain available after manufacturing has ceased. Products that are the subject of ongoing manufacturing will not ordinarily have any EndMarketingDate. Products with a value in the EndMarketingDate will be removed from the NDC Directory when the EndMarketingDate is reached."
- name: marketingcategoryname
description: "Product types are broken down into several potential Marketing Categories, such as NDA/ANDA/BLA, OTC Monograph, or Unapproved Drug. One and only one Marketing Category may be chosen for a product, not all marketing categories are available to all product types. Currently, only final marketed product categories are included. The complete list of codes and translations can be found at www.fda.gov/edrls under Structured Product Labeling Resources."
- name: applicationnumber
description: "This corresponds to the NDA, ANDA, or BLA number reported by the labeler for products which have the corresponding Marketing Category designated. If the designated Marketing Category is OTC Monograph Final or OTC Monograph Not Final, then the Application number will be the CFR citation corresponding to the appropriate Monograph (e.g. “part 341”). For unapproved drugs, this field will be null."
- name: labelername
description: "Name of Company corresponding to the labeler code segment of the ProductNDC."
- name: substancename
description: "This is the active ingredient list. Each ingredient name is the preferred term of the UNII code submitted."
- name: active_numerator_strength
description: "These are the strength values (to be used with units below) of each active ingredient, listed in the same order as the SubstanceName field above."
- name: active_ingred_unit
description: "These are the units to be used with the strength values above, listed in the same order as the SubstanceName and SubstanceNumber (ActiveNumeratorStrength)."
- name: pharm_classes
description: "These are the reported pharmacological class categories corresponding to the SubstanceNames listed above."
- name: deaschedule
description: "This is the assigned DEA Schedule number as reported by the labeler. Values are CI, CII, CIII, CIV, and CV."
- name: product_ndc_exclude_flag
description: "Values = Y, N, E, or I. This indicates whether the product has been removed/excluded from the NDC Directory for failure to respond to FDA's requests for correction to deficient or non-compliant submissions (Y), or because the listing certification is expired (E), or because the listing data was inactivated by FDA (I). The PRODUCT.XLS and PRODUCT.TXT files only contain listing records where NDC_EXCLUDE_FLAG=N. The PRODUCTS_EXCLUDED.XLS and PRODUCTS_EXCLUDED.TXT file contains all listing records with an NDC_EXCLUDE_FLAG of Y, E, and I."
- name: listing_record_certified_through
description: "This is the date when the listing record will expire if not updated or certified by the firm."
- name: ndcpackagecode
description: "The labeler code, product code, and package code segments of the National Drug Code number, separated by hyphens. Asterisks are no longer used or included within the product and package code segments to indicate certain configurations of the NDC."
- name: packagedescription
description: "A description of the size and type of packaging in sentence form. Multilevel packages will have the descriptions concatenated together. For example: 4 BOTTLES in 1 CARTON/100 TABLETS in 1 BOTTLE."
- name: package_startmarketingdate
description: "This is the date that the labeler indicates was the start of its marketing of the drug product."
- name: package_endmarketingdate
description: "This is the date the product will no longer be available on the market. If a product is no longer being manufactured, in most cases, the FDA recommends firms use the expiration date of the last lot produced as the EndMarketingDate, to reflect the potential for drug product to remain available after manufacturing has ceased. Products that are the subject of ongoing manufacturing will not ordinarily have any EndMarketingDate. Products with a value in the EndMarketingDate will be removed from the NDC Directory when the EndMarketingDate is reached."
- name: package_ndc_exclude_flag
description: "Values = Y, N, E, or I. This indicates whether the PACKAGE has been removed/excluded from the NDC Directory for failure to respond to FDAs requests for correction to deficient or non-compliant submissions (Y), or because the listing certification is expired (E), or because the listing data was inactivated by FDA (I). The PACKAGE.XLS and PACKAGE.TXT files only contain listing records where NDC_EXCLUDE_FLAG=N. The PACKAGES_EXCLUDED.XLS and PACKAGES_EXCLUDED.TXT file contains all listing records with an NDC_EXCLUDE_FLAG of Y, E, and I."
- name: sample_package
description: "This indicates if the package is to be distributed as a sample package. Values = Y or N."

- name: stg_fda_excluded__classes
description: "Product-level class information"
columns:
# primary key would be productid + class_line
- name: productid
- name: class_line
description: Product can have multiple classes. This is the line number of the class.
- name: class_name
description: Name of the pharmaceutical class.
- name: class_type
description: Options include Chemical/Ingredient, EXT, PE, MoA, CS, and EPC.

- name: stg_fda_excluded__substances
description: "Product-level substance information"
columns:
# primary key would be productid + substance_line
- name: productid
- name: substance_line
- name: substancename
- name: active_numerator_strength
- name: active_ingred_unit
Loading

0 comments on commit 1ea1fb3

Please sign in to comment.