Skip to content

Commit

Permalink
Merge pull request #6 from monarch-initiative/reports
Browse files Browse the repository at this point in the history
Create report as a pivot table
  • Loading branch information
matentzn authored Sep 29, 2023
2 parents b322d32 + ae2df12 commit 2d711b1
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ $(DATADIR)/curated-mondo-ncit-renal-subset.sssom.tsv: subsets/mondo-rare-renal.o
#$(DATADIR)/curated-mondo-ncit-renal-subset.sssom.tsv:
# runoak -i sqlite:obo:mondo mappings .desc//p=i 'kidney disease' .and .desc//p=i 'hereditary disease' -O sssom -o $@

###############################################
############ Reports as Pivot table ###########
###############################################
$(RESULTDIR)/report_pivot_table.md:
python util/reports.py -o $@

###############################################
############ Ontologies #######################
Expand Down
5 changes: 5 additions & 0 deletions results/report_pivot_table.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
| subject_prefix | MmusDv | NCIT | WBbt | ZFA |
|:-----------------|---------:|-------:|-------:|------:|
| FBbt | 0 | 0 | 41 | 72 |
| HsapDv | 22 | 0 | 0 | 0 |
| MONDO | 0 | 25 | 0 | 0 |
56 changes: 56 additions & 0 deletions util/reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import click
from pathlib import Path
import glob
import pandas as pd
PROJECT_DIR = Path(__file__).resolve().parents[1]
DATA_DIR = PROJECT_DIR / "data"
RESULTS_DIR = PROJECT_DIR / "results"
SUBJECT_ID = "subject_id"
OBJECT_ID = "object_id"
PREDICATE_ID = "predicate_id"


@click.group()
def main():
pass

@main.command()
@click.option("-o", "--outfile", default=RESULTS_DIR / "report_pivot_table.md")
def reports(outfile: str):
"""Run reports."""
# import all tsv files using pandas and just 3 columns
tsv_files = glob.glob(str(DATA_DIR / "*.tsv"))

# Read and concatenate dataframes in one step
dfs = (pd.read_csv(file, sep='\t', usecols=['subject_id', 'predicate_id', 'object_id'], comment="#") for file in tsv_files)
large_df = pd.concat(dfs, ignore_index=True).drop_duplicates()

# Split subject_id and object_id
large_df[['subject_prefix', 'subject_id']] = large_df['subject_id'].str.split(":", expand=True)
large_df[['object_prefix', 'object_id']] = large_df['object_id'].str.split(":", expand=True)

# unique_object_prefixes = list(large_df['object_prefix'].unique())
# unique_subject_prefixes = list(large_df['subject_prefix'].unique())

# # Group by subject_prefix and object_prefix
# grouped_df = large_df.groupby(['subject_prefix', 'object_prefix'])

# mondo_ncit_df = grouped_df.get_group(('MONDO', 'NCIT')).drop_duplicates()
# hsapdv_mmusdv_df = grouped_df.get_group(('HsapDv', 'MmusDv')).drop_duplicates()
# fbbt_wbbt_df = grouped_df.get_group(('FBbt', 'WBbt')).drop_duplicates()
# fbbt_zfa_df = grouped_df.get_group(('FBbt', 'ZFA')).drop_duplicates()

# Create a pivot table
pivot_table = large_df.pivot_table(index='subject_prefix', columns='object_prefix', aggfunc='size', fill_value=0)

# Convert the pivot table to markdown
markdown_table = pivot_table.to_markdown()

# Write the markdown table to a file
with open(str(outfile), 'w') as f:
f.write(markdown_table)



if __name__ == "__main__":
reports()

0 comments on commit 2d711b1

Please sign in to comment.