Merge pull request #30 from OpenKBC/engineering_dev

new feature extraction and features analysis result including pipeline update
OpenKBC · Sep 26, 2021 · 4e4bf80 · 4e4bf80
2 parents e4c5f0e + bf6c4f4
commit 4e4bf80
Show file tree

Hide file tree

Showing 17 changed files with 1,633 additions and 56 deletions.
diff --git a/notebook/features_visualization.ipynb → .../Jun09182021/features_visualization.ipynb b/notebook/features_visualization.ipynb → .../Jun09182021/features_visualization.ipynb
diff --git a/notebook/notebook_archive/Jun09182021/step2_RFECV_ActScore.ipynb b/notebook/notebook_archive/Jun09182021/step2_RFECV_ActScore.ipynb
diff --git a/notebook/notebook_archive/Jun09232021/ACT_actExt_healthy_ver1.ipynb b/notebook/notebook_archive/Jun09232021/ACT_actExt_healthy_ver1.ipynb
diff --git a/notebook/notebook_archive/Jun09232021/ACT_actExt_long_ver1.ipynb b/notebook/notebook_archive/Jun09232021/ACT_actExt_long_ver1.ipynb
diff --git a/notebook/notebook_archive/Jun09232021/DAVID_result_with_genes.txt b/notebook/notebook_archive/Jun09232021/DAVID_result_with_genes.txt
@@ -0,0 +1,24 @@
+Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
+UP_KEYWORDS	Signal-anchor	4	18.181818181818183	0.010618235535230702	9435, 92370, 84750, 482	22	455	20581	8.224175824175823	0.644942699379905	1.0	1.0
+GOTERM_BP_DIRECT	GO:0030007~cellular potassium ion homeostasis	2	9.090909090909092	0.014203886862870552	3778, 482	21	12	16792	133.26984126984127	0.8550364897890121	1.0	1.0
+GOTERM_BP_DIRECT	GO:0006970~response to osmotic stress	2	9.090909090909092	0.021233751409806934	1831, 3778	21	18	16792	88.84656084656085	0.9448350841481213	1.0	1.0
+UP_SEQ_FEATURE	splice variant	13	59.09090909090909	0.02672379874332258	9435, 25960, 92370, 2273, 84937, 9098, 29893, 1831, 27152, 60385, 84750, 3778, 11326	20	7760	20063	1.680534793814433	0.8915172884093386	1.0	1.0
+GOTERM_CC_DIRECT	GO:0000139~Golgi membrane	4	18.181818181818183	0.029224941783175234	9435, 84342, 92370, 84750	22	591	18224	5.60652207352715	0.7122723462224878	1.0	1.0
+GOTERM_BP_DIRECT	GO:0009790~embryo development	2	9.090909090909092	0.044324271083796926	10361, 84750	21	38	16792	42.08521303258145	0.9978025190277713	1.0	1.0
+INTERPRO	IPR000340:Dual specificity phosphatase, catalytic domain	2	9.090909090909092	0.04535550105440483	1844, 7803	21	43	18559	41.10520487264673	0.9662369069907797	1.0	1.0
+GOTERM_MF_DIRECT	GO:0016791~phosphatase activity	2	9.090909090909092	0.04636430022502955	92370, 7803	21	40	16881	40.19285714285714	0.8873853134098133	1.0	1.0
+UP_KEYWORDS	Golgi apparatus	4	18.181818181818183	0.047948316929936494	9435, 84342, 92370, 84750	22	812	20581	4.608374384236454	0.991487229254269	1.0	1.0
+KEGG_PATHWAY	hsa04911:Insulin secretion	2	9.090909090909092	0.048527607447158325	3778, 482	5	85	6879	32.371764705882356	0.7116600702180332	0.4416846321392304	0.4416846321392304
+UP_SEQ_FEATURE	mutagenesis site	6	27.27272727272727	0.04876123266351168	9435, 4953, 84937, 9098, 3778, 7803	20	2191	20063	2.747101780009128	0.9834139717878374	1.0	1.0
+KEGG_PATHWAY	hsa04970:Salivary secretion	2	9.090909090909092	0.049087791369392356	3778, 482	5	86	6879	31.995348837209303	0.7158742585745337	0.4416846321392304	0.4416846321392304
+KEGG_PATHWAY	hsa04972:Pancreatic secretion	2	9.090909090909092	0.053002155856707645	3778, 482	5	93	6879	29.58709677419355	0.7437140735976389	0.4416846321392304	0.4416846321392304
+SMART	SM00404:PTPc_motif	2	9.090909090909092	0.055007744369135844	1844, 7803	10	63	10057	31.926984126984127	0.658699645983222	1.0	1.0
+UP_SEQ_FEATURE	domain:Tyrosine-protein phosphatase	2	9.090909090909092	0.05803089125516087	1844, 7803	20	63	20063	31.84603174603175	0.9925697030377879	1.0	1.0
+INTERPRO	IPR003595:Protein-tyrosine phosphatase, catalytic	2	9.090909090909092	0.0657787434224818	1844, 7803	21	63	18559	28.055933484504912	0.9930364393651362	1.0	1.0
+UP_KEYWORDS	Developmental protein	4	18.181818181818183	0.07005658477911464	27152, 10361, 2273, 7803	22	949	20581	3.9430979978925187	0.9991284691002745	1.0	1.0
+UP_SEQ_FEATURE	active site:Phosphocysteine intermediate	2	9.090909090909092	0.0784103517723536	1844, 7803	20	86	20063	23.32906976744186	0.9987638185946439	1.0	1.0
+INTERPRO	IPR000387:Protein-tyrosine/Dual specificity phosphatase	2	9.090909090909092	0.08479581942452594	1844, 7803	21	82	18559	21.555168408826948	0.9984482789379427	1.0	1.0
+UP_KEYWORDS	Endosome	3	13.636363636363635	0.08549711150290609	84937, 9098, 7803	22	481	20581	5.834719334719335	0.9998282277870996	1.0	1.0
+KEGG_PATHWAY	hsa04022:cGMP-PKG signaling pathway	2	9.090909090909092	0.08877539992187655	3778, 482	5	158	6879	17.415189873417724	0.902133086037035	0.5548462495117285	0.5548462495117285
+GOTERM_BP_DIRECT	GO:0006813~potassium ion transport	2	9.090909090909092	0.09331516173239449	3778, 482	21	82	16792	19.502903600464574	0.9999981944575009	1.0	1.0
+UP_KEYWORDS	Membrane	12	54.54545454545454	0.0993540921976912	9435, 84342, 92370, 25960, 100526664, 84750, 84937, 9098, 3778, 7803, 482, 11326	22	7494	20581	1.497998398718975	0.9999609387831477	1.0	1.0
diff --git a/notebook/notebook_archive/Jun09232021/RF_ML_ver1.ipynb b/notebook/notebook_archive/Jun09232021/RF_ML_ver1.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "resultPath = \"resultFiles/featureExtractionV3_by_Jun/\"\n",
+    "\n",
+    "intersected_genes = []\n",
+    "for x in ['CD4', 'CD8', 'CD14']:\n",
+    "    #df_healthy = pd.read_csv(resultPath+\"HealthyPatients/\"+x+\".Ranksum.RFECV.gene.result\", index_col=0)\n",
+    "    df_long = pd.read_csv(resultPath+\"LongDiseaseDuration/\"+x+\".Ranksum.RFECV.gene.result\", index_col=0)\n",
+    "    print(x)\n",
+    "    print(len(df_long))\n"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "CD4\n",
+      "43\n",
+      "CD8\n",
+      "149\n",
+      "CD14\n",
+      "637\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "source": [
+    "## Utils and Library for notebook\n",
+    "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
+    "import itertools\n",
+    "def _LoadDiseaseDuration(df, meta_data, returntype='long'):\n",
+    "    \"\"\"\n",
+    "    df : Expression or activation score matrix\n",
+    "    meta_data : meta data which contains duration and sample ID\n",
+    "    output: long DD samples and short DD samples by list, or healthy samples and short DD samples by list\n",
+    "    \"\"\"\n",
+    "    # Sample by disease category\n",
+    "    sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
+    "    \n",
+    "    # Sort by disease category and exclude uknown samples\n",
+    "    patient_samples = [] # patient samples\n",
+    "    healthy_samples = [] # healthy samples\n",
+    "    for samples, category in zip(sample_list, sample_category):\n",
+    "        if category=='Healthy':\n",
+    "            healthy_samples = samples\n",
+    "        else:\n",
+    "            if category!='Unknown':# Excluding unknown samples\n",
+    "                patient_samples.append(samples)\n",
+    "\n",
+    "    patient_samples = list(itertools.chain(*patient_samples)) # flatten\n",
+    "    patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+    "    healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+    "    patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata\n",
+    "\n",
+    "    longDD_samples, shortDD_samples = exttoolkit.get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 50)\n",
+    "    longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+    "    shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+    "\n",
+    "    if returntype=='long':\n",
+    "        return longDD_samples, shortDD_samples\n",
+    "    elif returntype=='healthy':\n",
+    "        return healthy_samples, shortDD_samples"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "source": [
+    "#df_cd4 = pd.read_csv(resultPath+\"LongDiseaseDuration/CD4.Ranksum.RFECV.act.csv\", index_col=0)\n",
+    "df_cd4 = pd.read_csv(resultPath+\"LongDiseaseDuration/CD8.Ranksum.RFECV.gene.result\", index_col=0)\n",
+    "#df_cd4 = pd.read_csv(\"../data/counts_normalized/IDConvertedFiles/counts_vst_CD4.converted.csv\", index_col=0)\n",
+    "meta_data = pd.read_csv('../data/annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')\n",
+    "long_samples, shortDD_samples = _LoadDiseaseDuration(df_cd4, meta_data, 'long')\n",
+    "df_cd4 = df_cd4[long_samples+shortDD_samples]\n",
+    "df_cd4 = df_cd4.subtract(df_cd4.median(axis=1), axis=0)\n",
+    "\n",
+    "\n",
+    "fold_change = (df_cd4[long_samples].mean(axis=1) - df_cd4[shortDD_samples].mean(axis=1)).apply(abs)\n",
+    "fold_change = fold_change.sort_values(ascending=False)[:40].index.tolist()\n",
+    "df_cd4 = df_cd4.loc[fold_change]\n",
+    "\n",
+    "# Import some data to play with\n",
+    "X = df_cd4.T.values\n",
+    "y = [0]*len(long_samples)+[1]*len(shortDD_samples) # Training y\n",
+    "y = np.array(y)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "\n",
+    "for t in list(range(45,50)):\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=t)\n",
+    "    #X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n",
+    "\n",
+    "    randomState = list(range(40,60))\n",
+    "\n",
+    "    test_auc = []\n",
+    "    val_auc = []\n",
+    "    for i in randomState:\n",
+    "        clf = RandomForestClassifier(max_depth=10, random_state=i)\n",
+    "        clf.fit(X_train, y_train)\n",
+    "\n",
+    "        y_pred = clf.predict(X_test)\n",
+    "        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)\n",
+    "        #print(\"test auc: \",metrics.auc(fpr, tpr))\n",
+    "        test_auc.append(metrics.auc(fpr, tpr))\n",
+    "        \n",
+    "        #y_val_pred = clf.predict(X_val)\n",
+    "        #fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred, pos_label=1)\n",
+    "        #print(\"val auc: \",metrics.auc(fpr, tpr))\n",
+    "        #val_auc.append(metrics.auc(fpr, tpr))\n",
+    "    #print(np.mean(test_auc), np.mean(val_auc))\n",
+    "    print(np.mean(test_auc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "0.6107080419580418\n",
+      "0.5788095238095239\n",
+      "0.6720274390243903\n",
+      "0.5882142857142857\n",
+      "0.6155594405594405\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "orig_nbformat": 4,
+  "language_info": {
+   "name": "python",
+   "version": "3.8.2",
+   "mimetype": "text/x-python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "pygments_lexer": "ipython3",
+   "nbconvert_exporter": "python",
+   "file_extension": ".py"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.8.2 64-bit ('r-py-test': conda)"
+  },
+  "interpreter": {
+   "hash": "7508a6b53ffb04362d156591e4bfb20c197555e37f3cce3b1ec90fd899bbfe63"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/notebook_archive/Jun09232021/vis_testing.ipynb b/notebook/notebook_archive/Jun09232021/vis_testing.ipynb
diff --git a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb