lemaslab · rahul-vade · Jun 11, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 3, 2023
diff --git a/code/Machine_learning_infantfeeding.py b/code/Machine_learning_infantfeeding.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# Import necessary libraries
+import numpy as np
+import pandas as pd
+import csv
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics import classification_report, confusion_matrix
+from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
+from sklearn.feature_extraction.text import TfidfVectorizer
+import  numpy as np
+import requests
+import nltk
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import sent_tokenize
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
+from sklearn import metrics
+from sklearn.metrics import roc_auc_score
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+# Set plotting styles
+sns.set_style("darkgrid")
+sns.set_context("poster")
+plt.rcParams["figure.figsize"] = [8, 6]
+
+
+# Read the data from CSV file
+data = pd.read_csv("Allnoted_Nan_class.csv")
+
+# Remove unwanted projects from the data
+data = data[data['teamtat_project'] != 'project_8']
+data = data[data['teamtat_project'] != 'project_7']
+data = data[data['teamtat_project'] != 'project_6']
+
+# Lowercase all words in the 'note_text' column
+data['note_text'] = data['note_text'].str.lower()
+
+# Remove punctuation from the 'note_text' column
+data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
+
+# Remove numerical values from the 'note_text' column
+data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if not word.isdigit()]))
+
+# Remove stopwords from the 'note_text' column
+stop_words = stopwords.words('english')
+data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
+
+# Lemmatize the words in the 'note_text' column
+w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
+lemmatizer = nltk.stem.WordNetLemmatizer()
+
+def lemmatize_text(text):
+    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])
+
+data['text_lemmatized'] = data.note_text.apply(lemmatize_text)
+
+# Replace class labels with appropriate values
+data['Class'] = data['Class'].replace({'EXPRESS/PUMP': 'BREAST', 'MIXED': 'BOTTLE', 'nan-feeding-related': 'NA', 'nan-related': 'NA', 'nan-not-related': 'NA'})
+
+# Plot the distribution of class labels
+print(data["Class"].value_counts())
+data.groupby('Class').size().plot(kind='pie', y="Class", autopct='%1.1f%%')
+
+# Split the data into train and test sets
+BREAST = data[data["Class"] == "BREAST"]
+BOTTLE = data[data["Class"] == "BOTTLE"]
+NA = data[data["Class"] == "NA"]
+
+downsample_breast = resample(BREAST, replace=True, n_samples=len(BOTTLE), random_state=42)
+downsample_NA = resample(NA, replace=True, n_samples=len(BOTTLE), random_state=42)
+
+data_downsampled = pd.concat([downsample_breast, BOTTLE, downsample_NA])
+
+# Print the distribution of class labels after downsampling
+print(data_downsampled["Class"].value_counts())
+data_downsampled.groupby('Class').size().plot(kind='pie', y="class", label="Type", autopct='%1.1f%%')
+
+# Prepare the data for training and testing
+Notes = np.array(data_downsampled['text_lemmatized'])
+classs = np.array(data_downsampled['Class'])
+
+x_train, x_test, y_train, y_test = train_test_split(Notes, classs, test_size=0.3, random_state=12)
+
+tf_vectorizer = TfidfVectorizer()
+x_train_tfidf = tf_vectorizer.fit_transform(x_train)
+x_test_tfidf = tf_vectorizer.transform(x_test)
+
+import xgboost as xgb
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import LabelEncoder
+
+# Convert class labels to numeric values
+label_encoder = LabelEncoder()
+y_train_encoded = label_encoder.fit_transform(y_train)
+y_test_encoded = label_encoder.transform(y_test)
+
+# Create DMatrix for XGBoost
+dtrain = xgb.DMatrix(x_train_tfidf, label=y_train_encoded)
+dtest = xgb.DMatrix(x_test_tfidf)
+
+# Set XGBoost parameters
+params = {
+    'objective': 'multi:softmax',
+    'num_class': len(label_encoder.classes_),
+    'seed': 12
+}
+
+# Define the parameter grid for hyperparameter tuning
+param_grid = {
+    'max_depth': [3, 6, 9],
+    'learning_rate': [0.1, 0.01, 0.001],
+    'gamma': [0, 0.1, 0.2]
+}
+
+# Create XGBoost classifier
+xgb_classifier = xgb.XGBClassifier(**params)
+
+# Perform grid search with cross-validation
+grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5)
+grid_search.fit(x_train_tfidf, y_train_encoded)
+
+# Get the best model and its hyperparameters
+best_model = grid_search.best_estimator_
+best_params = grid_search.best_params_
+
+# Make predictions on the test data using the best model
+y_pred_encoded = best_model.predict(x_test_tfidf)
+y_pred = label_encoder.inverse_transform(y_pred_encoded)
+
+# Calculate accuracy on the test data
+accuracy = accuracy_score(y_test, y_pred)
+precision = precision_score(y_test, y_pred,average = 'macro')
+recall = recall_score(y_test, y_pred,average = 'macro')
+f1 = f1_score(y_test, y_pred,average = 'macro')
+
+
+print("Accuracy:", accuracy)
+print("precision:", precision)
+print("recall:", recall)
+print("f1:", f1)
diff --git a/code/Metadata-2(python).py b/code/Metadata-2(python).py
@@ -0,0 +1,85 @@
+import pandas as pd
+
+# Read the 'metadata_prenatal_400k_v1.csv' file into a DataFrame called 'meta'
+meta = pd.read_csv("metadata_prenatal_400k_v1.csv")
+
+# Display the 'meta' DataFrame
+meta
+
+# Modify the 'file_name' column in the 'meta' DataFrame to extract the 'note_id'
+meta['note_id'] = meta['file_name'].str.replace('_.txt', '')
+
+# Display the updated 'meta' DataFrame
+meta
+
+# Read the 'mom_notes_prenatal_visit.csv' file into a DataFrame called 'm1'
+m1 = pd.read_csv("mom_notes_prenatal_visit.csv")
+# Rename the 'deid_note_ID' column to 'note_id'
+m1 = m1.rename(columns={'deid_note_ID':'note_id'})
+
+# Display the 'm1' DataFrame
+m1
+
+# Merge the 'meta' and 'm1' DataFrames based on the 'note_id' column
+m2 = meta.merge(m1, on='note_id', how='left')
+# Rename the 'Deidentified_mom_ID' column to 'deidentified_mom_id'
+m2 = m2.rename(columns={'Deidentified_mom_ID':'deidentified_mom_id'})
+
+# Display the merged DataFrame 'm2'
+m2
+
+# Drop rows with missing values in the 'note_created_datetime' column
+m2 = m2.dropna(subset=['note_created_datetime'])
+
+# Display the updated DataFrame 'm2'
+m2
+
+# Read the 'baby_mom_at_birth_with_payer.csv' file into a DataFrame called 'mom'
+mom = pd.read_csv("baby_mom_at_birth_with_payer.csv")
+
+# Display the 'mom' DataFrame
+mom
+
+# Merge the 'mom' and 'm2' DataFrames based on the 'deidentified_mom_id' column
+prenatal_final = mom.merge(m2, on='deidentified_mom_id')
+
+# Display the merged DataFrame 'prenatal_final'
+prenatal_final
+
+# Drop rows with missing values in the 'date_of_delivery' column
+prenatal_final = prenatal_final.dropna(subset=['date_of_delivery'])
+
+# Convert the columns 'date_of_delivery' and 'note_created_datetime' to datetime format
+prenatal_final['date_of_delivery'] = pd.to_datetime(prenatal_final['date_of_delivery'], format='%m/%d/%Y %H:%M')
+prenatal_final['note_created_datetime'] = pd.to_datetime(prenatal_final['note_created_datetime'])
+
+# Calculate the difference between 'date_of_delivery' and 'note_created_datetime'
+prenatal_final['days2birth'] = prenatal_final['date_of_delivery'] - prenatal_final['note_created_datetime']
+
+# Display the updated DataFrame 'prenatal_final'
+prenatal_final
+
+# Convert the 'days2birth' column to Timedelta format
+prenatal_final['days2birth'] = pd.to_timedelta(prenatal_final['days2birth'])
+
+# Create a boolean mask to filter the DataFrame based on the desired range (-280 to 0 days)
+mask = (prenatal_final['days2birth'] >= pd.Timedelta(days=-280)) & (prenatal_final['days2birth'] <= pd.Timedelta(days=0))
+
+# Filter the DataFrame based on the boolean mask
+df_filtered = prenatal_final[mask]
+
+# Display the filtered DataFrame 'df_filtered'
+df_filtered
+
+# Count the number of duplicated note_id values
+duplicate_counts = df_filtered['note_id'].duplicated().sum()
+
+# Count the number of unique note_id values
+unique_counts = df_filtered['note_id'].nunique()
+
+# Display the counts
+duplicate_counts
+unique_counts
+
+# Write the 'df_filtered' DataFrame to a CSV file named 'delivery_1000k_notes_metadata_v1.csv'
+df_filtered.to_csv('prenatal_400k_notes_metadata_v1.csv', index=False)
diff --git a/code/metadata_1(slurm-script).r b/code/metadata_1(slurm-script).r
@@ -0,0 +1,128 @@
+# Load required libraries
+library(dplyr)
+library(tidyverse)
+library(stringr)
+library(knitr)
+library(tibble)
+library(readr)
+library(quanteda)
+# Get the list of file names from a specific directory
+files <- list.files(
+  path = "/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/prenatal_all_notes/",
+  pattern = ".txt",
+  all.files = TRUE,
+  full.names = FALSE
+) %>% as_tibble() %>% 
+  mutate(note_number = as.numeric(stringr::str_split(value, "_") %>% map_chr(., 2))) %>% 
+  rename(file_name = value) %>% 
+  add_column(
+    char_count = NA,
+    tokens = NA,
+    sentences = NA,
+    file_size = NA,
+    breast = NA,
+    bottle = NA,
+    express = NA,
+    smoking = NA,
+    suicide = NA,
+    pain = NA,
+    Cancer = NA,
+    diabetes = NA,
+    hypertension = NA
+)
+
+# counter
+file_name_vec <- files$file_name
+files_loop <- files %>% filter(file_name %in% file_name_vec)
+
+file_count <- length(file_name_vec)
+
+# Define a dictionary of terms
+my_dict <- dictionary(list(
+  breast = c("breastfeeding", "breast"),
+  bottle = c("bottle"),
+  express = c("express", "pump"),
+  smoking = c("smoking"),
+  suicide = c("suicide", "self-harm"),
+  pain = c("pain"),
+  Cancer = c("cancer", "tumor"),
+  diabetes = c("Diabetes", "Blood sugar", "insulin"),
+  hypertension = c("Hypertension", "blood pressure")
+))
+
+# Loop through annotations
+for (i in 1:file_count) {
+
+   # Get the current file index and its path
+  file_index <- file_name_vec[i]
+  file_index_path <- paste0(
+    "/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/prenatal_all_notes/",
+    file_index
+  )
+  print(file_index)
+  print(i)
+
+  # note info
+  file_size <- file.size(file_index_path)
+
+  # EXTRACT DATA FROM TEXT FILE
+
+  note <- read_file(file = file_index_path)  # Read the contents of the file
+
+  # character count
+  char_count <- nchar(note) # Count the number of characters in the note
+
+  # quanteda package
+  corpus <- corpus(note)  # build a new corpus from the texts
+
+  # check if corpus has content
+  if (ntoken(corpus) > 0) {
+    names(corpus) <- file_index # Assign the file index as the name of the corpus
+    sum <- summary(corpus)      # Calculate summary statistics for the corpus
+    tokens <- sum[3]            # Get the token count
+    sentence <- sum[4]          # Get the sentence count
+
+    dat1 <- dfm(corpus, dictionary = my_dict)   # Create a document-feature matrix with the specified dictionary
+    dat2 <- as_tibble(dat1)                     # Convert the document-feature matrix to a tibble
+
+    breast <- dat2[2]
+    bottle <- dat2[3]
+    express <- dat2[4]
+    smoking <- dat2[5]
+    suicide <- dat2[6]
+    pain <- dat2[7]
+    Cancer <- dat2[8]
+    diabetes <- dat2[9]
+    hypertension <- dat2[10]
+
+    # INSERT INTO TABLE
+    files_loop[i, 3] <- char_count 
+    files_loop[i, 4] <- tokens 
+    files_loop[i, 5] <- sentence   
+    files_loop[i, 6] <- file_size  
+    files_loop[i, 7] <- breast
+    files_loop[i, 8] <- bottle
+    files_loop[i, 9] <- express
+    files_loop[i, 10] <- smoking
+    files_loop[i, 11] <- suicide
+    files_loop[i, 12] <- pain
+    files_loop[i, 13] <- Cancer
+    files_loop[i, 14] <- diabetes
+    files_loop[i, 15] <- hypertension
+  } else {
+    # skip to next iteration if corpus has no content
+    next
+  }
+
+} # end loop
+
+# remove special characters
+files_final <- files_loop 
+
+# export file
+file_name <- paste0("metadata_prenatal_400k_v1.csv")
+data_directory <- paste0("/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/") 
+data_path <- paste0(data_directory, file_name)
+
+# export
+write_csv(files_loop, file = data_path, col_names = TRUE)