Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add files via upload #5

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions code/Machine_learning_infantfeeding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python
# coding: utf-8

# Import necessary libraries
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import requests
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Set plotting styles
sns.set_style("darkgrid")
sns.set_context("poster")
plt.rcParams["figure.figsize"] = [8, 6]


# Read the data from CSV file
data = pd.read_csv("Allnoted_Nan_class.csv")

# Remove unwanted projects from the data
data = data[data['teamtat_project'] != 'project_8']
data = data[data['teamtat_project'] != 'project_7']
data = data[data['teamtat_project'] != 'project_6']

# Lowercase all words in the 'note_text' column
data['note_text'] = data['note_text'].str.lower()

# Remove punctuation from the 'note_text' column
data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))

# Remove numerical values from the 'note_text' column
data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if not word.isdigit()]))

# Remove stopwords from the 'note_text' column
stop_words = stopwords.words('english')
data['note_text'] = data['note_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Lemmatize the words in the 'note_text' column
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

data['text_lemmatized'] = data.note_text.apply(lemmatize_text)

# Replace class labels with appropriate values
data['Class'] = data['Class'].replace({'EXPRESS/PUMP': 'BREAST', 'MIXED': 'BOTTLE', 'nan-feeding-related': 'NA', 'nan-related': 'NA', 'nan-not-related': 'NA'})

# Plot the distribution of class labels
print(data["Class"].value_counts())
data.groupby('Class').size().plot(kind='pie', y="Class", autopct='%1.1f%%')

# Split the data into train and test sets
BREAST = data[data["Class"] == "BREAST"]
BOTTLE = data[data["Class"] == "BOTTLE"]
NA = data[data["Class"] == "NA"]

downsample_breast = resample(BREAST, replace=True, n_samples=len(BOTTLE), random_state=42)
downsample_NA = resample(NA, replace=True, n_samples=len(BOTTLE), random_state=42)

data_downsampled = pd.concat([downsample_breast, BOTTLE, downsample_NA])

# Print the distribution of class labels after downsampling
print(data_downsampled["Class"].value_counts())
data_downsampled.groupby('Class').size().plot(kind='pie', y="class", label="Type", autopct='%1.1f%%')

# Prepare the data for training and testing
Notes = np.array(data_downsampled['text_lemmatized'])
classs = np.array(data_downsampled['Class'])

x_train, x_test, y_train, y_test = train_test_split(Notes, classs, test_size=0.3, random_state=12)

tf_vectorizer = TfidfVectorizer()
x_train_tfidf = tf_vectorizer.fit_transform(x_train)
x_test_tfidf = tf_vectorizer.transform(x_test)

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Convert class labels to numeric values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(x_train_tfidf, label=y_train_encoded)
dtest = xgb.DMatrix(x_test_tfidf)

# Set XGBoost parameters
params = {
'objective': 'multi:softmax',
'num_class': len(label_encoder.classes_),
'seed': 12
}

# Define the parameter grid for hyperparameter tuning
param_grid = {
'max_depth': [3, 6, 9],
'learning_rate': [0.1, 0.01, 0.001],
'gamma': [0, 0.1, 0.2]
}

# Create XGBoost classifier
xgb_classifier = xgb.XGBClassifier(**params)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5)
grid_search.fit(x_train_tfidf, y_train_encoded)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred_encoded = best_model.predict(x_test_tfidf)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average = 'macro')
recall = recall_score(y_test, y_pred,average = 'macro')
f1 = f1_score(y_test, y_pred,average = 'macro')


print("Accuracy:", accuracy)
print("precision:", precision)
print("recall:", recall)
print("f1:", f1)
85 changes: 85 additions & 0 deletions code/Metadata-2(python).py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd

# Read the 'metadata_prenatal_400k_v1.csv' file into a DataFrame called 'meta'
meta = pd.read_csv("metadata_prenatal_400k_v1.csv")

# Display the 'meta' DataFrame
meta

# Modify the 'file_name' column in the 'meta' DataFrame to extract the 'note_id'
meta['note_id'] = meta['file_name'].str.replace('_.txt', '')

# Display the updated 'meta' DataFrame
meta

# Read the 'mom_notes_prenatal_visit.csv' file into a DataFrame called 'm1'
m1 = pd.read_csv("mom_notes_prenatal_visit.csv")
# Rename the 'deid_note_ID' column to 'note_id'
m1 = m1.rename(columns={'deid_note_ID':'note_id'})

# Display the 'm1' DataFrame
m1

# Merge the 'meta' and 'm1' DataFrames based on the 'note_id' column
m2 = meta.merge(m1, on='note_id', how='left')
# Rename the 'Deidentified_mom_ID' column to 'deidentified_mom_id'
m2 = m2.rename(columns={'Deidentified_mom_ID':'deidentified_mom_id'})

# Display the merged DataFrame 'm2'
m2

# Drop rows with missing values in the 'note_created_datetime' column
m2 = m2.dropna(subset=['note_created_datetime'])

# Display the updated DataFrame 'm2'
m2

# Read the 'baby_mom_at_birth_with_payer.csv' file into a DataFrame called 'mom'
mom = pd.read_csv("baby_mom_at_birth_with_payer.csv")

# Display the 'mom' DataFrame
mom

# Merge the 'mom' and 'm2' DataFrames based on the 'deidentified_mom_id' column
prenatal_final = mom.merge(m2, on='deidentified_mom_id')

# Display the merged DataFrame 'prenatal_final'
prenatal_final

# Drop rows with missing values in the 'date_of_delivery' column
prenatal_final = prenatal_final.dropna(subset=['date_of_delivery'])

# Convert the columns 'date_of_delivery' and 'note_created_datetime' to datetime format
prenatal_final['date_of_delivery'] = pd.to_datetime(prenatal_final['date_of_delivery'], format='%m/%d/%Y %H:%M')
prenatal_final['note_created_datetime'] = pd.to_datetime(prenatal_final['note_created_datetime'])

# Calculate the difference between 'date_of_delivery' and 'note_created_datetime'
prenatal_final['days2birth'] = prenatal_final['date_of_delivery'] - prenatal_final['note_created_datetime']

# Display the updated DataFrame 'prenatal_final'
prenatal_final

# Convert the 'days2birth' column to Timedelta format
prenatal_final['days2birth'] = pd.to_timedelta(prenatal_final['days2birth'])

# Create a boolean mask to filter the DataFrame based on the desired range (-280 to 0 days)
mask = (prenatal_final['days2birth'] >= pd.Timedelta(days=-280)) & (prenatal_final['days2birth'] <= pd.Timedelta(days=0))

# Filter the DataFrame based on the boolean mask
df_filtered = prenatal_final[mask]

# Display the filtered DataFrame 'df_filtered'
df_filtered

# Count the number of duplicated note_id values
duplicate_counts = df_filtered['note_id'].duplicated().sum()

# Count the number of unique note_id values
unique_counts = df_filtered['note_id'].nunique()

# Display the counts
duplicate_counts
unique_counts

# Write the 'df_filtered' DataFrame to a CSV file named 'delivery_1000k_notes_metadata_v1.csv'
df_filtered.to_csv('prenatal_400k_notes_metadata_v1.csv', index=False)
128 changes: 128 additions & 0 deletions code/metadata_1(slurm-script).r
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Load required libraries
library(dplyr)
library(tidyverse)
library(stringr)
library(knitr)
library(tibble)
library(readr)
library(quanteda)
# Get the list of file names from a specific directory
files <- list.files(
path = "/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/prenatal_all_notes/",
pattern = ".txt",
all.files = TRUE,
full.names = FALSE
) %>% as_tibble() %>%
mutate(note_number = as.numeric(stringr::str_split(value, "_") %>% map_chr(., 2))) %>%
rename(file_name = value) %>%
add_column(
char_count = NA,
tokens = NA,
sentences = NA,
file_size = NA,
breast = NA,
bottle = NA,
express = NA,
smoking = NA,
suicide = NA,
pain = NA,
Cancer = NA,
diabetes = NA,
hypertension = NA
)

# counter
file_name_vec <- files$file_name
files_loop <- files %>% filter(file_name %in% file_name_vec)

file_count <- length(file_name_vec)

# Define a dictionary of terms
my_dict <- dictionary(list(
breast = c("breastfeeding", "breast"),
bottle = c("bottle"),
express = c("express", "pump"),
smoking = c("smoking"),
suicide = c("suicide", "self-harm"),
pain = c("pain"),
Cancer = c("cancer", "tumor"),
diabetes = c("Diabetes", "Blood sugar", "insulin"),
hypertension = c("Hypertension", "blood pressure")
))

# Loop through annotations
for (i in 1:file_count) {

# Get the current file index and its path
file_index <- file_name_vec[i]
file_index_path <- paste0(
"/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/prenatal_all_notes/",
file_index
)
print(file_index)
print(i)

# note info
file_size <- file.size(file_index_path)

# EXTRACT DATA FROM TEXT FILE

note <- read_file(file = file_index_path) # Read the contents of the file

# character count
char_count <- nchar(note) # Count the number of characters in the note

# quanteda package
corpus <- corpus(note) # build a new corpus from the texts

# check if corpus has content
if (ntoken(corpus) > 0) {
names(corpus) <- file_index # Assign the file index as the name of the corpus
sum <- summary(corpus) # Calculate summary statistics for the corpus
tokens <- sum[3] # Get the token count
sentence <- sum[4] # Get the sentence count

dat1 <- dfm(corpus, dictionary = my_dict) # Create a document-feature matrix with the specified dictionary
dat2 <- as_tibble(dat1) # Convert the document-feature matrix to a tibble

breast <- dat2[2]
bottle <- dat2[3]
express <- dat2[4]
smoking <- dat2[5]
suicide <- dat2[6]
pain <- dat2[7]
Cancer <- dat2[8]
diabetes <- dat2[9]
hypertension <- dat2[10]

# INSERT INTO TABLE
files_loop[i, 3] <- char_count
files_loop[i, 4] <- tokens
files_loop[i, 5] <- sentence
files_loop[i, 6] <- file_size
files_loop[i, 7] <- breast
files_loop[i, 8] <- bottle
files_loop[i, 9] <- express
files_loop[i, 10] <- smoking
files_loop[i, 11] <- suicide
files_loop[i, 12] <- pain
files_loop[i, 13] <- Cancer
files_loop[i, 14] <- diabetes
files_loop[i, 15] <- hypertension
} else {
# skip to next iteration if corpus has no content
next
}

} # end loop

# remove special characters
files_final <- files_loop

# export file
file_name <- paste0("metadata_prenatal_400k_v1.csv")
data_directory <- paste0("/blue/djlemas/share/data/MyEHR/10year_dataset/notes/all_notes/prenatal/")
data_path <- paste0(data_directory, file_name)

# export
write_csv(files_loop, file = data_path, col_names = TRUE)