-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
157 lines (111 loc) · 4.29 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(text):
# Remove special characters and numbers
text = re.sub(r'[^a-zA-Z]', ' ', text)
# Convert to lowercase
text = text.lower()
# Tokenize the text
words = text.split()
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Join the words back into a sentence
text = ' '.join(words)
return text
def train_model():
# Load data
df = pd.read_csv("sad_texts.csv", header=None, names=['text', 'label'])
# Convert label column to numeric values
df['label'] = pd.to_numeric(df['label'], errors='coerce')
# Remove rows with missing label values
df = df.dropna(subset=['label'])
# Pre-process text data
df['text'] = df['text'].apply(preprocess_text)
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2)
# Vectorize the text data
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(x_train_vectorized, y_train)
return model, vectorizer, x_test, y_test, df
def predict_sentiment(model, vectorizer, input_text):
# Pre-process the input text
input_text = preprocess_text(input_text)
# Vectorize the input text
input_features = vectorizer.transform([input_text])
# Make a prediction
label = model.predict(input_features)[0]
confidence = model.predict_proba(input_features)[0]
return label, confidence
def update_dataset(input_text, label):
add = input("Was this right? (y/n) ")
if add.lower() == 'n':
with open("sad_texts.csv", "a", encoding='utf-8', newline='\n') as f:
writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, quotechar="\"")
writer.writerow([input_text, label])
def plot_class_distribution(df):
class_counts = df['label'].value_counts()
plt.bar(['Sad', 'Not Sad'], class_counts)
plt.title('Distribution of Training Dataset Classes')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.show()
def plot_sentiment_confidence(confidence):
plt.bar(['Not Sad', 'Sad'], confidence)
plt.title('Sadness Detection Confidence')
plt.xlabel('Emotion')
plt.ylabel('Confidence')
plt.show()
def generate_word_clouds(df):
sad_texts = df[df['label'] == 1]['text']
not_sad_texts = df[df['label'] == 0]['text']
sad_wordcloud = WordCloud().generate(' '.join(sad_texts))
not_sad_wordcloud = WordCloud().generate(' '.join(not_sad_texts))
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(sad_wordcloud, interpolation='bilinear')
plt.title('Words Associated with Sad Sentiment')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(not_sad_wordcloud, interpolation='bilinear')
plt.title('Words Associated with Not Sad Sentiment')
plt.axis('off')
plt.show()
def main():
# Train the model
model, vectorizer, x_test, y_test, df = train_model()
# Take input from the user
input_text = input("Enter a text: ")
# Make predictions on the input text
label, confidence = predict_sentiment(model, vectorizer, input_text)
# Print the results
print("The text is classified as:", "sad" if label == 1 else "not sad")
print("Confidence:", confidence[1] if confidence[1] > confidence[0] else confidence[0])
# Ask for user feedback and update the dataset
update_dataset(input_text, label)
# Plot class distribution
plot_class_distribution(df)
# Plot sentiment confidence
plot_sentiment_confidence(confidence)
# Generate word clouds
generate_word_clouds(df)
if __name__ == "__main__":
main()