Skip to content

Commit

Permalink
Updated backend logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Son Nguyen committed Dec 19, 2023
1 parent dd30551 commit 1ba8100
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 7 deletions.
55 changes: 55 additions & 0 deletions machine-learning/movie-recommendation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

# Load movie data
movies_df = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'})
ratings_df = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

# Preprocessing
# Create a user-movie matrix
user_movie_df = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Normalize the data by subtracting the mean user rating
mean_user_rating = user_movie_df.mean(axis=1)
ratings_demeaned = user_movie_df.sub(mean_user_rating, axis=0)

# Singular Value Decomposition
U, sigma, Vt = svds(ratings_demeaned, k=50)
sigma = np.diag(sigma)

# Making Predictions
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + mean_user_rating.values.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_movie_df.columns)

# Recommend Movies
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
user_row_number = userID - 1
sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)

user_data = original_ratings_df[original_ratings_df.userId == userID]
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId').
sort_values(['rating'], ascending=False)
)

recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left',
left_on='movieId',
right_on='movieId').
rename(columns={user_row_number: 'Predictions'}).
sort_values('Predictions', ascending=False).
iloc[:num_recommendations, :-1]
)

return user_full, recommendations

# Test the recommendation system for a user
user_id = 1
rated_movies, recommendations = recommend_movies(preds_df, user_id, movies_df, ratings_df, 10)

print("User has already rated these movies:")
print(rated_movies.head(10))
print("\nTop 10 movie recommendations:")
print(recommendations)
67 changes: 67 additions & 0 deletions machine-learning/movie-reviews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
reviews_df = pd.read_csv('movie_reviews.csv', encoding='utf-8')
reviews_df = reviews_df[['review', 'sentiment']] # Assuming 'review' and 'sentiment' columns

# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", '', text)
text = ' '.join([word for word in text.split() if word not in stop_words])
return text

reviews_df['review'] = reviews_df['review'].apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews_df['review'])
sequences = tokenizer.texts_to_sequences(reviews_df['review'])

# Padding sequences
max_len = max([len(x) for x in sequences])
X = pad_sequences(sequences, maxlen=max_len)
y = pd.get_dummies(reviews_df['sentiment']).values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model creation
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model training
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Predictions
def predict_sentiment(review):
cleaned_review = clean_text(review)
sequence = tokenizer.texts_to_sequences([cleaned_review])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded_sequence)
sentiment = 'Positive' if np.argmax(prediction) == 1 else 'Negative'
return sentiment
65 changes: 65 additions & 0 deletions machine-learning/plot-summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import torch
import logging
from transformers import BartTokenizer, BartForConditionalGeneration
import streamlit as st

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MoviePlotSummarizer:
def __init__(self, model_name='facebook/bart-large-cnn'):
self.tokenizer = BartTokenizer.from_pretrained(model_name)
self.model = BartForConditionalGeneration.from_pretrained(model_name)

# Summarize a movie plot
def summarize(self, plot_text, max_length=130, min_length=30, style='default'):
try:
# Adjusting the style of summarization based on input
if style == 'verbose':
max_length *= 2
min_length *= 2
elif style == 'concise':
max_length //= 2
min_length //= 2

# Tokenize and generate summary
inputs = self.tokenizer.encode("summarize: " + plot_text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = self.model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
except Exception as e:
logger.error(f"Error in summarizing plot: {e}")
return "Error in summarization process."

# Streamlit UI
def main():
st.title("Movie Plot Summarizer")
st.write("Enter a movie plot to get a summarized version.")

plot_text = st.text_area("Movie Plot", height=250)
max_length = st.slider("Max Summary Length", 30, 300, 130)
min_length = st.slider("Min Summary Length", 10, 150, 30)
style = st.selectbox("Summarization Style", ['default', 'verbose', 'concise'])

if st.button("Summarize"):
summarizer = MoviePlotSummarizer()
summary = summarizer.summarize(plot_text, max_length=max_length, min_length=min_length, style=style)
st.subheader("Summarized Plot")
st.write(summary)

if st.button("About"):
st.subheader("About")
st.write("This is a simple movie plot summarizer built using the HuggingFace Transformers library. It uses the BART model to generate the summaries.")
st.write("The model was trained on the CNN/Daily Mail dataset, which contains news articles and their summaries. The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.")

if __name__ == "__main__":
main()
7 changes: 0 additions & 7 deletions script.js
Original file line number Diff line number Diff line change
Expand Up @@ -1138,13 +1138,6 @@ document.getElementById('side-nav').addEventListener('mouseleave', function() {
}
});

document.addEventListener('click', function () {
const sideNav = document.getElementById('side-nav');
if (!sideNav.classList.contains('manual-toggle')) {
sideNav.style.left = '-250px';
}
})

function toggleNav() {
const sideNav = document.getElementById('side-nav');
sideNav.classList.toggle('manual-toggle');
Expand Down

0 comments on commit 1ba8100

Please sign in to comment.