-
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Son Nguyen
committed
Dec 19, 2023
1 parent
dd30551
commit 1ba8100
Showing
4 changed files
with
187 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import mean_squared_error | ||
from scipy.sparse.linalg import svds | ||
|
||
# Load movie data | ||
movies_df = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'}) | ||
ratings_df = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'}) | ||
|
||
# Preprocessing | ||
# Create a user-movie matrix | ||
user_movie_df = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0) | ||
|
||
# Normalize the data by subtracting the mean user rating | ||
mean_user_rating = user_movie_df.mean(axis=1) | ||
ratings_demeaned = user_movie_df.sub(mean_user_rating, axis=0) | ||
|
||
# Singular Value Decomposition | ||
U, sigma, Vt = svds(ratings_demeaned, k=50) | ||
sigma = np.diag(sigma) | ||
|
||
# Making Predictions | ||
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + mean_user_rating.values.reshape(-1, 1) | ||
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_movie_df.columns) | ||
|
||
# Recommend Movies | ||
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5): | ||
user_row_number = userID - 1 | ||
sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False) | ||
|
||
user_data = original_ratings_df[original_ratings_df.userId == userID] | ||
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId'). | ||
sort_values(['rating'], ascending=False) | ||
) | ||
|
||
recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]. | ||
merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left', | ||
left_on='movieId', | ||
right_on='movieId'). | ||
rename(columns={user_row_number: 'Predictions'}). | ||
sort_values('Predictions', ascending=False). | ||
iloc[:num_recommendations, :-1] | ||
) | ||
|
||
return user_full, recommendations | ||
|
||
# Test the recommendation system for a user | ||
user_id = 1 | ||
rated_movies, recommendations = recommend_movies(preds_df, user_id, movies_df, ratings_df, 10) | ||
|
||
print("User has already rated these movies:") | ||
print(rated_movies.head(10)) | ||
print("\nTop 10 movie recommendations:") | ||
print(recommendations) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import re | ||
import nltk | ||
from nltk.corpus import stopwords | ||
from sklearn.model_selection import train_test_split | ||
from tensorflow.keras.preprocessing.text import Tokenizer | ||
from tensorflow.keras.preprocessing.sequence import pad_sequences | ||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout | ||
from tensorflow.keras.callbacks import EarlyStopping | ||
|
||
# Load dataset | ||
reviews_df = pd.read_csv('movie_reviews.csv', encoding='utf-8') | ||
reviews_df = reviews_df[['review', 'sentiment']] # Assuming 'review' and 'sentiment' columns | ||
|
||
# Text preprocessing | ||
nltk.download('stopwords') | ||
stop_words = set(stopwords.words('english')) | ||
|
||
def clean_text(text): | ||
text = text.lower() | ||
text = re.sub(r"[^a-z0-9\s]", '', text) | ||
text = ' '.join([word for word in text.split() if word not in stop_words]) | ||
return text | ||
|
||
reviews_df['review'] = reviews_df['review'].apply(clean_text) | ||
|
||
# Tokenization | ||
tokenizer = Tokenizer() | ||
tokenizer.fit_on_texts(reviews_df['review']) | ||
sequences = tokenizer.texts_to_sequences(reviews_df['review']) | ||
|
||
# Padding sequences | ||
max_len = max([len(x) for x in sequences]) | ||
X = pad_sequences(sequences, maxlen=max_len) | ||
y = pd.get_dummies(reviews_df['sentiment']).values | ||
|
||
# Train-test split | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | ||
|
||
# Model creation | ||
model = Sequential() | ||
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len)) | ||
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) | ||
model.add(Dense(64, activation='relu')) | ||
model.add(Dropout(0.5)) | ||
model.add(Dense(y.shape[1], activation='softmax')) | ||
|
||
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | ||
|
||
# Model training | ||
early_stopping = EarlyStopping(monitor='val_loss', patience=3) | ||
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=[early_stopping]) | ||
|
||
# Evaluate the model | ||
loss, accuracy = model.evaluate(X_test, y_test) | ||
print(f'Test Accuracy: {accuracy:.2f}') | ||
|
||
# Predictions | ||
def predict_sentiment(review): | ||
cleaned_review = clean_text(review) | ||
sequence = tokenizer.texts_to_sequences([cleaned_review]) | ||
padded_sequence = pad_sequences(sequence, maxlen=max_len) | ||
prediction = model.predict(padded_sequence) | ||
sentiment = 'Positive' if np.argmax(prediction) == 1 else 'Negative' | ||
return sentiment |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import torch | ||
import logging | ||
from transformers import BartTokenizer, BartForConditionalGeneration | ||
import streamlit as st | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
class MoviePlotSummarizer: | ||
def __init__(self, model_name='facebook/bart-large-cnn'): | ||
self.tokenizer = BartTokenizer.from_pretrained(model_name) | ||
self.model = BartForConditionalGeneration.from_pretrained(model_name) | ||
|
||
# Summarize a movie plot | ||
def summarize(self, plot_text, max_length=130, min_length=30, style='default'): | ||
try: | ||
# Adjusting the style of summarization based on input | ||
if style == 'verbose': | ||
max_length *= 2 | ||
min_length *= 2 | ||
elif style == 'concise': | ||
max_length //= 2 | ||
min_length //= 2 | ||
|
||
# Tokenize and generate summary | ||
inputs = self.tokenizer.encode("summarize: " + plot_text, return_tensors="pt", max_length=1024, truncation=True) | ||
summary_ids = self.model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True) | ||
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) | ||
except Exception as e: | ||
logger.error(f"Error in summarizing plot: {e}") | ||
return "Error in summarization process." | ||
|
||
# Streamlit UI | ||
def main(): | ||
st.title("Movie Plot Summarizer") | ||
st.write("Enter a movie plot to get a summarized version.") | ||
|
||
plot_text = st.text_area("Movie Plot", height=250) | ||
max_length = st.slider("Max Summary Length", 30, 300, 130) | ||
min_length = st.slider("Min Summary Length", 10, 150, 30) | ||
style = st.selectbox("Summarization Style", ['default', 'verbose', 'concise']) | ||
|
||
if st.button("Summarize"): | ||
summarizer = MoviePlotSummarizer() | ||
summary = summarizer.summarize(plot_text, max_length=max_length, min_length=min_length, style=style) | ||
st.subheader("Summarized Plot") | ||
st.write(summary) | ||
|
||
if st.button("About"): | ||
st.subheader("About") | ||
st.write("This is a simple movie plot summarizer built using the HuggingFace Transformers library. It uses the BART model to generate the summaries.") | ||
st.write("The model was trained on the CNN/Daily Mail dataset, which contains news articles and their summaries. The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
st.write("The model was fine-tuned on the XSUM dataset, which contains summaries of BBC articles.") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters