-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Book recommender * imports * imports * restructure * readme * remove genres * fixes * docstrings * generation script * assets * readme * readme * minor fixes * minor * remove data files * lock * fixed command * readme
- Loading branch information
1 parent
176ad67
commit 5afc945
Showing
6 changed files
with
438 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Book Recommender | ||
|
||
A chat assistant for recommending books to the user based on inputs. | ||
|
||
## Set key | ||
|
||
To run this example you need to set the `OPENAI_API_KEY` environment variable. | ||
|
||
```bash | ||
export OPENAI_API_KEY=<your_api_key> | ||
``` | ||
|
||
## Dataset | ||
|
||
Download the [dataset](https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023) to the `book-recommender/` folder, and rename it as `goodreads.csv`. | ||
|
||
|
||
## Generate lookup files | ||
|
||
Generate lookup files by running the following script: | ||
|
||
```bash | ||
python generate_assets.py --embeddings --verbose | ||
``` | ||
|
||
Running this command should generate `author_to_title.json`, `title_to_description.json` and `embeddings.json` files in the `assets/` folder. | ||
|
||
If you want to generate embedding for the first N rows only run the below command (here N=100): | ||
|
||
```bash | ||
python generate_assets.py -n 100 --embeddings --verbose | ||
``` | ||
|
||
## Deployment | ||
|
||
Create a zip of `app.py`, `util.py`, `requirements.txt` and `assets/` folder, and follow the instructions for deploying a [Panel](https://docs.cloud.ploomber.io/en/latest/apps/panel.html) application. | ||
You also need to set `OPENAI_API_KEY` as an [environment variable](https://docs.cloud.ploomber.io/en/latest/user-guide/env-vars.html) while deploying the application. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
""" | ||
Chat application for recommending books to the user. | ||
Input: | ||
Users can submit queries that describe the type of books they are looking for, e.g., suggest fiction novels. | ||
Users can also ask the chat assistant for books by specific author, e.g., recommend books by Dan Brown. | ||
Answers to user's queries will be based on the Goodreads dataset: | ||
https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023 | ||
Application logic: | ||
The app determines the closest matches by comparing user query's embedding to the available | ||
book embeddings. Embeddings of books are pre-computed on the description column of every book | ||
and stored in the assets/ folder. | ||
Response: | ||
The chat assistant then determines the top relevant answers shortlisted by comparing embeddings and | ||
provides the top 5 recommendations. | ||
""" | ||
|
||
import json | ||
import panel as pn | ||
from openai import OpenAI | ||
from scipy.spatial import KDTree | ||
import numpy as np | ||
from pathlib import Path | ||
|
||
from util import get_embedding_from_text | ||
|
||
with open(Path("assets", "title_to_description.json"), 'r') as file: | ||
DESCRIPTIONS = json.load(file) | ||
|
||
with open(Path("assets", "author_to_title.json"), 'r') as file: | ||
AUTHOR_TITLES = json.load(file) | ||
|
||
|
||
def load_embeddings_file(): | ||
"""Load the pre-computed embeddings of description column | ||
The data is in the format title: embedding | ||
""" | ||
file_path = Path('assets', 'embeddings.json') | ||
with open(file_path, "r", encoding="utf-8") as file: | ||
embeddings_json = json.load(file) | ||
return embeddings_json | ||
|
||
client = OpenAI() | ||
|
||
pn.extension() | ||
|
||
|
||
def get_book_description_by_title(title): | ||
"""Return description of a book""" | ||
return DESCRIPTIONS[title.upper()] | ||
|
||
|
||
def detect_author(user_query): | ||
system_prompt = f""" | ||
You're a system that determines the author in user query. | ||
You need to return only the author name.Please fix any typo if possible | ||
""" | ||
response = client.chat.completions.create( | ||
model="gpt-3.5-turbo", | ||
messages=[ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": "What are some books by Sandra Boynton"}, | ||
{"role": "system", "content": "Sandra Boynton"}, | ||
{"role": "user", "content": user_query}, | ||
], | ||
seed=42, | ||
n=1, | ||
) | ||
author = response.choices[0].message.content.upper() | ||
return author if author in AUTHOR_TITLES else "" | ||
|
||
|
||
def book_recommender_agent(user_query, verbose=False): | ||
"""An agent that can recommend books to the user based on input""" | ||
embeddings_json = load_embeddings_file() | ||
|
||
# If author is mentioned, filter books written by the author. | ||
# Otherwise, consider all the available books. | ||
author = detect_author(user_query) | ||
titles = [] | ||
if author: | ||
titles = AUTHOR_TITLES[author] | ||
if verbose: | ||
print(f"Found these titles: {titles} by author: {author}") | ||
|
||
filtered_embeddings_by_title = {} | ||
for title in titles: | ||
title_embedding = embeddings_json.get(title, None) | ||
if title_embedding: | ||
filtered_embeddings_by_title[title] = title_embedding | ||
if filtered_embeddings_by_title: | ||
embeddings_json = filtered_embeddings_by_title | ||
|
||
titles = [] | ||
embeddings = [] | ||
for key, value in embeddings_json.items(): | ||
if value: | ||
titles.append(key) | ||
embeddings.append(value) | ||
kdtree = KDTree(np.array(embeddings)) | ||
_, indexes = kdtree.query(get_embedding_from_text(user_query), k=min(len(titles), 5)) | ||
|
||
if isinstance(indexes, np.int64): | ||
indexes = [indexes] | ||
titles_relevant = [titles[i] for i in indexes if titles[i] != "null"] | ||
if verbose: | ||
print(f"Found these relevant titles: {titles_relevant}") | ||
descriptions_relevant = [get_book_description_by_title(title) for title in titles_relevant] | ||
|
||
recommendation_text = "" | ||
for i, value in enumerate(titles_relevant): | ||
recommendation_text = f"{recommendation_text}{value}: {descriptions_relevant[i]}\n\n##" | ||
|
||
system_prompt = f""" | ||
You are a helpful book recommendation system that can recommend users books based on their inputs. | ||
Here are the top relevant titles and descriptions (separated by ##) in the format titles: descriptions, | ||
use these to generate your answer, | ||
and disregard books that are not relevant to user's input. You can display 5 or less recommendations.: | ||
{recommendation_text} | ||
You should also create a summary of the description and format the answer properly. | ||
You can display a maximum of 5 recommendations. | ||
Please do not suggest any books outside this list. | ||
""" | ||
|
||
if verbose: | ||
print(f"System prompt: {system_prompt}") | ||
|
||
response = client.chat.completions.create( | ||
model="gpt-3.5-turbo", | ||
messages=[ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": user_query}, | ||
], | ||
seed=42, | ||
n=1, | ||
) | ||
|
||
return response.choices[0].message.content | ||
|
||
|
||
def callback(contents: str, user: str, instance: pn.chat.ChatInterface): | ||
return book_recommender_agent(contents) | ||
|
||
|
||
chat_interface = pn.chat.ChatInterface(callback=callback) | ||
chat_interface.send( | ||
"I am a book recommendation engine! " | ||
"You may ask questions like: \n* Recommend books by Dan Brown.\n" | ||
"* Suggest some books based in the Victorian era.\n\n" | ||
"You can deploy your own by signing up at https://ploomber.io", | ||
user="System", | ||
respond=False, | ||
) | ||
|
||
pn.template.MaterialTemplate( | ||
title="Book Recommender", | ||
main=[chat_interface], | ||
).servable() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import json | ||
import argparse | ||
import pandas as pd | ||
from pathlib import Path | ||
from openai import OpenAI | ||
|
||
from util import get_embedding_from_text | ||
|
||
# Download the dataset from: | ||
# https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023 | ||
df = pd.read_csv(Path("goodreads.csv")) | ||
|
||
client = OpenAI() | ||
|
||
|
||
def generate_embeddings(rows, verbose=False): | ||
"""Function to generate an embeddings.json file that contains | ||
mapping {title: embedding} where embeddings are generated on the | ||
description column of goodreads.csv file. | ||
""" | ||
|
||
final_df = df.head(rows) if rows else df | ||
embeddings_json = {} | ||
if verbose and rows: | ||
print(f"Generating embeddings for {rows} rows") | ||
for index, row in final_df.iterrows(): | ||
if verbose: | ||
print(f"Row number: {index}") | ||
embeddings_json[row["title"]] = get_embedding_from_text(row["description"]) | ||
path = Path("assets", "embeddings.json") | ||
with open(path, 'w') as f: | ||
json.dump(embeddings_json, f) | ||
if verbose: | ||
print(f"Generated embeddings of description column at '{path}'") | ||
|
||
|
||
def generate_lookup(verbose=False): | ||
"""Function to generate mappings between columns for faster lookup""" | ||
author_to_title = df.groupby(df['authors'].str.upper())['title'].apply(list).to_dict() | ||
path = Path("assets", "author_to_title.json") | ||
with open(path, 'w') as f: | ||
json.dump(author_to_title, f) | ||
if verbose: | ||
print(f"Generated author to title mappings at '{path}'") | ||
title_to_description = df.groupby(df['title'].str.upper())['description'].apply(list).to_dict() | ||
path = Path("assets", "title_to_description.json") | ||
with open(path, 'w') as f: | ||
json.dump(title_to_description, f) | ||
if verbose: | ||
print(f"Generated title to description mappings at '{path}'") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--verbose', action='store_true', help='Debug') | ||
parser.add_argument('--embeddings', action='store_true', help='Generate embeddings') | ||
parser.add_argument("-n", "--rows", required=False, help="Number of rows to limit for generating embeddings") | ||
args = parser.parse_args() | ||
assets_path = Path("assets") | ||
if not assets_path.exists(): | ||
Path("assets").mkdir() | ||
generate_lookup(args.verbose) | ||
rows = int(args.rows) if args.rows else None | ||
if args.embeddings: | ||
generate_embeddings(rows, args.verbose) | ||
|
Oops, something went wrong.